diff options
Diffstat (limited to 'libs/pixman-0.40.0/pixman')
69 files changed, 58220 insertions, 0 deletions
diff --git a/libs/pixman-0.40.0/pixman/CMakeLists.txt b/libs/pixman-0.40.0/pixman/CMakeLists.txt new file mode 100644 index 0000000..8b3ab2f --- /dev/null +++ b/libs/pixman-0.40.0/pixman/CMakeLists.txt @@ -0,0 +1,121 @@ +set(SOURCES +    pixman.c +    pixman-access.c +    pixman-access-accessors.c +    pixman-bits-image.c +    pixman-combine32.c +    pixman-combine-float.c +    pixman-conical-gradient.c +    pixman-filter.c +    pixman-x86.c +    pixman-mips.c +    pixman-arm.c +    pixman-ppc.c +    pixman-edge.c +    pixman-edge-accessors.c +    pixman-fast-path.c +    pixman-glyph.c +    pixman-general.c +    pixman-gradient-walker.c +    pixman-image.c +    pixman-implementation.c +    pixman-linear-gradient.c +    pixman-matrix.c +    pixman-noop.c +    pixman-radial-gradient.c +    pixman-region16.c +    pixman-region32.c +    pixman-solid-fill.c +    pixman-timer.c +    pixman-trap.c +    pixman-utils.c +) + +set(HEADERS +    pixman.h +    pixman-accessor.h +    pixman-combine32.h +    pixman-compiler.h +    pixman-edge-imp.h +    pixman-inlines.h +    pixman-private.h +) + +#proccesor_optimizations +if(ARM) +    if (ARM_NEON) +        add_definitions(-DUSE_ARM_NEON) +        list(APPEND SOURCES pixman-arm-neon.c +                            pixman-arm-neon-asm.S +                            pixman-arm-neon-asm-bilinear.S +        ) +    endif (ARM_NEON) +    if (ARM_SIMD) +        add_definitions(-DUSE_ARM_SIMD) +        list(APPEND SOURCES pixman-arm-simd.c +                            pixman-arm-simd-asm.S +                            pixman-arm-simd-asm-scaled.S +        ) +    endif (ARM_SIMD) +    if (ARM_IWMMXT) +        add_definitions(-DUSE_ARM_IWMMXT) +        list(APPEND SOURCES "pixman-mmx.c") +    endif() +endif(ARM)    + +if(MIPS) +    if (MIPS_DSPR2) +        add_definitions(-DUSE_MIPS_DSPR2) +        list(APPEND SOURCES pixman-mips-dspr2.c +                            pixman-mips-dspr2-asm.S +                            pixman-mips-memcpy-asm.S +        ) +    endif (MIPS_DSPR2) +    if (MIPS_LOONGSON_MMI) +        add_definitions(-DUSE_LOONGSON_MMI) +        list(APPEND SOURCES "pixman-mmx.c") +    endif() +endif(MIPS)   + +if(PPC) +    if (PPC_VMX) +        add_definitions(-DUSE_VMX) +        list(APPEND SOURCES "pixman-vmx.c") +    endif (PPC_VMX) +endif(PPC)   + +if(X86) +    if (X86_MMX) +        add_definitions(-DUSE_X86_MMX) +        list(APPEND SOURCES "pixman-mmx.c") +    endif (X86_MMX) +    if (X86_SSE2) +        add_definitions(-DUSE_SSE2) +        list(APPEND SOURCES "pixman-sse2.c") +    endif (X86_SSE2) +    if (X86_SSSE3) +        add_definitions(-DUSE_SSSE3) +        list(APPEND SOURCES "pixman-ssse3.c") +    endif(X86_SSSE3) +endif(X86) + +IF(UNIX) +    set(CMAKE_C_FLAGS -fPIC) +ENDIF(UNIX)  + +add_library (pixman-1_core OBJECT ${SOURCES} ${HEADERS}) + +if(BUILD_SHARED) +    add_library(pixman-1 SHARED $<TARGET_OBJECTS:pixman-1_core>) +    install(TARGETS pixman-1 EXPORT PixmanTargets RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib) +endif() + +if(BUILD_STATIC) +    add_library(pixman-1_static STATIC $<TARGET_OBJECTS:pixman-1_core>) +    if(NOT MSVC) +        set_target_properties(pixman-1_static PROPERTIES OUTPUT_NAME pixman-1) +    endif(NOT MSVC) +    install(TARGETS pixman-1_static EXPORT PixmanTargets RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib) +endif(BUILD_STATIC) + +install(FILES pixman.h ${CMAKE_CURRENT_BINARY_DIR}/pixman-version.h DESTINATION include/pixman-1) diff --git a/libs/pixman-0.40.0/pixman/Makefile.am b/libs/pixman-0.40.0/pixman/Makefile.am new file mode 100644 index 0000000..8f780a1 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/Makefile.am @@ -0,0 +1,143 @@ +include $(top_srcdir)/pixman/Makefile.sources + +lib_LTLIBRARIES = libpixman-1.la + +libpixman_1_la_LDFLAGS = -version-info $(LT_VERSION_INFO) -no-undefined @PTHREAD_LDFLAGS@  +libpixman_1_la_LIBADD = @PTHREAD_LIBS@ -lm +libpixman_1_la_SOURCES = $(libpixman_sources) $(libpixman_headers) + +libpixmanincludedir = $(includedir)/pixman-1 +libpixmaninclude_HEADERS = pixman.h pixman-version.h +noinst_LTLIBRARIES =  + +EXTRA_DIST =				\ +	Makefile.win32			\ +	dither/make-blue-noise.c	\ +	pixman-region.c			\ +	solaris-hwcap.mapfile		\ +	meson.build			\ +	$(NULL) + +# mmx code +if USE_X86_MMX +noinst_LTLIBRARIES += libpixman-mmx.la +libpixman_mmx_la_SOURCES = \ +	pixman-mmx.c +libpixman_mmx_la_CFLAGS = $(MMX_CFLAGS) +libpixman_1_la_LDFLAGS += $(MMX_LDFLAGS) +libpixman_1_la_LIBADD += libpixman-mmx.la + +ASM_CFLAGS_mmx=$(MMX_CFLAGS) +endif + +# vmx code +if USE_VMX +noinst_LTLIBRARIES += libpixman-vmx.la +libpixman_vmx_la_SOURCES = \ +	pixman-vmx.c \ +	pixman-combine32.h +libpixman_vmx_la_CFLAGS = $(VMX_CFLAGS) +libpixman_1_la_LIBADD += libpixman-vmx.la + +ASM_CFLAGS_vmx=$(VMX_CFLAGS) +endif + +# sse2 code +if USE_SSE2 +noinst_LTLIBRARIES += libpixman-sse2.la +libpixman_sse2_la_SOURCES = \ +	pixman-sse2.c +libpixman_sse2_la_CFLAGS = $(SSE2_CFLAGS) +libpixman_1_la_LDFLAGS += $(SSE2_LDFLAGS) +libpixman_1_la_LIBADD += libpixman-sse2.la + +ASM_CFLAGS_sse2=$(SSE2_CFLAGS) +endif + +# ssse3 code +if USE_SSSE3 +noinst_LTLIBRARIES += libpixman-ssse3.la +libpixman_ssse3_la_SOURCES = \ +	pixman-ssse3.c +libpixman_ssse3_la_CFLAGS = $(SSSE3_CFLAGS) +libpixman_1_la_LDFLAGS += $(SSSE3_LDFLAGS) +libpixman_1_la_LIBADD += libpixman-ssse3.la + +ASM_CFLAGS_ssse3=$(SSSE3_CFLAGS) +endif + +# arm simd code +if USE_ARM_SIMD +noinst_LTLIBRARIES += libpixman-arm-simd.la +libpixman_arm_simd_la_SOURCES = \ +	pixman-arm-simd.c	\ +	pixman-arm-common.h	\ +	pixman-arm-simd-asm.S   \ +	pixman-arm-simd-asm-scaled.S \ +	pixman-arm-asm.h	\ +	pixman-arm-simd-asm.h +libpixman_1_la_LIBADD += libpixman-arm-simd.la + +ASM_CFLAGS_arm_simd= +endif + +# arm neon code +if USE_ARM_NEON +noinst_LTLIBRARIES += libpixman-arm-neon.la +libpixman_arm_neon_la_SOURCES = \ +        pixman-arm-neon.c	\ +        pixman-arm-common.h	\ +        pixman-arm-neon-asm.S	\ +		pixman-arm-neon-asm-bilinear.S \ +        pixman-arm-asm.h	\ +        pixman-arm-neon-asm.h +libpixman_1_la_LIBADD += libpixman-arm-neon.la + +ASM_CFLAGS_arm_neon= +endif + +# iwmmxt code +if USE_ARM_IWMMXT +libpixman_iwmmxt_la_SOURCES = pixman-mmx.c +noinst_LTLIBRARIES += libpixman-iwmmxt.la +libpixman_1_la_LIBADD += libpixman-iwmmxt.la + +libpixman_iwmmxt_la-pixman-mmx.lo: pixman-mmx.c +	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(AM_CPPFLAGS) $(AM_CPPFLAGS) $(CPPFLAGS) $(CFLAGS) $(IWMMXT_CFLAGS) -MT libpixman_iwmmxt_la-pixman-mmx.lo -MD -MP -MF $(DEPDIR)/libpixman_iwmmxt_la-pixman-mmx.Tpo -c -o libpixman_iwmmxt_la-pixman-mmx.lo `test -f 'pixman-mmx.c' || echo '$(srcdir)/'`pixman-mmx.c +	$(AM_V_at)$(am__mv) $(DEPDIR)/libpixman_iwmmxt_la-pixman-mmx.Tpo $(DEPDIR)/libpixman_iwmmxt_la-pixman-mmx.Plo + +libpixman_iwmmxt_la_DEPENDENCIES = $(am__DEPENDENCIES_1) +libpixman_iwmmxt_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \ +        $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \ +	$(CFLAGS) $(IWMMXT_CFLAGS) $(AM_LDFLAGS) \ +	$(LDFLAGS) -o $@ + +libpixman-iwmmxt.la: libpixman_iwmmxt_la-pixman-mmx.lo $(libpixman_iwmmxt_la_DEPENDENCIES)  +	$(AM_V_CCLD)$(libpixman_iwmmxt_la_LINK) libpixman_iwmmxt_la-pixman-mmx.lo $(libpixman_iwmmxt_la_LIBADD) $(LIBS) +endif + +# mips dspr2 code +if USE_MIPS_DSPR2 +noinst_LTLIBRARIES += libpixman-mips-dspr2.la +libpixman_mips_dspr2_la_SOURCES = \ +        pixman-mips-dspr2.c \ +        pixman-mips-dspr2.h \ +        pixman-mips-dspr2-asm.S \ +        pixman-mips-dspr2-asm.h \ +        pixman-mips-memcpy-asm.S +libpixman_1_la_LIBADD += libpixman-mips-dspr2.la + +ASM_CFLAGS_mips_dspr2= +endif + +# loongson code +if USE_LOONGSON_MMI +noinst_LTLIBRARIES += libpixman-loongson-mmi.la +libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h +libpixman_loongson_mmi_la_CFLAGS = $(LS_CFLAGS) +libpixman_1_la_LDFLAGS += $(LS_LDFLAGS) +libpixman_1_la_LIBADD += libpixman-loongson-mmi.la +endif + +.c.s : $(libpixmaninclude_HEADERS) +	$(CC) $(CFLAGS) $(ASM_CFLAGS_$(@:pixman-%.s=%)) $(ASM_CFLAGS_$(@:pixman-arm-%.s=arm_%)) -DHAVE_CONFIG_H -I$(srcdir) -I$(builddir) -I$(top_builddir) -S -o $@ $< diff --git a/libs/pixman-0.40.0/pixman/Makefile.in b/libs/pixman-0.40.0/pixman/Makefile.in new file mode 100644 index 0000000..47139be --- /dev/null +++ b/libs/pixman-0.40.0/pixman/Makefile.in @@ -0,0 +1,1274 @@ +# Makefile.in generated by automake 1.16.1 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994-2018 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + + +VPATH = @srcdir@ +am__is_gnu_make = { \ +  if test -z '$(MAKELEVEL)'; then \ +    false; \ +  elif test -n '$(MAKE_HOST)'; then \ +    true; \ +  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ +    true; \ +  else \ +    false; \ +  fi; \ +} +am__make_running_with_option = \ +  case $${target_option-} in \ +      ?) ;; \ +      *) echo "am__make_running_with_option: internal error: invalid" \ +              "target option '$${target_option-}' specified" >&2; \ +         exit 1;; \ +  esac; \ +  has_opt=no; \ +  sane_makeflags=$$MAKEFLAGS; \ +  if $(am__is_gnu_make); then \ +    sane_makeflags=$$MFLAGS; \ +  else \ +    case $$MAKEFLAGS in \ +      *\\[\ \	]*) \ +        bs=\\; \ +        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ +          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \ +    esac; \ +  fi; \ +  skip_next=no; \ +  strip_trailopt () \ +  { \ +    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ +  }; \ +  for flg in $$sane_makeflags; do \ +    test $$skip_next = yes && { skip_next=no; continue; }; \ +    case $$flg in \ +      *=*|--*) continue;; \ +        -*I) strip_trailopt 'I'; skip_next=yes;; \ +      -*I?*) strip_trailopt 'I';; \ +        -*O) strip_trailopt 'O'; skip_next=yes;; \ +      -*O?*) strip_trailopt 'O';; \ +        -*l) strip_trailopt 'l'; skip_next=yes;; \ +      -*l?*) strip_trailopt 'l';; \ +      -[dEDm]) skip_next=yes;; \ +      -[JT]) skip_next=yes;; \ +    esac; \ +    case $$flg in \ +      *$$target_option*) has_opt=yes; break;; \ +    esac; \ +  done; \ +  test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ + +# mmx code +@USE_X86_MMX_TRUE@am__append_1 = libpixman-mmx.la +@USE_X86_MMX_TRUE@am__append_2 = $(MMX_LDFLAGS) +@USE_X86_MMX_TRUE@am__append_3 = libpixman-mmx.la + +# vmx code +@USE_VMX_TRUE@am__append_4 = libpixman-vmx.la +@USE_VMX_TRUE@am__append_5 = libpixman-vmx.la + +# sse2 code +@USE_SSE2_TRUE@am__append_6 = libpixman-sse2.la +@USE_SSE2_TRUE@am__append_7 = $(SSE2_LDFLAGS) +@USE_SSE2_TRUE@am__append_8 = libpixman-sse2.la + +# ssse3 code +@USE_SSSE3_TRUE@am__append_9 = libpixman-ssse3.la +@USE_SSSE3_TRUE@am__append_10 = $(SSSE3_LDFLAGS) +@USE_SSSE3_TRUE@am__append_11 = libpixman-ssse3.la + +# arm simd code +@USE_ARM_SIMD_TRUE@am__append_12 = libpixman-arm-simd.la +@USE_ARM_SIMD_TRUE@am__append_13 = libpixman-arm-simd.la + +# arm neon code +@USE_ARM_NEON_TRUE@am__append_14 = libpixman-arm-neon.la +@USE_ARM_NEON_TRUE@am__append_15 = libpixman-arm-neon.la +@USE_ARM_IWMMXT_TRUE@am__append_16 = libpixman-iwmmxt.la +@USE_ARM_IWMMXT_TRUE@am__append_17 = libpixman-iwmmxt.la + +# mips dspr2 code +@USE_MIPS_DSPR2_TRUE@am__append_18 = libpixman-mips-dspr2.la +@USE_MIPS_DSPR2_TRUE@am__append_19 = libpixman-mips-dspr2.la + +# loongson code +@USE_LOONGSON_MMI_TRUE@am__append_20 = libpixman-loongson-mmi.la +@USE_LOONGSON_MMI_TRUE@am__append_21 = $(LS_LDFLAGS) +@USE_LOONGSON_MMI_TRUE@am__append_22 = libpixman-loongson-mmi.la +subdir = pixman +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ +	$(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(libpixmaninclude_HEADERS) \ +	$(am__DIST_COMMON) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = pixman-version.h +CONFIG_CLEAN_VPATH_FILES = +am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; +am__vpath_adj = case $$p in \ +    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ +    *) f=$$p;; \ +  esac; +am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; +am__install_max = 40 +am__nobase_strip_setup = \ +  srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` +am__nobase_strip = \ +  for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" +am__nobase_list = $(am__nobase_strip_setup); \ +  for p in $$list; do echo "$$p $$p"; done | \ +  sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ +  $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ +    if (++n[$$2] == $(am__install_max)) \ +      { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ +    END { for (dir in files) print dir, files[dir] }' +am__base_list = \ +  sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ +  sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' +am__uninstall_files_from_dir = { \ +  test -z "$$files" \ +    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ +    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ +         $(am__cd) "$$dir" && rm -f $$files; }; \ +  } +am__installdirs = "$(DESTDIR)$(libdir)" \ +	"$(DESTDIR)$(libpixmanincludedir)" +LTLIBRARIES = $(lib_LTLIBRARIES) $(noinst_LTLIBRARIES) +libpixman_1_la_DEPENDENCIES = $(am__append_3) $(am__append_5) \ +	$(am__append_8) $(am__append_11) $(am__append_13) \ +	$(am__append_15) $(am__append_17) $(am__append_19) \ +	$(am__append_22) +am__objects_1 = pixman.lo pixman-access.lo pixman-access-accessors.lo \ +	pixman-bits-image.lo pixman-combine32.lo \ +	pixman-combine-float.lo pixman-conical-gradient.lo \ +	pixman-filter.lo pixman-x86.lo pixman-mips.lo pixman-arm.lo \ +	pixman-ppc.lo pixman-edge.lo pixman-edge-accessors.lo \ +	pixman-fast-path.lo pixman-glyph.lo pixman-general.lo \ +	pixman-gradient-walker.lo pixman-image.lo \ +	pixman-implementation.lo pixman-linear-gradient.lo \ +	pixman-matrix.lo pixman-noop.lo pixman-radial-gradient.lo \ +	pixman-region16.lo pixman-region32.lo pixman-solid-fill.lo \ +	pixman-timer.lo pixman-trap.lo pixman-utils.lo +am__objects_2 = +am_libpixman_1_la_OBJECTS = $(am__objects_1) $(am__objects_2) +libpixman_1_la_OBJECTS = $(am_libpixman_1_la_OBJECTS) +AM_V_lt = $(am__v_lt_@AM_V@) +am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) +am__v_lt_0 = --silent +am__v_lt_1 =  +libpixman_1_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \ +	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \ +	$(AM_CFLAGS) $(CFLAGS) $(libpixman_1_la_LDFLAGS) $(LDFLAGS) -o \ +	$@ +libpixman_arm_neon_la_LIBADD = +am__libpixman_arm_neon_la_SOURCES_DIST = pixman-arm-neon.c \ +	pixman-arm-common.h pixman-arm-neon-asm.S \ +	pixman-arm-neon-asm-bilinear.S pixman-arm-asm.h \ +	pixman-arm-neon-asm.h +@USE_ARM_NEON_TRUE@am_libpixman_arm_neon_la_OBJECTS =  \ +@USE_ARM_NEON_TRUE@	pixman-arm-neon.lo pixman-arm-neon-asm.lo \ +@USE_ARM_NEON_TRUE@	pixman-arm-neon-asm-bilinear.lo +libpixman_arm_neon_la_OBJECTS = $(am_libpixman_arm_neon_la_OBJECTS) +@USE_ARM_NEON_TRUE@am_libpixman_arm_neon_la_rpath = +libpixman_arm_simd_la_LIBADD = +am__libpixman_arm_simd_la_SOURCES_DIST = pixman-arm-simd.c \ +	pixman-arm-common.h pixman-arm-simd-asm.S \ +	pixman-arm-simd-asm-scaled.S pixman-arm-asm.h \ +	pixman-arm-simd-asm.h +@USE_ARM_SIMD_TRUE@am_libpixman_arm_simd_la_OBJECTS =  \ +@USE_ARM_SIMD_TRUE@	pixman-arm-simd.lo pixman-arm-simd-asm.lo \ +@USE_ARM_SIMD_TRUE@	pixman-arm-simd-asm-scaled.lo +libpixman_arm_simd_la_OBJECTS = $(am_libpixman_arm_simd_la_OBJECTS) +@USE_ARM_SIMD_TRUE@am_libpixman_arm_simd_la_rpath = +libpixman_iwmmxt_la_LIBADD = +am__libpixman_iwmmxt_la_SOURCES_DIST = pixman-mmx.c +@USE_ARM_IWMMXT_TRUE@am_libpixman_iwmmxt_la_OBJECTS = pixman-mmx.lo +libpixman_iwmmxt_la_OBJECTS = $(am_libpixman_iwmmxt_la_OBJECTS) +@USE_ARM_IWMMXT_TRUE@am_libpixman_iwmmxt_la_rpath = +libpixman_loongson_mmi_la_LIBADD = +am__libpixman_loongson_mmi_la_SOURCES_DIST = pixman-mmx.c \ +	loongson-mmintrin.h +@USE_LOONGSON_MMI_TRUE@am_libpixman_loongson_mmi_la_OBJECTS = libpixman_loongson_mmi_la-pixman-mmx.lo +libpixman_loongson_mmi_la_OBJECTS =  \ +	$(am_libpixman_loongson_mmi_la_OBJECTS) +libpixman_loongson_mmi_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \ +	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \ +	$(libpixman_loongson_mmi_la_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \ +	$(LDFLAGS) -o $@ +@USE_LOONGSON_MMI_TRUE@am_libpixman_loongson_mmi_la_rpath = +libpixman_mips_dspr2_la_LIBADD = +am__libpixman_mips_dspr2_la_SOURCES_DIST = pixman-mips-dspr2.c \ +	pixman-mips-dspr2.h pixman-mips-dspr2-asm.S \ +	pixman-mips-dspr2-asm.h pixman-mips-memcpy-asm.S +@USE_MIPS_DSPR2_TRUE@am_libpixman_mips_dspr2_la_OBJECTS =  \ +@USE_MIPS_DSPR2_TRUE@	pixman-mips-dspr2.lo \ +@USE_MIPS_DSPR2_TRUE@	pixman-mips-dspr2-asm.lo \ +@USE_MIPS_DSPR2_TRUE@	pixman-mips-memcpy-asm.lo +libpixman_mips_dspr2_la_OBJECTS =  \ +	$(am_libpixman_mips_dspr2_la_OBJECTS) +@USE_MIPS_DSPR2_TRUE@am_libpixman_mips_dspr2_la_rpath = +libpixman_mmx_la_LIBADD = +am__libpixman_mmx_la_SOURCES_DIST = pixman-mmx.c +@USE_X86_MMX_TRUE@am_libpixman_mmx_la_OBJECTS =  \ +@USE_X86_MMX_TRUE@	libpixman_mmx_la-pixman-mmx.lo +libpixman_mmx_la_OBJECTS = $(am_libpixman_mmx_la_OBJECTS) +libpixman_mmx_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \ +	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \ +	$(libpixman_mmx_la_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \ +	-o $@ +@USE_X86_MMX_TRUE@am_libpixman_mmx_la_rpath = +libpixman_sse2_la_LIBADD = +am__libpixman_sse2_la_SOURCES_DIST = pixman-sse2.c +@USE_SSE2_TRUE@am_libpixman_sse2_la_OBJECTS =  \ +@USE_SSE2_TRUE@	libpixman_sse2_la-pixman-sse2.lo +libpixman_sse2_la_OBJECTS = $(am_libpixman_sse2_la_OBJECTS) +libpixman_sse2_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \ +	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \ +	$(libpixman_sse2_la_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \ +	-o $@ +@USE_SSE2_TRUE@am_libpixman_sse2_la_rpath = +libpixman_ssse3_la_LIBADD = +am__libpixman_ssse3_la_SOURCES_DIST = pixman-ssse3.c +@USE_SSSE3_TRUE@am_libpixman_ssse3_la_OBJECTS =  \ +@USE_SSSE3_TRUE@	libpixman_ssse3_la-pixman-ssse3.lo +libpixman_ssse3_la_OBJECTS = $(am_libpixman_ssse3_la_OBJECTS) +libpixman_ssse3_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \ +	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \ +	$(libpixman_ssse3_la_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \ +	$(LDFLAGS) -o $@ +@USE_SSSE3_TRUE@am_libpixman_ssse3_la_rpath = +libpixman_vmx_la_LIBADD = +am__libpixman_vmx_la_SOURCES_DIST = pixman-vmx.c pixman-combine32.h +@USE_VMX_TRUE@am_libpixman_vmx_la_OBJECTS =  \ +@USE_VMX_TRUE@	libpixman_vmx_la-pixman-vmx.lo +libpixman_vmx_la_OBJECTS = $(am_libpixman_vmx_la_OBJECTS) +libpixman_vmx_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \ +	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \ +	$(libpixman_vmx_la_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \ +	-o $@ +@USE_VMX_TRUE@am_libpixman_vmx_la_rpath = +AM_V_P = $(am__v_P_@AM_V@) +am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_@AM_V@) +am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) +am__v_GEN_0 = @echo "  GEN     " $@; +am__v_GEN_1 =  +AM_V_at = $(am__v_at_@AM_V@) +am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) +am__v_at_0 = @ +am__v_at_1 =  +DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/depcomp +am__maybe_remake_depfiles = depfiles +am__depfiles_remade =  \ +	./$(DEPDIR)/libpixman_loongson_mmi_la-pixman-mmx.Plo \ +	./$(DEPDIR)/libpixman_mmx_la-pixman-mmx.Plo \ +	./$(DEPDIR)/libpixman_sse2_la-pixman-sse2.Plo \ +	./$(DEPDIR)/libpixman_ssse3_la-pixman-ssse3.Plo \ +	./$(DEPDIR)/libpixman_vmx_la-pixman-vmx.Plo \ +	./$(DEPDIR)/pixman-access-accessors.Plo \ +	./$(DEPDIR)/pixman-access.Plo \ +	./$(DEPDIR)/pixman-arm-neon-asm-bilinear.Plo \ +	./$(DEPDIR)/pixman-arm-neon-asm.Plo \ +	./$(DEPDIR)/pixman-arm-neon.Plo \ +	./$(DEPDIR)/pixman-arm-simd-asm-scaled.Plo \ +	./$(DEPDIR)/pixman-arm-simd-asm.Plo \ +	./$(DEPDIR)/pixman-arm-simd.Plo ./$(DEPDIR)/pixman-arm.Plo \ +	./$(DEPDIR)/pixman-bits-image.Plo \ +	./$(DEPDIR)/pixman-combine-float.Plo \ +	./$(DEPDIR)/pixman-combine32.Plo \ +	./$(DEPDIR)/pixman-conical-gradient.Plo \ +	./$(DEPDIR)/pixman-edge-accessors.Plo \ +	./$(DEPDIR)/pixman-edge.Plo ./$(DEPDIR)/pixman-fast-path.Plo \ +	./$(DEPDIR)/pixman-filter.Plo ./$(DEPDIR)/pixman-general.Plo \ +	./$(DEPDIR)/pixman-glyph.Plo \ +	./$(DEPDIR)/pixman-gradient-walker.Plo \ +	./$(DEPDIR)/pixman-image.Plo \ +	./$(DEPDIR)/pixman-implementation.Plo \ +	./$(DEPDIR)/pixman-linear-gradient.Plo \ +	./$(DEPDIR)/pixman-matrix.Plo \ +	./$(DEPDIR)/pixman-mips-dspr2-asm.Plo \ +	./$(DEPDIR)/pixman-mips-dspr2.Plo \ +	./$(DEPDIR)/pixman-mips-memcpy-asm.Plo \ +	./$(DEPDIR)/pixman-mips.Plo ./$(DEPDIR)/pixman-mmx.Plo \ +	./$(DEPDIR)/pixman-noop.Plo ./$(DEPDIR)/pixman-ppc.Plo \ +	./$(DEPDIR)/pixman-radial-gradient.Plo \ +	./$(DEPDIR)/pixman-region16.Plo \ +	./$(DEPDIR)/pixman-region32.Plo \ +	./$(DEPDIR)/pixman-solid-fill.Plo ./$(DEPDIR)/pixman-timer.Plo \ +	./$(DEPDIR)/pixman-trap.Plo ./$(DEPDIR)/pixman-utils.Plo \ +	./$(DEPDIR)/pixman-x86.Plo ./$(DEPDIR)/pixman.Plo +am__mv = mv -f +CPPASCOMPILE = $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ +	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CCASFLAGS) $(CCASFLAGS) +LTCPPASCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ +	$(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) \ +	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ +	$(AM_CCASFLAGS) $(CCASFLAGS) +AM_V_CPPAS = $(am__v_CPPAS_@AM_V@) +am__v_CPPAS_ = $(am__v_CPPAS_@AM_DEFAULT_V@) +am__v_CPPAS_0 = @echo "  CPPAS   " $@; +am__v_CPPAS_1 =  +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ +	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ +	$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ +	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ +	$(AM_CFLAGS) $(CFLAGS) +AM_V_CC = $(am__v_CC_@AM_V@) +am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@) +am__v_CC_0 = @echo "  CC      " $@; +am__v_CC_1 =  +CCLD = $(CC) +LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ +	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ +	$(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CCLD = $(am__v_CCLD_@AM_V@) +am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@) +am__v_CCLD_0 = @echo "  CCLD    " $@; +am__v_CCLD_1 =  +SOURCES = $(libpixman_1_la_SOURCES) $(libpixman_arm_neon_la_SOURCES) \ +	$(libpixman_arm_simd_la_SOURCES) \ +	$(libpixman_iwmmxt_la_SOURCES) \ +	$(libpixman_loongson_mmi_la_SOURCES) \ +	$(libpixman_mips_dspr2_la_SOURCES) $(libpixman_mmx_la_SOURCES) \ +	$(libpixman_sse2_la_SOURCES) $(libpixman_ssse3_la_SOURCES) \ +	$(libpixman_vmx_la_SOURCES) +DIST_SOURCES = $(libpixman_1_la_SOURCES) \ +	$(am__libpixman_arm_neon_la_SOURCES_DIST) \ +	$(am__libpixman_arm_simd_la_SOURCES_DIST) \ +	$(am__libpixman_iwmmxt_la_SOURCES_DIST) \ +	$(am__libpixman_loongson_mmi_la_SOURCES_DIST) \ +	$(am__libpixman_mips_dspr2_la_SOURCES_DIST) \ +	$(am__libpixman_mmx_la_SOURCES_DIST) \ +	$(am__libpixman_sse2_la_SOURCES_DIST) \ +	$(am__libpixman_ssse3_la_SOURCES_DIST) \ +	$(am__libpixman_vmx_la_SOURCES_DIST) +am__can_run_installinfo = \ +  case $$AM_UPDATE_INFO_DIR in \ +    n|no|NO) false;; \ +    *) (install-info --version) >/dev/null 2>&1;; \ +  esac +HEADERS = $(libpixmaninclude_HEADERS) +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates.  Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ +  BEGIN { nonempty = 0; } \ +  { items[$$0] = 1; nonempty = 1; } \ +  END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique.  This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ +  list='$(am__tagged_files)'; \ +  unique=`for i in $$list; do \ +    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ +  done | $(am__uniquify_input)` +ETAGS = etags +CTAGS = ctags +am__DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/pixman-version.h.in \ +	$(top_srcdir)/depcomp $(top_srcdir)/pixman/Makefile.sources +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +AMTAR = @AMTAR@ +AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCAS = @CCAS@ +CCASDEPMODE = @CCASDEPMODE@ +CCASFLAGS = @CCASFLAGS@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +DLLTOOL = @DLLTOOL@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +FGREP = @FGREP@ +GREP = @GREP@ +GTK_CFLAGS = @GTK_CFLAGS@ +GTK_LIBS = @GTK_LIBS@ +HAVE_LIBPNG = @HAVE_LIBPNG@ +HAVE_PTHREADS = @HAVE_PTHREADS@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +IWMMXT_CFLAGS = @IWMMXT_CFLAGS@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LIPO = @LIPO@ +LN_S = @LN_S@ +LS_CFLAGS = @LS_CFLAGS@ +LTLIBOBJS = @LTLIBOBJS@ +LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@ +LT_VERSION_INFO = @LT_VERSION_INFO@ +MAKEINFO = @MAKEINFO@ +MANIFEST_TOOL = @MANIFEST_TOOL@ +MKDIR_P = @MKDIR_P@ +MMX_CFLAGS = @MMX_CFLAGS@ +MMX_LDFLAGS = @MMX_LDFLAGS@ +NM = @NM@ +NMEDIT = @NMEDIT@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +OPENMP_CFLAGS = @OPENMP_CFLAGS@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PIXMAN_GNUPLOT = @PIXMAN_GNUPLOT@ +PIXMAN_TIMERS = @PIXMAN_TIMERS@ +PIXMAN_VERSION_MAJOR = @PIXMAN_VERSION_MAJOR@ +PIXMAN_VERSION_MICRO = @PIXMAN_VERSION_MICRO@ +PIXMAN_VERSION_MINOR = @PIXMAN_VERSION_MINOR@ +PKG_CONFIG = @PKG_CONFIG@ +PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ +PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ +PNG_CFLAGS = @PNG_CFLAGS@ +PNG_LIBS = @PNG_LIBS@ +PTHREAD_CFLAGS = @PTHREAD_CFLAGS@ +PTHREAD_LDFLAGS = @PTHREAD_LDFLAGS@ +PTHREAD_LIBS = @PTHREAD_LIBS@ +RANLIB = @RANLIB@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +SSE2_CFLAGS = @SSE2_CFLAGS@ +SSE2_LDFLAGS = @SSE2_LDFLAGS@ +SSSE3_CFLAGS = @SSSE3_CFLAGS@ +STRIP = @STRIP@ +TESTPROGS_EXTRA_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@ +TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR = @TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR@ +TOOLCHAIN_SUPPORTS__THREAD = @TOOLCHAIN_SUPPORTS__THREAD@ +VERSION = @VERSION@ +VMX_CFLAGS = @VMX_CFLAGS@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +libpixman_sources = \ +	pixman.c			\ +	pixman-access.c			\ +	pixman-access-accessors.c	\ +	pixman-bits-image.c		\ +	pixman-combine32.c		\ +	pixman-combine-float.c		\ +	pixman-conical-gradient.c	\ +	pixman-filter.c			\ +	pixman-x86.c			\ +	pixman-mips.c			\ +	pixman-arm.c			\ +	pixman-ppc.c			\ +	pixman-edge.c			\ +	pixman-edge-accessors.c		\ +	pixman-fast-path.c		\ +	pixman-glyph.c			\ +	pixman-general.c		\ +	pixman-gradient-walker.c	\ +	pixman-image.c			\ +	pixman-implementation.c		\ +	pixman-linear-gradient.c	\ +	pixman-matrix.c			\ +	pixman-noop.c			\ +	pixman-radial-gradient.c	\ +	pixman-region16.c		\ +	pixman-region32.c		\ +	pixman-solid-fill.c		\ +	pixman-timer.c			\ +	pixman-trap.c			\ +	pixman-utils.c			\ +	$(NULL) + +libpixman_headers = \ +	dither/blue-noise-64x64.h	\ +	pixman.h			\ +	pixman-accessor.h		\ +	pixman-combine32.h		\ +	pixman-compiler.h		\ +	pixman-edge-imp.h		\ +	pixman-inlines.h		\ +	pixman-private.h		\ +	$(NULL) + +lib_LTLIBRARIES = libpixman-1.la +libpixman_1_la_LDFLAGS = -version-info $(LT_VERSION_INFO) \ +	-no-undefined @PTHREAD_LDFLAGS@ $(am__append_2) \ +	$(am__append_7) $(am__append_10) $(am__append_21) +libpixman_1_la_LIBADD = @PTHREAD_LIBS@ -lm $(am__append_3) \ +	$(am__append_5) $(am__append_8) $(am__append_11) \ +	$(am__append_13) $(am__append_15) $(am__append_17) \ +	$(am__append_19) $(am__append_22) +libpixman_1_la_SOURCES = $(libpixman_sources) $(libpixman_headers) +libpixmanincludedir = $(includedir)/pixman-1 +libpixmaninclude_HEADERS = pixman.h pixman-version.h +noinst_LTLIBRARIES = $(am__append_1) $(am__append_4) $(am__append_6) \ +	$(am__append_9) $(am__append_12) $(am__append_14) \ +	$(am__append_16) $(am__append_18) $(am__append_20) +EXTRA_DIST = \ +	Makefile.win32			\ +	dither/make-blue-noise.c	\ +	pixman-region.c			\ +	solaris-hwcap.mapfile		\ +	meson.build			\ +	$(NULL) + +@USE_X86_MMX_TRUE@libpixman_mmx_la_SOURCES = \ +@USE_X86_MMX_TRUE@	pixman-mmx.c + +@USE_X86_MMX_TRUE@libpixman_mmx_la_CFLAGS = $(MMX_CFLAGS) +@USE_X86_MMX_TRUE@ASM_CFLAGS_mmx = $(MMX_CFLAGS) +@USE_VMX_TRUE@libpixman_vmx_la_SOURCES = \ +@USE_VMX_TRUE@	pixman-vmx.c \ +@USE_VMX_TRUE@	pixman-combine32.h + +@USE_VMX_TRUE@libpixman_vmx_la_CFLAGS = $(VMX_CFLAGS) +@USE_VMX_TRUE@ASM_CFLAGS_vmx = $(VMX_CFLAGS) +@USE_SSE2_TRUE@libpixman_sse2_la_SOURCES = \ +@USE_SSE2_TRUE@	pixman-sse2.c + +@USE_SSE2_TRUE@libpixman_sse2_la_CFLAGS = $(SSE2_CFLAGS) +@USE_SSE2_TRUE@ASM_CFLAGS_sse2 = $(SSE2_CFLAGS) +@USE_SSSE3_TRUE@libpixman_ssse3_la_SOURCES = \ +@USE_SSSE3_TRUE@	pixman-ssse3.c + +@USE_SSSE3_TRUE@libpixman_ssse3_la_CFLAGS = $(SSSE3_CFLAGS) +@USE_SSSE3_TRUE@ASM_CFLAGS_ssse3 = $(SSSE3_CFLAGS) +@USE_ARM_SIMD_TRUE@libpixman_arm_simd_la_SOURCES = \ +@USE_ARM_SIMD_TRUE@	pixman-arm-simd.c	\ +@USE_ARM_SIMD_TRUE@	pixman-arm-common.h	\ +@USE_ARM_SIMD_TRUE@	pixman-arm-simd-asm.S   \ +@USE_ARM_SIMD_TRUE@	pixman-arm-simd-asm-scaled.S \ +@USE_ARM_SIMD_TRUE@	pixman-arm-asm.h	\ +@USE_ARM_SIMD_TRUE@	pixman-arm-simd-asm.h + +@USE_ARM_SIMD_TRUE@ASM_CFLAGS_arm_simd =  +@USE_ARM_NEON_TRUE@libpixman_arm_neon_la_SOURCES = \ +@USE_ARM_NEON_TRUE@        pixman-arm-neon.c	\ +@USE_ARM_NEON_TRUE@        pixman-arm-common.h	\ +@USE_ARM_NEON_TRUE@        pixman-arm-neon-asm.S	\ +@USE_ARM_NEON_TRUE@		pixman-arm-neon-asm-bilinear.S \ +@USE_ARM_NEON_TRUE@        pixman-arm-asm.h	\ +@USE_ARM_NEON_TRUE@        pixman-arm-neon-asm.h + +@USE_ARM_NEON_TRUE@ASM_CFLAGS_arm_neon =  + +# iwmmxt code +@USE_ARM_IWMMXT_TRUE@libpixman_iwmmxt_la_SOURCES = pixman-mmx.c +@USE_ARM_IWMMXT_TRUE@libpixman_iwmmxt_la_DEPENDENCIES = $(am__DEPENDENCIES_1) +@USE_ARM_IWMMXT_TRUE@libpixman_iwmmxt_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \ +@USE_ARM_IWMMXT_TRUE@        $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \ +@USE_ARM_IWMMXT_TRUE@	$(CFLAGS) $(IWMMXT_CFLAGS) $(AM_LDFLAGS) \ +@USE_ARM_IWMMXT_TRUE@	$(LDFLAGS) -o $@ + +@USE_MIPS_DSPR2_TRUE@libpixman_mips_dspr2_la_SOURCES = \ +@USE_MIPS_DSPR2_TRUE@        pixman-mips-dspr2.c \ +@USE_MIPS_DSPR2_TRUE@        pixman-mips-dspr2.h \ +@USE_MIPS_DSPR2_TRUE@        pixman-mips-dspr2-asm.S \ +@USE_MIPS_DSPR2_TRUE@        pixman-mips-dspr2-asm.h \ +@USE_MIPS_DSPR2_TRUE@        pixman-mips-memcpy-asm.S + +@USE_MIPS_DSPR2_TRUE@ASM_CFLAGS_mips_dspr2 =  +@USE_LOONGSON_MMI_TRUE@libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h +@USE_LOONGSON_MMI_TRUE@libpixman_loongson_mmi_la_CFLAGS = $(LS_CFLAGS) +all: all-am + +.SUFFIXES: +.SUFFIXES: .S .c .lo .o .obj .s +$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am $(top_srcdir)/pixman/Makefile.sources $(am__configure_deps) +	@for dep in $?; do \ +	  case '$(am__configure_deps)' in \ +	    *$$dep*) \ +	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ +	        && { if test -f $@; then exit 0; else break; fi; }; \ +	      exit 1;; \ +	  esac; \ +	done; \ +	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign pixman/Makefile'; \ +	$(am__cd) $(top_srcdir) && \ +	  $(AUTOMAKE) --foreign pixman/Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status +	@case '$?' in \ +	  *config.status*) \ +	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ +	  *) \ +	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \ +	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \ +	esac; +$(top_srcdir)/pixman/Makefile.sources $(am__empty): + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) +	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure:  $(am__configure_deps) +	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4):  $(am__aclocal_m4_deps) +	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): +pixman-version.h: $(top_builddir)/config.status $(srcdir)/pixman-version.h.in +	cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ + +install-libLTLIBRARIES: $(lib_LTLIBRARIES) +	@$(NORMAL_INSTALL) +	@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \ +	list2=; for p in $$list; do \ +	  if test -f $$p; then \ +	    list2="$$list2 $$p"; \ +	  else :; fi; \ +	done; \ +	test -z "$$list2" || { \ +	  echo " $(MKDIR_P) '$(DESTDIR)$(libdir)'"; \ +	  $(MKDIR_P) "$(DESTDIR)$(libdir)" || exit 1; \ +	  echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(libdir)'"; \ +	  $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(libdir)"; \ +	} + +uninstall-libLTLIBRARIES: +	@$(NORMAL_UNINSTALL) +	@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \ +	for p in $$list; do \ +	  $(am__strip_dir) \ +	  echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$$f'"; \ +	  $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$$f"; \ +	done + +clean-libLTLIBRARIES: +	-test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES) +	@list='$(lib_LTLIBRARIES)'; \ +	locs=`for p in $$list; do echo $$p; done | \ +	      sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ +	      sort -u`; \ +	test -z "$$locs" || { \ +	  echo rm -f $${locs}; \ +	  rm -f $${locs}; \ +	} + +clean-noinstLTLIBRARIES: +	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES) +	@list='$(noinst_LTLIBRARIES)'; \ +	locs=`for p in $$list; do echo $$p; done | \ +	      sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ +	      sort -u`; \ +	test -z "$$locs" || { \ +	  echo rm -f $${locs}; \ +	  rm -f $${locs}; \ +	} + +libpixman-1.la: $(libpixman_1_la_OBJECTS) $(libpixman_1_la_DEPENDENCIES) $(EXTRA_libpixman_1_la_DEPENDENCIES)  +	$(AM_V_CCLD)$(libpixman_1_la_LINK) -rpath $(libdir) $(libpixman_1_la_OBJECTS) $(libpixman_1_la_LIBADD) $(LIBS) + +libpixman-arm-neon.la: $(libpixman_arm_neon_la_OBJECTS) $(libpixman_arm_neon_la_DEPENDENCIES) $(EXTRA_libpixman_arm_neon_la_DEPENDENCIES)  +	$(AM_V_CCLD)$(LINK) $(am_libpixman_arm_neon_la_rpath) $(libpixman_arm_neon_la_OBJECTS) $(libpixman_arm_neon_la_LIBADD) $(LIBS) + +libpixman-arm-simd.la: $(libpixman_arm_simd_la_OBJECTS) $(libpixman_arm_simd_la_DEPENDENCIES) $(EXTRA_libpixman_arm_simd_la_DEPENDENCIES)  +	$(AM_V_CCLD)$(LINK) $(am_libpixman_arm_simd_la_rpath) $(libpixman_arm_simd_la_OBJECTS) $(libpixman_arm_simd_la_LIBADD) $(LIBS) + +@USE_ARM_IWMMXT_FALSE@libpixman-iwmmxt.la: $(libpixman_iwmmxt_la_OBJECTS) $(libpixman_iwmmxt_la_DEPENDENCIES) $(EXTRA_libpixman_iwmmxt_la_DEPENDENCIES)  +@USE_ARM_IWMMXT_FALSE@	$(AM_V_GEN)$(libpixman_iwmmxt_la_LINK) $(am_libpixman_iwmmxt_la_rpath) $(libpixman_iwmmxt_la_OBJECTS) $(libpixman_iwmmxt_la_LIBADD) $(LIBS) + +libpixman-loongson-mmi.la: $(libpixman_loongson_mmi_la_OBJECTS) $(libpixman_loongson_mmi_la_DEPENDENCIES) $(EXTRA_libpixman_loongson_mmi_la_DEPENDENCIES)  +	$(AM_V_CCLD)$(libpixman_loongson_mmi_la_LINK) $(am_libpixman_loongson_mmi_la_rpath) $(libpixman_loongson_mmi_la_OBJECTS) $(libpixman_loongson_mmi_la_LIBADD) $(LIBS) + +libpixman-mips-dspr2.la: $(libpixman_mips_dspr2_la_OBJECTS) $(libpixman_mips_dspr2_la_DEPENDENCIES) $(EXTRA_libpixman_mips_dspr2_la_DEPENDENCIES)  +	$(AM_V_CCLD)$(LINK) $(am_libpixman_mips_dspr2_la_rpath) $(libpixman_mips_dspr2_la_OBJECTS) $(libpixman_mips_dspr2_la_LIBADD) $(LIBS) + +libpixman-mmx.la: $(libpixman_mmx_la_OBJECTS) $(libpixman_mmx_la_DEPENDENCIES) $(EXTRA_libpixman_mmx_la_DEPENDENCIES)  +	$(AM_V_CCLD)$(libpixman_mmx_la_LINK) $(am_libpixman_mmx_la_rpath) $(libpixman_mmx_la_OBJECTS) $(libpixman_mmx_la_LIBADD) $(LIBS) + +libpixman-sse2.la: $(libpixman_sse2_la_OBJECTS) $(libpixman_sse2_la_DEPENDENCIES) $(EXTRA_libpixman_sse2_la_DEPENDENCIES)  +	$(AM_V_CCLD)$(libpixman_sse2_la_LINK) $(am_libpixman_sse2_la_rpath) $(libpixman_sse2_la_OBJECTS) $(libpixman_sse2_la_LIBADD) $(LIBS) + +libpixman-ssse3.la: $(libpixman_ssse3_la_OBJECTS) $(libpixman_ssse3_la_DEPENDENCIES) $(EXTRA_libpixman_ssse3_la_DEPENDENCIES)  +	$(AM_V_CCLD)$(libpixman_ssse3_la_LINK) $(am_libpixman_ssse3_la_rpath) $(libpixman_ssse3_la_OBJECTS) $(libpixman_ssse3_la_LIBADD) $(LIBS) + +libpixman-vmx.la: $(libpixman_vmx_la_OBJECTS) $(libpixman_vmx_la_DEPENDENCIES) $(EXTRA_libpixman_vmx_la_DEPENDENCIES)  +	$(AM_V_CCLD)$(libpixman_vmx_la_LINK) $(am_libpixman_vmx_la_rpath) $(libpixman_vmx_la_OBJECTS) $(libpixman_vmx_la_LIBADD) $(LIBS) + +mostlyclean-compile: +	-rm -f *.$(OBJEXT) + +distclean-compile: +	-rm -f *.tab.c + +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_loongson_mmi_la-pixman-mmx.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_mmx_la-pixman-mmx.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_sse2_la-pixman-sse2.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_ssse3_la-pixman-ssse3.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_vmx_la-pixman-vmx.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-access-accessors.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-access.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-arm-neon-asm-bilinear.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-arm-neon-asm.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-arm-neon.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-arm-simd-asm-scaled.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-arm-simd-asm.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-arm-simd.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-arm.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-bits-image.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-combine-float.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-combine32.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-conical-gradient.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-edge-accessors.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-edge.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-fast-path.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-filter.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-general.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-glyph.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-gradient-walker.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-image.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-implementation.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-linear-gradient.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-matrix.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-mips-dspr2-asm.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-mips-dspr2.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-mips-memcpy-asm.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-mips.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-mmx.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-noop.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-ppc.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-radial-gradient.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-region16.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-region32.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-solid-fill.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-timer.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-trap.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-utils.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-x86.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman.Plo@am__quote@ # am--include-marker + +$(am__depfiles_remade): +	@$(MKDIR_P) $(@D) +	@echo '# dummy' >$@-t && $(am__mv) $@-t $@ + +am--depfiles: $(am__depfiles_remade) + +.S.o: +@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(CPPASCOMPILE) -c -o $@ $< + +.S.obj: +@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(CPPASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.S.lo: +@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(LTCPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(LTCPPASCOMPILE) -c -o $@ $< + +.c.o: +@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $< + +.c.obj: +@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.c.lo: +@am__fastdepCC_TRUE@	$(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $< + +libpixman_loongson_mmi_la-pixman-mmx.lo: pixman-mmx.c +@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_loongson_mmi_la_CFLAGS) $(CFLAGS) -MT libpixman_loongson_mmi_la-pixman-mmx.lo -MD -MP -MF $(DEPDIR)/libpixman_loongson_mmi_la-pixman-mmx.Tpo -c -o libpixman_loongson_mmi_la-pixman-mmx.lo `test -f 'pixman-mmx.c' || echo '$(srcdir)/'`pixman-mmx.c +@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpixman_loongson_mmi_la-pixman-mmx.Tpo $(DEPDIR)/libpixman_loongson_mmi_la-pixman-mmx.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='pixman-mmx.c' object='libpixman_loongson_mmi_la-pixman-mmx.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_loongson_mmi_la_CFLAGS) $(CFLAGS) -c -o libpixman_loongson_mmi_la-pixman-mmx.lo `test -f 'pixman-mmx.c' || echo '$(srcdir)/'`pixman-mmx.c + +libpixman_mmx_la-pixman-mmx.lo: pixman-mmx.c +@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_mmx_la_CFLAGS) $(CFLAGS) -MT libpixman_mmx_la-pixman-mmx.lo -MD -MP -MF $(DEPDIR)/libpixman_mmx_la-pixman-mmx.Tpo -c -o libpixman_mmx_la-pixman-mmx.lo `test -f 'pixman-mmx.c' || echo '$(srcdir)/'`pixman-mmx.c +@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpixman_mmx_la-pixman-mmx.Tpo $(DEPDIR)/libpixman_mmx_la-pixman-mmx.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='pixman-mmx.c' object='libpixman_mmx_la-pixman-mmx.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_mmx_la_CFLAGS) $(CFLAGS) -c -o libpixman_mmx_la-pixman-mmx.lo `test -f 'pixman-mmx.c' || echo '$(srcdir)/'`pixman-mmx.c + +libpixman_sse2_la-pixman-sse2.lo: pixman-sse2.c +@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_sse2_la_CFLAGS) $(CFLAGS) -MT libpixman_sse2_la-pixman-sse2.lo -MD -MP -MF $(DEPDIR)/libpixman_sse2_la-pixman-sse2.Tpo -c -o libpixman_sse2_la-pixman-sse2.lo `test -f 'pixman-sse2.c' || echo '$(srcdir)/'`pixman-sse2.c +@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpixman_sse2_la-pixman-sse2.Tpo $(DEPDIR)/libpixman_sse2_la-pixman-sse2.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='pixman-sse2.c' object='libpixman_sse2_la-pixman-sse2.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_sse2_la_CFLAGS) $(CFLAGS) -c -o libpixman_sse2_la-pixman-sse2.lo `test -f 'pixman-sse2.c' || echo '$(srcdir)/'`pixman-sse2.c + +libpixman_ssse3_la-pixman-ssse3.lo: pixman-ssse3.c +@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_ssse3_la_CFLAGS) $(CFLAGS) -MT libpixman_ssse3_la-pixman-ssse3.lo -MD -MP -MF $(DEPDIR)/libpixman_ssse3_la-pixman-ssse3.Tpo -c -o libpixman_ssse3_la-pixman-ssse3.lo `test -f 'pixman-ssse3.c' || echo '$(srcdir)/'`pixman-ssse3.c +@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpixman_ssse3_la-pixman-ssse3.Tpo $(DEPDIR)/libpixman_ssse3_la-pixman-ssse3.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='pixman-ssse3.c' object='libpixman_ssse3_la-pixman-ssse3.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_ssse3_la_CFLAGS) $(CFLAGS) -c -o libpixman_ssse3_la-pixman-ssse3.lo `test -f 'pixman-ssse3.c' || echo '$(srcdir)/'`pixman-ssse3.c + +libpixman_vmx_la-pixman-vmx.lo: pixman-vmx.c +@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_vmx_la_CFLAGS) $(CFLAGS) -MT libpixman_vmx_la-pixman-vmx.lo -MD -MP -MF $(DEPDIR)/libpixman_vmx_la-pixman-vmx.Tpo -c -o libpixman_vmx_la-pixman-vmx.lo `test -f 'pixman-vmx.c' || echo '$(srcdir)/'`pixman-vmx.c +@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpixman_vmx_la-pixman-vmx.Tpo $(DEPDIR)/libpixman_vmx_la-pixman-vmx.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='pixman-vmx.c' object='libpixman_vmx_la-pixman-vmx.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_vmx_la_CFLAGS) $(CFLAGS) -c -o libpixman_vmx_la-pixman-vmx.lo `test -f 'pixman-vmx.c' || echo '$(srcdir)/'`pixman-vmx.c + +mostlyclean-libtool: +	-rm -f *.lo + +clean-libtool: +	-rm -rf .libs _libs +install-libpixmanincludeHEADERS: $(libpixmaninclude_HEADERS) +	@$(NORMAL_INSTALL) +	@list='$(libpixmaninclude_HEADERS)'; test -n "$(libpixmanincludedir)" || list=; \ +	if test -n "$$list"; then \ +	  echo " $(MKDIR_P) '$(DESTDIR)$(libpixmanincludedir)'"; \ +	  $(MKDIR_P) "$(DESTDIR)$(libpixmanincludedir)" || exit 1; \ +	fi; \ +	for p in $$list; do \ +	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ +	  echo "$$d$$p"; \ +	done | $(am__base_list) | \ +	while read files; do \ +	  echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(libpixmanincludedir)'"; \ +	  $(INSTALL_HEADER) $$files "$(DESTDIR)$(libpixmanincludedir)" || exit $$?; \ +	done + +uninstall-libpixmanincludeHEADERS: +	@$(NORMAL_UNINSTALL) +	@list='$(libpixmaninclude_HEADERS)'; test -n "$(libpixmanincludedir)" || list=; \ +	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ +	dir='$(DESTDIR)$(libpixmanincludedir)'; $(am__uninstall_files_from_dir) + +ID: $(am__tagged_files) +	$(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) +	set x; \ +	here=`pwd`; \ +	$(am__define_uniq_tagged_files); \ +	shift; \ +	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ +	  test -n "$$unique" || unique=$$empty_fix; \ +	  if test $$# -gt 0; then \ +	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ +	      "$$@" $$unique; \ +	  else \ +	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ +	      $$unique; \ +	  fi; \ +	fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) +	$(am__define_uniq_tagged_files); \ +	test -z "$(CTAGS_ARGS)$$unique" \ +	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ +	     $$unique + +GTAGS: +	here=`$(am__cd) $(top_builddir) && pwd` \ +	  && $(am__cd) $(top_srcdir) \ +	  && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) +	list='$(am__tagged_files)'; \ +	case "$(srcdir)" in \ +	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ +	  *) sdir=$(subdir)/$(srcdir) ;; \ +	esac; \ +	for i in $$list; do \ +	  if test -f "$$i"; then \ +	    echo "$(subdir)/$$i"; \ +	  else \ +	    echo "$$sdir/$$i"; \ +	  fi; \ +	done >> $(top_builddir)/cscope.files + +distclean-tags: +	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(BUILT_SOURCES) +	$(MAKE) $(AM_MAKEFLAGS) distdir-am + +distdir-am: $(DISTFILES) +	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ +	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ +	list='$(DISTFILES)'; \ +	  dist_files=`for file in $$list; do echo $$file; done | \ +	  sed -e "s|^$$srcdirstrip/||;t" \ +	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ +	case $$dist_files in \ +	  */*) $(MKDIR_P) `echo "$$dist_files" | \ +			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ +			   sort -u` ;; \ +	esac; \ +	for file in $$dist_files; do \ +	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ +	  if test -d $$d/$$file; then \ +	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ +	    if test -d "$(distdir)/$$file"; then \ +	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ +	    fi; \ +	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ +	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ +	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ +	    fi; \ +	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ +	  else \ +	    test -f "$(distdir)/$$file" \ +	    || cp -p $$d/$$file "$(distdir)/$$file" \ +	    || exit 1; \ +	  fi; \ +	done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) $(HEADERS) +installdirs: +	for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(libpixmanincludedir)"; do \ +	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \ +	done +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am +	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: +	if test -z '$(STRIP)'; then \ +	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ +	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ +	      install; \ +	else \ +	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ +	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ +	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ +	fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: +	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) +	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: +	@echo "This command is intended for maintainers to use" +	@echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \ +	clean-noinstLTLIBRARIES mostlyclean-am + +distclean: distclean-am +		-rm -f ./$(DEPDIR)/libpixman_loongson_mmi_la-pixman-mmx.Plo +	-rm -f ./$(DEPDIR)/libpixman_mmx_la-pixman-mmx.Plo +	-rm -f ./$(DEPDIR)/libpixman_sse2_la-pixman-sse2.Plo +	-rm -f ./$(DEPDIR)/libpixman_ssse3_la-pixman-ssse3.Plo +	-rm -f ./$(DEPDIR)/libpixman_vmx_la-pixman-vmx.Plo +	-rm -f ./$(DEPDIR)/pixman-access-accessors.Plo +	-rm -f ./$(DEPDIR)/pixman-access.Plo +	-rm -f ./$(DEPDIR)/pixman-arm-neon-asm-bilinear.Plo +	-rm -f ./$(DEPDIR)/pixman-arm-neon-asm.Plo +	-rm -f ./$(DEPDIR)/pixman-arm-neon.Plo +	-rm -f ./$(DEPDIR)/pixman-arm-simd-asm-scaled.Plo +	-rm -f ./$(DEPDIR)/pixman-arm-simd-asm.Plo +	-rm -f ./$(DEPDIR)/pixman-arm-simd.Plo +	-rm -f ./$(DEPDIR)/pixman-arm.Plo +	-rm -f ./$(DEPDIR)/pixman-bits-image.Plo +	-rm -f ./$(DEPDIR)/pixman-combine-float.Plo +	-rm -f ./$(DEPDIR)/pixman-combine32.Plo +	-rm -f ./$(DEPDIR)/pixman-conical-gradient.Plo +	-rm -f ./$(DEPDIR)/pixman-edge-accessors.Plo +	-rm -f ./$(DEPDIR)/pixman-edge.Plo +	-rm -f ./$(DEPDIR)/pixman-fast-path.Plo +	-rm -f ./$(DEPDIR)/pixman-filter.Plo +	-rm -f ./$(DEPDIR)/pixman-general.Plo +	-rm -f ./$(DEPDIR)/pixman-glyph.Plo +	-rm -f ./$(DEPDIR)/pixman-gradient-walker.Plo +	-rm -f ./$(DEPDIR)/pixman-image.Plo +	-rm -f ./$(DEPDIR)/pixman-implementation.Plo +	-rm -f ./$(DEPDIR)/pixman-linear-gradient.Plo +	-rm -f ./$(DEPDIR)/pixman-matrix.Plo +	-rm -f ./$(DEPDIR)/pixman-mips-dspr2-asm.Plo +	-rm -f ./$(DEPDIR)/pixman-mips-dspr2.Plo +	-rm -f ./$(DEPDIR)/pixman-mips-memcpy-asm.Plo +	-rm -f ./$(DEPDIR)/pixman-mips.Plo +	-rm -f ./$(DEPDIR)/pixman-mmx.Plo +	-rm -f ./$(DEPDIR)/pixman-noop.Plo +	-rm -f ./$(DEPDIR)/pixman-ppc.Plo +	-rm -f ./$(DEPDIR)/pixman-radial-gradient.Plo +	-rm -f ./$(DEPDIR)/pixman-region16.Plo +	-rm -f ./$(DEPDIR)/pixman-region32.Plo +	-rm -f ./$(DEPDIR)/pixman-solid-fill.Plo +	-rm -f ./$(DEPDIR)/pixman-timer.Plo +	-rm -f ./$(DEPDIR)/pixman-trap.Plo +	-rm -f ./$(DEPDIR)/pixman-utils.Plo +	-rm -f ./$(DEPDIR)/pixman-x86.Plo +	-rm -f ./$(DEPDIR)/pixman.Plo +	-rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ +	distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: install-libpixmanincludeHEADERS + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: install-libLTLIBRARIES + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am +		-rm -f ./$(DEPDIR)/libpixman_loongson_mmi_la-pixman-mmx.Plo +	-rm -f ./$(DEPDIR)/libpixman_mmx_la-pixman-mmx.Plo +	-rm -f ./$(DEPDIR)/libpixman_sse2_la-pixman-sse2.Plo +	-rm -f ./$(DEPDIR)/libpixman_ssse3_la-pixman-ssse3.Plo +	-rm -f ./$(DEPDIR)/libpixman_vmx_la-pixman-vmx.Plo +	-rm -f ./$(DEPDIR)/pixman-access-accessors.Plo +	-rm -f ./$(DEPDIR)/pixman-access.Plo +	-rm -f ./$(DEPDIR)/pixman-arm-neon-asm-bilinear.Plo +	-rm -f ./$(DEPDIR)/pixman-arm-neon-asm.Plo +	-rm -f ./$(DEPDIR)/pixman-arm-neon.Plo +	-rm -f ./$(DEPDIR)/pixman-arm-simd-asm-scaled.Plo +	-rm -f ./$(DEPDIR)/pixman-arm-simd-asm.Plo +	-rm -f ./$(DEPDIR)/pixman-arm-simd.Plo +	-rm -f ./$(DEPDIR)/pixman-arm.Plo +	-rm -f ./$(DEPDIR)/pixman-bits-image.Plo +	-rm -f ./$(DEPDIR)/pixman-combine-float.Plo +	-rm -f ./$(DEPDIR)/pixman-combine32.Plo +	-rm -f ./$(DEPDIR)/pixman-conical-gradient.Plo +	-rm -f ./$(DEPDIR)/pixman-edge-accessors.Plo +	-rm -f ./$(DEPDIR)/pixman-edge.Plo +	-rm -f ./$(DEPDIR)/pixman-fast-path.Plo +	-rm -f ./$(DEPDIR)/pixman-filter.Plo +	-rm -f ./$(DEPDIR)/pixman-general.Plo +	-rm -f ./$(DEPDIR)/pixman-glyph.Plo +	-rm -f ./$(DEPDIR)/pixman-gradient-walker.Plo +	-rm -f ./$(DEPDIR)/pixman-image.Plo +	-rm -f ./$(DEPDIR)/pixman-implementation.Plo +	-rm -f ./$(DEPDIR)/pixman-linear-gradient.Plo +	-rm -f ./$(DEPDIR)/pixman-matrix.Plo +	-rm -f ./$(DEPDIR)/pixman-mips-dspr2-asm.Plo +	-rm -f ./$(DEPDIR)/pixman-mips-dspr2.Plo +	-rm -f ./$(DEPDIR)/pixman-mips-memcpy-asm.Plo +	-rm -f ./$(DEPDIR)/pixman-mips.Plo +	-rm -f ./$(DEPDIR)/pixman-mmx.Plo +	-rm -f ./$(DEPDIR)/pixman-noop.Plo +	-rm -f ./$(DEPDIR)/pixman-ppc.Plo +	-rm -f ./$(DEPDIR)/pixman-radial-gradient.Plo +	-rm -f ./$(DEPDIR)/pixman-region16.Plo +	-rm -f ./$(DEPDIR)/pixman-region32.Plo +	-rm -f ./$(DEPDIR)/pixman-solid-fill.Plo +	-rm -f ./$(DEPDIR)/pixman-timer.Plo +	-rm -f ./$(DEPDIR)/pixman-trap.Plo +	-rm -f ./$(DEPDIR)/pixman-utils.Plo +	-rm -f ./$(DEPDIR)/pixman-x86.Plo +	-rm -f ./$(DEPDIR)/pixman.Plo +	-rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ +	mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: uninstall-libLTLIBRARIES \ +	uninstall-libpixmanincludeHEADERS + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \ +	clean-generic clean-libLTLIBRARIES clean-libtool \ +	clean-noinstLTLIBRARIES cscopelist-am ctags ctags-am distclean \ +	distclean-compile distclean-generic distclean-libtool \ +	distclean-tags distdir dvi dvi-am html html-am info info-am \ +	install install-am install-data install-data-am install-dvi \ +	install-dvi-am install-exec install-exec-am install-html \ +	install-html-am install-info install-info-am \ +	install-libLTLIBRARIES install-libpixmanincludeHEADERS \ +	install-man install-pdf install-pdf-am install-ps \ +	install-ps-am install-strip installcheck installcheck-am \ +	installdirs maintainer-clean maintainer-clean-generic \ +	mostlyclean mostlyclean-compile mostlyclean-generic \ +	mostlyclean-libtool pdf pdf-am ps ps-am tags tags-am uninstall \ +	uninstall-am uninstall-libLTLIBRARIES \ +	uninstall-libpixmanincludeHEADERS + +.PRECIOUS: Makefile + + +@USE_ARM_IWMMXT_TRUE@libpixman_iwmmxt_la-pixman-mmx.lo: pixman-mmx.c +@USE_ARM_IWMMXT_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(AM_CPPFLAGS) $(AM_CPPFLAGS) $(CPPFLAGS) $(CFLAGS) $(IWMMXT_CFLAGS) -MT libpixman_iwmmxt_la-pixman-mmx.lo -MD -MP -MF $(DEPDIR)/libpixman_iwmmxt_la-pixman-mmx.Tpo -c -o libpixman_iwmmxt_la-pixman-mmx.lo `test -f 'pixman-mmx.c' || echo '$(srcdir)/'`pixman-mmx.c +@USE_ARM_IWMMXT_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpixman_iwmmxt_la-pixman-mmx.Tpo $(DEPDIR)/libpixman_iwmmxt_la-pixman-mmx.Plo + +@USE_ARM_IWMMXT_TRUE@libpixman-iwmmxt.la: libpixman_iwmmxt_la-pixman-mmx.lo $(libpixman_iwmmxt_la_DEPENDENCIES)  +@USE_ARM_IWMMXT_TRUE@	$(AM_V_CCLD)$(libpixman_iwmmxt_la_LINK) libpixman_iwmmxt_la-pixman-mmx.lo $(libpixman_iwmmxt_la_LIBADD) $(LIBS) + +.c.s : $(libpixmaninclude_HEADERS) +	$(CC) $(CFLAGS) $(ASM_CFLAGS_$(@:pixman-%.s=%)) $(ASM_CFLAGS_$(@:pixman-arm-%.s=arm_%)) -DHAVE_CONFIG_H -I$(srcdir) -I$(builddir) -I$(top_builddir) -S -o $@ $< + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/libs/pixman-0.40.0/pixman/Makefile.sources b/libs/pixman-0.40.0/pixman/Makefile.sources new file mode 100644 index 0000000..23d1d97 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/Makefile.sources @@ -0,0 +1,43 @@ +libpixman_sources =			\ +	pixman.c			\ +	pixman-access.c			\ +	pixman-access-accessors.c	\ +	pixman-bits-image.c		\ +	pixman-combine32.c		\ +	pixman-combine-float.c		\ +	pixman-conical-gradient.c	\ +	pixman-filter.c			\ +	pixman-x86.c			\ +	pixman-mips.c			\ +	pixman-arm.c			\ +	pixman-ppc.c			\ +	pixman-edge.c			\ +	pixman-edge-accessors.c		\ +	pixman-fast-path.c		\ +	pixman-glyph.c			\ +	pixman-general.c		\ +	pixman-gradient-walker.c	\ +	pixman-image.c			\ +	pixman-implementation.c		\ +	pixman-linear-gradient.c	\ +	pixman-matrix.c			\ +	pixman-noop.c			\ +	pixman-radial-gradient.c	\ +	pixman-region16.c		\ +	pixman-region32.c		\ +	pixman-solid-fill.c		\ +	pixman-timer.c			\ +	pixman-trap.c			\ +	pixman-utils.c			\ +	$(NULL) + +libpixman_headers =			\ +	dither/blue-noise-64x64.h	\ +	pixman.h			\ +	pixman-accessor.h		\ +	pixman-combine32.h		\ +	pixman-compiler.h		\ +	pixman-edge-imp.h		\ +	pixman-inlines.h		\ +	pixman-private.h		\ +	$(NULL) diff --git a/libs/pixman-0.40.0/pixman/Makefile.win32 b/libs/pixman-0.40.0/pixman/Makefile.win32 new file mode 100644 index 0000000..7b64033 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/Makefile.win32 @@ -0,0 +1,93 @@ +default: all + +top_srcdir = .. +include $(top_srcdir)/pixman/Makefile.sources +include $(top_srcdir)/Makefile.win32.common + +MMX_VAR = $(MMX) +ifeq ($(MMX_VAR),) +MMX_VAR=on +endif + +SSE2_VAR = $(SSE2) +ifeq ($(SSE2_VAR),) +SSE2_VAR=on +endif + +SSSE3_VAR = $(SSSE3) +ifeq ($(SSSE3_VAR),) +SSSE3_VAR=on +endif + +MMX_CFLAGS = -DUSE_X86_MMX -w14710 -w14714 +SSE2_CFLAGS = -DUSE_SSE2 +SSSE3_CFLAGS = -DUSE_SSSE3 + +# MMX compilation flags +ifeq ($(MMX_VAR),on) +PIXMAN_CFLAGS += $(MMX_CFLAGS) +libpixman_sources += pixman-mmx.c +endif + +# SSE2 compilation flags +ifeq ($(SSE2_VAR),on) +PIXMAN_CFLAGS += $(SSE2_CFLAGS) +libpixman_sources += pixman-sse2.c +endif + +# SSSE3 compilation flags +ifeq ($(SSSE3_VAR),on) +PIXMAN_CFLAGS += $(SSSE3_CFLAGS) +libpixman_sources += pixman-ssse3.c +endif + +OBJECTS = $(patsubst %.c, $(CFG_VAR)/%.obj, $(libpixman_sources)) + +# targets +all: inform informMMX informSSE2 informSSSE3 $(CFG_VAR)/$(LIBRARY).lib + +informMMX: +ifneq ($(MMX),off) +ifneq ($(MMX),on) +ifneq ($(MMX),) +	@echo "Invalid specified MMX option : "$(MMX_VAR)"." +	@echo +	@echo "Possible choices for MMX are 'on' or 'off'" +	@exit 1 +endif +	@echo "Setting MMX flag to default value 'on'... (use MMX=on or MMX=off)" +endif +endif + +informSSE2: +ifneq ($(SSE2),off) +ifneq ($(SSE2),on) +ifneq ($(SSE2),) +	@echo "Invalid specified SSE option : "$(SSE2)"." +	@echo +	@echo "Possible choices for SSE2 are 'on' or 'off'" +	@exit 1 +endif +	@echo "Setting SSE2 flag to default value 'on'... (use SSE2=on or SSE2=off)" +endif +endif + +informSSSE3: +ifneq ($(SSSE3),off) +ifneq ($(SSSE3),on) +ifneq ($(SSSE3),) +	@echo "Invalid specified SSE option : "$(SSSE3)"." +	@echo +	@echo "Possible choices for SSSE3 are 'on' or 'off'" +	@exit 1 +endif +	@echo "Setting SSSE3 flag to default value 'on'... (use SSSE3=on or SSSE3=off)" +endif +endif + + +# pixman linking +$(CFG_VAR)/$(LIBRARY).lib: $(OBJECTS) +	@$(AR) $(PIXMAN_ARFLAGS) -OUT:$@ $^ + +.PHONY: all informMMX informSSE2 informSSSE3 diff --git a/libs/pixman-0.40.0/pixman/dither/blue-noise-64x64.h b/libs/pixman-0.40.0/pixman/dither/blue-noise-64x64.h new file mode 100644 index 0000000..93c8805 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/dither/blue-noise-64x64.h @@ -0,0 +1,77 @@ +/* WARNING: This file is generated by make-blue-noise.c + * Please edit that file instead of this one. + */ + +#ifndef BLUE_NOISE_64X64_H +#define BLUE_NOISE_64X64_H + +#include <stdint.h> + +static const uint16_t dither_blue_noise_64x64[4096] = { +    3039, 1368, 3169, 103, 2211, 1248, 2981, 668, 2633, 37, 3963, 2903, 384, 2564, 3115, 1973, 3348, 830, 2505, 1293, 3054, 1060, 1505, 3268, 400, 1341, 593, 3802, 3384, 429, 4082, 1411, 2503, 3863, 126, 1292, 1887, 2855, 205, 2094, 2977, 1899, 3924, 356, 3088, 2500, 3942, 1409, 2293, 1734, 3732, 1291, 3227, 277, 2054, 786, 2871, 411, 2425, 1678, 3986, 455, 2879, 2288, +    388, 1972, 3851, 778, 2768, 3697, 944, 2123, 1501, 3533, 937, 1713, 1381, 3888, 156, 1242, 516, 2888, 1607, 3676, 632, 2397, 3804, 2673, 1898, 3534, 2593, 1777, 1170, 2299, 3013, 1838, 523, 3053, 1647, 3601, 3197, 959, 1520, 3633, 893, 2437, 3367, 2187, 1258, 137, 1965, 401, 3546, 643, 3087, 2498, 733, 2786, 3371, 4053, 1266, 1977, 3663, 183, 2570, 2107, 1183, 3708, +    907, 2473, 1151, 3363, 1527, 1902, 232, 3903, 3060, 496, 2486, 3206, 2165, 861, 2387, 3653, 2101, 3972, 132, 2162, 3437, 1827, 215, 895, 3114, 271, 969, 2932, 197, 1598, 878, 3696, 1140, 2120, 904, 2431, 302, 3846, 2675, 481, 3187, 66, 1440, 650, 3833, 2826, 3435, 901, 2936, 2111, 250, 1875, 3609, 1174, 1747, 162, 2346, 3420, 913, 3172, 1383, 752, 3298, 1735, +    3540, 2938, 249, 2324, 526, 3099, 2561, 1324, 2347, 1861, 1200, 3702, 257, 3442, 1514, 2999, 992, 1766, 2735, 1163, 478, 2943, 1279, 3635, 2177, 1464, 3672, 2386, 3871, 3340, 2690, 64, 3489, 2811, 3999, 633, 1948, 1243, 2269, 1807, 1143, 2750, 3729, 1790, 2363, 1053, 1537, 2636, 4065, 1076, 1476, 3869, 450, 2200, 2676, 658, 2979, 1548, 544, 1913, 2838, 3911, 116, 2698, +    517, 1295, 3997, 1739, 3665, 1083, 3509, 599, 3400, 118, 2956, 720, 2689, 1907, 567, 2523, 284, 3397, 711, 3219, 2450, 3985, 1665, 2549, 562, 3011, 1855, 729, 1355, 528, 1908, 2456, 1384, 337, 1540, 2654, 3138, 3513, 703, 4080, 3314, 2047, 855, 3037, 209, 3317, 577, 1828, 17, 2336, 3193, 2748, 962, 3441, 1450, 3246, 1075, 3878, 2615, 3497, 1033, 2310, 1442, 2183, +    1654, 3254, 2061, 738, 2832, 148, 2030, 1670, 909, 3850, 2109, 1533, 4046, 1085, 3098, 3897, 1378, 2248, 3829, 1495, 1966, 23, 797, 3427, 1124, 4057, 95, 2787, 2190, 3074, 3950, 742, 3194, 1999, 3386, 1113, 16, 1657, 2804, 201, 1543, 383, 2559, 1325, 3604, 2068, 2493, 3771, 1284, 3460, 710, 1716, 2447, 80, 3811, 2032, 347, 2227, 15, 1689, 397, 3084, 662, 3798, +    973, 43, 2608, 3143, 1459, 2423, 4066, 2770, 3191, 1283, 2630, 314, 3235, 2289, 72, 1822, 2840, 924, 350, 2653, 1057, 3715, 2235, 2775, 346, 2083, 1553, 3292, 1081, 274, 1686, 1188, 2327, 3743, 578, 2234, 3916, 2519, 1011, 3056, 2207, 3438, 3890, 537, 1617, 837, 3094, 373, 2795, 1980, 276, 3951, 1353, 3015, 844, 1724, 3651, 2923, 1316, 4092, 2504, 3627, 1936, 2854, +    2461, 3929, 1193, 421, 3746, 820, 1180, 286, 2261, 532, 3625, 1812, 802, 1327, 3527, 670, 3730, 2025, 3124, 3565, 529, 2960, 1769, 1390, 3196, 2494, 3756, 796, 3618, 2602, 3463, 2847, 166, 953, 1745, 2900, 438, 2070, 1418, 3741, 639, 1205, 1891, 2882, 2282, 4012, 1182, 1696, 3630, 951, 2904, 2170, 3530, 375, 2320, 2742, 1132, 701, 3216, 2023, 847, 1230, 310, 3431, +    770, 1961, 3531, 1702, 2181, 3370, 1877, 3072, 1571, 3389, 1071, 2415, 3782, 2803, 1610, 2454, 1211, 182, 1655, 2322, 1282, 3372, 287, 3935, 704, 1232, 415, 1910, 2286, 1399, 556, 1964, 4068, 2444, 3605, 1272, 3345, 816, 3526, 256, 2402, 2777, 955, 345, 3289, 111, 2727, 635, 2396, 1488, 3331, 600, 1032, 1575, 4026, 515, 3507, 2433, 1605, 460, 3364, 2783, 1810, 1397, +    2334, 223, 2945, 688, 2533, 99, 2705, 624, 3944, 2073, 46, 2978, 508, 2132, 269, 3173, 3453, 2631, 4076, 694, 1892, 2586, 972, 2178, 3470, 1695, 2849, 3141, 77, 3884, 994, 3029, 1536, 673, 3083, 124, 2583, 1722, 2821, 1944, 4027, 1661, 3176, 3728, 1337, 1813, 3503, 2035, 3930, 157, 2537, 1865, 3096, 2646, 1941, 3252, 1449, 135, 2836, 3758, 2139, 84, 3678, 3106, +    3862, 1545, 3307, 1320, 3955, 1031, 3664, 1306, 2460, 776, 1487, 3294, 1187, 3990, 1903, 1021, 549, 1484, 943, 3027, 97, 3853, 1499, 2880, 198, 2575, 3995, 1089, 1587, 2475, 3282, 339, 2657, 1158, 2105, 1493, 3943, 580, 3232, 1287, 846, 48, 2480, 2112, 771, 2534, 459, 3134, 850, 1298, 3790, 325, 3652, 1249, 193, 940, 2202, 3895, 1829, 911, 1366, 2577, 1069, 534, +    2104, 1009, 2667, 392, 1983, 2917, 1645, 324, 3439, 2869, 3705, 1767, 2592, 756, 2916, 3683, 2276, 2850, 2053, 3594, 2403, 3181, 634, 3699, 1933, 906, 519, 2150, 3673, 764, 1770, 2220, 3795, 3336, 502, 3547, 2339, 1110, 301, 2210, 3354, 3643, 569, 1518, 2940, 3973, 1138, 1613, 2773, 2127, 2983, 1671, 769, 2161, 3800, 2730, 3127, 1179, 533, 3259, 2284, 4014, 1651, 2820, +    3566, 653, 1839, 3455, 2399, 789, 3149, 2244, 1863, 1099, 474, 2307, 158, 3541, 1312, 1711, 0, 3902, 360, 1629, 1091, 395, 1781, 1191, 2374, 3353, 1419, 3225, 206, 2931, 3553, 1046, 54, 1646, 2470, 910, 1860, 3137, 3770, 2635, 1562, 2809, 1215, 3788, 222, 2199, 3335, 67, 3606, 524, 1001, 3309, 2410, 3473, 591, 1619, 291, 2502, 3629, 2891, 335, 741, 3378, 168, +    2384, 3129, 4051, 22, 1444, 3613, 543, 3893, 186, 2665, 4062, 933, 3058, 2142, 449, 2711, 3224, 849, 1330, 3349, 2195, 2670, 3484, 2993, 32, 3774, 2722, 1859, 2548, 1268, 583, 2027, 3165, 2807, 4029, 227, 2897, 1434, 721, 1816, 195, 905, 2066, 3258, 1754, 970, 2674, 1880, 2338, 3915, 1485, 2660, 14, 1313, 2914, 2046, 4074, 791, 1917, 1301, 1725, 2687, 2019, 1443, +    418, 1186, 1664, 2859, 1049, 2056, 2741, 1226, 1589, 3186, 2042, 1377, 3449, 1574, 3941, 1063, 1930, 2501, 3751, 2930, 671, 4031, 888, 2081, 1544, 684, 1117, 351, 4052, 1698, 2393, 3881, 1439, 785, 1277, 2013, 3488, 441, 2459, 3980, 3061, 3481, 2543, 419, 3020, 609, 3515, 1350, 799, 2878, 348, 2034, 3966, 1824, 950, 3281, 1394, 2239, 3452, 55, 3922, 3119, 892, 3785, +    3023, 2140, 782, 2492, 3817, 241, 3355, 2424, 856, 3639, 612, 2556, 245, 2858, 705, 2316, 3562, 495, 1748, 128, 1912, 1454, 280, 2552, 3905, 3130, 2274, 3472, 834, 3055, 240, 2692, 471, 2272, 3301, 2632, 1080, 3693, 2136, 1029, 1364, 590, 1611, 4067, 1190, 2360, 3827, 261, 3180, 1768, 3471, 1103, 3003, 520, 3674, 151, 2571, 555, 3033, 982, 2353, 504, 1259, 2555, +    149, 3889, 3380, 493, 3178, 1681, 663, 1924, 2990, 49, 1792, 3861, 1192, 1987, 3273, 297, 1457, 3043, 1177, 2292, 3249, 2829, 3682, 1154, 1758, 428, 2872, 1993, 1500, 3703, 1129, 3421, 1840, 3754, 163, 659, 1733, 3182, 38, 2875, 1957, 3614, 2237, 78, 1873, 2801, 1513, 2121, 1074, 2516, 667, 3710, 1429, 2430, 2088, 2830, 1072, 3557, 1531, 2733, 1955, 3286, 3590, 1826, +    2778, 1068, 1932, 1452, 2279, 1185, 3564, 3952, 1391, 2726, 3313, 2331, 870, 3709, 1674, 2772, 4085, 808, 2596, 3848, 927, 538, 2335, 3334, 773, 3597, 1347, 109, 2663, 608, 2108, 2994, 936, 1524, 2922, 3968, 2422, 1467, 845, 3870, 321, 2704, 1073, 3308, 3680, 823, 430, 3375, 4030, 112, 2171, 2695, 267, 3374, 731, 1627, 3919, 1871, 352, 3839, 1370, 234, 794, 1532, +    3245, 647, 3575, 74, 3045, 2766, 285, 2174, 498, 1059, 1551, 385, 3125, 2598, 143, 1128, 2095, 3395, 318, 1590, 3524, 1345, 1969, 242, 2759, 2092, 947, 3926, 3244, 2356, 1658, 6, 3593, 2554, 1172, 1995, 371, 2755, 3417, 2294, 1570, 3164, 748, 2517, 1401, 3111, 2420, 1662, 2910, 1276, 3276, 854, 1804, 4000, 1253, 2987, 229, 2344, 3184, 649, 2196, 2921, 4095, 2389, +    1289, 2193, 2579, 4023, 757, 1858, 986, 3199, 2514, 3475, 4021, 2154, 651, 1432, 3468, 2404, 574, 1799, 3105, 2145, 86, 2614, 3218, 1565, 4088, 2481, 3079, 1815, 323, 1212, 3837, 759, 2159, 435, 3223, 784, 3659, 1114, 1888, 550, 1221, 3786, 1803, 499, 2117, 185, 3763, 942, 589, 2001, 3838, 1483, 3154, 2256, 468, 2544, 3403, 898, 1208, 2610, 3622, 967, 1929, 378, +    3781, 220, 1656, 1115, 3347, 2428, 3822, 1577, 712, 1959, 110, 2765, 1762, 3854, 979, 2928, 3714, 1371, 746, 3969, 2884, 975, 3779, 641, 1142, 159, 1460, 702, 3485, 2866, 2495, 3330, 1305, 3937, 1635, 2229, 2962, 146, 4055, 3091, 2417, 100, 3508, 2933, 4006, 1167, 1920, 2760, 3552, 2545, 433, 2845, 142, 1056, 1886, 3616, 1435, 2099, 3803, 1749, 27, 1446, 3350, 2843, +    884, 3310, 2948, 2103, 447, 1351, 187, 2895, 3655, 1256, 3036, 932, 3325, 2257, 451, 1915, 40, 2780, 2438, 1112, 1814, 423, 2290, 1905, 2898, 3419, 2306, 3760, 1938, 486, 1019, 1791, 3010, 2628, 203, 3408, 1269, 2507, 1606, 862, 2779, 2078, 952, 1529, 2638, 708, 3332, 1413, 2, 1726, 1156, 3500, 2392, 3791, 3076, 812, 107, 2861, 501, 3050, 3487, 2455, 594, 1731, +    2685, 1498, 680, 3908, 2621, 3529, 1786, 2236, 342, 2569, 1526, 3722, 230, 1290, 3203, 3947, 1609, 3516, 467, 3267, 3685, 1461, 3140, 3569, 367, 1759, 928, 2754, 1332, 2219, 4034, 260, 655, 1984, 978, 3814, 617, 2086, 3525, 279, 3841, 1373, 3361, 319, 2251, 3066, 407, 2382, 3918, 3133, 2168, 762, 1523, 507, 2641, 1677, 4025, 2413, 1584, 793, 2049, 1109, 3962, 2218, +    1194, 3692, 266, 1687, 981, 3103, 740, 3983, 1005, 3434, 570, 2383, 1942, 2718, 676, 2462, 1007, 2089, 1308, 2222, 233, 2568, 829, 1241, 2669, 3987, 514, 3303, 69, 3142, 1603, 3560, 2295, 3288, 1497, 2696, 1764, 2865, 1058, 3271, 1914, 477, 2529, 3927, 1736, 1273, 3752, 2029, 1012, 565, 2798, 4078, 1949, 3305, 1175, 2179, 380, 3366, 1195, 3849, 2637, 416, 2959, 125, +    3396, 2467, 2036, 3234, 2340, 68, 2819, 1436, 2011, 3139, 1704, 4073, 860, 3582, 1468, 2969, 211, 3157, 4056, 866, 2935, 2000, 3923, 31, 2157, 1477, 2429, 1147, 3792, 2557, 774, 2802, 1153, 3747, 464, 3192, 42, 3904, 539, 1474, 2283, 803, 2876, 1061, 75, 3477, 747, 2893, 1538, 3626, 251, 1322, 2506, 189, 2791, 3667, 939, 2991, 1971, 175, 3195, 1416, 3648, 1857, +    3052, 454, 851, 3789, 1271, 1906, 3694, 2484, 406, 2757, 26, 1189, 2909, 296, 2215, 3784, 1864, 637, 2715, 1673, 3445, 581, 1572, 3059, 3469, 761, 2984, 1737, 2058, 440, 1414, 1921, 121, 2527, 894, 2223, 1302, 2377, 3077, 2666, 3759, 3198, 1811, 3661, 2166, 2731, 1883, 359, 3285, 2458, 1805, 3459, 926, 3834, 675, 1893, 1496, 2612, 657, 3523, 1763, 2354, 564, 961, +    1367, 3977, 1588, 2714, 322, 3446, 1088, 625, 3887, 1354, 3535, 2090, 3316, 1760, 1127, 483, 3491, 1421, 2301, 94, 1202, 3740, 2311, 1014, 1878, 3836, 180, 3412, 991, 2868, 3953, 3450, 3081, 1632, 4071, 1882, 3543, 726, 1719, 179, 1171, 364, 1420, 622, 3090, 1490, 946, 4007, 2212, 1102, 619, 2739, 2189, 1669, 2937, 3426, 39, 3940, 2191, 1264, 887, 4091, 2792, 2135, +    4, 2883, 2281, 631, 3044, 1641, 2232, 3243, 1773, 2319, 827, 2591, 629, 3938, 2426, 3222, 2629, 1044, 3879, 3293, 1952, 2749, 275, 2590, 472, 1372, 2496, 660, 3669, 2264, 208, 915, 2167, 561, 2828, 307, 3265, 1104, 3964, 2155, 3425, 1951, 4077, 2391, 283, 3387, 2581, 115, 1415, 3069, 3896, 141, 3158, 1214, 442, 2405, 1349, 3085, 425, 2528, 3002, 312, 1602, 3588, +    1137, 3323, 1963, 1002, 3578, 2521, 127, 925, 2970, 273, 3737, 1573, 167, 2863, 1509, 800, 147, 2059, 2942, 409, 921, 3151, 1451, 3909, 3333, 2844, 2096, 1512, 3136, 1210, 1798, 2709, 1331, 3586, 1034, 1521, 2441, 2926, 488, 2585, 775, 3031, 2693, 879, 3602, 1173, 2028, 3654, 2781, 841, 1975, 1507, 3646, 768, 3991, 2012, 996, 3544, 1666, 3810, 1990, 3360, 753, 2597, +    3736, 304, 1473, 3828, 485, 1334, 4008, 2072, 3495, 1136, 2806, 2004, 3236, 1010, 2130, 3819, 1750, 3567, 644, 2515, 1794, 3636, 698, 2137, 1162, 832, 3761, 326, 2613, 513, 3302, 3820, 357, 3163, 2259, 3733, 101, 1922, 1386, 3587, 1640, 28, 1286, 2141, 1761, 2918, 693, 1639, 457, 3250, 2434, 365, 2599, 1729, 3284, 2643, 306, 2793, 689, 1090, 104, 1309, 2305, 1831, +    2776, 859, 2446, 2915, 1778, 3337, 2677, 614, 1508, 2409, 469, 4033, 1321, 3563, 402, 3131, 2720, 1093, 1569, 4042, 1229, 2277, 216, 3046, 1817, 57, 3006, 1684, 4059, 2016, 795, 2440, 1652, 1960, 610, 2763, 920, 3864, 3110, 1026, 2326, 3762, 3233, 521, 3856, 173, 2457, 3939, 2138, 1262, 3572, 989, 3021, 2238, 119, 1445, 3832, 1809, 2297, 3467, 2700, 3684, 3102, 394, +    4036, 2050, 3256, 89, 2198, 1079, 248, 1845, 3805, 3104, 880, 1779, 2688, 717, 2373, 1375, 262, 2249, 3071, 13, 2813, 3429, 1600, 3984, 2416, 3603, 1299, 2298, 998, 3492, 1393, 2951, 10, 4009, 1247, 3462, 1679, 2204, 414, 2736, 316, 1894, 2816, 1050, 3373, 1462, 3107, 817, 3464, 21, 1835, 4070, 568, 1178, 3718, 875, 3168, 466, 2974, 1458, 2084, 616, 1564, 1018, +    1693, 546, 1244, 3899, 716, 3160, 3608, 2877, 1220, 334, 3443, 2270, 44, 3000, 1843, 3928, 3405, 766, 3686, 2040, 587, 993, 2647, 387, 930, 2753, 630, 3274, 150, 2808, 453, 3638, 1092, 2352, 3030, 239, 2562, 700, 3240, 1257, 4016, 730, 1515, 2203, 2551, 417, 1866, 1123, 2348, 2902, 1550, 2678, 2075, 3238, 1630, 2531, 2115, 1255, 4054, 840, 290, 3874, 2477, 3399, +    2250, 3577, 2817, 1626, 2576, 1356, 2315, 792, 2087, 2618, 1612, 3855, 1263, 3637, 1036, 494, 1535, 2553, 1198, 1715, 3867, 3170, 1359, 1954, 3483, 1539, 2069, 3886, 1772, 2487, 1534, 2045, 3242, 806, 1578, 2018, 3948, 1423, 3596, 2076, 2466, 3424, 139, 3688, 871, 4049, 2852, 3342, 547, 3719, 327, 852, 3505, 207, 2794, 542, 3600, 45, 2411, 3324, 1788, 3012, 1235, 61, +    2655, 917, 253, 1986, 3738, 313, 1706, 4072, 120, 3229, 957, 597, 2024, 3262, 2453, 2857, 2002, 3190, 210, 2784, 2206, 300, 2400, 3766, 553, 3152, 218, 1150, 2988, 883, 3753, 627, 2664, 3831, 437, 3385, 1008, 2957, 60, 1636, 891, 2899, 1776, 3062, 1315, 2026, 194, 1643, 2079, 1296, 3201, 2465, 1379, 1927, 3898, 1125, 1847, 2846, 1552, 1028, 2725, 2169, 787, 3202, +    1441, 3982, 3032, 1052, 3251, 605, 2639, 3073, 1431, 3642, 2329, 2949, 341, 1634, 833, 129, 4020, 916, 3571, 669, 1506, 3411, 821, 2856, 1207, 2337, 2683, 3448, 340, 2214, 3128, 235, 1738, 1288, 2833, 2419, 606, 1884, 2668, 552, 3765, 1176, 399, 2302, 596, 3591, 2634, 767, 3845, 2767, 995, 3967, 491, 3057, 814, 2300, 3422, 691, 3797, 254, 3645, 509, 3478, 1836, +    2119, 475, 2445, 1525, 2175, 3539, 914, 1926, 473, 1157, 1800, 3971, 2701, 3739, 2129, 3486, 1333, 1784, 2366, 2982, 1070, 4089, 1802, 73, 1642, 3958, 835, 1837, 1480, 4043, 1217, 2469, 3416, 2113, 88, 3668, 1240, 3255, 3920, 2355, 3167, 2003, 2645, 3936, 3228, 1592, 1144, 3474, 2394, 79, 1820, 2241, 1594, 3656, 2584, 153, 1448, 3034, 2005, 2511, 1692, 1335, 3913, 217, +    2822, 3391, 745, 3813, 192, 1274, 2941, 3847, 2489, 3440, 744, 161, 1422, 1086, 572, 3004, 2617, 338, 3807, 2031, 236, 2472, 3065, 2098, 3358, 362, 2163, 3574, 497, 2788, 1970, 948, 3885, 685, 3100, 1712, 2228, 292, 1408, 1016, 164, 3537, 1417, 941, 34, 2172, 3001, 358, 1491, 3147, 699, 3356, 258, 1149, 2946, 1787, 3931, 382, 1146, 3291, 818, 2890, 2379, 1096, +    3679, 1328, 1901, 3162, 2747, 1730, 2253, 5, 1556, 2818, 2093, 3166, 2522, 3410, 2287, 1701, 956, 3237, 620, 1596, 3300, 1307, 511, 3701, 1020, 2939, 1362, 2532, 3208, 749, 3641, 160, 1522, 2624, 1095, 4086, 826, 2841, 3583, 2173, 1727, 723, 2925, 1911, 2482, 3726, 863, 1962, 4028, 1111, 2835, 3773, 2449, 2022, 582, 3278, 923, 2619, 2152, 4039, 92, 1934, 3145, 677, +    2530, 53, 2303, 1003, 458, 3989, 739, 3321, 1064, 369, 3556, 877, 1900, 426, 3876, 1, 3617, 2106, 1197, 2805, 3634, 857, 2706, 1504, 2418, 682, 3868, 20, 1139, 1688, 2333, 3311, 2907, 1945, 265, 2385, 3433, 1601, 636, 2620, 3095, 4044, 386, 3382, 1184, 527, 2814, 3414, 2342, 465, 1889, 1343, 874, 3479, 1502, 2233, 3689, 1385, 559, 2745, 1463, 3465, 376, 1718, +    3217, 4045, 1580, 3612, 2525, 1228, 3018, 1958, 3725, 2358, 1361, 3996, 1581, 3063, 1224, 2737, 1475, 2442, 3946, 191, 1796, 2128, 3975, 134, 1916, 3318, 1597, 2071, 3749, 2672, 403, 1278, 602, 3745, 3220, 1374, 445, 2064, 3830, 243, 1252, 2390, 1563, 2724, 3875, 1818, 1346, 165, 1650, 3264, 2680, 117, 2998, 4081, 343, 2799, 9, 3122, 1743, 3724, 1040, 2231, 3842, 1209, +    900, 398, 2851, 697, 1797, 3482, 293, 2679, 1649, 566, 2954, 91, 2697, 714, 2060, 3211, 781, 480, 3040, 1038, 2611, 666, 2989, 3458, 1201, 2796, 548, 2975, 839, 3121, 1850, 4001, 2208, 1631, 790, 2558, 2972, 1148, 3213, 1849, 3624, 971, 2102, 108, 772, 3101, 2589, 3777, 1042, 656, 3907, 2097, 1615, 2540, 805, 1935, 1231, 3494, 2451, 268, 2995, 750, 2682, 2020, +    3024, 1392, 2124, 3279, 106, 2217, 1387, 822, 3214, 3825, 2160, 1000, 2395, 3691, 228, 4038, 1872, 3413, 1608, 2225, 3536, 303, 1653, 886, 2541, 224, 4037, 2252, 1428, 172, 3504, 958, 2848, 113, 3628, 1834, 3979, 19, 2317, 779, 2797, 518, 3174, 3549, 1482, 2266, 444, 2014, 3555, 2439, 1213, 3113, 535, 1135, 3204, 3858, 2309, 931, 623, 2009, 3359, 1566, 140, 3550, +    1808, 3872, 2488, 1152, 3764, 2892, 3960, 2412, 353, 1223, 1825, 3444, 3116, 1717, 1082, 2313, 1280, 2661, 82, 3852, 1389, 3200, 2330, 3812, 2038, 3581, 1728, 1039, 3339, 2427, 586, 2580, 1238, 3328, 2280, 1047, 595, 2662, 1363, 3338, 1620, 3934, 2497, 1881, 1054, 3954, 3215, 864, 2887, 1801, 320, 3519, 2378, 3704, 1753, 424, 2958, 1660, 4005, 2601, 1116, 3912, 2381, 573, +    2740, 200, 828, 1667, 432, 1931, 1035, 1616, 3598, 2640, 728, 264, 1437, 557, 3501, 2966, 372, 3734, 974, 1978, 758, 2719, 1145, 452, 1433, 725, 2681, 408, 3843, 1918, 1547, 3906, 1996, 503, 1456, 3019, 3493, 1700, 3742, 355, 2134, 176, 1311, 615, 2867, 315, 1680, 1314, 8, 3297, 1494, 783, 1950, 83, 2656, 1382, 3561, 138, 2834, 1404, 330, 1904, 3156, 1027, +    1357, 3381, 3041, 3666, 2729, 734, 3415, 177, 3051, 2021, 4079, 2823, 3775, 2186, 2616, 869, 1668, 3148, 2367, 3315, 393, 4075, 1870, 2920, 3343, 2362, 3188, 1303, 2782, 825, 3171, 259, 2905, 3717, 2538, 184, 2074, 838, 2860, 2407, 1024, 3496, 3008, 3706, 1985, 2349, 3623, 2582, 4058, 2184, 2694, 3873, 2964, 990, 3346, 690, 2033, 1066, 2201, 3490, 2971, 718, 3700, 2188, +    4061, 391, 1989, 2325, 1430, 3150, 2125, 2526, 592, 1403, 976, 2351, 1165, 1851, 114, 3921, 2063, 613, 1358, 2785, 1623, 2254, 25, 3542, 1045, 246, 1852, 3554, 87, 2243, 3615, 1169, 727, 1705, 968, 3957, 3185, 1251, 500, 4063, 1751, 2622, 842, 1519, 90, 3393, 819, 490, 1874, 999, 571, 1275, 2271, 1586, 4040, 2448, 3126, 3731, 436, 885, 1708, 2421, 24, 1599, +    889, 2563, 1199, 645, 70, 4013, 1237, 3723, 1694, 3499, 3, 3266, 484, 2997, 3390, 1233, 2842, 3687, 152, 3480, 1084, 3698, 881, 2490, 1542, 3992, 2209, 692, 1690, 3022, 1470, 2625, 2114, 3512, 2359, 381, 2684, 1897, 3368, 1395, 3080, 289, 2065, 3981, 2758, 1141, 3097, 1472, 2870, 3352, 3707, 225, 3159, 505, 1895, 214, 1222, 1774, 2686, 3978, 3275, 1196, 3518, 2825, +    3270, 1720, 3796, 3466, 2650, 1841, 298, 899, 2862, 2091, 2671, 1744, 3735, 801, 1560, 349, 2262, 903, 1833, 2524, 512, 3117, 1793, 2827, 476, 3038, 1216, 2550, 3826, 980, 431, 4048, 35, 2992, 1265, 1595, 765, 3675, 76, 2247, 696, 3456, 1254, 2452, 664, 1757, 2133, 3750, 145, 2332, 1554, 1981, 3580, 2712, 868, 3640, 2919, 638, 2275, 1427, 309, 2595, 2006, 492, +    2226, 178, 2911, 836, 1528, 3028, 2240, 3327, 404, 3970, 707, 1294, 2464, 2131, 4032, 2600, 3319, 1406, 2913, 3974, 2156, 1425, 221, 3877, 2017, 811, 3662, 272, 3287, 1988, 2408, 3357, 1746, 598, 3239, 3823, 2182, 2934, 1078, 2604, 3840, 1697, 2906, 413, 3210, 3880, 331, 2644, 1260, 848, 3042, 2535, 1077, 1438, 3261, 2365, 1561, 3799, 85, 3082, 1876, 674, 3932, 1101, +    3644, 1344, 1943, 2401, 390, 3835, 1048, 2572, 1541, 1133, 3075, 3584, 308, 2889, 1065, 1869, 601, 3783, 282, 1181, 736, 3312, 2368, 1126, 3383, 1675, 2734, 1426, 628, 2873, 1317, 843, 2717, 2048, 1004, 2536, 333, 1782, 3295, 1517, 219, 2153, 815, 3502, 1579, 2268, 987, 3409, 1780, 4018, 354, 665, 3914, 47, 1956, 456, 1006, 2010, 3406, 1130, 3621, 2894, 1549, 3092, +    2485, 640, 3993, 3179, 1270, 3436, 585, 1925, 3757, 2304, 136, 1976, 1486, 646, 3520, 50, 3155, 1637, 2435, 3522, 1937, 2756, 3748, 661, 2224, 58, 3230, 2357, 1830, 3892, 170, 3607, 1447, 3949, 190, 3392, 1336, 584, 4010, 918, 3016, 3670, 1155, 2406, 52, 1304, 3009, 607, 2085, 2699, 3205, 1848, 2291, 3402, 2764, 3865, 3048, 2508, 735, 2710, 443, 2341, 897, 263, +    1785, 2769, 983, 56, 2197, 1685, 2703, 202, 2944, 810, 3377, 2626, 3787, 3047, 2055, 1236, 2752, 2122, 945, 3093, 96, 1624, 439, 3014, 1388, 4015, 977, 448, 3506, 1098, 2242, 3026, 506, 2361, 2952, 1862, 3619, 2790, 1992, 2483, 525, 1868, 2652, 4093, 1998, 3595, 2478, 3816, 122, 1412, 929, 3716, 1166, 1648, 813, 1300, 199, 1489, 3998, 1771, 1310, 3808, 2052, 3423, +    434, 3712, 1625, 3558, 2955, 853, 4019, 1348, 3511, 1732, 1246, 487, 934, 1672, 2510, 3965, 788, 3711, 396, 1369, 4090, 1055, 2603, 1879, 3528, 2518, 2067, 3005, 1516, 2588, 751, 1740, 3418, 1131, 1576, 686, 2296, 1118, 18, 3263, 1365, 3401, 294, 737, 3177, 410, 867, 1633, 2963, 3579, 2375, 252, 2881, 479, 2471, 3576, 2180, 3306, 332, 2255, 3035, 41, 2648, 1396, +    2929, 2230, 1219, 2512, 446, 2008, 3189, 2388, 626, 2164, 2831, 4047, 2376, 174, 3272, 368, 1469, 3226, 2578, 1991, 2874, 2263, 3681, 876, 188, 1239, 683, 3776, 226, 3183, 4083, 2148, 63, 2649, 3859, 299, 3086, 3933, 1585, 2185, 3767, 988, 1707, 2908, 1407, 1844, 2771, 2245, 1161, 560, 1755, 3376, 2051, 4064, 3135, 1832, 652, 2853, 1051, 3649, 760, 3290, 1105, 3945, +    872, 154, 3207, 713, 3780, 1453, 281, 1087, 3695, 30, 3299, 1919, 1400, 3551, 1119, 1890, 2314, 618, 1703, 3428, 724, 295, 3146, 1557, 3341, 2896, 1683, 2723, 1974, 1017, 541, 1380, 3720, 804, 3280, 2082, 997, 2567, 777, 2961, 213, 2707, 2328, 3632, 1025, 3891, 3304, 255, 4003, 3108, 2587, 1323, 743, 1479, 105, 1013, 3901, 1618, 2044, 2627, 1465, 1846, 576, 1994, +    2560, 3521, 1742, 2118, 2800, 3404, 1783, 2609, 2968, 1582, 1022, 412, 2713, 687, 2976, 3857, 2761, 3620, 62, 1108, 3844, 1340, 2100, 540, 2345, 3925, 405, 3457, 1319, 2468, 3362, 2815, 1867, 2372, 1281, 1714, 3690, 482, 3498, 1842, 1285, 3994, 558, 2039, 81, 2499, 678, 1481, 1923, 964, 12, 3824, 2980, 2205, 2762, 3432, 2398, 181, 3247, 462, 4094, 2350, 3589, 3089, +    1555, 1094, 4041, 247, 1267, 908, 3959, 2041, 732, 3860, 2343, 3132, 3769, 2144, 1621, 237, 912, 1329, 3025, 2146, 2642, 1775, 3721, 2746, 1121, 1953, 902, 2285, 130, 3671, 1659, 278, 3153, 522, 2721, 123, 2996, 1466, 2380, 377, 3231, 873, 1510, 3476, 3123, 1250, 2147, 3650, 2839, 3451, 2323, 1122, 3545, 379, 1765, 1218, 603, 3768, 1360, 938, 2885, 133, 1245, 363, +    2364, 554, 2743, 3344, 2474, 530, 3112, 169, 1297, 3430, 536, 1741, 98, 1043, 2574, 3253, 2246, 1854, 4022, 510, 3283, 204, 858, 3398, 36, 3118, 1478, 3794, 2986, 706, 2176, 922, 3559, 1097, 3976, 3322, 2149, 1160, 2810, 3883, 2007, 2513, 2953, 328, 1721, 3793, 422, 2566, 807, 329, 1638, 1967, 648, 2520, 3727, 3109, 2116, 2927, 2491, 1939, 3365, 1709, 2728, 3815, +    2037, 3120, 831, 1405, 1896, 3592, 1622, 2369, 2864, 2151, 1107, 2542, 3532, 1410, 3917, 427, 3568, 709, 2509, 1503, 1037, 2973, 2436, 1604, 4035, 2594, 563, 1819, 2659, 1234, 4004, 2565, 1511, 2273, 1823, 336, 882, 3772, 575, 1628, 171, 3570, 1120, 2260, 2716, 935, 3064, 1806, 1342, 3144, 3900, 2744, 3296, 985, 1546, 238, 896, 1663, 305, 3660, 695, 2213, 960, 3407, +    144, 1795, 3894, 2267, 51, 2708, 1023, 3818, 366, 1821, 4087, 2985, 755, 2057, 2912, 949, 1583, 2774, 231, 3447, 2258, 3866, 1982, 672, 1225, 2077, 3320, 1062, 370, 3241, 1968, 7, 3068, 681, 3631, 2573, 1567, 3175, 2321, 1067, 3070, 722, 1856, 3744, 642, 1471, 4084, 131, 3514, 2443, 531, 1227, 155, 2265, 4024, 2658, 3326, 3910, 1168, 3078, 1530, 3956, 489, 1424, +    3647, 1203, 420, 2924, 3755, 719, 3248, 1376, 3067, 890, 196, 1559, 3269, 270, 2432, 1885, 3212, 1164, 3778, 1752, 579, 1338, 344, 3585, 3017, 288, 3658, 2371, 3882, 1691, 611, 2789, 3809, 1339, 389, 2950, 2015, 59, 3548, 2751, 2158, 4011, 1352, 29, 3388, 2370, 2812, 1946, 954, 2110, 1558, 2947, 3573, 1909, 1326, 679, 1853, 2312, 551, 2702, 33, 2414, 3209, 2824, +    2547, 2143, 3379, 966, 1492, 1979, 2479, 463, 2194, 3657, 2738, 2318, 1261, 3713, 604, 4002, 11, 2192, 2967, 919, 2607, 3369, 2837, 1676, 2539, 984, 1568, 93, 2901, 1318, 3538, 1041, 2216, 1756, 3454, 1030, 4050, 1402, 798, 1723, 311, 3277, 2546, 2886, 2043, 461, 1206, 3677, 361, 3260, 3988, 809, 2605, 470, 3007, 3517, 102, 3221, 1398, 2062, 3611, 1134, 1928, 865, +    4060, 621, 1710, 2606, 3510, 317, 4017, 1682, 3329, 1159, 1940, 654, 3461, 1789, 1015, 2691, 1455, 3599, 374, 1947, 4069, 71, 2126, 763, 3961, 2278, 3161, 1997, 824, 2623, 2080, 244, 3257, 780, 2732, 2308, 545, 3351, 2476, 3806, 1204, 588, 1591, 963, 3610, 1699, 754, 3049, 2651, 1106, 65, 2221, 1644, 3821, 1100, 2463, 1614, 3801, 965, 2965, 715, 3394, 1593, 212, +}; + +#endif /* BLUE_NOISE_64X64_H */ diff --git a/libs/pixman-0.40.0/pixman/dither/make-blue-noise.c b/libs/pixman-0.40.0/pixman/dither/make-blue-noise.c new file mode 100644 index 0000000..f9974b4 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/dither/make-blue-noise.c @@ -0,0 +1,679 @@ +/* Blue noise generation using the void-and-cluster method as described in + * + *     The void-and-cluster method for dither array generation + *     Ulichney, Robert A (1993) + * + *     http://cv.ulichney.com/papers/1993-void-cluster.pdf + * + * Note that running with openmp (-DUSE_OPENMP) will trigger additional + * randomness due to computing reductions in parallel, and is not recommended + * unless generating very large dither arrays. + */ + +#include <assert.h> +#include <stdlib.h> +#include <stdint.h> +#include <math.h> +#include <stdio.h> + +/* Booleans and utility functions */ + +#ifndef TRUE +#   define TRUE 1 +#endif + +#ifndef FALSE +#   define FALSE 0 +#endif + +typedef int bool_t; + +int +imin (int x, int y) +{ +    return x < y ? x : y; +} + +/* Memory allocation */ +void * +malloc_abc (unsigned int a, unsigned int b, unsigned int c) +{ +    if (a >= INT32_MAX / b) +	return NULL; +    else if (a * b >= INT32_MAX / c) +	return NULL; +    else +	return malloc (a * b * c); +} + +/* Random number generation */ +typedef uint32_t xorwow_state_t[5]; + +uint32_t +xorwow_next (xorwow_state_t *state) +{ +    uint32_t s = (*state)[0], +    t = (*state)[3]; +    (*state)[3] = (*state)[2]; +    (*state)[2] = (*state)[1]; +    (*state)[1] = s; + +    t ^= t >> 2; +    t ^= t << 1; +    t ^= s ^ (s << 4); + +    (*state)[0] = t; +    (*state)[4] += 362437; + +    return t + (*state)[4]; +} + +float +xorwow_float (xorwow_state_t *s) +{ +    return (xorwow_next (s) >> 9) / (float)((1 << 23) - 1); +} + +/* Floating point matrices + * + * Used to cache the cluster sizes. + */ +typedef struct matrix_t { +    int width; +    int height; +    float *buffer; +} matrix_t; + +bool_t +matrix_init (matrix_t *matrix, int width, int height) +{ +    float *buffer; + +    if (!matrix) +	return FALSE; + +    buffer = malloc_abc (width, height, sizeof (float)); + +    if (!buffer) +	return FALSE; + +    matrix->buffer = buffer; +    matrix->width  = width; +    matrix->height = height; + +    return TRUE; +} + +bool_t +matrix_copy (matrix_t *dst, matrix_t const *src) +{ +    float *srcbuf = src->buffer, +	  *srcend = src->buffer + src->width * src->height, +	  *dstbuf = dst->buffer; + +    if (dst->width != src->width || dst->height != src->height) +	return FALSE; + +    while (srcbuf < srcend) +	*dstbuf++ = *srcbuf++; + +    return TRUE; +} + +float * +matrix_get (matrix_t *matrix, int x, int y) +{ +    return &matrix->buffer[y * matrix->width + x]; +} + +void +matrix_destroy (matrix_t *matrix) +{ +    free (matrix->buffer); +} + +/* Binary patterns */ +typedef struct pattern_t { +    int width; +    int height; +    bool_t *buffer; +} pattern_t; + +bool_t +pattern_init (pattern_t *pattern, int width, int height) +{ +    bool_t *buffer; + +    if (!pattern) +	return FALSE; + +    buffer = malloc_abc (width, height, sizeof (bool_t)); + +    if (!buffer) +	return FALSE; + +    pattern->buffer = buffer; +    pattern->width  = width; +    pattern->height = height; + +    return TRUE; +} + +bool_t +pattern_copy (pattern_t *dst, pattern_t const *src) +{ +    bool_t *srcbuf = src->buffer, +	   *srcend = src->buffer + src->width * src->height, +	   *dstbuf = dst->buffer; + +    if (dst->width != src->width || dst->height != src->height) +	return FALSE; + +    while (srcbuf < srcend) +	*dstbuf++ = *srcbuf++; + +    return TRUE; +} + +bool_t * +pattern_get (pattern_t *pattern, int x, int y) +{ +    return &pattern->buffer[y * pattern->width + x]; +} + +void +pattern_fill_white_noise (pattern_t *pattern, float fraction, +			  xorwow_state_t *s) +{ +    bool_t *buffer = pattern->buffer; +    bool_t *end    = buffer + (pattern->width * pattern->height); + +    while (buffer < end) +	*buffer++ = xorwow_float (s) < fraction; +} + +void +pattern_destroy (pattern_t *pattern) +{ +    free (pattern->buffer); +} + +/* Dither arrays */ +typedef struct array_t { +    int width; +    int height; +    uint32_t *buffer; +} array_t; + +bool_t +array_init (array_t *array, int width, int height) +{ +    uint32_t *buffer; + +    if (!array) +	return FALSE; + +    buffer = malloc_abc (width, height, sizeof (uint32_t)); + +    if (!buffer) +	return FALSE; + +    array->buffer = buffer; +    array->width  = width; +    array->height = height; + +    return TRUE; +} + +uint32_t * +array_get (array_t *array, int x, int y) +{ +    return &array->buffer[y * array->width + x]; +} + +bool_t +array_save_ppm (array_t *array, const char *filename) +{ +    FILE *f = fopen(filename, "wb"); + +    int i   = 0; +    int bpp = 2; +    uint8_t buffer[1024]; + +    if (!f) +	return FALSE; + +    if (array->width * array->height - 1 < 256) +	bpp = 1; + +    fprintf(f, "P5 %d %d %d\n", array->width, array->height, +	    array->width * array->height - 1); +    while (i < array->width * array->height) +    { +	    int j = 0; +	    for (; j < 1024 / bpp && j < array->width * array->height; ++j) +	    { +		    uint32_t v = array->buffer[i + j]; +		    if (bpp == 2) +		    { +			buffer[2 * j] = v & 0xff; +			buffer[2 * j + 1] = (v & 0xff00) >> 8; +		    } else { +			buffer[j] = v; +		    } +	    } + +	    fwrite((void *)buffer, bpp, j, f); +	    i += j; +    } + +    if (fclose(f) != 0) +	return FALSE; + +    return TRUE; +} + +bool_t +array_save (array_t *array, const char *filename) +{ +    int x, y; +    FILE *f = fopen(filename, "wb"); + +    if (!f) +	return FALSE; + +    fprintf (f,  +"/* WARNING: This file is generated by make-blue-noise.c\n" +" * Please edit that file instead of this one.\n" +" */\n" +"\n" +"#ifndef BLUE_NOISE_%dX%d_H\n" +"#define BLUE_NOISE_%dX%d_H\n" +"\n" +"#include <stdint.h>\n" +"\n", array->width, array->height, array->width, array->height); + +    fprintf (f, "static const uint16_t dither_blue_noise_%dx%d[%d] = {\n", +	     array->width, array->height, array->width * array->height); + +    for (y = 0; y < array->height; ++y) +    { +	fprintf (f, "    "); +	for (x = 0; x < array->width; ++x) +	{ +	    if (x != 0) +		fprintf (f, ", "); + +	    fprintf (f, "%d", *array_get (array, x, y)); +	} + +	fprintf (f, ",\n"); +    } +    fprintf (f, "};\n"); + +    fprintf (f, "\n#endif /* BLUE_NOISE_%dX%d_H */\n", +	     array->width, array->height); + +    if (fclose(f) != 0) +	return FALSE; + +    return TRUE; +} + +void +array_destroy (array_t *array) +{ +    free (array->buffer); +} + +/* Dither array generation */ +bool_t +compute_cluster_sizes (pattern_t *pattern, matrix_t *matrix) +{ +    int width  = pattern->width, +	height = pattern->height; + +    if (matrix->width != width || matrix->height != height) +	return FALSE; + +    int px, py, qx, qy, dx, dy; +    float tsqsi = 2.f * 1.5f * 1.5f; + +#ifdef USE_OPENMP +#pragma omp parallel for default (none) \ +    private (py, px, qy, qx, dx, dy) \ +    shared (height, width, pattern, matrix, tsqsi) +#endif +    for (py = 0; py < height; ++py) +    { +	for (px = 0; px < width; ++px) +	{ +	    bool_t pixel = *pattern_get (pattern, px, py); +	    float dist   = 0.f; + +	    for (qx = 0; qx < width; ++qx) +	    { +		dx = imin (abs (qx - px), width - abs (qx - px)); +		dx = dx * dx; + +		for (qy = 0; qy < height; ++qy) +		{ +		    dy = imin (abs (qy - py), height - abs (qy - py)); +		    dy = dy * dy; + +		    dist += (pixel == *pattern_get (pattern, qx, qy)) +			* expf (- (dx + dy) / tsqsi); +		} +	    } + +	    *matrix_get (matrix, px, py) = dist; +	} +    } + +    return TRUE; +} + +bool_t +swap_pixel (pattern_t *pattern, matrix_t *matrix, int x, int y) +{ +    int width  = pattern->width, +	height = pattern->height; + +    bool_t new; + +    float f, +          dist  = 0.f, +	  tsqsi = 2.f * 1.5f * 1.5f; + +    int px, py, dx, dy; +    bool_t b; + +    new = !*pattern_get (pattern, x, y); +    *pattern_get (pattern, x, y) = new; + +    if (matrix->width != width || matrix->height != height) +	return FALSE; + + +#ifdef USE_OPENMP +#pragma omp parallel for reduction (+:dist) default (none) \ +    private (px, py, dx, dy, b, f) \ +    shared (x, y, width, height, pattern, matrix, new, tsqsi) +#endif +    for (py = 0; py < height; ++py) +    { +	dy = imin (abs (py - y), height - abs (py - y)); +	dy = dy * dy; + +	for (px = 0; px < width; ++px) +	{ +	    dx = imin (abs (px - x), width - abs (px - x)); +	    dx = dx * dx; + +	    b = (*pattern_get (pattern, px, py) == new); +	    f = expf (- (dx + dy) / tsqsi); +	    *matrix_get (matrix, px, py) += (2 * b - 1) * f; + +	    dist += b * f; +	} +    } + +    *matrix_get (matrix, x, y) = dist; +    return TRUE; +} + +void +largest_cluster (pattern_t *pattern, matrix_t *matrix, +		 bool_t pixel, int *xmax, int *ymax) +{ +    int width       = pattern->width, +	height      = pattern->height; + +    int   x, y; + +    float vmax = -INFINITY; + +#ifdef USE_OPENMP +#pragma omp parallel default (none) \ +    private (x, y) \ +    shared (height, width, pattern, matrix, pixel, xmax, ymax, vmax) +#endif +    { +	int xbest = -1, +	    ybest = -1; + +#ifdef USE_OPENMP +	float vbest = -INFINITY; + +#pragma omp for reduction (max: vmax) collapse (2) +#endif +	for (y = 0; y < height; ++y) +	{ +	    for (x = 0; x < width; ++x) +	    { +		if (*pattern_get (pattern, x, y) != pixel) +		    continue; + +		if (*matrix_get (matrix, x, y) > vmax) +		{ +		    vmax = *matrix_get (matrix, x, y); +#ifdef USE_OPENMP +		    vbest = vmax; +#endif +		    xbest = x; +		    ybest = y; +		} +	    } +	} + +#ifdef USE_OPENMP +#pragma omp barrier +#pragma omp critical +	{ +	    if (vmax == vbest) +	    { +		*xmax = xbest; +		*ymax = ybest; +	    } +	} +#else +	*xmax = xbest; +	*ymax = ybest; +#endif +    } + +    assert (vmax > -INFINITY); +} + +void +generate_initial_binary_pattern (pattern_t *pattern, matrix_t *matrix) +{ +    int xcluster = 0, +	ycluster = 0, +	xvoid    = 0, +	yvoid    = 0; + +    for (;;) +    { +	largest_cluster (pattern, matrix, TRUE, &xcluster, &ycluster); +	assert (*pattern_get (pattern, xcluster, ycluster) == TRUE); +	swap_pixel (pattern, matrix, xcluster, ycluster); + +	largest_cluster (pattern, matrix, FALSE, &xvoid, &yvoid); +	assert (*pattern_get (pattern, xvoid, yvoid) == FALSE); +	swap_pixel (pattern, matrix, xvoid, yvoid); + +	if (xcluster == xvoid && ycluster == yvoid) +	    return; +    } +} + +bool_t +generate_dither_array (array_t *array, +		       pattern_t const *prototype, matrix_t const *matrix, +		       pattern_t *temp_pattern, matrix_t *temp_matrix) +{ +    int width        = prototype->width, +	height       = prototype->height; + +    int x, y, rank; + +    int initial_rank = 0; + +    if (array->width != width || array->height != height) +	return FALSE; + +    // Make copies of the prototype and associated sizes matrix since we will +    // trash them +    if (!pattern_copy (temp_pattern, prototype)) +	return FALSE; + +    if (!matrix_copy (temp_matrix, matrix)) +	return FALSE; + +    // Compute initial rank +    for (y = 0; y < height; ++y) +    { +	for (x = 0; x < width; ++x) +	{ +	    if (*pattern_get (temp_pattern, x, y)) +		initial_rank += 1; + +	    *array_get (array, x, y) = 0; +	} +    } + +    // Phase 1 +    for (rank = initial_rank; rank > 0; --rank) +    { +	largest_cluster (temp_pattern, temp_matrix, TRUE, &x, &y); +	swap_pixel (temp_pattern, temp_matrix, x, y); +	*array_get (array, x, y) = rank - 1; +    } + +    // Make copies again for phases 2 & 3 +    if (!pattern_copy (temp_pattern, prototype)) +	return FALSE; + +    if (!matrix_copy (temp_matrix, matrix)) +	return FALSE; + +    // Phase 2 & 3 +    for (rank = initial_rank; rank < width * height; ++rank) +    { +	largest_cluster (temp_pattern, temp_matrix, FALSE, &x, &y); +	swap_pixel (temp_pattern, temp_matrix, x, y); +	*array_get (array, x, y) = rank; +    } + +    return TRUE; +} + +bool_t +generate (int size, xorwow_state_t *s, +	  char const *c_filename, char const *ppm_filename) +{ +    bool_t ok = TRUE; + +    pattern_t prototype, temp_pattern; +    array_t   array; +    matrix_t  matrix, temp_matrix; + +    printf ("Generating %dx%d blue noise...\n", size, size); + +    if (!pattern_init (&prototype, size, size)) +	return FALSE; + +    if (!pattern_init (&temp_pattern, size, size)) +    { +	pattern_destroy (&prototype); +	return FALSE; +    } + +    if (!matrix_init (&matrix, size, size)) +    { +	pattern_destroy (&temp_pattern); +	pattern_destroy (&prototype); +	return FALSE; +    } + +    if (!matrix_init (&temp_matrix, size, size)) +    { +	matrix_destroy (&matrix); +	pattern_destroy (&temp_pattern); +	pattern_destroy (&prototype); +	return FALSE; +    } + +    if (!array_init (&array, size, size)) +    { +	matrix_destroy (&temp_matrix); +	matrix_destroy (&matrix); +	pattern_destroy (&temp_pattern); +	pattern_destroy (&prototype); +	return FALSE; +    } + +    printf("Filling initial binary pattern with white noise...\n"); +    pattern_fill_white_noise (&prototype, .1, s); + +    printf("Initializing cluster sizes...\n"); +    if (!compute_cluster_sizes (&prototype, &matrix)) +    { +	fprintf (stderr, "Error while computing cluster sizes\n"); +	ok = FALSE; +	goto out; +    } + +    printf("Generating initial binary pattern...\n"); +    generate_initial_binary_pattern (&prototype, &matrix); + +    printf("Generating dither array...\n"); +    if (!generate_dither_array (&array, &prototype, &matrix, +			 &temp_pattern, &temp_matrix)) +    { +	fprintf (stderr, "Error while generating dither array\n"); +	ok = FALSE; +	goto out; +    } + +    printf("Saving dither array...\n"); +    if (!array_save (&array, c_filename)) +    { +	fprintf (stderr, "Error saving dither array\n"); +	ok = FALSE; +	goto out; +    } + +#if SAVE_PPM +    if (!array_save_ppm (&array, ppm_filename)) +    { +	fprintf (stderr, "Error saving dither array PPM\n"); +	ok = FALSE; +	goto out; +    } +#else +    (void)ppm_filename; +#endif + +    printf("All done!\n"); + +out: +    array_destroy (&array); +    matrix_destroy (&temp_matrix); +    matrix_destroy (&matrix); +    pattern_destroy (&temp_pattern); +    pattern_destroy (&prototype); +    return ok; +} + +int +main (void) +{ +    xorwow_state_t s = {1185956906, 12385940, 983948, 349208051, 901842}; + +    if (!generate (64, &s, "blue-noise-64x64.h", "blue-noise-64x64.ppm")) +	return -1; + +    return 0; +} diff --git a/libs/pixman-0.40.0/pixman/loongson-mmintrin.h b/libs/pixman-0.40.0/pixman/loongson-mmintrin.h new file mode 100644 index 0000000..0e79e86 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/loongson-mmintrin.h @@ -0,0 +1,412 @@ +/* The gcc-provided loongson intrinsic functions are way too fucking broken + * to be of any use, otherwise I'd use them. + * + * - The hardware instructions are very similar to MMX or iwMMXt. Certainly + *   close enough that they could have implemented the _mm_*-style intrinsic + *   interface and had a ton of optimized code available to them. Instead they + *   implemented something much, much worse. + * + * - pshuf takes a dead first argument, causing extra instructions to be + *   generated. + * + * - There are no 64-bit shift or logical intrinsics, which means you have + *   to implement them with inline assembly, but this is a nightmare because + *   gcc doesn't understand that the integer vector datatypes are actually in + *   floating-point registers, so you end up with braindead code like + * + *	punpcklwd	$f9,$f9,$f5 + *	    dmtc1	v0,$f8 + *	punpcklwd	$f19,$f19,$f5 + *	    dmfc1	t9,$f9 + *	    dmtc1	v0,$f9 + *	    dmtc1	t9,$f20 + *	    dmfc1	s0,$f19 + *	punpcklbh	$f20,$f20,$f2 + * + *   where crap just gets copied back and forth between integer and floating- + *   point registers ad nauseum. + * + * Instead of trying to workaround the problems from these crap intrinsics, I + * just implement the _mm_* intrinsics needed for pixman-mmx.c using inline + * assembly. + */ + +#include <stdint.h> + +/* vectors are stored in 64-bit floating-point registers */ +typedef double __m64; +/* having a 32-bit datatype allows us to use 32-bit loads in places like load8888 */ +typedef float  __m32; + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setzero_si64 (void) +{ +	return 0.0; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pi16 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("paddh %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pi32 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("paddw %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_pu16 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("paddush %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_pu8 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("paddusb %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_and_si64 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("and %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("pcmpeqw %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_empty (void) +{ + +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_madd_pi16 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("pmaddhw %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhi_pu16 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("pmulhuh %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mullo_pi16 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("pmullh %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_si64 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("or %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_pu16 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("packushb %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_pi32 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("packsswh %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0) +{ +	if (__builtin_constant_p (__w3) && +	    __builtin_constant_p (__w2) && +	    __builtin_constant_p (__w1) && +	    __builtin_constant_p (__w0)) +	{ +		uint64_t val = ((uint64_t)__w3 << 48) +			     | ((uint64_t)__w2 << 32) +			     | ((uint64_t)__w1 << 16) +			     | ((uint64_t)__w0 <<  0); +		return *(__m64 *)&val; +	} +	else if (__w3 == __w2 && __w2 == __w1 && __w1 == __w0) +	{ +		/* TODO: handle other cases */ +		uint64_t val = __w3; +		uint64_t imm = _MM_SHUFFLE (0, 0, 0, 0); +		__m64 ret; +		asm("pshufh %0, %1, %2\n\t" +		    : "=f" (ret) +		    : "f" (*(__m64 *)&val), "f" (*(__m64 *)&imm) +		); +		return ret; +	} else { +		uint64_t val = ((uint64_t)__w3 << 48) +			     | ((uint64_t)__w2 << 32) +			     | ((uint64_t)__w1 << 16) +			     | ((uint64_t)__w0 <<  0); +		return *(__m64 *)&val; +	} +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi32 (unsigned __i1, unsigned __i0) +{ +	if (__builtin_constant_p (__i1) && +	    __builtin_constant_p (__i0)) +	{ +		uint64_t val = ((uint64_t)__i1 << 32) +			     | ((uint64_t)__i0 <<  0); +		return *(__m64 *)&val; +	} +	else if (__i1 == __i0) +	{ +		uint64_t imm = _MM_SHUFFLE (1, 0, 1, 0); +		__m64 ret; +		asm("pshufh %0, %1, %2\n\t" +		    : "=f" (ret) +		    : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm) +		); +		return ret; +	} else { +		uint64_t val = ((uint64_t)__i1 << 32) +			     | ((uint64_t)__i0 <<  0); +		return *(__m64 *)&val; +	} +} +#undef _MM_SHUFFLE + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_pi16 (__m64 __m, int64_t __n) +{ +	__m64 ret; +	asm("pshufh %0, %1, %2\n\t" +	    : "=f" (ret) +	    : "f" (__m), "f" (*(__m64 *)&__n) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_pi16 (__m64 __m, int64_t __count) +{ +	__m64 ret; +	asm("psllh  %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m), "f" (*(__m64 *)&__count) +	); +	return ret; +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_si64 (__m64 __m, int64_t __count) +{ +	__m64 ret; +	asm("dsll  %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m), "f" (*(__m64 *)&__count) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_pi16 (__m64 __m, int64_t __count) +{ +	__m64 ret; +	asm("psrlh %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m), "f" (*(__m64 *)&__count) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_pi32 (__m64 __m, int64_t __count) +{ +	__m64 ret; +	asm("psrlw %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m), "f" (*(__m64 *)&__count) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_si64 (__m64 __m, int64_t __count) +{ +	__m64 ret; +	asm("dsrl  %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m), "f" (*(__m64 *)&__count) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_pi16 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("psubh %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("punpckhbh %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("punpckhhw %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("punpcklbh %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32 datatype which + * allows load8888 to use 32-bit loads */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pi8_f (__m32 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("punpcklbh %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("punpcklhw %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_si64 (__m64 __m1, __m64 __m2) +{ +	__m64 ret; +	asm("xor %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +loongson_extract_pi16 (__m64 __m, int64_t __pos) +{ +	__m64 ret; +	asm("pextrh %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m), "f" (*(__m64 *)&__pos) +	); +	return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +loongson_insert_pi16 (__m64 __m1, __m64 __m2, int64_t __pos) +{ +	__m64 ret; +	asm("pinsrh_%3 %0, %1, %2\n\t" +	   : "=f" (ret) +	   : "f" (__m1), "f" (__m2), "i" (__pos) +	); +	return ret; +} diff --git a/libs/pixman-0.40.0/pixman/meson.build b/libs/pixman-0.40.0/pixman/meson.build new file mode 100644 index 0000000..f48357f --- /dev/null +++ b/libs/pixman-0.40.0/pixman/meson.build @@ -0,0 +1,129 @@ +# Copyright © 2018 Intel Corporation + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +config_h = configure_file( +  configuration : config, +  output : 'config.h' +) + +version_h = configure_file( +  configuration : version_conf, +  input : 'pixman-version.h.in', +  output : 'pixman-version.h', +  install_dir : join_paths(get_option('prefix'), get_option('includedir'), 'pixman-1') +) + +libpixman_extra_cargs = [] +if cc.has_function_attribute('dllexport') +  libpixman_extra_cargs = ['-DPIXMAN_API=__declspec(dllexport)'] +endif + +pixman_simd_libs = [] +simds = [ +  # the mmx library can be compiled with mmx on x86/x86_64, iwmmxt on +  # some arm cores, or loongson mmi on loongson mips systems. The +  # libraries will all have the same name, "pixman-mmx", but there is +  # no chance of more than one version being built in the same build +  # because no system could have mmx, iwmmxt, and mmi, and it +  # simplifies the build logic to give them the same name. +  ['mmx', have_mmx, mmx_flags, []], +  ['mmx', have_loongson_mmi, loongson_mmi_flags, []], +  ['mmx', have_iwmmxt, iwmmxt_flags, []], + +  ['sse2', have_sse2, sse2_flags, []], +  ['ssse3', have_ssse3, ssse3_flags, []], +  ['vmx', have_vmx, vmx_flags, []], +  ['arm-simd', have_armv6_simd, [], +   ['pixman-arm-simd-asm.S', 'pixman-arm-simd-asm-scaled.S']], +  ['arm-neon', have_neon, [], +   ['pixman-arm-neon-asm.S', 'pixman-arm-neon-asm-bilinear.S']], +  ['mips-dspr2', have_mips_dspr2, mips_dspr2_flags, +   ['pixman-mips-dspr2-asm.S', 'pixman-mips-memcpy-asm.S']], +] + +foreach simd : simds +  if simd[1] +    name = 'pixman-' + simd[0] +    pixman_simd_libs += static_library( +      name, +      [name + '.c', config_h, version_h, simd[3]], +      c_args : simd[2] +    ) +  endif +endforeach + +pixman_files = files( +  'pixman.c', +  'pixman-access.c', +  'pixman-access-accessors.c', +  'pixman-bits-image.c', +  'pixman-combine32.c', +  'pixman-combine-float.c', +  'pixman-conical-gradient.c', +  'pixman-filter.c', +  'pixman-x86.c', +  'pixman-mips.c', +  'pixman-arm.c', +  'pixman-ppc.c', +  'pixman-edge.c', +  'pixman-edge-accessors.c', +  'pixman-fast-path.c', +  'pixman-glyph.c', +  'pixman-general.c', +  'pixman-gradient-walker.c', +  'pixman-image.c', +  'pixman-implementation.c', +  'pixman-linear-gradient.c', +  'pixman-matrix.c', +  'pixman-noop.c', +  'pixman-radial-gradient.c', +  'pixman-region16.c', +  'pixman-region32.c', +  'pixman-solid-fill.c', +  'pixman-timer.c', +  'pixman-trap.c', +  'pixman-utils.c', +) + +# We cannot use 'link_with' or 'link_whole' because meson wont do the right +# thing for static archives. +_obs = [] +foreach l : pixman_simd_libs +  _obs += l.extract_all_objects() +endforeach + +libpixman = library( +  'pixman-1', +  [pixman_files, config_h, version_h], +  objects : _obs, +  c_args : libpixman_extra_cargs, +  dependencies : [dep_m, dep_threads], +  version : meson.project_version(), +  install : true, +) + +inc_pixman = include_directories('.') + +idep_pixman = declare_dependency( +  link_with: libpixman, +  include_directories : inc_pixman, +) + +install_headers('pixman.h', subdir : 'pixman-1') diff --git a/libs/pixman-0.40.0/pixman/pixman-access-accessors.c b/libs/pixman-0.40.0/pixman/pixman-access-accessors.c new file mode 100644 index 0000000..3263582 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-access-accessors.c @@ -0,0 +1,3 @@ +#define PIXMAN_FB_ACCESSORS + +#include "pixman-access.c" diff --git a/libs/pixman-0.40.0/pixman/pixman-access.c b/libs/pixman-0.40.0/pixman/pixman-access.c new file mode 100644 index 0000000..7c5ce78 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-access.c @@ -0,0 +1,1559 @@ +/* + * + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. + *             2005 Lars Knoll & Zack Rusin, Trolltech + *             2008 Aaron Plattner, NVIDIA Corporation + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Keith Packard not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission.  Keith Packard makes no + * representations about the suitability of this software for any purpose.  It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include <math.h> + +#include "pixman-accessor.h" +#include "pixman-private.h" + +#define CONVERT_RGB24_TO_Y15(s)						\ +    (((((s) >> 16) & 0xff) * 153 +					\ +      (((s) >>  8) & 0xff) * 301 +					\ +      (((s)      ) & 0xff) * 58) >> 2) + +#define CONVERT_RGB24_TO_RGB15(s)                                       \ +    ((((s) >> 3) & 0x001f) |                                            \ +     (((s) >> 6) & 0x03e0) |                                            \ +     (((s) >> 9) & 0x7c00)) + +/* Fetch macros */ + +#ifdef WORDS_BIGENDIAN +#define FETCH_1(img,l,o)						\ +    (((READ ((img), ((uint32_t *)(l)) + ((o) >> 5))) >> (0x1f - ((o) & 0x1f))) & 0x1) +#else +#define FETCH_1(img,l,o)						\ +    ((((READ ((img), ((uint32_t *)(l)) + ((o) >> 5))) >> ((o) & 0x1f))) & 0x1) +#endif + +#define FETCH_8(img,l,o)    (READ (img, (((uint8_t *)(l)) + ((o) >> 3)))) + +#ifdef WORDS_BIGENDIAN +#define FETCH_4(img,l,o)						\ +    (((4 * (o)) & 4) ? (FETCH_8 (img,l, 4 * (o)) & 0xf) : (FETCH_8 (img,l,(4 * (o))) >> 4)) +#else +#define FETCH_4(img,l,o)						\ +    (((4 * (o)) & 4) ? (FETCH_8 (img, l, 4 * (o)) >> 4) : (FETCH_8 (img, l, (4 * (o))) & 0xf)) +#endif + +#ifdef WORDS_BIGENDIAN +#define FETCH_24(img,l,o)                                              \ +    ((uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 16)    |       \ +     (uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8)     |       \ +     (uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 0)) +#else +#define FETCH_24(img,l,o)						\ +    ((uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 0)	|	\ +     (uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8)	|	\ +     (uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 16)) +#endif + +/* Store macros */ + +#ifdef WORDS_BIGENDIAN +#define STORE_1(img,l,o,v)						\ +    do									\ +    {									\ +	uint32_t  *__d = ((uint32_t *)(l)) + ((o) >> 5);		\ +	uint32_t __m, __v;						\ +									\ +	__m = 1U << (0x1f - ((o) & 0x1f));				\ +	__v = (v)? __m : 0;						\ +									\ +	WRITE((img), __d, (READ((img), __d) & ~__m) | __v);		\ +    }									\ +    while (0) +#else +#define STORE_1(img,l,o,v)						\ +    do									\ +    {									\ +	uint32_t  *__d = ((uint32_t *)(l)) + ((o) >> 5);		\ +	uint32_t __m, __v;						\ +									\ +	__m = 1U << ((o) & 0x1f);					\ +	__v = (v)? __m : 0;						\ +									\ +	WRITE((img), __d, (READ((img), __d) & ~__m) | __v);		\ +    }									\ +    while (0) +#endif + +#define STORE_8(img,l,o,v)  (WRITE (img, (uint8_t *)(l) + ((o) >> 3), (v))) + +#ifdef WORDS_BIGENDIAN +#define STORE_4(img,l,o,v)						\ +    do									\ +    {									\ +	int bo = 4 * (o);						\ +	int v4 = (v) & 0x0f;						\ +									\ +	STORE_8 (img, l, bo, (						\ +		     bo & 4 ?						\ +		     (FETCH_8 (img, l, bo) & 0xf0) | (v4) :		\ +		     (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4)));	\ +    } while (0) +#else +#define STORE_4(img,l,o,v)						\ +    do									\ +    {									\ +	int bo = 4 * (o);						\ +	int v4 = (v) & 0x0f;						\ +									\ +	STORE_8 (img, l, bo, (						\ +		     bo & 4 ?						\ +		     (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4) :	\ +		     (FETCH_8 (img, l, bo) & 0xf0) | (v4)));		\ +    } while (0) +#endif + +#ifdef WORDS_BIGENDIAN +#define STORE_24(img,l,o,v)                                            \ +    do                                                                 \ +    {                                                                  \ +	uint8_t *__tmp = (l) + 3 * (o);				       \ +        							       \ +	WRITE ((img), __tmp++, ((v) & 0x00ff0000) >> 16);	       \ +	WRITE ((img), __tmp++, ((v) & 0x0000ff00) >>  8);	       \ +	WRITE ((img), __tmp++, ((v) & 0x000000ff) >>  0);	       \ +    }                                                                  \ +    while (0) +#else +#define STORE_24(img,l,o,v)                                            \ +    do                                                                 \ +    {                                                                  \ +	uint8_t *__tmp = (l) + 3 * (o);				       \ +        							       \ +	WRITE ((img), __tmp++, ((v) & 0x000000ff) >>  0);	       \ +	WRITE ((img), __tmp++, ((v) & 0x0000ff00) >>  8);	       \ +	WRITE ((img), __tmp++, ((v) & 0x00ff0000) >> 16);	       \ +    }								       \ +    while (0) +#endif + +/* + * YV12 setup and access macros + */ + +#define YV12_SETUP(image)                                               \ +    bits_image_t *__bits_image = (bits_image_t *)image;                 \ +    uint32_t *bits = __bits_image->bits;                                \ +    int stride = __bits_image->rowstride;                               \ +    int offset0 = stride < 0 ?                                          \ +    ((-stride) >> 1) * ((__bits_image->height - 1) >> 1) - stride :	\ +    stride * __bits_image->height;					\ +    int offset1 = stride < 0 ?                                          \ +    offset0 + ((-stride) >> 1) * ((__bits_image->height) >> 1) :	\ +	offset0 + (offset0 >> 2) + +/* Note no trailing semicolon on the above macro; if it's there, then + * the typical usage of YV12_SETUP(image); will have an extra trailing ; + * that some compilers will interpret as a statement -- and then any further + * variable declarations will cause an error. + */ + +#define YV12_Y(line)                                                    \ +    ((uint8_t *) ((bits) + (stride) * (line))) + +#define YV12_U(line)                                                    \ +    ((uint8_t *) ((bits) + offset1 +                                    \ +                  ((stride) >> 1) * ((line) >> 1))) + +#define YV12_V(line)                                                    \ +    ((uint8_t *) ((bits) + offset0 +                                    \ +                  ((stride) >> 1) * ((line) >> 1))) + +/* Misc. helpers */ + +static force_inline void +get_shifts (pixman_format_code_t  format, +	    int			 *a, +	    int			 *r, +	    int                  *g, +	    int                  *b) +{ +    switch (PIXMAN_FORMAT_TYPE (format)) +    { +    case PIXMAN_TYPE_A: +	*b = 0; +	*g = 0; +	*r = 0; +	*a = 0; +	break; + +    case PIXMAN_TYPE_ARGB: +    case PIXMAN_TYPE_ARGB_SRGB: +	*b = 0; +	*g = *b + PIXMAN_FORMAT_B (format); +	*r = *g + PIXMAN_FORMAT_G (format); +	*a = *r + PIXMAN_FORMAT_R (format); +	break; + +    case PIXMAN_TYPE_ABGR: +	*r = 0; +	*g = *r + PIXMAN_FORMAT_R (format); +	*b = *g + PIXMAN_FORMAT_G (format); +	*a = *b + PIXMAN_FORMAT_B (format); +	break; + +    case PIXMAN_TYPE_BGRA: +	/* With BGRA formats we start counting at the high end of the pixel */ +	*b = PIXMAN_FORMAT_BPP (format) - PIXMAN_FORMAT_B (format); +	*g = *b - PIXMAN_FORMAT_B (format); +	*r = *g - PIXMAN_FORMAT_G (format); +	*a = *r - PIXMAN_FORMAT_R (format); +	break; + +    case PIXMAN_TYPE_RGBA: +	/* With BGRA formats we start counting at the high end of the pixel */ +	*r = PIXMAN_FORMAT_BPP (format) - PIXMAN_FORMAT_R (format); +	*g = *r - PIXMAN_FORMAT_R (format); +	*b = *g - PIXMAN_FORMAT_G (format); +	*a = *b - PIXMAN_FORMAT_B (format); +	break; + +    default: +	assert (0); +	break; +    } +} + +static force_inline uint32_t +convert_channel (uint32_t pixel, uint32_t def_value, +		 int n_from_bits, int from_shift, +		 int n_to_bits, int to_shift) +{ +    uint32_t v; + +    if (n_from_bits && n_to_bits) +	v  = unorm_to_unorm (pixel >> from_shift, n_from_bits, n_to_bits); +    else if (n_to_bits) +	v = def_value; +    else +	v = 0; + +    return (v & ((1 << n_to_bits) - 1)) << to_shift; +} + +static force_inline uint32_t +convert_pixel (pixman_format_code_t from, pixman_format_code_t to, uint32_t pixel) +{ +    int a_from_shift, r_from_shift, g_from_shift, b_from_shift; +    int a_to_shift, r_to_shift, g_to_shift, b_to_shift; +    uint32_t a, r, g, b; + +    get_shifts (from, &a_from_shift, &r_from_shift, &g_from_shift, &b_from_shift); +    get_shifts (to, &a_to_shift, &r_to_shift, &g_to_shift, &b_to_shift); + +    a = convert_channel (pixel, ~0, +			 PIXMAN_FORMAT_A (from), a_from_shift, +			 PIXMAN_FORMAT_A (to), a_to_shift); + +    r = convert_channel (pixel, 0, +			 PIXMAN_FORMAT_R (from), r_from_shift, +			 PIXMAN_FORMAT_R (to), r_to_shift); + +    g = convert_channel (pixel, 0, +			 PIXMAN_FORMAT_G (from), g_from_shift, +			 PIXMAN_FORMAT_G (to), g_to_shift); + +    b = convert_channel (pixel, 0, +			 PIXMAN_FORMAT_B (from), b_from_shift, +			 PIXMAN_FORMAT_B (to), b_to_shift); + +    return a | r | g | b; +} + +static force_inline uint32_t +convert_pixel_to_a8r8g8b8 (bits_image_t *image, +			   pixman_format_code_t format, +			   uint32_t pixel) +{ +    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY		|| +	PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR) +    { +	return image->indexed->rgba[pixel]; +    } +    else +    { +	return convert_pixel (format, PIXMAN_a8r8g8b8, pixel); +    } +} + +static force_inline uint32_t +convert_pixel_from_a8r8g8b8 (pixman_image_t *image, +			     pixman_format_code_t format, uint32_t pixel) +{ +    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY) +    { +	pixel = CONVERT_RGB24_TO_Y15 (pixel); + +	return image->bits.indexed->ent[pixel & 0x7fff]; +    } +    else if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR) +    { +	pixel = convert_pixel (PIXMAN_a8r8g8b8, PIXMAN_x1r5g5b5, pixel); + +	return image->bits.indexed->ent[pixel & 0x7fff]; +    } +    else +    { +	return convert_pixel (PIXMAN_a8r8g8b8, format, pixel); +    } +} + +static force_inline uint32_t +fetch_and_convert_pixel (bits_image_t *		image, +			 const uint8_t *	bits, +			 int			offset, +			 pixman_format_code_t	format) +{ +    uint32_t pixel; + +    switch (PIXMAN_FORMAT_BPP (format)) +    { +    case 1: +	pixel = FETCH_1 (image, bits, offset); +	break; + +    case 4: +	pixel = FETCH_4 (image, bits, offset); +	break; + +    case 8: +	pixel = READ (image, bits + offset); +	break; + +    case 16: +	pixel = READ (image, ((uint16_t *)bits + offset)); +	break; + +    case 24: +	pixel = FETCH_24 (image, bits, offset); +	break; + +    case 32: +	pixel = READ (image, ((uint32_t *)bits + offset)); +	break; + +    default: +	pixel = 0xffff00ff; /* As ugly as possible to detect the bug */ +	break; +    } + +    return convert_pixel_to_a8r8g8b8 (image, format, pixel); +} + +static force_inline void +convert_and_store_pixel (bits_image_t *		image, +			 uint8_t *		dest, +			 int                    offset, +			 pixman_format_code_t	format, +			 uint32_t		pixel) +{ +    uint32_t converted = convert_pixel_from_a8r8g8b8 ( +	(pixman_image_t *)image, format, pixel); + +    switch (PIXMAN_FORMAT_BPP (format)) +    { +    case 1: +	STORE_1 (image, dest, offset, converted & 0x01); +	break; + +    case 4: +	STORE_4 (image, dest, offset, converted & 0xf); +	break; + +    case 8: +	WRITE (image, (dest + offset), converted & 0xff); +	break; + +    case 16: +	WRITE (image, ((uint16_t *)dest + offset), converted & 0xffff); +	break; + +    case 24: +	STORE_24 (image, dest, offset, converted); +	break; + +    case 32: +	WRITE (image, ((uint32_t *)dest + offset), converted); +	break; + +    default: +	*dest = 0x0; +	break; +    } +} + +#define MAKE_ACCESSORS(format)						\ +    static void								\ +    fetch_scanline_ ## format (bits_image_t *image,			\ +			       int	       x,			\ +			       int             y,			\ +			       int             width,			\ +			       uint32_t *      buffer,			\ +			       const uint32_t *mask)			\ +    {									\ +	uint8_t *bits =							\ +	    (uint8_t *)(image->bits + y * image->rowstride);		\ +	int i;								\ +									\ +	for (i = 0; i < width; ++i)					\ +	{								\ +	    *buffer++ =							\ +		fetch_and_convert_pixel (image, bits, x + i, PIXMAN_ ## format); \ +	}								\ +    }									\ +									\ +    static void								\ +    store_scanline_ ## format (bits_image_t *  image,			\ +			       int             x,			\ +			       int             y,			\ +			       int             width,			\ +			       const uint32_t *values)			\ +    {									\ +	uint8_t *dest =							\ +	    (uint8_t *)(image->bits + y * image->rowstride);		\ +	int i;								\ +									\ +	for (i = 0; i < width; ++i)					\ +	{								\ +	    convert_and_store_pixel (					\ +		image, dest, i + x, PIXMAN_ ## format, values[i]);	\ +	}								\ +    }									\ +									\ +    static uint32_t							\ +    fetch_pixel_ ## format (bits_image_t *image,			\ +			    int		offset,				\ +			    int		line)				\ +    {									\ +	uint8_t *bits =							\ +	    (uint8_t *)(image->bits + line * image->rowstride);		\ +									\ +	return fetch_and_convert_pixel (				\ +	    image, bits, offset, PIXMAN_ ## format);			\ +    }									\ +									\ +    static const void *const __dummy__ ## format + +MAKE_ACCESSORS(a8r8g8b8); +MAKE_ACCESSORS(x8r8g8b8); +MAKE_ACCESSORS(a8b8g8r8); +MAKE_ACCESSORS(x8b8g8r8); +MAKE_ACCESSORS(x14r6g6b6); +MAKE_ACCESSORS(b8g8r8a8); +MAKE_ACCESSORS(b8g8r8x8); +MAKE_ACCESSORS(r8g8b8x8); +MAKE_ACCESSORS(r8g8b8a8); +MAKE_ACCESSORS(r8g8b8); +MAKE_ACCESSORS(b8g8r8); +MAKE_ACCESSORS(r5g6b5); +MAKE_ACCESSORS(b5g6r5); +MAKE_ACCESSORS(a1r5g5b5); +MAKE_ACCESSORS(x1r5g5b5); +MAKE_ACCESSORS(a1b5g5r5); +MAKE_ACCESSORS(x1b5g5r5); +MAKE_ACCESSORS(a4r4g4b4); +MAKE_ACCESSORS(x4r4g4b4); +MAKE_ACCESSORS(a4b4g4r4); +MAKE_ACCESSORS(x4b4g4r4); +MAKE_ACCESSORS(a8); +MAKE_ACCESSORS(c8); +MAKE_ACCESSORS(g8); +MAKE_ACCESSORS(r3g3b2); +MAKE_ACCESSORS(b2g3r3); +MAKE_ACCESSORS(a2r2g2b2); +MAKE_ACCESSORS(a2b2g2r2); +MAKE_ACCESSORS(x4a4); +MAKE_ACCESSORS(a4); +MAKE_ACCESSORS(g4); +MAKE_ACCESSORS(c4); +MAKE_ACCESSORS(r1g2b1); +MAKE_ACCESSORS(b1g2r1); +MAKE_ACCESSORS(a1r1g1b1); +MAKE_ACCESSORS(a1b1g1r1); +MAKE_ACCESSORS(a1); +MAKE_ACCESSORS(g1); + +/********************************** Fetch ************************************/ +/* Table mapping sRGB-encoded 8 bit numbers to linearly encoded + * floating point numbers. We assume that single precision + * floating point follows the IEEE 754 format. + */ +static const uint32_t to_linear_u[256] = +{ +    0x00000000, 0x399f22b4, 0x3a1f22b4, 0x3a6eb40e, 0x3a9f22b4, 0x3ac6eb61, +    0x3aeeb40e, 0x3b0b3e5d, 0x3b1f22b4, 0x3b33070b, 0x3b46eb61, 0x3b5b518a, +    0x3b70f18a, 0x3b83e1c5, 0x3b8fe614, 0x3b9c87fb, 0x3ba9c9b5, 0x3bb7ad6d, +    0x3bc63547, 0x3bd5635f, 0x3be539bd, 0x3bf5ba70, 0x3c0373b5, 0x3c0c6152, +    0x3c15a703, 0x3c1f45bc, 0x3c293e68, 0x3c3391f4, 0x3c3e4149, 0x3c494d43, +    0x3c54b6c7, 0x3c607eb1, 0x3c6ca5df, 0x3c792d22, 0x3c830aa8, 0x3c89af9e, +    0x3c9085db, 0x3c978dc5, 0x3c9ec7c0, 0x3ca63432, 0x3cadd37d, 0x3cb5a601, +    0x3cbdac20, 0x3cc5e639, 0x3cce54ab, 0x3cd6f7d2, 0x3cdfd00e, 0x3ce8ddb9, +    0x3cf2212c, 0x3cfb9ac1, 0x3d02a569, 0x3d0798dc, 0x3d0ca7e4, 0x3d11d2ae, +    0x3d171963, 0x3d1c7c2e, 0x3d21fb3a, 0x3d2796af, 0x3d2d4ebb, 0x3d332380, +    0x3d39152b, 0x3d3f23e3, 0x3d454fd0, 0x3d4b991c, 0x3d51ffeb, 0x3d588466, +    0x3d5f26b7, 0x3d65e6fe, 0x3d6cc564, 0x3d73c210, 0x3d7add25, 0x3d810b65, +    0x3d84b793, 0x3d88732e, 0x3d8c3e48, 0x3d9018f4, 0x3d940343, 0x3d97fd48, +    0x3d9c0714, 0x3da020b9, 0x3da44a48, 0x3da883d6, 0x3daccd70, 0x3db12728, +    0x3db59110, 0x3dba0b38, 0x3dbe95b2, 0x3dc3308f, 0x3dc7dbe0, 0x3dcc97b4, +    0x3dd1641c, 0x3dd6412a, 0x3ddb2eec, 0x3de02d75, 0x3de53cd3, 0x3dea5d16, +    0x3def8e52, 0x3df4d091, 0x3dfa23e5, 0x3dff885e, 0x3e027f06, 0x3e05427f, +    0x3e080ea2, 0x3e0ae376, 0x3e0dc104, 0x3e10a752, 0x3e139669, 0x3e168e50, +    0x3e198f0e, 0x3e1c98ab, 0x3e1fab2e, 0x3e22c6a0, 0x3e25eb08, 0x3e29186a, +    0x3e2c4ed0, 0x3e2f8e42, 0x3e32d6c4, 0x3e362861, 0x3e39831e, 0x3e3ce702, +    0x3e405416, 0x3e43ca5e, 0x3e4749e4, 0x3e4ad2ae, 0x3e4e64c2, 0x3e520027, +    0x3e55a4e6, 0x3e595303, 0x3e5d0a8a, 0x3e60cb7c, 0x3e6495e0, 0x3e6869bf, +    0x3e6c4720, 0x3e702e08, 0x3e741e7f, 0x3e78188c, 0x3e7c1c34, 0x3e8014c0, +    0x3e822039, 0x3e84308b, 0x3e8645b8, 0x3e885fc3, 0x3e8a7eb0, 0x3e8ca281, +    0x3e8ecb3a, 0x3e90f8df, 0x3e932b72, 0x3e9562f6, 0x3e979f6f, 0x3e99e0e0, +    0x3e9c274e, 0x3e9e72b8, 0x3ea0c322, 0x3ea31892, 0x3ea57308, 0x3ea7d28a, +    0x3eaa3718, 0x3eaca0b7, 0x3eaf0f69, 0x3eb18332, 0x3eb3fc16, 0x3eb67a15, +    0x3eb8fd34, 0x3ebb8576, 0x3ebe12de, 0x3ec0a56e, 0x3ec33d2a, 0x3ec5da14, +    0x3ec87c30, 0x3ecb2380, 0x3ecdd008, 0x3ed081ca, 0x3ed338c9, 0x3ed5f508, +    0x3ed8b68a, 0x3edb7d52, 0x3ede4962, 0x3ee11abe, 0x3ee3f168, 0x3ee6cd64, +    0x3ee9aeb6, 0x3eec955d, 0x3eef815d, 0x3ef272ba, 0x3ef56976, 0x3ef86594, +    0x3efb6717, 0x3efe6e02, 0x3f00bd2b, 0x3f02460c, 0x3f03d1a5, 0x3f055ff8, +    0x3f06f105, 0x3f0884ce, 0x3f0a1b54, 0x3f0bb499, 0x3f0d509f, 0x3f0eef65, +    0x3f1090ef, 0x3f12353c, 0x3f13dc50, 0x3f15862a, 0x3f1732cc, 0x3f18e237, +    0x3f1a946d, 0x3f1c4970, 0x3f1e013f, 0x3f1fbbde, 0x3f21794c, 0x3f23398c, +    0x3f24fca0, 0x3f26c286, 0x3f288b42, 0x3f2a56d3, 0x3f2c253d, 0x3f2df680, +    0x3f2fca9d, 0x3f31a195, 0x3f337b6a, 0x3f35581e, 0x3f3737b1, 0x3f391a24, +    0x3f3aff7a, 0x3f3ce7b2, 0x3f3ed2d0, 0x3f40c0d2, 0x3f42b1bc, 0x3f44a58e, +    0x3f469c49, 0x3f4895ee, 0x3f4a9280, 0x3f4c91ff, 0x3f4e946c, 0x3f5099c8, +    0x3f52a216, 0x3f54ad55, 0x3f56bb88, 0x3f58ccae, 0x3f5ae0cb, 0x3f5cf7de, +    0x3f5f11ec, 0x3f612ef0, 0x3f634eef, 0x3f6571ea, 0x3f6797e1, 0x3f69c0d6, +    0x3f6beccb, 0x3f6e1bc0, 0x3f704db6, 0x3f7282af, 0x3f74baac, 0x3f76f5ae, +    0x3f7933b6, 0x3f7b74c6, 0x3f7db8de, 0x3f800000 +}; + +static const float * const to_linear = (const float *)to_linear_u; + +static uint8_t +to_srgb (float f) +{ +    uint8_t low = 0; +    uint8_t high = 255; + +    while (high - low > 1) +    { +	uint8_t mid = (low + high) / 2; + +	if (to_linear[mid] > f) +	    high = mid; +	else +	    low = mid; +    } + +    if (to_linear[high] - f < f - to_linear[low]) +	return high; +    else +	return low; +} + +static void +fetch_scanline_a8r8g8b8_sRGB_float (bits_image_t *  image, +				    int             x, +				    int             y, +				    int             width, +				    uint32_t *      b, +				    const uint32_t *mask) +{ +    const uint32_t *bits = image->bits + y * image->rowstride; +    const uint32_t *pixel = bits + x; +    const uint32_t *end = pixel + width; +    argb_t *buffer = (argb_t *)b; + +    while (pixel < end) +    { +	uint32_t p = READ (image, pixel++); +	argb_t *argb = buffer; + +	argb->a = pixman_unorm_to_float ((p >> 24) & 0xff, 8); + +	argb->r = to_linear [(p >> 16) & 0xff]; +	argb->g = to_linear [(p >>  8) & 0xff]; +	argb->b = to_linear [(p >>  0) & 0xff]; + +	buffer++; +    } +} + +/* Expects a float buffer */ +static void +fetch_scanline_a2r10g10b10_float (bits_image_t *  image, +				  int             x, +				  int             y, +				  int             width, +				  uint32_t *      b, +				  const uint32_t *mask) +{ +    const uint32_t *bits = image->bits + y * image->rowstride; +    const uint32_t *pixel = bits + x; +    const uint32_t *end = pixel + width; +    argb_t *buffer = (argb_t *)b; + +    while (pixel < end) +    { +	uint32_t p = READ (image, pixel++); +	uint64_t a = p >> 30; +	uint64_t r = (p >> 20) & 0x3ff; +	uint64_t g = (p >> 10) & 0x3ff; +	uint64_t b = p & 0x3ff; + +	buffer->a = pixman_unorm_to_float (a, 2); +	buffer->r = pixman_unorm_to_float (r, 10); +	buffer->g = pixman_unorm_to_float (g, 10); +	buffer->b = pixman_unorm_to_float (b, 10); + +	buffer++; +    } +} + +/* Expects a float buffer */ +#ifndef PIXMAN_FB_ACCESSORS +static void +fetch_scanline_rgbf_float (bits_image_t   *image, +			   int             x, +			   int             y, +			   int             width, +			   uint32_t *      b, +			   const uint32_t *mask) +{ +    const float *bits = (float *)image->bits + y * image->rowstride; +    const float *pixel = bits + x * 3; +    argb_t *buffer = (argb_t *)b; + +    for (; width--; buffer++) { +	buffer->r = *pixel++; +	buffer->g = *pixel++; +	buffer->b = *pixel++; +	buffer->a = 1.f; +    } +} + +static void +fetch_scanline_rgbaf_float (bits_image_t   *image, +			    int             x, +			    int             y, +			    int             width, +			    uint32_t *      b, +			    const uint32_t *mask) +{ +    const float *bits = (float *)image->bits + y * image->rowstride; +    const float *pixel = bits + x * 4; +    argb_t *buffer = (argb_t *)b; + +    for (; width--; buffer++) { +	buffer->r = *pixel++; +	buffer->g = *pixel++; +	buffer->b = *pixel++; +	buffer->a = *pixel++; +    } +} +#endif + +static void +fetch_scanline_x2r10g10b10_float (bits_image_t   *image, +				  int             x, +				  int             y, +				  int             width, +				  uint32_t *      b, +				  const uint32_t *mask) +{ +    const uint32_t *bits = image->bits + y * image->rowstride; +    const uint32_t *pixel = (uint32_t *)bits + x; +    const uint32_t *end = pixel + width; +    argb_t *buffer = (argb_t *)b; + +    while (pixel < end) +    { +	uint32_t p = READ (image, pixel++); +	uint64_t r = (p >> 20) & 0x3ff; +	uint64_t g = (p >> 10) & 0x3ff; +	uint64_t b = p & 0x3ff; + +	buffer->a = 1.0; +	buffer->r = pixman_unorm_to_float (r, 10); +	buffer->g = pixman_unorm_to_float (g, 10); +	buffer->b = pixman_unorm_to_float (b, 10); + +	buffer++; +    } +} + +/* Expects a float buffer */ +static void +fetch_scanline_a2b10g10r10_float (bits_image_t   *image, +				  int             x, +				  int             y, +				  int             width, +				  uint32_t *      b, +				  const uint32_t *mask) +{ +    const uint32_t *bits = image->bits + y * image->rowstride; +    const uint32_t *pixel = bits + x; +    const uint32_t *end = pixel + width; +    argb_t *buffer = (argb_t *)b; + +    while (pixel < end) +    { +	uint32_t p = READ (image, pixel++); +	uint64_t a = p >> 30; +	uint64_t b = (p >> 20) & 0x3ff; +	uint64_t g = (p >> 10) & 0x3ff; +	uint64_t r = p & 0x3ff; + +	buffer->a = pixman_unorm_to_float (a, 2); +	buffer->r = pixman_unorm_to_float (r, 10); +	buffer->g = pixman_unorm_to_float (g, 10); +	buffer->b = pixman_unorm_to_float (b, 10); + +	buffer++; +    } +} + +/* Expects a float buffer */ +static void +fetch_scanline_x2b10g10r10_float (bits_image_t   *image, +				  int             x, +				  int             y, +				  int             width, +				  uint32_t *      b, +				  const uint32_t *mask) +{ +    const uint32_t *bits = image->bits + y * image->rowstride; +    const uint32_t *pixel = (uint32_t *)bits + x; +    const uint32_t *end = pixel + width; +    argb_t *buffer = (argb_t *)b; + +    while (pixel < end) +    { +	uint32_t p = READ (image, pixel++); +	uint64_t b = (p >> 20) & 0x3ff; +	uint64_t g = (p >> 10) & 0x3ff; +	uint64_t r = p & 0x3ff; + +	buffer->a = 1.0; +	buffer->r = pixman_unorm_to_float (r, 10); +	buffer->g = pixman_unorm_to_float (g, 10); +	buffer->b = pixman_unorm_to_float (b, 10); + +	buffer++; +    } +} + +static void +fetch_scanline_yuy2 (bits_image_t   *image, +                     int             x, +                     int             line, +                     int             width, +                     uint32_t *      buffer, +                     const uint32_t *mask) +{ +    const uint32_t *bits = image->bits + image->rowstride * line; +    int i; +     +    for (i = 0; i < width; i++) +    { +	int16_t y, u, v; +	int32_t r, g, b; +	 +	y = ((uint8_t *) bits)[(x + i) << 1] - 16; +	u = ((uint8_t *) bits)[(((x + i) << 1) & - 4) + 1] - 128; +	v = ((uint8_t *) bits)[(((x + i) << 1) & - 4) + 3] - 128; +	 +	/* R = 1.164(Y - 16) + 1.596(V - 128) */ +	r = 0x012b27 * y + 0x019a2e * v; +	/* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */ +	g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u; +	/* B = 1.164(Y - 16) + 2.018(U - 128) */ +	b = 0x012b27 * y + 0x0206a2 * u; +	 +	*buffer++ = 0xff000000 | +	    (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) | +	    (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) | +	    (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0); +    } +} + +static void +fetch_scanline_yv12 (bits_image_t   *image, +                     int             x, +                     int             line, +                     int             width, +                     uint32_t *      buffer, +                     const uint32_t *mask) +{ +    YV12_SETUP (image); +    uint8_t *y_line = YV12_Y (line); +    uint8_t *u_line = YV12_U (line); +    uint8_t *v_line = YV12_V (line); +    int i; +     +    for (i = 0; i < width; i++) +    { +	int16_t y, u, v; +	int32_t r, g, b; + +	y = y_line[x + i] - 16; +	u = u_line[(x + i) >> 1] - 128; +	v = v_line[(x + i) >> 1] - 128; + +	/* R = 1.164(Y - 16) + 1.596(V - 128) */ +	r = 0x012b27 * y + 0x019a2e * v; +	/* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */ +	g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u; +	/* B = 1.164(Y - 16) + 2.018(U - 128) */ +	b = 0x012b27 * y + 0x0206a2 * u; + +	*buffer++ = 0xff000000 | +	    (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) | +	    (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) | +	    (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0); +    } +} + +/**************************** Pixel wise fetching *****************************/ + +#ifndef PIXMAN_FB_ACCESSORS +static argb_t +fetch_pixel_rgbf_float (bits_image_t *image, +			int	    offset, +			int	    line) +{ +    float *bits = (float *)image->bits + line * image->rowstride; +    argb_t argb; + +    argb.r = bits[offset * 3]; +    argb.g = bits[offset * 3 + 1]; +    argb.b = bits[offset * 3 + 2]; +    argb.a = 1.f; + +    return argb; +} + +static argb_t +fetch_pixel_rgbaf_float (bits_image_t *image, +			 int	    offset, +			 int	    line) +{ +    float *bits = (float *)image->bits + line * image->rowstride; +    argb_t argb; + +    argb.r = bits[offset * 4]; +    argb.g = bits[offset * 4 + 1]; +    argb.b = bits[offset * 4 + 2]; +    argb.a = bits[offset * 4 + 3]; + +    return argb; +} +#endif + +static argb_t +fetch_pixel_x2r10g10b10_float (bits_image_t *image, +			       int	   offset, +			       int           line) +{ +    uint32_t *bits = image->bits + line * image->rowstride; +    uint32_t p = READ (image, bits + offset); +    uint64_t r = (p >> 20) & 0x3ff; +    uint64_t g = (p >> 10) & 0x3ff; +    uint64_t b = p & 0x3ff; +    argb_t argb; + +    argb.a = 1.0; +    argb.r = pixman_unorm_to_float (r, 10); +    argb.g = pixman_unorm_to_float (g, 10); +    argb.b = pixman_unorm_to_float (b, 10); + +    return argb; +} + +static argb_t +fetch_pixel_a2r10g10b10_float (bits_image_t *image, +			       int	     offset, +			       int           line) +{ +    uint32_t *bits = image->bits + line * image->rowstride; +    uint32_t p = READ (image, bits + offset); +    uint64_t a = p >> 30; +    uint64_t r = (p >> 20) & 0x3ff; +    uint64_t g = (p >> 10) & 0x3ff; +    uint64_t b = p & 0x3ff; +    argb_t argb; + +    argb.a = pixman_unorm_to_float (a, 2); +    argb.r = pixman_unorm_to_float (r, 10); +    argb.g = pixman_unorm_to_float (g, 10); +    argb.b = pixman_unorm_to_float (b, 10); + +    return argb; +} + +static argb_t +fetch_pixel_a2b10g10r10_float (bits_image_t *image, +			       int           offset, +			       int           line) +{ +    uint32_t *bits = image->bits + line * image->rowstride; +    uint32_t p = READ (image, bits + offset); +    uint64_t a = p >> 30; +    uint64_t b = (p >> 20) & 0x3ff; +    uint64_t g = (p >> 10) & 0x3ff; +    uint64_t r = p & 0x3ff; +    argb_t argb; + +    argb.a = pixman_unorm_to_float (a, 2); +    argb.r = pixman_unorm_to_float (r, 10); +    argb.g = pixman_unorm_to_float (g, 10); +    argb.b = pixman_unorm_to_float (b, 10); + +    return argb; +} + +static argb_t +fetch_pixel_x2b10g10r10_float (bits_image_t *image, +			       int           offset, +			       int           line) +{ +    uint32_t *bits = image->bits + line * image->rowstride; +    uint32_t p = READ (image, bits + offset); +    uint64_t b = (p >> 20) & 0x3ff; +    uint64_t g = (p >> 10) & 0x3ff; +    uint64_t r = p & 0x3ff; +    argb_t argb; + +    argb.a = 1.0; +    argb.r = pixman_unorm_to_float (r, 10); +    argb.g = pixman_unorm_to_float (g, 10); +    argb.b = pixman_unorm_to_float (b, 10); + +    return argb; +} + +static argb_t +fetch_pixel_a8r8g8b8_sRGB_float (bits_image_t *image, +				 int	       offset, +				 int           line) +{ +    uint32_t *bits = image->bits + line * image->rowstride; +    uint32_t p = READ (image, bits + offset); +    argb_t argb; + +    argb.a = pixman_unorm_to_float ((p >> 24) & 0xff, 8); + +    argb.r = to_linear [(p >> 16) & 0xff]; +    argb.g = to_linear [(p >>  8) & 0xff]; +    argb.b = to_linear [(p >>  0) & 0xff]; + +    return argb; +} + +static uint32_t +fetch_pixel_yuy2 (bits_image_t *image, +		  int           offset, +		  int           line) +{ +    const uint32_t *bits = image->bits + image->rowstride * line; +     +    int16_t y, u, v; +    int32_t r, g, b; +     +    y = ((uint8_t *) bits)[offset << 1] - 16; +    u = ((uint8_t *) bits)[((offset << 1) & - 4) + 1] - 128; +    v = ((uint8_t *) bits)[((offset << 1) & - 4) + 3] - 128; +     +    /* R = 1.164(Y - 16) + 1.596(V - 128) */ +    r = 0x012b27 * y + 0x019a2e * v; +     +    /* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */ +    g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u; +     +    /* B = 1.164(Y - 16) + 2.018(U - 128) */ +    b = 0x012b27 * y + 0x0206a2 * u; +     +    return 0xff000000 | +	(r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) | +	(g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) | +	(b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0); +} + +static uint32_t +fetch_pixel_yv12 (bits_image_t *image, +		  int           offset, +		  int           line) +{ +    YV12_SETUP (image); +    int16_t y = YV12_Y (line)[offset] - 16; +    int16_t u = YV12_U (line)[offset >> 1] - 128; +    int16_t v = YV12_V (line)[offset >> 1] - 128; +    int32_t r, g, b; +     +    /* R = 1.164(Y - 16) + 1.596(V - 128) */ +    r = 0x012b27 * y + 0x019a2e * v; +     +    /* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */ +    g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u; +     +    /* B = 1.164(Y - 16) + 2.018(U - 128) */ +    b = 0x012b27 * y + 0x0206a2 * u; +     +    return 0xff000000 | +	(r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) | +	(g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) | +	(b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0); +} + +/*********************************** Store ************************************/ + +#ifndef PIXMAN_FB_ACCESSORS +static void +store_scanline_rgbaf_float (bits_image_t *  image, +			    int             x, +			    int             y, +			    int             width, +			    const uint32_t *v) +{ +    float *bits = (float *)image->bits + image->rowstride * y + 4 * x; +    const argb_t *values = (argb_t *)v; + +    for (; width; width--, values++) +    { +	*bits++ = values->r; +	*bits++ = values->g; +	*bits++ = values->b; +	*bits++ = values->a; +    } +} + +static void +store_scanline_rgbf_float (bits_image_t *  image, +			   int             x, +			   int             y, +			   int             width, +			   const uint32_t *v) +{ +    float *bits = (float *)image->bits + image->rowstride * y + 3 * x; +    const argb_t *values = (argb_t *)v; + +    for (; width; width--, values++) +    { +	*bits++ = values->r; +	*bits++ = values->g; +	*bits++ = values->b; +    } +} +#endif + +static void +store_scanline_a2r10g10b10_float (bits_image_t *  image, +				  int             x, +				  int             y, +				  int             width, +				  const uint32_t *v) +{ +    uint32_t *bits = image->bits + image->rowstride * y; +    uint32_t *pixel = bits + x; +    argb_t *values = (argb_t *)v; +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t a, r, g, b; + +	a = pixman_float_to_unorm (values[i].a, 2); +	r = pixman_float_to_unorm (values[i].r, 10); +	g = pixman_float_to_unorm (values[i].g, 10); +	b = pixman_float_to_unorm (values[i].b, 10); + +	WRITE (image, pixel++, +	       (a << 30) | (r << 20) | (g << 10) | b); +    } +} + +static void +store_scanline_x2r10g10b10_float (bits_image_t *  image, +				  int             x, +				  int             y, +				  int             width, +				  const uint32_t *v) +{ +    uint32_t *bits = image->bits + image->rowstride * y; +    uint32_t *pixel = bits + x; +    argb_t *values = (argb_t *)v; +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t r, g, b; + +	r = pixman_float_to_unorm (values[i].r, 10); +	g = pixman_float_to_unorm (values[i].g, 10); +	b = pixman_float_to_unorm (values[i].b, 10); + +	WRITE (image, pixel++, +	       (r << 20) | (g << 10) | b); +    } +} + +static void +store_scanline_a2b10g10r10_float (bits_image_t *  image, +				  int             x, +				  int             y, +				  int             width, +				  const uint32_t *v) +{ +    uint32_t *bits = image->bits + image->rowstride * y; +    uint32_t *pixel = bits + x; +    argb_t *values = (argb_t *)v; +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t a, r, g, b; + +	a = pixman_float_to_unorm (values[i].a, 2); +	r = pixman_float_to_unorm (values[i].r, 10); +	g = pixman_float_to_unorm (values[i].g, 10); +	b = pixman_float_to_unorm (values[i].b, 10); + +	WRITE (image, pixel++, +	       (a << 30) | (b << 20) | (g << 10) | r); +    } +} + +static void +store_scanline_x2b10g10r10_float (bits_image_t *  image, +				  int             x, +				  int             y, +				  int             width, +				  const uint32_t *v) +{ +    uint32_t *bits = image->bits + image->rowstride * y; +    uint32_t *pixel = bits + x; +    argb_t *values = (argb_t *)v; +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t r, g, b; + +	r = pixman_float_to_unorm (values[i].r, 10); +	g = pixman_float_to_unorm (values[i].g, 10); +	b = pixman_float_to_unorm (values[i].b, 10); + +	WRITE (image, pixel++, +	       (b << 20) | (g << 10) | r); +    } +} + +static void +store_scanline_a8r8g8b8_sRGB_float (bits_image_t *  image, +				    int             x, +				    int             y, +				    int             width, +				    const uint32_t *v) +{ +    uint32_t *bits = image->bits + image->rowstride * y; +    uint32_t *pixel = bits + x; +    argb_t *values = (argb_t *)v; +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t a, r, g, b; + +	a = pixman_float_to_unorm (values[i].a, 8); +	r = to_srgb (values[i].r); +	g = to_srgb (values[i].g); +	b = to_srgb (values[i].b); + +	WRITE (image, pixel++, +	       (a << 24) | (r << 16) | (g << 8) | b); +    } +} + +/* + * Contracts a floating point image to 32bpp and then stores it using a + * regular 32-bit store proc. Despite the type, this function expects an + * argb_t buffer. + */ +static void +store_scanline_generic_float (bits_image_t *  image, +			      int             x, +			      int             y, +			      int             width, +			      const uint32_t *values) +{ +    uint32_t *argb8_pixels; + +    assert (image->common.type == BITS); + +    argb8_pixels = pixman_malloc_ab (width, sizeof(uint32_t)); +    if (!argb8_pixels) +	return; + +    /* Contract the scanline.  We could do this in place if values weren't +     * const. +     */ +    pixman_contract_from_float (argb8_pixels, (argb_t *)values, width); + +    image->store_scanline_32 (image, x, y, width, argb8_pixels); + +    free (argb8_pixels); +} + +static void +fetch_scanline_generic_float (bits_image_t *  image, +			      int	      x, +			      int	      y, +			      int	      width, +			      uint32_t *      buffer, +			      const uint32_t *mask) +{ +    image->fetch_scanline_32 (image, x, y, width, buffer, NULL); + +    pixman_expand_to_float ((argb_t *)buffer, buffer, image->format, width); +} + +/* The 32_sRGB paths should be deleted after narrow processing + * is no longer invoked for formats that are considered wide. + * (Also see fetch_pixel_generic_lossy_32) */ +static void +fetch_scanline_a8r8g8b8_32_sRGB (bits_image_t   *image, +                                 int             x, +                                 int             y, +                                 int             width, +                                 uint32_t       *buffer, +                                 const uint32_t *mask) +{ +    const uint32_t *bits = image->bits + y * image->rowstride; +    const uint32_t *pixel = (uint32_t *)bits + x; +    const uint32_t *end = pixel + width; +    uint32_t tmp; +     +    while (pixel < end) +    { +	uint32_t a, r, g, b; + +	tmp = READ (image, pixel++); + +	a = (tmp >> 24) & 0xff; +	r = (tmp >> 16) & 0xff; +	g = (tmp >> 8) & 0xff; +	b = (tmp >> 0) & 0xff; + +	r = to_linear[r] * 255.0f + 0.5f; +	g = to_linear[g] * 255.0f + 0.5f; +	b = to_linear[b] * 255.0f + 0.5f; + +	*buffer++ = (a << 24) | (r << 16) | (g << 8) | (b << 0); +    } +} + +static uint32_t +fetch_pixel_a8r8g8b8_32_sRGB (bits_image_t *image, +			      int           offset, +			      int           line) +{ +    uint32_t *bits = image->bits + line * image->rowstride; +    uint32_t tmp = READ (image, bits + offset); +    uint32_t a, r, g, b; + +    a = (tmp >> 24) & 0xff; +    r = (tmp >> 16) & 0xff; +    g = (tmp >> 8) & 0xff; +    b = (tmp >> 0) & 0xff; + +    r = to_linear[r] * 255.0f + 0.5f; +    g = to_linear[g] * 255.0f + 0.5f; +    b = to_linear[b] * 255.0f + 0.5f; + +    return (a << 24) | (r << 16) | (g << 8) | (b << 0); +} + +static void +store_scanline_a8r8g8b8_32_sRGB (bits_image_t   *image, +                                 int             x, +                                 int             y, +                                 int             width, +                                 const uint32_t *v) +{ +    uint32_t *bits = image->bits + image->rowstride * y; +    uint64_t *values = (uint64_t *)v; +    uint32_t *pixel = bits + x; +    uint64_t tmp; +    int i; +     +    for (i = 0; i < width; ++i) +    { +	uint32_t a, r, g, b; + +	tmp = values[i]; + +	a = (tmp >> 24) & 0xff; +	r = (tmp >> 16) & 0xff; +	g = (tmp >> 8) & 0xff; +	b = (tmp >> 0) & 0xff; + +	r = to_srgb (r * (1/255.0f)); +	g = to_srgb (g * (1/255.0f)); +	b = to_srgb (b * (1/255.0f)); +	 +	WRITE (image, pixel++, a | (r << 16) | (g << 8) | (b << 0)); +    } +} + +static argb_t +fetch_pixel_generic_float (bits_image_t *image, +			   int		 offset, +			   int           line) +{ +    uint32_t pixel32 = image->fetch_pixel_32 (image, offset, line); +    argb_t f; + +    pixman_expand_to_float (&f, &pixel32, image->format, 1); + +    return f; +} + +/* + * XXX: The transformed fetch path only works at 32-bpp so far.  When all + * paths have wide versions, this can be removed. + * + * WARNING: This function loses precision! + */ +static uint32_t +fetch_pixel_generic_lossy_32 (bits_image_t *image, +			      int           offset, +			      int           line) +{ +    argb_t pixel64 = image->fetch_pixel_float (image, offset, line); +    uint32_t result; + +    pixman_contract_from_float (&result, &pixel64, 1); + +    return result; +} + +typedef struct +{ +    pixman_format_code_t	format; +    fetch_scanline_t		fetch_scanline_32; +    fetch_scanline_t		fetch_scanline_float; +    fetch_pixel_32_t		fetch_pixel_32; +    fetch_pixel_float_t		fetch_pixel_float; +    store_scanline_t		store_scanline_32; +    store_scanline_t		store_scanline_float; +} format_info_t; + +#define FORMAT_INFO(format) 						\ +    {									\ +	PIXMAN_ ## format,						\ +	    fetch_scanline_ ## format,					\ +	    fetch_scanline_generic_float,				\ +	    fetch_pixel_ ## format,					\ +	    fetch_pixel_generic_float,					\ +	    store_scanline_ ## format,					\ +	    store_scanline_generic_float				\ +    } + +static const format_info_t accessors[] = +{ +/* 32 bpp formats */ +    FORMAT_INFO (a8r8g8b8), +    FORMAT_INFO (x8r8g8b8), +    FORMAT_INFO (a8b8g8r8), +    FORMAT_INFO (x8b8g8r8), +    FORMAT_INFO (b8g8r8a8), +    FORMAT_INFO (b8g8r8x8), +    FORMAT_INFO (r8g8b8a8), +    FORMAT_INFO (r8g8b8x8), +    FORMAT_INFO (x14r6g6b6), + +/* sRGB formats */ +  { PIXMAN_a8r8g8b8_sRGB, +    fetch_scanline_a8r8g8b8_32_sRGB, fetch_scanline_a8r8g8b8_sRGB_float, +    fetch_pixel_a8r8g8b8_32_sRGB, fetch_pixel_a8r8g8b8_sRGB_float, +    store_scanline_a8r8g8b8_32_sRGB, store_scanline_a8r8g8b8_sRGB_float, +  }, + +/* 24bpp formats */ +    FORMAT_INFO (r8g8b8), +    FORMAT_INFO (b8g8r8), +     +/* 16bpp formats */ +    FORMAT_INFO (r5g6b5), +    FORMAT_INFO (b5g6r5), +     +    FORMAT_INFO (a1r5g5b5), +    FORMAT_INFO (x1r5g5b5), +    FORMAT_INFO (a1b5g5r5), +    FORMAT_INFO (x1b5g5r5), +    FORMAT_INFO (a4r4g4b4), +    FORMAT_INFO (x4r4g4b4), +    FORMAT_INFO (a4b4g4r4), +    FORMAT_INFO (x4b4g4r4), +     +/* 8bpp formats */ +    FORMAT_INFO (a8), +    FORMAT_INFO (r3g3b2), +    FORMAT_INFO (b2g3r3), +    FORMAT_INFO (a2r2g2b2), +    FORMAT_INFO (a2b2g2r2), +     +    FORMAT_INFO (c8), +     +    FORMAT_INFO (g8), +     +#define fetch_scanline_x4c4 fetch_scanline_c8 +#define fetch_pixel_x4c4 fetch_pixel_c8 +#define store_scanline_x4c4 store_scanline_c8 +    FORMAT_INFO (x4c4), +     +#define fetch_scanline_x4g4 fetch_scanline_g8 +#define fetch_pixel_x4g4 fetch_pixel_g8 +#define store_scanline_x4g4 store_scanline_g8 +    FORMAT_INFO (x4g4), +     +    FORMAT_INFO (x4a4), +     +/* 4bpp formats */ +    FORMAT_INFO (a4), +    FORMAT_INFO (r1g2b1), +    FORMAT_INFO (b1g2r1), +    FORMAT_INFO (a1r1g1b1), +    FORMAT_INFO (a1b1g1r1), +     +    FORMAT_INFO (c4), +     +    FORMAT_INFO (g4), +     +/* 1bpp formats */ +    FORMAT_INFO (a1), +    FORMAT_INFO (g1), +     +/* Wide formats */ +#ifndef PIXMAN_FB_ACCESSORS +    { PIXMAN_rgba_float, +      NULL, fetch_scanline_rgbaf_float, +      fetch_pixel_generic_lossy_32, fetch_pixel_rgbaf_float, +      NULL, store_scanline_rgbaf_float }, + +    { PIXMAN_rgb_float, +      NULL, fetch_scanline_rgbf_float, +      fetch_pixel_generic_lossy_32, fetch_pixel_rgbf_float, +      NULL, store_scanline_rgbf_float }, +#endif + +    { PIXMAN_a2r10g10b10, +      NULL, fetch_scanline_a2r10g10b10_float, +      fetch_pixel_generic_lossy_32, fetch_pixel_a2r10g10b10_float, +      NULL, store_scanline_a2r10g10b10_float }, + +    { PIXMAN_x2r10g10b10, +      NULL, fetch_scanline_x2r10g10b10_float, +      fetch_pixel_generic_lossy_32, fetch_pixel_x2r10g10b10_float, +      NULL, store_scanline_x2r10g10b10_float }, + +    { PIXMAN_a2b10g10r10, +      NULL, fetch_scanline_a2b10g10r10_float, +      fetch_pixel_generic_lossy_32, fetch_pixel_a2b10g10r10_float, +      NULL, store_scanline_a2b10g10r10_float }, + +    { PIXMAN_x2b10g10r10, +      NULL, fetch_scanline_x2b10g10r10_float, +      fetch_pixel_generic_lossy_32, fetch_pixel_x2b10g10r10_float, +      NULL, store_scanline_x2b10g10r10_float }, + +/* YUV formats */ +    { PIXMAN_yuy2, +      fetch_scanline_yuy2, fetch_scanline_generic_float, +      fetch_pixel_yuy2, fetch_pixel_generic_float, +      NULL, NULL }, + +    { PIXMAN_yv12, +      fetch_scanline_yv12, fetch_scanline_generic_float, +      fetch_pixel_yv12, fetch_pixel_generic_float, +      NULL, NULL }, +     +    { PIXMAN_null }, +}; + +static void +setup_accessors (bits_image_t *image) +{ +    const format_info_t *info = accessors; +     +    while (info->format != PIXMAN_null) +    { +	if (info->format == image->format) +	{ +	    image->fetch_scanline_32 = info->fetch_scanline_32; +	    image->fetch_scanline_float = info->fetch_scanline_float; +	    image->fetch_pixel_32 = info->fetch_pixel_32; +	    image->fetch_pixel_float = info->fetch_pixel_float; +	    image->store_scanline_32 = info->store_scanline_32; +	    image->store_scanline_float = info->store_scanline_float; +	     +	    return; +	} +	 +	info++; +    } +} + +#ifndef PIXMAN_FB_ACCESSORS +void +_pixman_bits_image_setup_accessors_accessors (bits_image_t *image); + +void +_pixman_bits_image_setup_accessors (bits_image_t *image) +{ +    if (image->read_func || image->write_func) +	_pixman_bits_image_setup_accessors_accessors (image); +    else +	setup_accessors (image); +} + +#else + +void +_pixman_bits_image_setup_accessors_accessors (bits_image_t *image) +{ +    setup_accessors (image); +} + +#endif diff --git a/libs/pixman-0.40.0/pixman/pixman-accessor.h b/libs/pixman-0.40.0/pixman/pixman-accessor.h new file mode 100644 index 0000000..8e0b036 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-accessor.h @@ -0,0 +1,25 @@ +#ifdef PIXMAN_FB_ACCESSORS + +#define READ(img, ptr)							\ +    (((bits_image_t *)(img))->read_func ((ptr), sizeof(*(ptr)))) +#define WRITE(img, ptr,val)						\ +    (((bits_image_t *)(img))->write_func ((ptr), (val), sizeof (*(ptr)))) + +#define MEMSET_WRAPPED(img, dst, val, size)				\ +    do {								\ +	size_t _i;							\ +	uint8_t *_dst = (uint8_t*)(dst);				\ +	for(_i = 0; _i < (size_t) size; _i++) {				\ +	    WRITE((img), _dst +_i, (val));				\ +	}								\ +    } while (0) + +#else + +#define READ(img, ptr)		(*(ptr)) +#define WRITE(img, ptr, val)	(*(ptr) = (val)) +#define MEMSET_WRAPPED(img, dst, val, size)				\ +    memset(dst, val, size) + +#endif + diff --git a/libs/pixman-0.40.0/pixman/pixman-arm-asm.h b/libs/pixman-0.40.0/pixman/pixman-arm-asm.h new file mode 100644 index 0000000..ee78541 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-arm-asm.h @@ -0,0 +1,37 @@ +/* + * Copyright © 2008 Mozilla Corporation + * Copyright © 2010 Nokia Corporation + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Mozilla Corporation not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission.  Mozilla Corporation makes no + * representations about the suitability of this software for any purpose.  It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author:  Jeff Muizelaar (jeff@infidigm.net) + * + */ + +/* Supplementary macro for setting function attributes */ +.macro pixman_asm_function fname +	.func fname +	.global fname +#ifdef __ELF__ +	.hidden fname +	.type fname, %function +#endif +fname: +.endm diff --git a/libs/pixman-0.40.0/pixman/pixman-arm-common.h b/libs/pixman-0.40.0/pixman/pixman-arm-common.h new file mode 100644 index 0000000..9537688 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-arm-common.h @@ -0,0 +1,419 @@ +/* + * Copyright © 2010 Nokia Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com) + */ + +#ifndef PIXMAN_ARM_COMMON_H +#define PIXMAN_ARM_COMMON_H + +#include "pixman-inlines.h" + +/* Define some macros which can expand into proxy functions between + * ARM assembly optimized functions and the rest of pixman fast path API. + * + * All the low level ARM assembly functions have to use ARM EABI + * calling convention and take up to 8 arguments: + *    width, height, dst, dst_stride, src, src_stride, mask, mask_stride + * + * The arguments are ordered with the most important coming first (the + * first 4 arguments are passed to function in registers, the rest are + * on stack). The last arguments are optional, for example if the + * function is not using mask, then 'mask' and 'mask_stride' can be + * omitted when doing a function call. + * + * Arguments 'src' and 'mask' contain either a pointer to the top left + * pixel of the composited rectangle or a pixel color value depending + * on the function type. In the case of just a color value (solid source + * or mask), the corresponding stride argument is unused. + */ + +#define SKIP_ZERO_SRC  1 +#define SKIP_ZERO_MASK 2 + +#define PIXMAN_ARM_BIND_FAST_PATH_SRC_DST(cputype, name,                \ +                                          src_type, src_cnt,            \ +                                          dst_type, dst_cnt)            \ +void                                                                    \ +pixman_composite_##name##_asm_##cputype (int32_t   w,                   \ +                                         int32_t   h,                   \ +                                         dst_type *dst,                 \ +                                         int32_t   dst_stride,          \ +                                         src_type *src,                 \ +                                         int32_t   src_stride);         \ +                                                                        \ +static void                                                             \ +cputype##_composite_##name (pixman_implementation_t *imp,               \ +                            pixman_composite_info_t *info)              \ +{                                                                       \ +    PIXMAN_COMPOSITE_ARGS (info);                                       \ +    dst_type *dst_line;							\ +    src_type *src_line;                                                 \ +    int32_t dst_stride, src_stride;                                     \ +                                                                        \ +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \ +                           src_stride, src_line, src_cnt);              \ +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \ +                           dst_stride, dst_line, dst_cnt);              \ +                                                                        \ +    pixman_composite_##name##_asm_##cputype (width, height,             \ +                                             dst_line, dst_stride,      \ +                                             src_line, src_stride);     \ +} + +#define PIXMAN_ARM_BIND_FAST_PATH_N_DST(flags, cputype, name,           \ +                                        dst_type, dst_cnt)              \ +void                                                                    \ +pixman_composite_##name##_asm_##cputype (int32_t    w,                  \ +                                         int32_t    h,                  \ +                                         dst_type  *dst,                \ +                                         int32_t    dst_stride,         \ +                                         uint32_t   src);               \ +                                                                        \ +static void                                                             \ +cputype##_composite_##name (pixman_implementation_t *imp,               \ +			    pixman_composite_info_t *info)              \ +{                                                                       \ +    PIXMAN_COMPOSITE_ARGS (info);					\ +    dst_type  *dst_line;                                                \ +    int32_t    dst_stride;                                              \ +    uint32_t   src;                                                     \ +                                                                        \ +    src = _pixman_image_get_solid (					\ +	imp, src_image, dest_image->bits.format);			\ +                                                                        \ +    if ((flags & SKIP_ZERO_SRC) && src == 0)                            \ +	return;                                                         \ +                                                                        \ +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \ +                           dst_stride, dst_line, dst_cnt);              \ +                                                                        \ +    pixman_composite_##name##_asm_##cputype (width, height,             \ +                                             dst_line, dst_stride,      \ +                                             src);                      \ +} + +#define PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST(flags, cputype, name,      \ +                                             mask_type, mask_cnt,       \ +                                             dst_type, dst_cnt)         \ +void                                                                    \ +pixman_composite_##name##_asm_##cputype (int32_t    w,                  \ +                                         int32_t    h,                  \ +                                         dst_type  *dst,                \ +                                         int32_t    dst_stride,         \ +                                         uint32_t   src,                \ +                                         int32_t    unused,             \ +                                         mask_type *mask,               \ +                                         int32_t    mask_stride);       \ +                                                                        \ +static void                                                             \ +cputype##_composite_##name (pixman_implementation_t *imp,               \ +                            pixman_composite_info_t *info)              \ +{                                                                       \ +    PIXMAN_COMPOSITE_ARGS (info);                                       \ +    dst_type  *dst_line;						\ +    mask_type *mask_line;                                               \ +    int32_t    dst_stride, mask_stride;                                 \ +    uint32_t   src;                                                     \ +                                                                        \ +    src = _pixman_image_get_solid (					\ +	imp, src_image, dest_image->bits.format);			\ +                                                                        \ +    if ((flags & SKIP_ZERO_SRC) && src == 0)                            \ +	return;                                                         \ +                                                                        \ +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \ +                           dst_stride, dst_line, dst_cnt);              \ +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type,       \ +                           mask_stride, mask_line, mask_cnt);           \ +                                                                        \ +    pixman_composite_##name##_asm_##cputype (width, height,             \ +                                             dst_line, dst_stride,      \ +                                             src, 0,                    \ +                                             mask_line, mask_stride);   \ +} + +#define PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST(flags, cputype, name,       \ +                                            src_type, src_cnt,          \ +                                            dst_type, dst_cnt)          \ +void                                                                    \ +pixman_composite_##name##_asm_##cputype (int32_t    w,                  \ +                                         int32_t    h,                  \ +                                         dst_type  *dst,                \ +                                         int32_t    dst_stride,         \ +                                         src_type  *src,                \ +                                         int32_t    src_stride,         \ +                                         uint32_t   mask);              \ +                                                                        \ +static void                                                             \ +cputype##_composite_##name (pixman_implementation_t *imp,               \ +                            pixman_composite_info_t *info)              \ +{                                                                       \ +    PIXMAN_COMPOSITE_ARGS (info);                                       \ +    dst_type  *dst_line;						\ +    src_type  *src_line;                                                \ +    int32_t    dst_stride, src_stride;                                  \ +    uint32_t   mask;                                                    \ +                                                                        \ +    mask = _pixman_image_get_solid (					\ +	imp, mask_image, dest_image->bits.format);			\ +                                                                        \ +    if ((flags & SKIP_ZERO_MASK) && mask == 0)                          \ +	return;                                                         \ +                                                                        \ +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \ +                           dst_stride, dst_line, dst_cnt);              \ +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \ +                           src_stride, src_line, src_cnt);              \ +                                                                        \ +    pixman_composite_##name##_asm_##cputype (width, height,             \ +                                             dst_line, dst_stride,      \ +                                             src_line, src_stride,      \ +                                             mask);                     \ +} + +#define PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST(cputype, name,           \ +                                               src_type, src_cnt,       \ +                                               mask_type, mask_cnt,     \ +                                               dst_type, dst_cnt)       \ +void                                                                    \ +pixman_composite_##name##_asm_##cputype (int32_t    w,                  \ +                                         int32_t    h,                  \ +                                         dst_type  *dst,                \ +                                         int32_t    dst_stride,         \ +                                         src_type  *src,                \ +                                         int32_t    src_stride,         \ +                                         mask_type *mask,               \ +                                         int32_t    mask_stride);       \ +                                                                        \ +static void                                                             \ +cputype##_composite_##name (pixman_implementation_t *imp,               \ +                            pixman_composite_info_t *info)              \ +{                                                                       \ +    PIXMAN_COMPOSITE_ARGS (info);                                       \ +    dst_type  *dst_line;						\ +    src_type  *src_line;                                                \ +    mask_type *mask_line;                                               \ +    int32_t    dst_stride, src_stride, mask_stride;                     \ +                                                                        \ +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \ +                           dst_stride, dst_line, dst_cnt);              \ +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \ +                           src_stride, src_line, src_cnt);              \ +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type,       \ +                           mask_stride, mask_line, mask_cnt);           \ +                                                                        \ +    pixman_composite_##name##_asm_##cputype (width, height,             \ +                                             dst_line, dst_stride,      \ +                                             src_line, src_stride,      \ +                                             mask_line, mask_stride);   \ +} + +#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST(cputype, name, op,             \ +                                               src_type, dst_type)            \ +void                                                                          \ +pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \ +                                                   int32_t          w,        \ +                                                   dst_type *       dst,      \ +                                                   const src_type * src,      \ +                                                   pixman_fixed_t   vx,       \ +                                                   pixman_fixed_t   unit_x,   \ +                                                   pixman_fixed_t   max_vx);  \ +                                                                              \ +static force_inline void                                                      \ +scaled_nearest_scanline_##cputype##_##name##_##op (dst_type *       pd,       \ +                                                   const src_type * ps,       \ +                                                   int32_t          w,        \ +                                                   pixman_fixed_t   vx,       \ +                                                   pixman_fixed_t   unit_x,   \ +                                                   pixman_fixed_t   max_vx,   \ +                                                   pixman_bool_t    zero_src) \ +{                                                                             \ +    pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \ +                                                                  vx, unit_x, \ +                                                                  max_vx);    \ +}                                                                             \ +                                                                              \ +FAST_NEAREST_MAINLOOP (cputype##_##name##_cover_##op,                         \ +                       scaled_nearest_scanline_##cputype##_##name##_##op,     \ +                       src_type, dst_type, COVER)                             \ +FAST_NEAREST_MAINLOOP (cputype##_##name##_none_##op,                          \ +                       scaled_nearest_scanline_##cputype##_##name##_##op,     \ +                       src_type, dst_type, NONE)                              \ +FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op,                           \ +                       scaled_nearest_scanline_##cputype##_##name##_##op,     \ +                       src_type, dst_type, PAD)                               \ +FAST_NEAREST_MAINLOOP (cputype##_##name##_normal_##op,                        \ +                       scaled_nearest_scanline_##cputype##_##name##_##op,     \ +                       src_type, dst_type, NORMAL) + +#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op,   \ +                                                  src_type, dst_type)         \ +void                                                                          \ +pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \ +                                                   int32_t          w,        \ +                                                   dst_type *       dst,      \ +                                                   const src_type * src,      \ +                                                   pixman_fixed_t   vx,       \ +                                                   pixman_fixed_t   unit_x,   \ +                                                   pixman_fixed_t   max_vx,   \ +                                                   const uint8_t *  mask);    \ +                                                                              \ +static force_inline void                                                      \ +scaled_nearest_scanline_##cputype##_##name##_##op (const uint8_t *  mask,     \ +                                                   dst_type *       pd,       \ +                                                   const src_type * ps,       \ +                                                   int32_t          w,        \ +                                                   pixman_fixed_t   vx,       \ +                                                   pixman_fixed_t   unit_x,   \ +                                                   pixman_fixed_t   max_vx,   \ +                                                   pixman_bool_t    zero_src) \ +{                                                                             \ +    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \ +	return;                                                               \ +    pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \ +                                                                  vx, unit_x, \ +                                                                  max_vx,     \ +                                                                  mask);      \ +}                                                                             \ +                                                                              \ +FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                  \ +                              scaled_nearest_scanline_##cputype##_##name##_##op,\ +                              src_type, uint8_t, dst_type, COVER, TRUE, FALSE)\ +FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_none_##op,                   \ +                              scaled_nearest_scanline_##cputype##_##name##_##op,\ +                              src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \ +FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                    \ +                              scaled_nearest_scanline_##cputype##_##name##_##op,\ +                              src_type, uint8_t, dst_type, PAD, TRUE, FALSE)  \ +FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                 \ +                              scaled_nearest_scanline_##cputype##_##name##_##op,\ +                              src_type, uint8_t, dst_type, NORMAL, TRUE, FALSE) + +/* Provide entries for the fast path table */ +#define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)              \ +    SIMPLE_NEAREST_A8_MASK_FAST_PATH (op,s,d,func),                           \ +    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL (op,s,d,func) + +/*****************************************************************************/ + +#define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST(flags, cputype, name, op,     \ +                                                src_type, dst_type)           \ +void                                                                          \ +pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (               \ +                                                dst_type *       dst,         \ +                                                const src_type * top,         \ +                                                const src_type * bottom,      \ +                                                int              wt,          \ +                                                int              wb,          \ +                                                pixman_fixed_t   x,           \ +                                                pixman_fixed_t   ux,          \ +                                                int              width);      \ +                                                                              \ +static force_inline void                                                      \ +scaled_bilinear_scanline_##cputype##_##name##_##op (                          \ +                                                dst_type *       dst,         \ +                                                const uint32_t * mask,        \ +                                                const src_type * src_top,     \ +                                                const src_type * src_bottom,  \ +                                                int32_t          w,           \ +                                                int              wt,          \ +                                                int              wb,          \ +                                                pixman_fixed_t   vx,          \ +                                                pixman_fixed_t   unit_x,      \ +                                                pixman_fixed_t   max_vx,      \ +                                                pixman_bool_t    zero_src)    \ +{                                                                             \ +    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \ +	return;                                                               \ +    pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (           \ +                            dst, src_top, src_bottom, wt, wb, vx, unit_x, w); \ +}                                                                             \ +                                                                              \ +FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                 \ +                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \ +                       src_type, uint32_t, dst_type, COVER, FLAG_NONE)        \ +FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op,                  \ +                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \ +                       src_type, uint32_t, dst_type, NONE, FLAG_NONE)         \ +FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                   \ +                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \ +                       src_type, uint32_t, dst_type, PAD, FLAG_NONE)          \ +FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                \ +                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \ +                       src_type, uint32_t, dst_type, NORMAL,                  \ +                       FLAG_NONE) + + +#define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST(flags, cputype, name, op,  \ +                                                src_type, dst_type)           \ +void                                                                          \ +pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (               \ +                                                dst_type *       dst,         \ +                                                const uint8_t *  mask,        \ +                                                const src_type * top,         \ +                                                const src_type * bottom,      \ +                                                int              wt,          \ +                                                int              wb,          \ +                                                pixman_fixed_t   x,           \ +                                                pixman_fixed_t   ux,          \ +                                                int              width);      \ +                                                                              \ +static force_inline void                                                      \ +scaled_bilinear_scanline_##cputype##_##name##_##op (                          \ +                                                dst_type *       dst,         \ +                                                const uint8_t *  mask,        \ +                                                const src_type * src_top,     \ +                                                const src_type * src_bottom,  \ +                                                int32_t          w,           \ +                                                int              wt,          \ +                                                int              wb,          \ +                                                pixman_fixed_t   vx,          \ +                                                pixman_fixed_t   unit_x,      \ +                                                pixman_fixed_t   max_vx,      \ +                                                pixman_bool_t    zero_src)    \ +{                                                                             \ +    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \ +	return;                                                                   \ +    pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (           \ +                      dst, mask, src_top, src_bottom, wt, wb, vx, unit_x, w); \ +}                                                                             \ +                                                                              \ +FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                 \ +                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \ +                       src_type, uint8_t, dst_type, COVER,                    \ +                       FLAG_HAVE_NON_SOLID_MASK)                              \ +FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op,                  \ +                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \ +                       src_type, uint8_t, dst_type, NONE,                     \ +                       FLAG_HAVE_NON_SOLID_MASK)                              \ +FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                   \ +                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \ +                       src_type, uint8_t, dst_type, PAD,                      \ +                       FLAG_HAVE_NON_SOLID_MASK)                              \ +FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                \ +                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \ +                       src_type, uint8_t, dst_type, NORMAL,                   \ +                       FLAG_HAVE_NON_SOLID_MASK) + + +#endif diff --git a/libs/pixman-0.40.0/pixman/pixman-arm-neon-asm-bilinear.S b/libs/pixman-0.40.0/pixman/pixman-arm-neon-asm-bilinear.S new file mode 100644 index 0000000..0fd92d6 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-arm-neon-asm-bilinear.S @@ -0,0 +1,1358 @@ +/* + * Copyright © 2011 SCore Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com) + * Author:  Taekyun Kim (tkq.kim@samsung.com) + */ + +/* + * This file contains scaled bilinear scanline functions implemented + * using older siarhei's bilinear macro template. + * + * << General scanline function procedures >> + *  1. bilinear interpolate source pixels + *  2. load mask pixels + *  3. load destination pixels + *  4. duplicate mask to fill whole register + *  5. interleave source & destination pixels + *  6. apply mask to source pixels + *  7. combine source & destination pixels + *  8, Deinterleave final result + *  9. store destination pixels + * + * All registers with single number (i.e. src0, tmp0) are 64-bits registers. + * Registers with double numbers(src01, dst01) are 128-bits registers. + * All temp registers can be used freely outside the code block. + * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks. + * + * Remarks + *  There can be lots of pipeline stalls inside code block and between code blocks. + *  Further optimizations will be done by new macro templates using head/tail_head/tail scheme. + */ + +/* Prevent the stack from becoming executable for no reason... */ +#if defined(__linux__) && defined (__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +.text +.fpu neon +.arch armv7a +.object_arch armv4 +.eabi_attribute 10, 0 +.eabi_attribute 12, 0 +.arm +.altmacro +.p2align 2 + +#include "pixman-private.h" +#include "pixman-arm-asm.h" +#include "pixman-arm-neon-asm.h" + +/* + * Bilinear macros from pixman-arm-neon-asm.S + */ + +/* + * Bilinear scaling support code which tries to provide pixel fetching, color + * format conversion, and interpolation as separate macros which can be used + * as the basic building blocks for constructing bilinear scanline functions. + */ + +.macro bilinear_load_8888 reg1, reg2, tmp +    mov       TMP1, X, asr #16 +    add       X, X, UX +    add       TMP1, TOP, TMP1, asl #2 +    vld1.32   {reg1}, [TMP1], STRIDE +    vld1.32   {reg2}, [TMP1] +.endm + +.macro bilinear_load_0565 reg1, reg2, tmp +    mov       TMP1, X, asr #16 +    add       X, X, UX +    add       TMP1, TOP, TMP1, asl #1 +    vld1.32   {reg2[0]}, [TMP1], STRIDE +    vld1.32   {reg2[1]}, [TMP1] +    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp +.endm + +.macro bilinear_load_and_vertical_interpolate_two_8888 \ +                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 + +    bilinear_load_8888 reg1, reg2, tmp1 +    vmull.u8  acc1, reg1, d28 +    vmlal.u8  acc1, reg2, d29 +    bilinear_load_8888 reg3, reg4, tmp2 +    vmull.u8  acc2, reg3, d28 +    vmlal.u8  acc2, reg4, d29 +.endm + +.macro bilinear_load_and_vertical_interpolate_four_8888 \ +                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ +                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + +    bilinear_load_and_vertical_interpolate_two_8888 \ +                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi +    bilinear_load_and_vertical_interpolate_two_8888 \ +                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi +.endm + +.macro bilinear_load_and_vertical_interpolate_two_0565 \ +                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi + +    mov       TMP1, X, asr #16 +    add       X, X, UX +    add       TMP1, TOP, TMP1, asl #1 +    mov       TMP2, X, asr #16 +    add       X, X, UX +    add       TMP2, TOP, TMP2, asl #1 +    vld1.32   {acc2lo[0]}, [TMP1], STRIDE +    vld1.32   {acc2hi[0]}, [TMP2], STRIDE +    vld1.32   {acc2lo[1]}, [TMP1] +    vld1.32   {acc2hi[1]}, [TMP2] +    convert_0565_to_x888 acc2, reg3, reg2, reg1 +    vzip.u8   reg1, reg3 +    vzip.u8   reg2, reg4 +    vzip.u8   reg3, reg4 +    vzip.u8   reg1, reg2 +    vmull.u8  acc1, reg1, d28 +    vmlal.u8  acc1, reg2, d29 +    vmull.u8  acc2, reg3, d28 +    vmlal.u8  acc2, reg4, d29 +.endm + +.macro bilinear_load_and_vertical_interpolate_four_0565 \ +                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ +                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + +    mov       TMP1, X, asr #16 +    add       X, X, UX +    add       TMP1, TOP, TMP1, asl #1 +    mov       TMP2, X, asr #16 +    add       X, X, UX +    add       TMP2, TOP, TMP2, asl #1 +    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE +    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE +    vld1.32   {xacc2lo[1]}, [TMP1] +    vld1.32   {xacc2hi[1]}, [TMP2] +    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 +    mov       TMP1, X, asr #16 +    add       X, X, UX +    add       TMP1, TOP, TMP1, asl #1 +    mov       TMP2, X, asr #16 +    add       X, X, UX +    add       TMP2, TOP, TMP2, asl #1 +    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE +    vzip.u8   xreg1, xreg3 +    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE +    vzip.u8   xreg2, xreg4 +    vld1.32   {yacc2lo[1]}, [TMP1] +    vzip.u8   xreg3, xreg4 +    vld1.32   {yacc2hi[1]}, [TMP2] +    vzip.u8   xreg1, xreg2 +    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 +    vmull.u8  xacc1, xreg1, d28 +    vzip.u8   yreg1, yreg3 +    vmlal.u8  xacc1, xreg2, d29 +    vzip.u8   yreg2, yreg4 +    vmull.u8  xacc2, xreg3, d28 +    vzip.u8   yreg3, yreg4 +    vmlal.u8  xacc2, xreg4, d29 +    vzip.u8   yreg1, yreg2 +    vmull.u8  yacc1, yreg1, d28 +    vmlal.u8  yacc1, yreg2, d29 +    vmull.u8  yacc2, yreg3, d28 +    vmlal.u8  yacc2, yreg4, d29 +.endm + +.macro bilinear_store_8888 numpix, tmp1, tmp2 +.if numpix == 4 +    vst1.32   {d0, d1}, [OUT]! +.elseif numpix == 2 +    vst1.32   {d0}, [OUT]! +.elseif numpix == 1 +    vst1.32   {d0[0]}, [OUT, :32]! +.else +    .error bilinear_store_8888 numpix is unsupported +.endif +.endm + +.macro bilinear_store_0565 numpix, tmp1, tmp2 +    vuzp.u8 d0, d1 +    vuzp.u8 d2, d3 +    vuzp.u8 d1, d3 +    vuzp.u8 d0, d2 +    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 +.if numpix == 4 +    vst1.16   {d2}, [OUT]! +.elseif numpix == 2 +    vst1.32   {d2[0]}, [OUT]! +.elseif numpix == 1 +    vst1.16   {d2[0]}, [OUT]! +.else +    .error bilinear_store_0565 numpix is unsupported +.endif +.endm + + +/* + * Macros for loading mask pixels into register 'mask'. + * vdup must be done in somewhere else. + */ +.macro bilinear_load_mask_x numpix, mask +.endm + +.macro bilinear_load_mask_8 numpix, mask +.if numpix == 4 +    vld1.32     {mask[0]}, [MASK]! +.elseif numpix == 2 +    vld1.16     {mask[0]}, [MASK]! +.elseif numpix == 1 +    vld1.8      {mask[0]}, [MASK]! +.else +    .error bilinear_load_mask_8 numpix is unsupported +.endif +    pld         [MASK, #prefetch_offset] +.endm + +.macro bilinear_load_mask mask_fmt, numpix, mask +    bilinear_load_mask_&mask_fmt numpix, mask +.endm + + +/* + * Macros for loading destination pixels into register 'dst0' and 'dst1'. + * Interleave should be done somewhere else. + */ +.macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01 +.endm + +.macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01 +.endm + +.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01 +.if numpix == 4 +    vld1.32     {dst0, dst1}, [OUT] +.elseif numpix == 2 +    vld1.32     {dst0}, [OUT] +.elseif numpix == 1 +    vld1.32     {dst0[0]}, [OUT] +.else +    .error bilinear_load_dst_8888 numpix is unsupported +.endif +    pld         [OUT, #(prefetch_offset * 4)] +.endm + +.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01 +    bilinear_load_dst_8888 numpix, dst0, dst1, dst01 +.endm + +.macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01 +    bilinear_load_dst_8888 numpix, dst0, dst1, dst01 +.endm + +.macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01 +    bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01 +.endm + +/* + * Macros for duplicating partially loaded mask to fill entire register. + * We will apply mask to interleaved source pixels, that is + *  (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3) + *  (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3) + * So, we need to duplicate loaded mask into whole register. + * + * For two pixel case + *  (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) + *  (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) + * We can do some optimizations for this including last pixel cases. + */ +.macro bilinear_duplicate_mask_x numpix, mask +.endm + +.macro bilinear_duplicate_mask_8 numpix, mask +.if numpix == 4 +    vdup.32     mask, mask[0] +.elseif numpix == 2 +    vdup.16     mask, mask[0] +.elseif numpix == 1 +    vdup.8      mask, mask[0] +.else +    .error bilinear_duplicate_mask_8 is unsupported +.endif +.endm + +.macro bilinear_duplicate_mask mask_fmt, numpix, mask +    bilinear_duplicate_mask_&mask_fmt numpix, mask +.endm + +/* + * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form. + * Interleave should be done when maks is enabled or operator is 'over'. + */ +.macro bilinear_interleave src0, src1, dst0, dst1 +    vuzp.8      src0, src1 +    vuzp.8      dst0, dst1 +    vuzp.8      src0, src1 +    vuzp.8      dst0, dst1 +.endm + +.macro bilinear_interleave_src_dst_x_src \ +                numpix, src0, src1, src01, dst0, dst1, dst01 +.endm + +.macro bilinear_interleave_src_dst_x_over \ +                numpix, src0, src1, src01, dst0, dst1, dst01 + +    bilinear_interleave src0, src1, dst0, dst1 +.endm + +.macro bilinear_interleave_src_dst_x_add \ +                numpix, src0, src1, src01, dst0, dst1, dst01 +.endm + +.macro bilinear_interleave_src_dst_8_src \ +                numpix, src0, src1, src01, dst0, dst1, dst01 + +    bilinear_interleave src0, src1, dst0, dst1 +.endm + +.macro bilinear_interleave_src_dst_8_over \ +                numpix, src0, src1, src01, dst0, dst1, dst01 + +    bilinear_interleave src0, src1, dst0, dst1 +.endm + +.macro bilinear_interleave_src_dst_8_add \ +                numpix, src0, src1, src01, dst0, dst1, dst01 + +    bilinear_interleave src0, src1, dst0, dst1 +.endm + +.macro bilinear_interleave_src_dst \ +                mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01 + +    bilinear_interleave_src_dst_&mask_fmt&_&op \ +                numpix, src0, src1, src01, dst0, dst1, dst01 +.endm + + +/* + * Macros for applying masks to src pixels. (see combine_mask_u() function) + * src, dst should be in interleaved form. + * mask register should be in form (m0, m1, m2, m3). + */ +.macro bilinear_apply_mask_to_src_x \ +                numpix, src0, src1, src01, mask, \ +                tmp01, tmp23, tmp45, tmp67 +.endm + +.macro bilinear_apply_mask_to_src_8 \ +                numpix, src0, src1, src01, mask, \ +                tmp01, tmp23, tmp45, tmp67 + +    vmull.u8        tmp01, src0, mask +    vmull.u8        tmp23, src1, mask +    /* bubbles */ +    vrshr.u16       tmp45, tmp01, #8 +    vrshr.u16       tmp67, tmp23, #8 +    /* bubbles */ +    vraddhn.u16     src0, tmp45, tmp01 +    vraddhn.u16     src1, tmp67, tmp23 +.endm + +.macro bilinear_apply_mask_to_src \ +                mask_fmt, numpix, src0, src1, src01, mask, \ +                tmp01, tmp23, tmp45, tmp67 + +    bilinear_apply_mask_to_src_&mask_fmt \ +                numpix, src0, src1, src01, mask, \ +                tmp01, tmp23, tmp45, tmp67 +.endm + + +/* + * Macros for combining src and destination pixels. + * Interleave or not is depending on operator 'op'. + */ +.macro bilinear_combine_src \ +                numpix, src0, src1, src01, dst0, dst1, dst01, \ +                tmp01, tmp23, tmp45, tmp67, tmp8 +.endm + +.macro bilinear_combine_over \ +                numpix, src0, src1, src01, dst0, dst1, dst01, \ +                tmp01, tmp23, tmp45, tmp67, tmp8 + +    vdup.32     tmp8, src1[1] +    /* bubbles */ +    vmvn.8      tmp8, tmp8 +    /* bubbles */ +    vmull.u8    tmp01, dst0, tmp8 +    /* bubbles */ +    vmull.u8    tmp23, dst1, tmp8 +    /* bubbles */ +    vrshr.u16   tmp45, tmp01, #8 +    vrshr.u16   tmp67, tmp23, #8 +    /* bubbles */ +    vraddhn.u16 dst0, tmp45, tmp01 +    vraddhn.u16 dst1, tmp67, tmp23 +    /* bubbles */ +    vqadd.u8    src01, dst01, src01 +.endm + +.macro bilinear_combine_add \ +                numpix, src0, src1, src01, dst0, dst1, dst01, \ +                tmp01, tmp23, tmp45, tmp67, tmp8 + +    vqadd.u8    src01, dst01, src01 +.endm + +.macro bilinear_combine \ +                op, numpix, src0, src1, src01, dst0, dst1, dst01, \ +                tmp01, tmp23, tmp45, tmp67, tmp8 + +    bilinear_combine_&op \ +                numpix, src0, src1, src01, dst0, dst1, dst01, \ +                tmp01, tmp23, tmp45, tmp67, tmp8 +.endm + +/* + * Macros for final deinterleaving of destination pixels if needed. + */ +.macro bilinear_deinterleave numpix, dst0, dst1, dst01 +    vuzp.8      dst0, dst1 +    /* bubbles */ +    vuzp.8      dst0, dst1 +.endm + +.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01 +.endm + +.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01 +    bilinear_deinterleave numpix, dst0, dst1, dst01 +.endm + +.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01 +.endm + +.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01 +    bilinear_deinterleave numpix, dst0, dst1, dst01 +.endm + +.macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01 +    bilinear_deinterleave numpix, dst0, dst1, dst01 +.endm + +.macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01 +    bilinear_deinterleave numpix, dst0, dst1, dst01 +.endm + +.macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01 +    bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01 +.endm + + +.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op +    bilinear_load_&src_fmt d0, d1, d2 +    bilinear_load_mask mask_fmt, 1, d4 +    bilinear_load_dst dst_fmt, op, 1, d18, d19, q9 +    vmull.u8  q1, d0, d28 +    vmlal.u8  q1, d1, d29 +    /* 5 cycles bubble */ +    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q0, d2, d30 +    vmlal.u16 q0, d3, d30 +    /* 5 cycles bubble */ +    bilinear_duplicate_mask mask_fmt, 1, d4 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    /* 3 cycles bubble */ +    vmovn.u16 d0, q0 +    /* 1 cycle bubble */ +    bilinear_interleave_src_dst \ +                mask_fmt, op, 1, d0, d1, q0, d18, d19, q9 +    bilinear_apply_mask_to_src \ +                mask_fmt, 1, d0, d1, q0, d4, \ +                q3, q8, q10, q11 +    bilinear_combine \ +                op, 1, d0, d1, q0, d18, d19, q9, \ +                q3, q8, q10, q11, d5 +    bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0 +    bilinear_store_&dst_fmt 1, q2, q3 +.endm + +.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op +    bilinear_load_and_vertical_interpolate_two_&src_fmt \ +                q1, q11, d0, d1, d20, d21, d22, d23 +    bilinear_load_mask mask_fmt, 2, d4 +    bilinear_load_dst dst_fmt, op, 2, d18, d19, q9 +    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q0, d2, d30 +    vmlal.u16 q0, d3, d30 +    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q10, d22, d31 +    vmlal.u16 q10, d23, d31 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) +    bilinear_duplicate_mask mask_fmt, 2, d4 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vadd.u16  q12, q12, q13 +    vmovn.u16 d0, q0 +    bilinear_interleave_src_dst \ +                mask_fmt, op, 2, d0, d1, q0, d18, d19, q9 +    bilinear_apply_mask_to_src \ +                mask_fmt, 2, d0, d1, q0, d4, \ +                q3, q8, q10, q11 +    bilinear_combine \ +                op, 2, d0, d1, q0, d18, d19, q9, \ +                q3, q8, q10, q11, d5 +    bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0 +    bilinear_store_&dst_fmt 2, q2, q3 +.endm + +.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op +    bilinear_load_and_vertical_interpolate_four_&src_fmt \ +                q1, q11, d0, d1, d20, d21, d22, d23 \ +                q3, q9,  d4, d5, d16, d17, d18, d19 +    pld       [TMP1, PF_OFFS] +    sub       TMP1, TMP1, STRIDE +    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q0, d2, d30 +    vmlal.u16 q0, d3, d30 +    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q10, d22, d31 +    vmlal.u16 q10, d23, d31 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q2, d6, d30 +    vmlal.u16 q2, d7, d30 +    vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS +    bilinear_load_mask mask_fmt, 4, d22 +    bilinear_load_dst dst_fmt, op, 4, d2, d3, q1 +    pld       [TMP1, PF_OFFS] +    vmlsl.u16 q8, d18, d31 +    vmlal.u16 q8, d19, d31 +    vadd.u16  q12, q12, q13 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) +    bilinear_duplicate_mask mask_fmt, 4, d22 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vmovn.u16 d0, q0 +    vmovn.u16 d1, q2 +    vadd.u16  q12, q12, q13 +    bilinear_interleave_src_dst \ +                mask_fmt, op, 4, d0, d1, q0, d2, d3, q1 +    bilinear_apply_mask_to_src \ +                mask_fmt, 4, d0, d1, q0, d22, \ +                q3, q8, q9, q10 +    bilinear_combine \ +                op, 4, d0, d1, q0, d2, d3, q1, \ +                q3, q8, q9, q10, d23 +    bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0 +    bilinear_store_&dst_fmt 4, q2, q3 +.endm + +.set BILINEAR_FLAG_USE_MASK,		1 +.set BILINEAR_FLAG_USE_ALL_NEON_REGS,	2 + +/* + * Main template macro for generating NEON optimized bilinear scanline functions. + * + * Bilinear scanline generator macro take folling arguments: + *  fname			- name of the function to generate + *  src_fmt			- source color format (8888 or 0565) + *  dst_fmt			- destination color format (8888 or 0565) + *  src/dst_bpp_shift		- (1 << bpp_shift) is the size of src/dst pixel in bytes + *  process_last_pixel		- code block that interpolate one pixel and does not + *				  update horizontal weight + *  process_two_pixels		- code block that interpolate two pixels and update + *				  horizontal weight + *  process_four_pixels		- code block that interpolate four pixels and update + *				  horizontal weight + *  process_pixblock_head	- head part of middle loop + *  process_pixblock_tail	- tail part of middle loop + *  process_pixblock_tail_head	- tail_head of middle loop + *  pixblock_size		- number of pixels processed in a single middle loop + *  prefetch_distance		- prefetch in the source image by that many pixels ahead + */ + +.macro generate_bilinear_scanline_func \ +	fname, \ +	src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \ +	bilinear_process_last_pixel, \ +	bilinear_process_two_pixels, \ +	bilinear_process_four_pixels, \ +	bilinear_process_pixblock_head, \ +	bilinear_process_pixblock_tail, \ +	bilinear_process_pixblock_tail_head, \ +	pixblock_size, \ +	prefetch_distance, \ +	flags + +pixman_asm_function fname +.if pixblock_size == 8 +.elseif pixblock_size == 4 +.else +    .error unsupported pixblock size +.endif + +.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 +    OUT       .req    r0 +    TOP       .req    r1 +    BOTTOM    .req    r2 +    WT        .req    r3 +    WB        .req    r4 +    X         .req    r5 +    UX        .req    r6 +    WIDTH     .req    ip +    TMP1      .req    r3 +    TMP2      .req    r4 +    PF_OFFS   .req    r7 +    TMP3      .req    r8 +    TMP4      .req    r9 +    STRIDE    .req    r2 + +    mov		ip, sp +    push	{r4, r5, r6, r7, r8, r9} +    mov		PF_OFFS, #prefetch_distance +    ldmia	ip, {WB, X, UX, WIDTH} +.else +    OUT       .req      r0 +    MASK      .req      r1 +    TOP       .req      r2 +    BOTTOM    .req      r3 +    WT        .req      r4 +    WB        .req      r5 +    X         .req      r6 +    UX        .req      r7 +    WIDTH     .req      ip +    TMP1      .req      r4 +    TMP2      .req      r5 +    PF_OFFS   .req      r8 +    TMP3      .req      r9 +    TMP4      .req      r10 +    STRIDE    .req      r3 + +    .set prefetch_offset, prefetch_distance + +    mov       ip, sp +    push      {r4, r5, r6, r7, r8, r9, r10, ip} +    mov       PF_OFFS, #prefetch_distance +    ldmia     ip, {WT, WB, X, UX, WIDTH} +.endif + +    mul       PF_OFFS, PF_OFFS, UX + +.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 +    vpush     {d8-d15} +.endif + +    sub	      STRIDE, BOTTOM, TOP +    .unreq    BOTTOM + +    cmp       WIDTH, #0 +    ble       3f + +    vdup.u16  q12, X +    vdup.u16  q13, UX +    vdup.u8   d28, WT +    vdup.u8   d29, WB +    vadd.u16  d25, d25, d26 + +    /* ensure good destination alignment  */ +    cmp       WIDTH, #1 +    blt       0f +    tst       OUT, #(1 << dst_bpp_shift) +    beq       0f +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vadd.u16  q12, q12, q13 +    bilinear_process_last_pixel +    sub       WIDTH, WIDTH, #1 +0: +    vadd.u16  q13, q13, q13 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vadd.u16  q12, q12, q13 + +    cmp       WIDTH, #2 +    blt       0f +    tst       OUT, #(1 << (dst_bpp_shift + 1)) +    beq       0f +    bilinear_process_two_pixels +    sub       WIDTH, WIDTH, #2 +0: +.if pixblock_size == 8 +    cmp       WIDTH, #4 +    blt       0f +    tst       OUT, #(1 << (dst_bpp_shift + 2)) +    beq       0f +    bilinear_process_four_pixels +    sub       WIDTH, WIDTH, #4 +0: +.endif +    subs      WIDTH, WIDTH, #pixblock_size +    blt       1f +    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) +    bilinear_process_pixblock_head +    subs      WIDTH, WIDTH, #pixblock_size +    blt       5f +0: +    bilinear_process_pixblock_tail_head +    subs      WIDTH, WIDTH, #pixblock_size +    bge       0b +5: +    bilinear_process_pixblock_tail +1: +.if pixblock_size == 8 +    tst       WIDTH, #4 +    beq       2f +    bilinear_process_four_pixels +2: +.endif +    /* handle the remaining trailing pixels */ +    tst       WIDTH, #2 +    beq       2f +    bilinear_process_two_pixels +2: +    tst       WIDTH, #1 +    beq       3f +    bilinear_process_last_pixel +3: +.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 +    vpop      {d8-d15} +.endif + +.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 +    pop       {r4, r5, r6, r7, r8, r9} +.else +    pop       {r4, r5, r6, r7, r8, r9, r10, ip} +.endif +    bx        lr + +    .unreq    OUT +    .unreq    TOP +    .unreq    WT +    .unreq    WB +    .unreq    X +    .unreq    UX +    .unreq    WIDTH +    .unreq    TMP1 +    .unreq    TMP2 +    .unreq    PF_OFFS +    .unreq    TMP3 +    .unreq    TMP4 +    .unreq    STRIDE +.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0 +    .unreq    MASK +.endif + +.endfunc + +.endm + +/* src_8888_8_8888 */ +.macro bilinear_src_8888_8_8888_process_last_pixel +    bilinear_interpolate_last_pixel 8888, 8, 8888, src +.endm + +.macro bilinear_src_8888_8_8888_process_two_pixels +    bilinear_interpolate_two_pixels 8888, 8, 8888, src +.endm + +.macro bilinear_src_8888_8_8888_process_four_pixels +    bilinear_interpolate_four_pixels 8888, 8, 8888, src +.endm + +.macro bilinear_src_8888_8_8888_process_pixblock_head +    bilinear_src_8888_8_8888_process_four_pixels +.endm + +.macro bilinear_src_8888_8_8888_process_pixblock_tail +.endm + +.macro bilinear_src_8888_8_8888_process_pixblock_tail_head +    bilinear_src_8888_8_8888_process_pixblock_tail +    bilinear_src_8888_8_8888_process_pixblock_head +.endm + +/* src_8888_8_0565 */ +.macro bilinear_src_8888_8_0565_process_last_pixel +    bilinear_interpolate_last_pixel 8888, 8, 0565, src +.endm + +.macro bilinear_src_8888_8_0565_process_two_pixels +    bilinear_interpolate_two_pixels 8888, 8, 0565, src +.endm + +.macro bilinear_src_8888_8_0565_process_four_pixels +    bilinear_interpolate_four_pixels 8888, 8, 0565, src +.endm + +.macro bilinear_src_8888_8_0565_process_pixblock_head +    bilinear_src_8888_8_0565_process_four_pixels +.endm + +.macro bilinear_src_8888_8_0565_process_pixblock_tail +.endm + +.macro bilinear_src_8888_8_0565_process_pixblock_tail_head +    bilinear_src_8888_8_0565_process_pixblock_tail +    bilinear_src_8888_8_0565_process_pixblock_head +.endm + +/* src_0565_8_x888 */ +.macro bilinear_src_0565_8_x888_process_last_pixel +    bilinear_interpolate_last_pixel 0565, 8, 8888, src +.endm + +.macro bilinear_src_0565_8_x888_process_two_pixels +    bilinear_interpolate_two_pixels 0565, 8, 8888, src +.endm + +.macro bilinear_src_0565_8_x888_process_four_pixels +    bilinear_interpolate_four_pixels 0565, 8, 8888, src +.endm + +.macro bilinear_src_0565_8_x888_process_pixblock_head +    bilinear_src_0565_8_x888_process_four_pixels +.endm + +.macro bilinear_src_0565_8_x888_process_pixblock_tail +.endm + +.macro bilinear_src_0565_8_x888_process_pixblock_tail_head +    bilinear_src_0565_8_x888_process_pixblock_tail +    bilinear_src_0565_8_x888_process_pixblock_head +.endm + +/* src_0565_8_0565 */ +.macro bilinear_src_0565_8_0565_process_last_pixel +    bilinear_interpolate_last_pixel 0565, 8, 0565, src +.endm + +.macro bilinear_src_0565_8_0565_process_two_pixels +    bilinear_interpolate_two_pixels 0565, 8, 0565, src +.endm + +.macro bilinear_src_0565_8_0565_process_four_pixels +    bilinear_interpolate_four_pixels 0565, 8, 0565, src +.endm + +.macro bilinear_src_0565_8_0565_process_pixblock_head +    bilinear_src_0565_8_0565_process_four_pixels +.endm + +.macro bilinear_src_0565_8_0565_process_pixblock_tail +.endm + +.macro bilinear_src_0565_8_0565_process_pixblock_tail_head +    bilinear_src_0565_8_0565_process_pixblock_tail +    bilinear_src_0565_8_0565_process_pixblock_head +.endm + +/* over_8888_8888 */ +.macro bilinear_over_8888_8888_process_last_pixel +    bilinear_interpolate_last_pixel 8888, x, 8888, over +.endm + +.macro bilinear_over_8888_8888_process_two_pixels +    bilinear_interpolate_two_pixels 8888, x, 8888, over +.endm + +.macro bilinear_over_8888_8888_process_four_pixels +    bilinear_interpolate_four_pixels 8888, x, 8888, over +.endm + +.macro bilinear_over_8888_8888_process_pixblock_head +    mov         TMP1, X, asr #16 +    add         X, X, UX +    add         TMP1, TOP, TMP1, asl #2 +    mov         TMP2, X, asr #16 +    add         X, X, UX +    add         TMP2, TOP, TMP2, asl #2 + +    vld1.32     {d22}, [TMP1], STRIDE +    vld1.32     {d23}, [TMP1] +    mov         TMP3, X, asr #16 +    add         X, X, UX +    add         TMP3, TOP, TMP3, asl #2 +    vmull.u8    q8, d22, d28 +    vmlal.u8    q8, d23, d29 + +    vld1.32     {d22}, [TMP2], STRIDE +    vld1.32     {d23}, [TMP2] +    mov         TMP4, X, asr #16 +    add         X, X, UX +    add         TMP4, TOP, TMP4, asl #2 +    vmull.u8    q9, d22, d28 +    vmlal.u8    q9, d23, d29 + +    vld1.32     {d22}, [TMP3], STRIDE +    vld1.32     {d23}, [TMP3] +    vmull.u8    q10, d22, d28 +    vmlal.u8    q10, d23, d29 + +    vshll.u16   q0, d16, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16   q0, d16, d30 +    vmlal.u16   q0, d17, d30 + +    pld         [TMP4, PF_OFFS] +    vld1.32     {d16}, [TMP4], STRIDE +    vld1.32     {d17}, [TMP4] +    pld         [TMP4, PF_OFFS] +    vmull.u8    q11, d16, d28 +    vmlal.u8    q11, d17, d29 + +    vshll.u16   q1, d18, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16   q1, d18, d31 +    vmlal.u16   q1, d19, d31 +    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vadd.u16    q12, q12, q13 +.endm + +.macro bilinear_over_8888_8888_process_pixblock_tail +    vshll.u16   q2, d20, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16   q2, d20, d30 +    vmlal.u16   q2, d21, d30 +    vshll.u16   q3, d22, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16   q3, d22, d31 +    vmlal.u16   q3, d23, d31 +    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) +    vld1.32     {d2, d3}, [OUT, :128] +    pld         [OUT, #(prefetch_offset * 4)] +    vshrn.u32   d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vshrn.u32   d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) +    vmovn.u16   d6, q0 +    vmovn.u16   d7, q2 +    vuzp.8      d6, d7 +    vuzp.8      d2, d3 +    vuzp.8      d6, d7 +    vuzp.8      d2, d3 +    vdup.32     d4, d7[1] +    vmvn.8      d4, d4 +    vmull.u8    q11, d2, d4 +    vmull.u8    q2, d3, d4 +    vrshr.u16   q1, q11, #8 +    vrshr.u16   q10, q2, #8 +    vraddhn.u16 d2, q1, q11 +    vraddhn.u16 d3, q10, q2 +    vqadd.u8    q3, q1, q3 +    vuzp.8      d6, d7 +    vuzp.8      d6, d7 +    vadd.u16    q12, q12, q13 +    vst1.32     {d6, d7}, [OUT, :128]! +.endm + +.macro bilinear_over_8888_8888_process_pixblock_tail_head +                                            vshll.u16   q2, d20, #BILINEAR_INTERPOLATION_BITS +    mov         TMP1, X, asr #16 +    add         X, X, UX +    add         TMP1, TOP, TMP1, asl #2 +                                            vmlsl.u16   q2, d20, d30 +    mov         TMP2, X, asr #16 +    add         X, X, UX +    add         TMP2, TOP, TMP2, asl #2 +                                            vmlal.u16   q2, d21, d30 +                                            vshll.u16   q3, d22, #BILINEAR_INTERPOLATION_BITS +    vld1.32     {d20}, [TMP1], STRIDE +                                            vmlsl.u16   q3, d22, d31 +                                            vmlal.u16   q3, d23, d31 +    vld1.32     {d21}, [TMP1] +    vmull.u8    q8, d20, d28 +    vmlal.u8    q8, d21, d29 +                                            vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +                                            vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) +                                            vld1.32     {d2, d3}, [OUT, :128] +                                            pld         [OUT, PF_OFFS] +                                            vshrn.u32   d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) +                                            vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vld1.32     {d22}, [TMP2], STRIDE +                                            vshrn.u32   d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) +                                            vmovn.u16   d6, q0 +    vld1.32     {d23}, [TMP2] +    vmull.u8    q9, d22, d28 +    mov         TMP3, X, asr #16 +    add         X, X, UX +    add         TMP3, TOP, TMP3, asl #2 +    mov         TMP4, X, asr #16 +    add         X, X, UX +    add         TMP4, TOP, TMP4, asl #2 +    vmlal.u8    q9, d23, d29 +                                            vmovn.u16   d7, q2 +    vld1.32     {d22}, [TMP3], STRIDE +                                            vuzp.8      d6, d7 +                                            vuzp.8      d2, d3 +                                            vuzp.8      d6, d7 +                                            vuzp.8      d2, d3 +                                            vdup.32     d4, d7[1] +    vld1.32     {d23}, [TMP3] +                                            vmvn.8      d4, d4 +    vmull.u8    q10, d22, d28 +    vmlal.u8    q10, d23, d29 +                                            vmull.u8    q11, d2, d4 +                                            vmull.u8    q2, d3, d4 +    vshll.u16   q0, d16, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16   q0, d16, d30 +                                            vrshr.u16   q1, q11, #8 +    vmlal.u16   q0, d17, d30 +                                            vrshr.u16   q8, q2, #8 +                                            vraddhn.u16 d2, q1, q11 +                                            vraddhn.u16 d3, q8, q2 +    pld         [TMP4, PF_OFFS] +    vld1.32     {d16}, [TMP4], STRIDE +                                            vqadd.u8    q3, q1, q3 +    vld1.32     {d17}, [TMP4] +    pld         [TMP4, PF_OFFS] +    vmull.u8    q11, d16, d28 +    vmlal.u8    q11, d17, d29 +                                            vuzp.8      d6, d7 +    vshll.u16   q1, d18, #BILINEAR_INTERPOLATION_BITS +                                            vuzp.8      d6, d7 +    vmlsl.u16   q1, d18, d31 +                                            vadd.u16    q12, q12, q13 +    vmlal.u16   q1, d19, d31 +    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vadd.u16    q12, q12, q13 +                                            vst1.32     {d6, d7}, [OUT, :128]! +.endm + +/* over_8888_8_8888 */ +.macro bilinear_over_8888_8_8888_process_last_pixel +    bilinear_interpolate_last_pixel 8888, 8, 8888, over +.endm + +.macro bilinear_over_8888_8_8888_process_two_pixels +    bilinear_interpolate_two_pixels 8888, 8, 8888, over +.endm + +.macro bilinear_over_8888_8_8888_process_four_pixels +    bilinear_interpolate_four_pixels 8888, 8, 8888, over +.endm + +.macro bilinear_over_8888_8_8888_process_pixblock_head +    mov         TMP1, X, asr #16 +    add         X, X, UX +    add         TMP1, TOP, TMP1, asl #2 +    vld1.32     {d0}, [TMP1], STRIDE +    mov         TMP2, X, asr #16 +    add         X, X, UX +    add         TMP2, TOP, TMP2, asl #2 +    vld1.32     {d1}, [TMP1] +    mov         TMP3, X, asr #16 +    add         X, X, UX +    add         TMP3, TOP, TMP3, asl #2 +    vld1.32     {d2}, [TMP2], STRIDE +    mov         TMP4, X, asr #16 +    add         X, X, UX +    add         TMP4, TOP, TMP4, asl #2 +    vld1.32     {d3}, [TMP2] +    vmull.u8    q2, d0, d28 +    vmull.u8    q3, d2, d28 +    vmlal.u8    q2, d1, d29 +    vmlal.u8    q3, d3, d29 +    vshll.u16   q0, d4, #BILINEAR_INTERPOLATION_BITS +    vshll.u16   q1, d6, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16   q0, d4, d30 +    vmlsl.u16   q1, d6, d31 +    vmlal.u16   q0, d5, d30 +    vmlal.u16   q1, d7, d31 +    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) +    vld1.32     {d2}, [TMP3], STRIDE +    vld1.32     {d3}, [TMP3] +    pld         [TMP4, PF_OFFS] +    vld1.32     {d4}, [TMP4], STRIDE +    vld1.32     {d5}, [TMP4] +    pld         [TMP4, PF_OFFS] +    vmull.u8    q3, d2, d28 +    vmlal.u8    q3, d3, d29 +    vmull.u8    q1, d4, d28 +    vmlal.u8    q1, d5, d29 +    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vld1.32     {d22[0]}, [MASK]! +    pld         [MASK, #prefetch_offset] +    vadd.u16    q12, q12, q13 +    vmovn.u16   d16, q0 +.endm + +.macro bilinear_over_8888_8_8888_process_pixblock_tail +    vshll.u16   q9, d6, #BILINEAR_INTERPOLATION_BITS +    vshll.u16   q10, d2, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16   q9, d6, d30 +    vmlsl.u16   q10, d2, d31 +    vmlal.u16   q9, d7, d30 +    vmlal.u16   q10, d3, d31 +    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vadd.u16    q12, q12, q13 +    vdup.32     d22, d22[0] +    vshrn.u32   d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32   d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS) +    vmovn.u16   d17, q9 +    vld1.32     {d18, d19}, [OUT, :128] +    pld         [OUT, PF_OFFS] +    vuzp.8      d16, d17 +    vuzp.8      d18, d19 +    vuzp.8      d16, d17 +    vuzp.8      d18, d19 +    vmull.u8    q10, d16, d22 +    vmull.u8    q11, d17, d22 +    vrsra.u16   q10, q10, #8 +    vrsra.u16   q11, q11, #8 +    vrshrn.u16  d16, q10, #8 +    vrshrn.u16  d17, q11, #8 +    vdup.32     d22, d17[1] +    vmvn.8      d22, d22 +    vmull.u8    q10, d18, d22 +    vmull.u8    q11, d19, d22 +    vrshr.u16   q9, q10, #8 +    vrshr.u16   q0, q11, #8 +    vraddhn.u16 d18, q9, q10 +    vraddhn.u16 d19, q0, q11 +    vqadd.u8    q9, q8, q9 +    vuzp.8      d18, d19 +    vuzp.8      d18, d19 +    vst1.32     {d18, d19}, [OUT, :128]! +.endm + +.macro bilinear_over_8888_8_8888_process_pixblock_tail_head +                                            vshll.u16   q9, d6, #BILINEAR_INTERPOLATION_BITS +    mov         TMP1, X, asr #16 +    add         X, X, UX +    add         TMP1, TOP, TMP1, asl #2 +                                            vshll.u16   q10, d2, #BILINEAR_INTERPOLATION_BITS +    vld1.32     {d0}, [TMP1], STRIDE +    mov         TMP2, X, asr #16 +    add         X, X, UX +    add         TMP2, TOP, TMP2, asl #2 +                                            vmlsl.u16   q9, d6, d30 +                                            vmlsl.u16   q10, d2, d31 +    vld1.32     {d1}, [TMP1] +    mov         TMP3, X, asr #16 +    add         X, X, UX +    add         TMP3, TOP, TMP3, asl #2 +                                            vmlal.u16   q9, d7, d30 +                                            vmlal.u16   q10, d3, d31 +    vld1.32     {d2}, [TMP2], STRIDE +    mov         TMP4, X, asr #16 +    add         X, X, UX +    add         TMP4, TOP, TMP4, asl #2 +                                            vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +                                            vadd.u16    q12, q12, q13 +    vld1.32     {d3}, [TMP2] +                                            vdup.32     d22, d22[0] +                                            vshrn.u32   d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS) +                                            vshrn.u32   d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS) +    vmull.u8    q2, d0, d28 +    vmull.u8    q3, d2, d28 +                                            vmovn.u16   d17, q9 +                                            vld1.32     {d18, d19}, [OUT, :128] +                                            pld         [OUT, #(prefetch_offset * 4)] +    vmlal.u8    q2, d1, d29 +    vmlal.u8    q3, d3, d29 +                                            vuzp.8      d16, d17 +                                            vuzp.8      d18, d19 +    vshll.u16   q0, d4, #BILINEAR_INTERPOLATION_BITS +    vshll.u16   q1, d6, #BILINEAR_INTERPOLATION_BITS +                                            vuzp.8      d16, d17 +                                            vuzp.8      d18, d19 +    vmlsl.u16   q0, d4, d30 +    vmlsl.u16   q1, d6, d31 +                                            vmull.u8    q10, d16, d22 +                                            vmull.u8    q11, d17, d22 +    vmlal.u16   q0, d5, d30 +    vmlal.u16   q1, d7, d31 +                                            vrsra.u16   q10, q10, #8 +                                            vrsra.u16   q11, q11, #8 +    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) +                                            vrshrn.u16  d16, q10, #8 +                                            vrshrn.u16  d17, q11, #8 +    vld1.32     {d2}, [TMP3], STRIDE +                                            vdup.32     d22, d17[1] +    vld1.32     {d3}, [TMP3] +                                            vmvn.8      d22, d22 +    pld         [TMP4, PF_OFFS] +    vld1.32     {d4}, [TMP4], STRIDE +                                            vmull.u8    q10, d18, d22 +                                            vmull.u8    q11, d19, d22 +    vld1.32     {d5}, [TMP4] +    pld         [TMP4, PF_OFFS] +    vmull.u8    q3, d2, d28 +                                            vrshr.u16   q9, q10, #8 +                                            vrshr.u16   q15, q11, #8 +    vmlal.u8    q3, d3, d29 +    vmull.u8    q1, d4, d28 +                                            vraddhn.u16 d18, q9, q10 +                                            vraddhn.u16 d19, q15, q11 +    vmlal.u8    q1, d5, d29 +    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +                                            vqadd.u8    q9, q8, q9 +    vld1.32     {d22[0]}, [MASK]! +                                            vuzp.8      d18, d19 +    vadd.u16    q12, q12, q13 +                                            vuzp.8      d18, d19 +    vmovn.u16   d16, q0 +                                            vst1.32     {d18, d19}, [OUT, :128]! +.endm + +/* add_8888_8888 */ +.macro bilinear_add_8888_8888_process_last_pixel +    bilinear_interpolate_last_pixel 8888, x, 8888, add +.endm + +.macro bilinear_add_8888_8888_process_two_pixels +    bilinear_interpolate_two_pixels 8888, x, 8888, add +.endm + +.macro bilinear_add_8888_8888_process_four_pixels +    bilinear_interpolate_four_pixels 8888, x, 8888, add +.endm + +.macro bilinear_add_8888_8888_process_pixblock_head +    bilinear_add_8888_8888_process_four_pixels +.endm + +.macro bilinear_add_8888_8888_process_pixblock_tail +.endm + +.macro bilinear_add_8888_8888_process_pixblock_tail_head +    bilinear_add_8888_8888_process_pixblock_tail +    bilinear_add_8888_8888_process_pixblock_head +.endm + +/* add_8888_8_8888 */ +.macro bilinear_add_8888_8_8888_process_last_pixel +    bilinear_interpolate_last_pixel 8888, 8, 8888, add +.endm + +.macro bilinear_add_8888_8_8888_process_two_pixels +    bilinear_interpolate_two_pixels 8888, 8, 8888, add +.endm + +.macro bilinear_add_8888_8_8888_process_four_pixels +    bilinear_interpolate_four_pixels 8888, 8, 8888, add +.endm + +.macro bilinear_add_8888_8_8888_process_pixblock_head +    bilinear_add_8888_8_8888_process_four_pixels +.endm + +.macro bilinear_add_8888_8_8888_process_pixblock_tail +.endm + +.macro bilinear_add_8888_8_8888_process_pixblock_tail_head +    bilinear_add_8888_8_8888_process_pixblock_tail +    bilinear_add_8888_8_8888_process_pixblock_head +.endm + + +/* Bilinear scanline functions */ +generate_bilinear_scanline_func \ +    pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \ +    8888, 8888, 2, 2, \ +    bilinear_src_8888_8_8888_process_last_pixel, \ +    bilinear_src_8888_8_8888_process_two_pixels, \ +    bilinear_src_8888_8_8888_process_four_pixels, \ +    bilinear_src_8888_8_8888_process_pixblock_head, \ +    bilinear_src_8888_8_8888_process_pixblock_tail, \ +    bilinear_src_8888_8_8888_process_pixblock_tail_head, \ +    4, 28, BILINEAR_FLAG_USE_MASK + +generate_bilinear_scanline_func \ +    pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \ +    8888, 0565, 2, 1, \ +    bilinear_src_8888_8_0565_process_last_pixel, \ +    bilinear_src_8888_8_0565_process_two_pixels, \ +    bilinear_src_8888_8_0565_process_four_pixels, \ +    bilinear_src_8888_8_0565_process_pixblock_head, \ +    bilinear_src_8888_8_0565_process_pixblock_tail, \ +    bilinear_src_8888_8_0565_process_pixblock_tail_head, \ +    4, 28, BILINEAR_FLAG_USE_MASK + +generate_bilinear_scanline_func \ +    pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \ +    0565, 8888, 1, 2, \ +    bilinear_src_0565_8_x888_process_last_pixel, \ +    bilinear_src_0565_8_x888_process_two_pixels, \ +    bilinear_src_0565_8_x888_process_four_pixels, \ +    bilinear_src_0565_8_x888_process_pixblock_head, \ +    bilinear_src_0565_8_x888_process_pixblock_tail, \ +    bilinear_src_0565_8_x888_process_pixblock_tail_head, \ +    4, 28, BILINEAR_FLAG_USE_MASK + +generate_bilinear_scanline_func \ +    pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \ +    0565, 0565, 1, 1, \ +    bilinear_src_0565_8_0565_process_last_pixel, \ +    bilinear_src_0565_8_0565_process_two_pixels, \ +    bilinear_src_0565_8_0565_process_four_pixels, \ +    bilinear_src_0565_8_0565_process_pixblock_head, \ +    bilinear_src_0565_8_0565_process_pixblock_tail, \ +    bilinear_src_0565_8_0565_process_pixblock_tail_head, \ +    4, 28, BILINEAR_FLAG_USE_MASK + +generate_bilinear_scanline_func \ +    pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \ +    8888, 8888, 2, 2, \ +    bilinear_over_8888_8888_process_last_pixel, \ +    bilinear_over_8888_8888_process_two_pixels, \ +    bilinear_over_8888_8888_process_four_pixels, \ +    bilinear_over_8888_8888_process_pixblock_head, \ +    bilinear_over_8888_8888_process_pixblock_tail, \ +    bilinear_over_8888_8888_process_pixblock_tail_head, \ +    4, 28, 0 + +generate_bilinear_scanline_func \ +    pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \ +    8888, 8888, 2, 2, \ +    bilinear_over_8888_8_8888_process_last_pixel, \ +    bilinear_over_8888_8_8888_process_two_pixels, \ +    bilinear_over_8888_8_8888_process_four_pixels, \ +    bilinear_over_8888_8_8888_process_pixblock_head, \ +    bilinear_over_8888_8_8888_process_pixblock_tail, \ +    bilinear_over_8888_8_8888_process_pixblock_tail_head, \ +    4, 28, BILINEAR_FLAG_USE_MASK + +generate_bilinear_scanline_func \ +    pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \ +    8888, 8888, 2, 2, \ +    bilinear_add_8888_8888_process_last_pixel, \ +    bilinear_add_8888_8888_process_two_pixels, \ +    bilinear_add_8888_8888_process_four_pixels, \ +    bilinear_add_8888_8888_process_pixblock_head, \ +    bilinear_add_8888_8888_process_pixblock_tail, \ +    bilinear_add_8888_8888_process_pixblock_tail_head, \ +    4, 28, 0 + +generate_bilinear_scanline_func \ +    pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \ +    8888, 8888, 2, 2, \ +    bilinear_add_8888_8_8888_process_last_pixel, \ +    bilinear_add_8888_8_8888_process_two_pixels, \ +    bilinear_add_8888_8_8888_process_four_pixels, \ +    bilinear_add_8888_8_8888_process_pixblock_head, \ +    bilinear_add_8888_8_8888_process_pixblock_tail, \ +    bilinear_add_8888_8_8888_process_pixblock_tail_head, \ +    4, 28, BILINEAR_FLAG_USE_MASK diff --git a/libs/pixman-0.40.0/pixman/pixman-arm-neon-asm.S b/libs/pixman-0.40.0/pixman/pixman-arm-neon-asm.S new file mode 100644 index 0000000..7e949a3 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-arm-neon-asm.S @@ -0,0 +1,3627 @@ +/* + * Copyright © 2009 Nokia Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com) + */ + +/* + * This file contains implementations of NEON optimized pixel processing + * functions. There is no full and detailed tutorial, but some functions + * (those which are exposing some new or interesting features) are + * extensively commented and can be used as examples. + * + * You may want to have a look at the comments for following functions: + *  - pixman_composite_over_8888_0565_asm_neon + *  - pixman_composite_over_n_8_0565_asm_neon + */ + +/* Prevent the stack from becoming executable for no reason... */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +    .text +    .fpu neon +    .arch armv7a +    .object_arch armv4 +    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */ +    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */ +    .arm +    .altmacro +    .p2align 2 + +#include "pixman-private.h" +#include "pixman-arm-asm.h" +#include "pixman-arm-neon-asm.h" + +/* Global configuration options and preferences */ + +/* + * The code can optionally make use of unaligned memory accesses to improve + * performance of handling leading/trailing pixels for each scanline. + * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for + * example in linux if unaligned memory accesses are not configured to + * generate.exceptions. + */ +.set RESPECT_STRICT_ALIGNMENT, 1 + +/* + * Set default prefetch type. There is a choice between the following options: + * + * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work + * as NOP to workaround some HW bugs or for whatever other reason) + * + * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where + * advanced prefetch intruduces heavy overhead) + * + * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 + * which can run ARM and NEON instructions simultaneously so that extra ARM + * instructions do not add (many) extra cycles, but improve prefetch efficiency) + * + * Note: some types of function can't support advanced prefetch and fallback + *       to simple one (those which handle 24bpp pixels) + */ +.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED + +/* Prefetch distance in pixels for simple prefetch */ +.set PREFETCH_DISTANCE_SIMPLE, 64 + +/* + * Implementation of pixman_composite_over_8888_0565_asm_neon + * + * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and + * performs OVER compositing operation. Function fast_composite_over_8888_0565 + * from pixman-fast-path.c does the same in C and can be used as a reference. + * + * First we need to have some NEON assembly code which can do the actual + * operation on the pixels and provide it to the template macro. + * + * Template macro quite conveniently takes care of emitting all the necessary + * code for memory reading and writing (including quite tricky cases of + * handling unaligned leading/trailing pixels), so we only need to deal with + * the data in NEON registers. + * + * NEON registers allocation in general is recommented to be the following: + * d0,  d1,  d2,  d3  - contain loaded source pixel data + * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed) + * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used) + * d28, d29, d30, d31 - place for storing the result (destination pixels) + * + * As can be seen above, four 64-bit NEON registers are used for keeping + * intermediate pixel data and up to 8 pixels can be processed in one step + * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). + * + * This particular function uses the following registers allocation: + * d0,  d1,  d2,  d3  - contain loaded source pixel data + * d4,  d5            - contain loaded destination pixels (they are needed) + * d28, d29           - place for storing the result (destination pixels) + */ + +/* + * Step one. We need to have some code to do some arithmetics on pixel data. + * This is implemented as a pair of macros: '*_head' and '*_tail'. When used + * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5}, + * perform all the needed calculations and write the result to {d28, d29}. + * The rationale for having two macros and not just one will be explained + * later. In practice, any single monolitic function which does the work can + * be split into two parts in any arbitrary way without affecting correctness. + * + * There is one special trick here too. Common template macro can optionally + * make our life a bit easier by doing R, G, B, A color components + * deinterleaving for 32bpp pixel formats (and this feature is used in + * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that + * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we + * actually use d0 register for blue channel (a vector of eight 8-bit + * values), d1 register for green, d2 for red and d3 for alpha. This + * simple conversion can be also done with a few NEON instructions: + * + * Packed to planar conversion: + *  vuzp.8 d0, d1 + *  vuzp.8 d2, d3 + *  vuzp.8 d1, d3 + *  vuzp.8 d0, d2 + * + * Planar to packed conversion: + *  vzip.8 d0, d2 + *  vzip.8 d1, d3 + *  vzip.8 d2, d3 + *  vzip.8 d0, d1 + * + * But pixel can be loaded directly in planar format using VLD4.8 NEON + * instruction. It is 1 cycle slower than VLD1.32, so this is not always + * desirable, that's why deinterleaving is optional. + * + * But anyway, here is the code: + */ +.macro pixman_composite_over_8888_0565_process_pixblock_head +    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format +       and put data into d6 - red, d7 - green, d30 - blue */ +    vshrn.u16   d6, q2, #8 +    vshrn.u16   d7, q2, #3 +    vsli.u16    q2, q2, #5 +    vsri.u8     d6, d6, #5 +    vmvn.8      d3, d3      /* invert source alpha */ +    vsri.u8     d7, d7, #6 +    vshrn.u16   d30, q2, #2 +    /* now do alpha blending, storing results in 8-bit planar format +       into d16 - red, d19 - green, d18 - blue */ +    vmull.u8    q10, d3, d6 +    vmull.u8    q11, d3, d7 +    vmull.u8    q12, d3, d30 +    vrshr.u16   q13, q10, #8 +    vrshr.u16   q3, q11, #8 +    vrshr.u16   q15, q12, #8 +    vraddhn.u16 d20, q10, q13 +    vraddhn.u16 d23, q11, q3 +    vraddhn.u16 d22, q12, q15 +.endm + +.macro pixman_composite_over_8888_0565_process_pixblock_tail +    /* ... continue alpha blending */ +    vqadd.u8    d16, d2, d20 +    vqadd.u8    q9, q0, q11 +    /* convert the result to r5g6b5 and store it into {d28, d29} */ +    vshll.u8    q14, d16, #8 +    vshll.u8    q8, d19, #8 +    vshll.u8    q9, d18, #8 +    vsri.u16    q14, q8, #5 +    vsri.u16    q14, q9, #11 +.endm + +/* + * OK, now we got almost everything that we need. Using the above two + * macros, the work can be done right. But now we want to optimize + * it a bit. ARM Cortex-A8 is an in-order core, and benefits really + * a lot from good code scheduling and software pipelining. + * + * Let's construct some code, which will run in the core main loop. + * Some pseudo-code of the main loop will look like this: + *   head + *   while (...) { + *     tail + *     head + *   } + *   tail + * + * It may look a bit weird, but this setup allows to hide instruction + * latencies better and also utilize dual-issue capability more + * efficiently (make pairs of load-store and ALU instructions). + * + * So what we need now is a '*_tail_head' macro, which will be used + * in the core main loop. A trivial straightforward implementation + * of this macro would look like this: + * + *   pixman_composite_over_8888_0565_process_pixblock_tail + *   vst1.16     {d28, d29}, [DST_W, :128]! + *   vld1.16     {d4, d5}, [DST_R, :128]! + *   vld4.32     {d0, d1, d2, d3}, [SRC]! + *   pixman_composite_over_8888_0565_process_pixblock_head + *   cache_preload 8, 8 + * + * Now it also got some VLD/VST instructions. We simply can't move from + * processing one block of pixels to the other one with just arithmetics. + * The previously processed data needs to be written to memory and new + * data needs to be fetched. Fortunately, this main loop does not deal + * with partial leading/trailing pixels and can load/store a full block + * of pixels in a bulk. Additionally, destination buffer is already + * 16 bytes aligned here (which is good for performance). + * + * New things here are DST_R, DST_W, SRC and MASK identifiers. These + * are the aliases for ARM registers which are used as pointers for + * accessing data. We maintain separate pointers for reading and writing + * destination buffer (DST_R and DST_W). + * + * Another new thing is 'cache_preload' macro. It is used for prefetching + * data into CPU L2 cache and improve performance when dealing with large + * images which are far larger than cache size. It uses one argument + * (actually two, but they need to be the same here) - number of pixels + * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some + * details about this macro. Moreover, if good performance is needed + * the code from this macro needs to be copied into '*_tail_head' macro + * and mixed with the rest of code for optimal instructions scheduling. + * We are actually doing it below. + * + * Now after all the explanations, here is the optimized code. + * Different instruction streams (originaling from '*_head', '*_tail' + * and 'cache_preload' macro) use different indentation levels for + * better readability. Actually taking the code from one of these + * indentation levels and ignoring a few VLD/VST instructions would + * result in exactly the code from '*_head', '*_tail' or 'cache_preload' + * macro! + */ + +#if 1 + +.macro pixman_composite_over_8888_0565_process_pixblock_tail_head +        vqadd.u8    d16, d2, d20 +    vld1.16     {d4, d5}, [DST_R, :128]! +        vqadd.u8    q9, q0, q11 +    vshrn.u16   d6, q2, #8 +    fetch_src_pixblock +    vshrn.u16   d7, q2, #3 +    vsli.u16    q2, q2, #5 +        vshll.u8    q14, d16, #8 +                                    PF add PF_X, PF_X, #8 +        vshll.u8    q8, d19, #8 +                                    PF tst PF_CTL, #0xF +    vsri.u8     d6, d6, #5 +                                    PF addne PF_X, PF_X, #8 +    vmvn.8      d3, d3 +                                    PF subne PF_CTL, PF_CTL, #1 +    vsri.u8     d7, d7, #6 +    vshrn.u16   d30, q2, #2 +    vmull.u8    q10, d3, d6 +                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] +    vmull.u8    q11, d3, d7 +    vmull.u8    q12, d3, d30 +                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] +        vsri.u16    q14, q8, #5 +                                    PF cmp PF_X, ORIG_W +        vshll.u8    q9, d18, #8 +    vrshr.u16   q13, q10, #8 +                                    PF subge PF_X, PF_X, ORIG_W +    vrshr.u16   q3, q11, #8 +    vrshr.u16   q15, q12, #8 +                                    PF subges PF_CTL, PF_CTL, #0x10 +        vsri.u16    q14, q9, #11 +                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! +    vraddhn.u16 d20, q10, q13 +    vraddhn.u16 d23, q11, q3 +                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! +    vraddhn.u16 d22, q12, q15 +        vst1.16     {d28, d29}, [DST_W, :128]! +.endm + +#else + +/* If we did not care much about the performance, we would just use this... */ +.macro pixman_composite_over_8888_0565_process_pixblock_tail_head +    pixman_composite_over_8888_0565_process_pixblock_tail +    vst1.16     {d28, d29}, [DST_W, :128]! +    vld1.16     {d4, d5}, [DST_R, :128]! +    fetch_src_pixblock +    pixman_composite_over_8888_0565_process_pixblock_head +    cache_preload 8, 8 +.endm + +#endif + +/* + * And now the final part. We are using 'generate_composite_function' macro + * to put all the stuff together. We are specifying the name of the function + * which we want to get, number of bits per pixel for the source, mask and + * destination (0 if unused, like mask in this case). Next come some bit + * flags: + *   FLAG_DST_READWRITE      - tells that the destination buffer is both read + *                             and written, for write-only buffer we would use + *                             FLAG_DST_WRITEONLY flag instead + *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data + *                             and separate color channels for 32bpp format. + * The next things are: + *  - the number of pixels processed per iteration (8 in this case, because + *    that's the maximum what can fit into four 64-bit NEON registers). + *  - prefetch distance, measured in pixel blocks. In this case it is 5 times + *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal + *    prefetch distance can be selected by running some benchmarks. + * + * After that we specify some macros, these are 'default_init', + * 'default_cleanup' here which are empty (but it is possible to have custom + * init/cleanup macros to be able to save/restore some extra NEON registers + * like d8-d15 or do anything else) followed by + * 'pixman_composite_over_8888_0565_process_pixblock_head', + * 'pixman_composite_over_8888_0565_process_pixblock_tail' and + * 'pixman_composite_over_8888_0565_process_pixblock_tail_head' + * which we got implemented above. + * + * The last part is the NEON registers allocation scheme. + */ +generate_composite_function \ +    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_over_8888_0565_process_pixblock_head, \ +    pixman_composite_over_8888_0565_process_pixblock_tail, \ +    pixman_composite_over_8888_0565_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    4,  /* dst_r_basereg */ \ +    0,  /* src_basereg   */ \ +    24  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_over_n_0565_process_pixblock_head +    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format +       and put data into d6 - red, d7 - green, d30 - blue */ +    vshrn.u16   d6, q2, #8 +    vshrn.u16   d7, q2, #3 +    vsli.u16    q2, q2, #5 +    vsri.u8     d6, d6, #5 +    vsri.u8     d7, d7, #6 +    vshrn.u16   d30, q2, #2 +    /* now do alpha blending, storing results in 8-bit planar format +       into d16 - red, d19 - green, d18 - blue */ +    vmull.u8    q10, d3, d6 +    vmull.u8    q11, d3, d7 +    vmull.u8    q12, d3, d30 +    vrshr.u16   q13, q10, #8 +    vrshr.u16   q3, q11, #8 +    vrshr.u16   q15, q12, #8 +    vraddhn.u16 d20, q10, q13 +    vraddhn.u16 d23, q11, q3 +    vraddhn.u16 d22, q12, q15 +.endm + +.macro pixman_composite_over_n_0565_process_pixblock_tail +    /* ... continue alpha blending */ +    vqadd.u8    d16, d2, d20 +    vqadd.u8    q9, q0, q11 +    /* convert the result to r5g6b5 and store it into {d28, d29} */ +    vshll.u8    q14, d16, #8 +    vshll.u8    q8, d19, #8 +    vshll.u8    q9, d18, #8 +    vsri.u16    q14, q8, #5 +    vsri.u16    q14, q9, #11 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_n_0565_process_pixblock_tail_head +    pixman_composite_over_n_0565_process_pixblock_tail +    vld1.16     {d4, d5}, [DST_R, :128]! +    vst1.16     {d28, d29}, [DST_W, :128]! +    pixman_composite_over_n_0565_process_pixblock_head +    cache_preload 8, 8 +.endm + +.macro pixman_composite_over_n_0565_init +    add         DUMMY, sp, #ARGS_STACK_OFFSET +    vld1.32     {d3[0]}, [DUMMY] +    vdup.8      d0, d3[0] +    vdup.8      d1, d3[1] +    vdup.8      d2, d3[2] +    vdup.8      d3, d3[3] +    vmvn.8      d3, d3      /* invert source alpha */ +.endm + +generate_composite_function \ +    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \ +    FLAG_DST_READWRITE, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    pixman_composite_over_n_0565_init, \ +    default_cleanup, \ +    pixman_composite_over_n_0565_process_pixblock_head, \ +    pixman_composite_over_n_0565_process_pixblock_tail, \ +    pixman_composite_over_n_0565_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    4,  /* dst_r_basereg */ \ +    0,  /* src_basereg   */ \ +    24  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_src_8888_0565_process_pixblock_head +    vshll.u8    q8, d1, #8 +    vshll.u8    q14, d2, #8 +    vshll.u8    q9, d0, #8 +.endm + +.macro pixman_composite_src_8888_0565_process_pixblock_tail +    vsri.u16    q14, q8, #5 +    vsri.u16    q14, q9, #11 +.endm + +.macro pixman_composite_src_8888_0565_process_pixblock_tail_head +        vsri.u16    q14, q8, #5 +                                    PF add PF_X, PF_X, #8 +                                    PF tst PF_CTL, #0xF +    fetch_src_pixblock +                                    PF addne PF_X, PF_X, #8 +                                    PF subne PF_CTL, PF_CTL, #1 +        vsri.u16    q14, q9, #11 +                                    PF cmp PF_X, ORIG_W +                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] +    vshll.u8    q8, d1, #8 +        vst1.16     {d28, d29}, [DST_W, :128]! +                                    PF subge PF_X, PF_X, ORIG_W +                                    PF subges PF_CTL, PF_CTL, #0x10 +    vshll.u8    q14, d2, #8 +                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! +    vshll.u8    q9, d0, #8 +.endm + +generate_composite_function \ +    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    10, /* prefetch distance */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_src_8888_0565_process_pixblock_head, \ +    pixman_composite_src_8888_0565_process_pixblock_tail, \ +    pixman_composite_src_8888_0565_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_src_0565_8888_process_pixblock_head +    vshrn.u16   d30, q0, #8 +    vshrn.u16   d29, q0, #3 +    vsli.u16    q0, q0, #5 +    vmov.u8     d31, #255 +    vsri.u8     d30, d30, #5 +    vsri.u8     d29, d29, #6 +    vshrn.u16   d28, q0, #2 +.endm + +.macro pixman_composite_src_0565_8888_process_pixblock_tail +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_src_0565_8888_process_pixblock_tail_head +    pixman_composite_src_0565_8888_process_pixblock_tail +    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]! +    fetch_src_pixblock +    pixman_composite_src_0565_8888_process_pixblock_head +    cache_preload 8, 8 +.endm + +generate_composite_function \ +    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \ +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    10, /* prefetch distance */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_src_0565_8888_process_pixblock_head, \ +    pixman_composite_src_0565_8888_process_pixblock_tail, \ +    pixman_composite_src_0565_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_add_8_8_process_pixblock_head +    vqadd.u8    q14, q0, q2 +    vqadd.u8    q15, q1, q3 +.endm + +.macro pixman_composite_add_8_8_process_pixblock_tail +.endm + +.macro pixman_composite_add_8_8_process_pixblock_tail_head +    fetch_src_pixblock +                                    PF add PF_X, PF_X, #32 +                                    PF tst PF_CTL, #0xF +    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]! +                                    PF addne PF_X, PF_X, #32 +                                    PF subne PF_CTL, PF_CTL, #1 +        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]! +                                    PF cmp PF_X, ORIG_W +                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] +                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] +                                    PF subge PF_X, PF_X, ORIG_W +                                    PF subges PF_CTL, PF_CTL, #0x10 +    vqadd.u8    q14, q0, q2 +                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! +                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! +    vqadd.u8    q15, q1, q3 +.endm + +generate_composite_function \ +    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ +    FLAG_DST_READWRITE, \ +    32, /* number of pixels, processed in a single block */ \ +    10, /* prefetch distance */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_add_8_8_process_pixblock_head, \ +    pixman_composite_add_8_8_process_pixblock_tail, \ +    pixman_composite_add_8_8_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_add_8888_8888_process_pixblock_tail_head +    fetch_src_pixblock +                                    PF add PF_X, PF_X, #8 +                                    PF tst PF_CTL, #0xF +    vld1.32     {d4, d5, d6, d7}, [DST_R, :128]! +                                    PF addne PF_X, PF_X, #8 +                                    PF subne PF_CTL, PF_CTL, #1 +        vst1.32     {d28, d29, d30, d31}, [DST_W, :128]! +                                    PF cmp PF_X, ORIG_W +                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] +                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] +                                    PF subge PF_X, PF_X, ORIG_W +                                    PF subges PF_CTL, PF_CTL, #0x10 +    vqadd.u8    q14, q0, q2 +                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! +                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! +    vqadd.u8    q15, q1, q3 +.endm + +generate_composite_function \ +    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ +    FLAG_DST_READWRITE, \ +    8, /* number of pixels, processed in a single block */ \ +    10, /* prefetch distance */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_add_8_8_process_pixblock_head, \ +    pixman_composite_add_8_8_process_pixblock_tail, \ +    pixman_composite_add_8888_8888_process_pixblock_tail_head + +generate_composite_function_single_scanline \ +    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \ +    FLAG_DST_READWRITE, \ +    8, /* number of pixels, processed in a single block */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_add_8_8_process_pixblock_head, \ +    pixman_composite_add_8_8_process_pixblock_tail, \ +    pixman_composite_add_8888_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head +    vmvn.8      d24, d3  /* get inverted alpha */ +    /* do alpha blending */ +    vmull.u8    q8, d24, d4 +    vmull.u8    q9, d24, d5 +    vmull.u8    q10, d24, d6 +    vmull.u8    q11, d24, d7 +.endm + +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail +    vrshr.u16   q14, q8, #8 +    vrshr.u16   q15, q9, #8 +    vrshr.u16   q12, q10, #8 +    vrshr.u16   q13, q11, #8 +    vraddhn.u16 d28, q14, q8 +    vraddhn.u16 d29, q15, q9 +    vraddhn.u16 d30, q12, q10 +    vraddhn.u16 d31, q13, q11 +.endm + +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head +    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]! +        vrshr.u16   q14, q8, #8 +                                    PF add PF_X, PF_X, #8 +                                    PF tst PF_CTL, #0xF +        vrshr.u16   q15, q9, #8 +        vrshr.u16   q12, q10, #8 +        vrshr.u16   q13, q11, #8 +                                    PF addne PF_X, PF_X, #8 +                                    PF subne PF_CTL, PF_CTL, #1 +        vraddhn.u16 d28, q14, q8 +        vraddhn.u16 d29, q15, q9 +                                    PF cmp PF_X, ORIG_W +        vraddhn.u16 d30, q12, q10 +        vraddhn.u16 d31, q13, q11 +    fetch_src_pixblock +                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] +    vmvn.8      d22, d3 +                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] +        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]! +                                    PF subge PF_X, PF_X, ORIG_W +    vmull.u8    q8, d22, d4 +                                    PF subges PF_CTL, PF_CTL, #0x10 +    vmull.u8    q9, d22, d5 +                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! +    vmull.u8    q10, d22, d6 +                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! +    vmull.u8    q11, d22, d7 +.endm + +generate_composite_function_single_scanline \ +    pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_out_reverse_8888_8888_process_pixblock_head, \ +    pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \ +    pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_8888_8888_process_pixblock_head +    pixman_composite_out_reverse_8888_8888_process_pixblock_head +.endm + +.macro pixman_composite_over_8888_8888_process_pixblock_tail +    pixman_composite_out_reverse_8888_8888_process_pixblock_tail +    vqadd.u8    q14, q0, q14 +    vqadd.u8    q15, q1, q15 +.endm + +.macro pixman_composite_over_8888_8888_process_pixblock_tail_head +    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]! +        vrshr.u16   q14, q8, #8 +                                    PF add PF_X, PF_X, #8 +                                    PF tst PF_CTL, #0xF +        vrshr.u16   q15, q9, #8 +        vrshr.u16   q12, q10, #8 +        vrshr.u16   q13, q11, #8 +                                    PF addne PF_X, PF_X, #8 +                                    PF subne PF_CTL, PF_CTL, #1 +        vraddhn.u16 d28, q14, q8 +        vraddhn.u16 d29, q15, q9 +                                    PF cmp PF_X, ORIG_W +        vraddhn.u16 d30, q12, q10 +        vraddhn.u16 d31, q13, q11 +        vqadd.u8    q14, q0, q14 +        vqadd.u8    q15, q1, q15 +    fetch_src_pixblock +                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] +    vmvn.8      d22, d3 +                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] +        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]! +                                    PF subge PF_X, PF_X, ORIG_W +    vmull.u8    q8, d22, d4 +                                    PF subges PF_CTL, PF_CTL, #0x10 +    vmull.u8    q9, d22, d5 +                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! +    vmull.u8    q10, d22, d6 +                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! +    vmull.u8    q11, d22, d7 +.endm + +generate_composite_function \ +    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_over_8888_8888_process_pixblock_head, \ +    pixman_composite_over_8888_8888_process_pixblock_tail, \ +    pixman_composite_over_8888_8888_process_pixblock_tail_head + +generate_composite_function_single_scanline \ +    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_over_8888_8888_process_pixblock_head, \ +    pixman_composite_over_8888_8888_process_pixblock_tail, \ +    pixman_composite_over_8888_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_n_8888_process_pixblock_head +    /* deinterleaved source pixels in {d0, d1, d2, d3} */ +    /* inverted alpha in {d24} */ +    /* destination pixels in {d4, d5, d6, d7} */ +    vmull.u8    q8, d24, d4 +    vmull.u8    q9, d24, d5 +    vmull.u8    q10, d24, d6 +    vmull.u8    q11, d24, d7 +.endm + +.macro pixman_composite_over_n_8888_process_pixblock_tail +    vrshr.u16   q14, q8, #8 +    vrshr.u16   q15, q9, #8 +    vrshr.u16   q2, q10, #8 +    vrshr.u16   q3, q11, #8 +    vraddhn.u16 d28, q14, q8 +    vraddhn.u16 d29, q15, q9 +    vraddhn.u16 d30, q2, q10 +    vraddhn.u16 d31, q3, q11 +    vqadd.u8    q14, q0, q14 +    vqadd.u8    q15, q1, q15 +.endm + +.macro pixman_composite_over_n_8888_process_pixblock_tail_head +        vrshr.u16   q14, q8, #8 +        vrshr.u16   q15, q9, #8 +        vrshr.u16   q2, q10, #8 +        vrshr.u16   q3, q11, #8 +        vraddhn.u16 d28, q14, q8 +        vraddhn.u16 d29, q15, q9 +        vraddhn.u16 d30, q2, q10 +        vraddhn.u16 d31, q3, q11 +    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]! +        vqadd.u8    q14, q0, q14 +                                    PF add PF_X, PF_X, #8 +                                    PF tst PF_CTL, #0x0F +                                    PF addne PF_X, PF_X, #8 +                                    PF subne PF_CTL, PF_CTL, #1 +        vqadd.u8    q15, q1, q15 +                                    PF cmp PF_X, ORIG_W +    vmull.u8    q8, d24, d4 +                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] +    vmull.u8    q9, d24, d5 +                                    PF subge PF_X, PF_X, ORIG_W +    vmull.u8    q10, d24, d6 +                                    PF subges PF_CTL, PF_CTL, #0x10 +    vmull.u8    q11, d24, d7 +                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! +        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +.macro pixman_composite_over_n_8888_init +    add         DUMMY, sp, #ARGS_STACK_OFFSET +    vld1.32     {d3[0]}, [DUMMY] +    vdup.8      d0, d3[0] +    vdup.8      d1, d3[1] +    vdup.8      d2, d3[2] +    vdup.8      d3, d3[3] +    vmvn.8      d24, d3  /* get inverted alpha */ +.endm + +generate_composite_function \ +    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    pixman_composite_over_n_8888_init, \ +    default_cleanup, \ +    pixman_composite_over_8888_8888_process_pixblock_head, \ +    pixman_composite_over_8888_8888_process_pixblock_tail, \ +    pixman_composite_over_n_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head +        vrshr.u16   q14, q8, #8 +                                    PF add PF_X, PF_X, #8 +                                    PF tst PF_CTL, #0xF +        vrshr.u16   q15, q9, #8 +        vrshr.u16   q12, q10, #8 +        vrshr.u16   q13, q11, #8 +                                    PF addne PF_X, PF_X, #8 +                                    PF subne PF_CTL, PF_CTL, #1 +        vraddhn.u16 d28, q14, q8 +        vraddhn.u16 d29, q15, q9 +                                    PF cmp PF_X, ORIG_W +        vraddhn.u16 d30, q12, q10 +        vraddhn.u16 d31, q13, q11 +        vqadd.u8    q14, q0, q14 +        vqadd.u8    q15, q1, q15 +    vld4.8      {d0, d1, d2, d3}, [DST_R, :128]! +    vmvn.8      d22, d3 +                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] +        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]! +                                    PF subge PF_X, PF_X, ORIG_W +    vmull.u8    q8, d22, d4 +                                    PF subges PF_CTL, PF_CTL, #0x10 +    vmull.u8    q9, d22, d5 +    vmull.u8    q10, d22, d6 +                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! +    vmull.u8    q11, d22, d7 +.endm + +.macro pixman_composite_over_reverse_n_8888_init +    add         DUMMY, sp, #ARGS_STACK_OFFSET +    vld1.32     {d7[0]}, [DUMMY] +    vdup.8      d4, d7[0] +    vdup.8      d5, d7[1] +    vdup.8      d6, d7[2] +    vdup.8      d7, d7[3] +.endm + +generate_composite_function \ +    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    pixman_composite_over_reverse_n_8888_init, \ +    default_cleanup, \ +    pixman_composite_over_8888_8888_process_pixblock_head, \ +    pixman_composite_over_8888_8888_process_pixblock_tail, \ +    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    0,  /* dst_r_basereg */ \ +    4,  /* src_basereg   */ \ +    24  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_over_8888_8_0565_process_pixblock_head +    vmull.u8    q0,  d24, d8    /* IN for SRC pixels (part1) */ +    vmull.u8    q1,  d24, d9 +    vmull.u8    q6,  d24, d10 +    vmull.u8    q7,  d24, d11 +        vshrn.u16   d6,  q2, #8 /* convert DST_R data to 32-bpp (part1) */ +        vshrn.u16   d7,  q2, #3 +        vsli.u16    q2,  q2, #5 +    vrshr.u16   q8,  q0,  #8    /* IN for SRC pixels (part2) */ +    vrshr.u16   q9,  q1,  #8 +    vrshr.u16   q10, q6,  #8 +    vrshr.u16   q11, q7,  #8 +    vraddhn.u16 d0,  q0,  q8 +    vraddhn.u16 d1,  q1,  q9 +    vraddhn.u16 d2,  q6,  q10 +    vraddhn.u16 d3,  q7,  q11 +        vsri.u8     d6,  d6, #5 /* convert DST_R data to 32-bpp (part2) */ +        vsri.u8     d7,  d7, #6 +    vmvn.8      d3,  d3 +        vshrn.u16   d30, q2, #2 +    vmull.u8    q8,  d3, d6     /* now do alpha blending */ +    vmull.u8    q9,  d3, d7 +    vmull.u8    q10, d3, d30 +.endm + +.macro pixman_composite_over_8888_8_0565_process_pixblock_tail +    /* 3 cycle bubble (after vmull.u8) */ +    vrshr.u16   q13, q8,  #8 +    vrshr.u16   q11, q9,  #8 +    vrshr.u16   q15, q10, #8 +    vraddhn.u16 d16, q8,  q13 +    vraddhn.u16 d27, q9,  q11 +    vraddhn.u16 d26, q10, q15 +    vqadd.u8    d16, d2,  d16 +    /* 1 cycle bubble */ +    vqadd.u8    q9,  q0,  q13 +    vshll.u8    q14, d16, #8    /* convert to 16bpp */ +    vshll.u8    q8,  d19, #8 +    vshll.u8    q9,  d18, #8 +    vsri.u16    q14, q8,  #5 +    /* 1 cycle bubble */ +    vsri.u16    q14, q9,  #11 +.endm + +.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head +    vld1.16     {d4, d5}, [DST_R, :128]! +    vshrn.u16   d6,  q2,  #8 +    fetch_mask_pixblock +    vshrn.u16   d7,  q2,  #3 +    fetch_src_pixblock +    vmull.u8    q6,  d24, d10 +        vrshr.u16   q13, q8,  #8 +        vrshr.u16   q11, q9,  #8 +        vrshr.u16   q15, q10, #8 +        vraddhn.u16 d16, q8,  q13 +        vraddhn.u16 d27, q9,  q11 +        vraddhn.u16 d26, q10, q15 +        vqadd.u8    d16, d2,  d16 +    vmull.u8    q1,  d24, d9 +        vqadd.u8    q9,  q0,  q13 +        vshll.u8    q14, d16, #8 +    vmull.u8    q0,  d24, d8 +        vshll.u8    q8,  d19, #8 +        vshll.u8    q9,  d18, #8 +        vsri.u16    q14, q8,  #5 +    vmull.u8    q7,  d24, d11 +        vsri.u16    q14, q9,  #11 + +    cache_preload 8, 8 + +    vsli.u16    q2,  q2,  #5 +    vrshr.u16   q8,  q0,  #8 +    vrshr.u16   q9,  q1,  #8 +    vrshr.u16   q10, q6,  #8 +    vrshr.u16   q11, q7,  #8 +    vraddhn.u16 d0,  q0,  q8 +    vraddhn.u16 d1,  q1,  q9 +    vraddhn.u16 d2,  q6,  q10 +    vraddhn.u16 d3,  q7,  q11 +    vsri.u8     d6,  d6,  #5 +    vsri.u8     d7,  d7,  #6 +    vmvn.8      d3,  d3 +    vshrn.u16   d30, q2,  #2 +    vst1.16     {d28, d29}, [DST_W, :128]! +    vmull.u8    q8,  d3,  d6 +    vmull.u8    q9,  d3,  d7 +    vmull.u8    q10, d3,  d30 +.endm + +generate_composite_function \ +    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    default_init_need_all_regs, \ +    default_cleanup_need_all_regs, \ +    pixman_composite_over_8888_8_0565_process_pixblock_head, \ +    pixman_composite_over_8888_8_0565_process_pixblock_tail, \ +    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    4,  /* dst_r_basereg */ \ +    8,  /* src_basereg   */ \ +    24  /* mask_basereg  */ + +/******************************************************************************/ + +/* + * This function needs a special initialization of solid mask. + * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET + * offset, split into color components and replicated in d8-d11 + * registers. Additionally, this function needs all the NEON registers, + * so it has to save d8-d15 registers which are callee saved according + * to ABI. These registers are restored from 'cleanup' macro. All the + * other NEON registers are caller saved, so can be clobbered freely + * without introducing any problems. + */ +.macro pixman_composite_over_n_8_0565_init +    add         DUMMY, sp, #ARGS_STACK_OFFSET +    vpush       {d8-d15} +    vld1.32     {d11[0]}, [DUMMY] +    vdup.8      d8, d11[0] +    vdup.8      d9, d11[1] +    vdup.8      d10, d11[2] +    vdup.8      d11, d11[3] +.endm + +.macro pixman_composite_over_n_8_0565_cleanup +    vpop        {d8-d15} +.endm + +generate_composite_function \ +    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \ +    FLAG_DST_READWRITE, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    pixman_composite_over_n_8_0565_init, \ +    pixman_composite_over_n_8_0565_cleanup, \ +    pixman_composite_over_8888_8_0565_process_pixblock_head, \ +    pixman_composite_over_8888_8_0565_process_pixblock_tail, \ +    pixman_composite_over_8888_8_0565_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_8888_n_0565_init +    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8) +    vpush       {d8-d15} +    vld1.32     {d24[0]}, [DUMMY] +    vdup.8      d24, d24[3] +.endm + +.macro pixman_composite_over_8888_n_0565_cleanup +    vpop        {d8-d15} +.endm + +generate_composite_function \ +    pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    pixman_composite_over_8888_n_0565_init, \ +    pixman_composite_over_8888_n_0565_cleanup, \ +    pixman_composite_over_8888_8_0565_process_pixblock_head, \ +    pixman_composite_over_8888_8_0565_process_pixblock_tail, \ +    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    4,  /* dst_r_basereg */ \ +    8,  /* src_basereg   */ \ +    24  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_src_0565_0565_process_pixblock_head +.endm + +.macro pixman_composite_src_0565_0565_process_pixblock_tail +.endm + +.macro pixman_composite_src_0565_0565_process_pixblock_tail_head +    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! +    fetch_src_pixblock +    cache_preload 16, 16 +.endm + +generate_composite_function \ +    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \ +    FLAG_DST_WRITEONLY, \ +    16, /* number of pixels, processed in a single block */ \ +    10, /* prefetch distance */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_src_0565_0565_process_pixblock_head, \ +    pixman_composite_src_0565_0565_process_pixblock_tail, \ +    pixman_composite_src_0565_0565_process_pixblock_tail_head, \ +    0, /* dst_w_basereg */ \ +    0, /* dst_r_basereg */ \ +    0, /* src_basereg   */ \ +    0  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_src_n_8_process_pixblock_head +.endm + +.macro pixman_composite_src_n_8_process_pixblock_tail +.endm + +.macro pixman_composite_src_n_8_process_pixblock_tail_head +    vst1.8  {d0, d1, d2, d3}, [DST_W, :128]! +.endm + +.macro pixman_composite_src_n_8_init +    add         DUMMY, sp, #ARGS_STACK_OFFSET +    vld1.32     {d0[0]}, [DUMMY] +    vsli.u64    d0, d0, #8 +    vsli.u64    d0, d0, #16 +    vsli.u64    d0, d0, #32 +    vorr        d1, d0, d0 +    vorr        q1, q0, q0 +.endm + +.macro pixman_composite_src_n_8_cleanup +.endm + +generate_composite_function \ +    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \ +    FLAG_DST_WRITEONLY, \ +    32, /* number of pixels, processed in a single block */ \ +    0,  /* prefetch distance */ \ +    pixman_composite_src_n_8_init, \ +    pixman_composite_src_n_8_cleanup, \ +    pixman_composite_src_n_8_process_pixblock_head, \ +    pixman_composite_src_n_8_process_pixblock_tail, \ +    pixman_composite_src_n_8_process_pixblock_tail_head, \ +    0, /* dst_w_basereg */ \ +    0, /* dst_r_basereg */ \ +    0, /* src_basereg   */ \ +    0  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_src_n_0565_process_pixblock_head +.endm + +.macro pixman_composite_src_n_0565_process_pixblock_tail +.endm + +.macro pixman_composite_src_n_0565_process_pixblock_tail_head +    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! +.endm + +.macro pixman_composite_src_n_0565_init +    add         DUMMY, sp, #ARGS_STACK_OFFSET +    vld1.32     {d0[0]}, [DUMMY] +    vsli.u64    d0, d0, #16 +    vsli.u64    d0, d0, #32 +    vorr        d1, d0, d0 +    vorr        q1, q0, q0 +.endm + +.macro pixman_composite_src_n_0565_cleanup +.endm + +generate_composite_function \ +    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \ +    FLAG_DST_WRITEONLY, \ +    16, /* number of pixels, processed in a single block */ \ +    0,  /* prefetch distance */ \ +    pixman_composite_src_n_0565_init, \ +    pixman_composite_src_n_0565_cleanup, \ +    pixman_composite_src_n_0565_process_pixblock_head, \ +    pixman_composite_src_n_0565_process_pixblock_tail, \ +    pixman_composite_src_n_0565_process_pixblock_tail_head, \ +    0, /* dst_w_basereg */ \ +    0, /* dst_r_basereg */ \ +    0, /* src_basereg   */ \ +    0  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_src_n_8888_process_pixblock_head +.endm + +.macro pixman_composite_src_n_8888_process_pixblock_tail +.endm + +.macro pixman_composite_src_n_8888_process_pixblock_tail_head +    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! +.endm + +.macro pixman_composite_src_n_8888_init +    add         DUMMY, sp, #ARGS_STACK_OFFSET +    vld1.32     {d0[0]}, [DUMMY] +    vsli.u64    d0, d0, #32 +    vorr        d1, d0, d0 +    vorr        q1, q0, q0 +.endm + +.macro pixman_composite_src_n_8888_cleanup +.endm + +generate_composite_function \ +    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \ +    FLAG_DST_WRITEONLY, \ +    8, /* number of pixels, processed in a single block */ \ +    0, /* prefetch distance */ \ +    pixman_composite_src_n_8888_init, \ +    pixman_composite_src_n_8888_cleanup, \ +    pixman_composite_src_n_8888_process_pixblock_head, \ +    pixman_composite_src_n_8888_process_pixblock_tail, \ +    pixman_composite_src_n_8888_process_pixblock_tail_head, \ +    0, /* dst_w_basereg */ \ +    0, /* dst_r_basereg */ \ +    0, /* src_basereg   */ \ +    0  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_src_8888_8888_process_pixblock_head +.endm + +.macro pixman_composite_src_8888_8888_process_pixblock_tail +.endm + +.macro pixman_composite_src_8888_8888_process_pixblock_tail_head +    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! +    fetch_src_pixblock +    cache_preload 8, 8 +.endm + +generate_composite_function \ +    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \ +    FLAG_DST_WRITEONLY, \ +    8, /* number of pixels, processed in a single block */ \ +    10, /* prefetch distance */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_src_8888_8888_process_pixblock_head, \ +    pixman_composite_src_8888_8888_process_pixblock_tail, \ +    pixman_composite_src_8888_8888_process_pixblock_tail_head, \ +    0, /* dst_w_basereg */ \ +    0, /* dst_r_basereg */ \ +    0, /* src_basereg   */ \ +    0  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_src_x888_8888_process_pixblock_head +    vorr     q0, q0, q2 +    vorr     q1, q1, q2 +.endm + +.macro pixman_composite_src_x888_8888_process_pixblock_tail +.endm + +.macro pixman_composite_src_x888_8888_process_pixblock_tail_head +    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! +    fetch_src_pixblock +    vorr     q0, q0, q2 +    vorr     q1, q1, q2 +    cache_preload 8, 8 +.endm + +.macro pixman_composite_src_x888_8888_init +    vmov.u8  q2, #0xFF +    vshl.u32 q2, q2, #24 +.endm + +generate_composite_function \ +    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \ +    FLAG_DST_WRITEONLY, \ +    8, /* number of pixels, processed in a single block */ \ +    10, /* prefetch distance */ \ +    pixman_composite_src_x888_8888_init, \ +    default_cleanup, \ +    pixman_composite_src_x888_8888_process_pixblock_head, \ +    pixman_composite_src_x888_8888_process_pixblock_tail, \ +    pixman_composite_src_x888_8888_process_pixblock_tail_head, \ +    0, /* dst_w_basereg */ \ +    0, /* dst_r_basereg */ \ +    0, /* src_basereg   */ \ +    0  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_src_n_8_8888_process_pixblock_head +    /* expecting solid source in {d0, d1, d2, d3} */ +    /* mask is in d24 (d25, d26, d27 are unused) */ + +    /* in */ +    vmull.u8    q8, d24, d0 +    vmull.u8    q9, d24, d1 +    vmull.u8    q10, d24, d2 +    vmull.u8    q11, d24, d3 +    vrsra.u16   q8, q8, #8 +    vrsra.u16   q9, q9, #8 +    vrsra.u16   q10, q10, #8 +    vrsra.u16   q11, q11, #8 +.endm + +.macro pixman_composite_src_n_8_8888_process_pixblock_tail +    vrshrn.u16  d28, q8, #8 +    vrshrn.u16  d29, q9, #8 +    vrshrn.u16  d30, q10, #8 +    vrshrn.u16  d31, q11, #8 +.endm + +.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head +    fetch_mask_pixblock +                                    PF add PF_X, PF_X, #8 +        vrshrn.u16  d28, q8, #8 +                                    PF tst PF_CTL, #0x0F +        vrshrn.u16  d29, q9, #8 +                                    PF addne PF_X, PF_X, #8 +        vrshrn.u16  d30, q10, #8 +                                    PF subne PF_CTL, PF_CTL, #1 +        vrshrn.u16  d31, q11, #8 +                                    PF cmp PF_X, ORIG_W +    vmull.u8    q8, d24, d0 +                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] +    vmull.u8    q9, d24, d1 +                                    PF subge PF_X, PF_X, ORIG_W +    vmull.u8    q10, d24, d2 +                                    PF subges PF_CTL, PF_CTL, #0x10 +    vmull.u8    q11, d24, d3 +                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! +        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]! +    vrsra.u16   q8, q8, #8 +    vrsra.u16   q9, q9, #8 +    vrsra.u16   q10, q10, #8 +    vrsra.u16   q11, q11, #8 +.endm + +.macro pixman_composite_src_n_8_8888_init +    add         DUMMY, sp, #ARGS_STACK_OFFSET +    vld1.32     {d3[0]}, [DUMMY] +    vdup.8      d0, d3[0] +    vdup.8      d1, d3[1] +    vdup.8      d2, d3[2] +    vdup.8      d3, d3[3] +.endm + +.macro pixman_composite_src_n_8_8888_cleanup +.endm + +generate_composite_function \ +    pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \ +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    pixman_composite_src_n_8_8888_init, \ +    pixman_composite_src_n_8_8888_cleanup, \ +    pixman_composite_src_n_8_8888_process_pixblock_head, \ +    pixman_composite_src_n_8_8888_process_pixblock_tail, \ +    pixman_composite_src_n_8_8888_process_pixblock_tail_head, \ + +/******************************************************************************/ + +.macro pixman_composite_src_n_8_8_process_pixblock_head +    vmull.u8    q0, d24, d16 +    vmull.u8    q1, d25, d16 +    vmull.u8    q2, d26, d16 +    vmull.u8    q3, d27, d16 +    vrsra.u16   q0, q0,  #8 +    vrsra.u16   q1, q1,  #8 +    vrsra.u16   q2, q2,  #8 +    vrsra.u16   q3, q3,  #8 +.endm + +.macro pixman_composite_src_n_8_8_process_pixblock_tail +    vrshrn.u16  d28, q0, #8 +    vrshrn.u16  d29, q1, #8 +    vrshrn.u16  d30, q2, #8 +    vrshrn.u16  d31, q3, #8 +.endm + +.macro pixman_composite_src_n_8_8_process_pixblock_tail_head +    fetch_mask_pixblock +                                    PF add PF_X, PF_X, #8 +        vrshrn.u16  d28, q0, #8 +                                    PF tst PF_CTL, #0x0F +        vrshrn.u16  d29, q1, #8 +                                    PF addne PF_X, PF_X, #8 +        vrshrn.u16  d30, q2, #8 +                                    PF subne PF_CTL, PF_CTL, #1 +        vrshrn.u16  d31, q3, #8 +                                    PF cmp PF_X, ORIG_W +    vmull.u8    q0,  d24, d16 +                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] +    vmull.u8    q1,  d25, d16 +                                    PF subge PF_X, PF_X, ORIG_W +    vmull.u8    q2,  d26, d16 +                                    PF subges PF_CTL, PF_CTL, #0x10 +    vmull.u8    q3,  d27, d16 +                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! +        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]! +    vrsra.u16   q0, q0,  #8 +    vrsra.u16   q1, q1,  #8 +    vrsra.u16   q2, q2,  #8 +    vrsra.u16   q3, q3,  #8 +.endm + +.macro pixman_composite_src_n_8_8_init +    add         DUMMY, sp, #ARGS_STACK_OFFSET +    vld1.32     {d16[0]}, [DUMMY] +    vdup.8      d16, d16[3] +.endm + +.macro pixman_composite_src_n_8_8_cleanup +.endm + +generate_composite_function \ +    pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \ +    FLAG_DST_WRITEONLY, \ +    32, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    pixman_composite_src_n_8_8_init, \ +    pixman_composite_src_n_8_8_cleanup, \ +    pixman_composite_src_n_8_8_process_pixblock_head, \ +    pixman_composite_src_n_8_8_process_pixblock_tail, \ +    pixman_composite_src_n_8_8_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_n_8_8888_process_pixblock_head +    /* expecting deinterleaved source data in {d8, d9, d10, d11} */ +    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ +    /* and destination data in {d4, d5, d6, d7} */ +    /* mask is in d24 (d25, d26, d27 are unused) */ + +    /* in */ +    vmull.u8    q6, d24, d8 +    vmull.u8    q7, d24, d9 +    vmull.u8    q8, d24, d10 +    vmull.u8    q9, d24, d11 +    vrshr.u16   q10, q6, #8 +    vrshr.u16   q11, q7, #8 +    vrshr.u16   q12, q8, #8 +    vrshr.u16   q13, q9, #8 +    vraddhn.u16 d0, q6, q10 +    vraddhn.u16 d1, q7, q11 +    vraddhn.u16 d2, q8, q12 +    vraddhn.u16 d3, q9, q13 +    vmvn.8      d25, d3  /* get inverted alpha */ +    /* source:      d0 - blue, d1 - green, d2 - red, d3 - alpha */ +    /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */ +    /* now do alpha blending */ +    vmull.u8    q8, d25, d4 +    vmull.u8    q9, d25, d5 +    vmull.u8    q10, d25, d6 +    vmull.u8    q11, d25, d7 +.endm + +.macro pixman_composite_over_n_8_8888_process_pixblock_tail +    vrshr.u16   q14, q8, #8 +    vrshr.u16   q15, q9, #8 +    vrshr.u16   q6, q10, #8 +    vrshr.u16   q7, q11, #8 +    vraddhn.u16 d28, q14, q8 +    vraddhn.u16 d29, q15, q9 +    vraddhn.u16 d30, q6, q10 +    vraddhn.u16 d31, q7, q11 +    vqadd.u8    q14, q0, q14 +    vqadd.u8    q15, q1, q15 +.endm + +.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head +        vrshr.u16   q14, q8, #8 +    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]! +        vrshr.u16   q15, q9, #8 +    fetch_mask_pixblock +        vrshr.u16   q6, q10, #8 +                                    PF add PF_X, PF_X, #8 +        vrshr.u16   q7, q11, #8 +                                    PF tst PF_CTL, #0x0F +        vraddhn.u16 d28, q14, q8 +                                    PF addne PF_X, PF_X, #8 +        vraddhn.u16 d29, q15, q9 +                                    PF subne PF_CTL, PF_CTL, #1 +        vraddhn.u16 d30, q6, q10 +                                    PF cmp PF_X, ORIG_W +        vraddhn.u16 d31, q7, q11 +                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] +    vmull.u8    q6, d24, d8 +                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] +    vmull.u8    q7, d24, d9 +                                    PF subge PF_X, PF_X, ORIG_W +    vmull.u8    q8, d24, d10 +                                    PF subges PF_CTL, PF_CTL, #0x10 +    vmull.u8    q9, d24, d11 +                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! +        vqadd.u8    q14, q0, q14 +                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! +        vqadd.u8    q15, q1, q15 +    vrshr.u16   q10, q6, #8 +    vrshr.u16   q11, q7, #8 +    vrshr.u16   q12, q8, #8 +    vrshr.u16   q13, q9, #8 +    vraddhn.u16 d0, q6, q10 +    vraddhn.u16 d1, q7, q11 +    vraddhn.u16 d2, q8, q12 +    vraddhn.u16 d3, q9, q13 +        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]! +    vmvn.8      d25, d3 +    vmull.u8    q8, d25, d4 +    vmull.u8    q9, d25, d5 +    vmull.u8    q10, d25, d6 +    vmull.u8    q11, d25, d7 +.endm + +.macro pixman_composite_over_n_8_8888_init +    add         DUMMY, sp, #ARGS_STACK_OFFSET +    vpush       {d8-d15} +    vld1.32     {d11[0]}, [DUMMY] +    vdup.8      d8, d11[0] +    vdup.8      d9, d11[1] +    vdup.8      d10, d11[2] +    vdup.8      d11, d11[3] +.endm + +.macro pixman_composite_over_n_8_8888_cleanup +    vpop        {d8-d15} +.endm + +generate_composite_function \ +    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    pixman_composite_over_n_8_8888_init, \ +    pixman_composite_over_n_8_8888_cleanup, \ +    pixman_composite_over_n_8_8888_process_pixblock_head, \ +    pixman_composite_over_n_8_8888_process_pixblock_tail, \ +    pixman_composite_over_n_8_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_n_8_8_process_pixblock_head +    vmull.u8    q0,  d24, d8 +    vmull.u8    q1,  d25, d8 +    vmull.u8    q6,  d26, d8 +    vmull.u8    q7,  d27, d8 +    vrshr.u16   q10, q0,  #8 +    vrshr.u16   q11, q1,  #8 +    vrshr.u16   q12, q6,  #8 +    vrshr.u16   q13, q7,  #8 +    vraddhn.u16 d0,  q0,  q10 +    vraddhn.u16 d1,  q1,  q11 +    vraddhn.u16 d2,  q6,  q12 +    vraddhn.u16 d3,  q7,  q13 +    vmvn.8      q12, q0 +    vmvn.8      q13, q1 +    vmull.u8    q8,  d24, d4 +    vmull.u8    q9,  d25, d5 +    vmull.u8    q10, d26, d6 +    vmull.u8    q11, d27, d7 +.endm + +.macro pixman_composite_over_n_8_8_process_pixblock_tail +    vrshr.u16   q14, q8,  #8 +    vrshr.u16   q15, q9,  #8 +    vrshr.u16   q12, q10, #8 +    vrshr.u16   q13, q11, #8 +    vraddhn.u16 d28, q14, q8 +    vraddhn.u16 d29, q15, q9 +    vraddhn.u16 d30, q12, q10 +    vraddhn.u16 d31, q13, q11 +    vqadd.u8    q14, q0,  q14 +    vqadd.u8    q15, q1,  q15 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_n_8_8_process_pixblock_tail_head +    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]! +    pixman_composite_over_n_8_8_process_pixblock_tail +    fetch_mask_pixblock +    cache_preload 32, 32 +    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]! +    pixman_composite_over_n_8_8_process_pixblock_head +.endm + +.macro pixman_composite_over_n_8_8_init +    add         DUMMY, sp, #ARGS_STACK_OFFSET +    vpush       {d8-d15} +    vld1.32     {d8[0]}, [DUMMY] +    vdup.8      d8, d8[3] +.endm + +.macro pixman_composite_over_n_8_8_cleanup +    vpop        {d8-d15} +.endm + +generate_composite_function \ +    pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \ +    FLAG_DST_READWRITE, \ +    32, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    pixman_composite_over_n_8_8_init, \ +    pixman_composite_over_n_8_8_cleanup, \ +    pixman_composite_over_n_8_8_process_pixblock_head, \ +    pixman_composite_over_n_8_8_process_pixblock_tail, \ +    pixman_composite_over_n_8_8_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head +    /* +     * 'combine_mask_ca' replacement +     * +     * input:  solid src (n) in {d8,  d9,  d10, d11} +     *         dest in          {d4,  d5,  d6,  d7 } +     *         mask in          {d24, d25, d26, d27} +     * output: updated src in   {d0,  d1,  d2,  d3 } +     *         updated mask in  {d24, d25, d26, d3 } +     */ +    vmull.u8    q0,  d24, d8 +    vmull.u8    q1,  d25, d9 +    vmull.u8    q6,  d26, d10 +    vmull.u8    q7,  d27, d11 +    vmull.u8    q9,  d11, d25 +    vmull.u8    q12, d11, d24 +    vmull.u8    q13, d11, d26 +    vrshr.u16   q8,  q0,  #8 +    vrshr.u16   q10, q1,  #8 +    vrshr.u16   q11, q6,  #8 +    vraddhn.u16 d0,  q0,  q8 +    vraddhn.u16 d1,  q1,  q10 +    vraddhn.u16 d2,  q6,  q11 +    vrshr.u16   q11, q12, #8 +    vrshr.u16   q8,  q9,  #8 +    vrshr.u16   q6,  q13, #8 +    vrshr.u16   q10, q7,  #8 +    vraddhn.u16 d24, q12, q11 +    vraddhn.u16 d25, q9,  q8 +    vraddhn.u16 d26, q13, q6 +    vraddhn.u16 d3,  q7,  q10 +    /* +     * 'combine_over_ca' replacement +     * +     * output: updated dest in {d28, d29, d30, d31} +     */ +    vmvn.8      q12, q12 +    vmvn.8      d26, d26 +    vmull.u8    q8,  d24, d4 +    vmull.u8    q9,  d25, d5 +    vmvn.8      d27, d3 +    vmull.u8    q10, d26, d6 +    vmull.u8    q11, d27, d7 +.endm + +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail +    /* ... continue 'combine_over_ca' replacement */ +    vrshr.u16   q14, q8,  #8 +    vrshr.u16   q15, q9,  #8 +    vrshr.u16   q6,  q10, #8 +    vrshr.u16   q7,  q11, #8 +    vraddhn.u16 d28, q14, q8 +    vraddhn.u16 d29, q15, q9 +    vraddhn.u16 d30, q6,  q10 +    vraddhn.u16 d31, q7,  q11 +    vqadd.u8    q14, q0,  q14 +    vqadd.u8    q15, q1,  q15 +.endm + +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head +        vrshr.u16   q14, q8, #8 +        vrshr.u16   q15, q9, #8 +    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]! +        vrshr.u16   q6, q10, #8 +        vrshr.u16   q7, q11, #8 +        vraddhn.u16 d28, q14, q8 +        vraddhn.u16 d29, q15, q9 +        vraddhn.u16 d30, q6, q10 +        vraddhn.u16 d31, q7, q11 +    fetch_mask_pixblock +        vqadd.u8    q14, q0, q14 +        vqadd.u8    q15, q1, q15 +    cache_preload 8, 8 +    pixman_composite_over_n_8888_8888_ca_process_pixblock_head +    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +.macro pixman_composite_over_n_8888_8888_ca_init +    add         DUMMY, sp, #ARGS_STACK_OFFSET +    vpush       {d8-d15} +    vld1.32     {d11[0]}, [DUMMY] +    vdup.8      d8, d11[0] +    vdup.8      d9, d11[1] +    vdup.8      d10, d11[2] +    vdup.8      d11, d11[3] +.endm + +.macro pixman_composite_over_n_8888_8888_ca_cleanup +    vpop        {d8-d15} +.endm + +generate_composite_function \ +    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    pixman_composite_over_n_8888_8888_ca_init, \ +    pixman_composite_over_n_8888_8888_ca_cleanup, \ +    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \ +    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \ +    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head +    /* +     * 'combine_mask_ca' replacement +     * +     * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A] +     *         mask in          {d24, d25, d26}       [B, G, R] +     * output: updated src in   {d0,  d1,  d2 }       [B, G, R] +     *         updated mask in  {d24, d25, d26}       [B, G, R] +     */ +    vmull.u8    q0,  d24, d8 +    vmull.u8    q1,  d25, d9 +    vmull.u8    q6,  d26, d10 +    vmull.u8    q9,  d11, d25 +    vmull.u8    q12, d11, d24 +    vmull.u8    q13, d11, d26 +    vrshr.u16   q8,  q0,  #8 +    vrshr.u16   q10, q1,  #8 +    vrshr.u16   q11, q6,  #8 +    vraddhn.u16 d0,  q0,  q8 +    vraddhn.u16 d1,  q1,  q10 +    vraddhn.u16 d2,  q6,  q11 +    vrshr.u16   q11, q12, #8 +    vrshr.u16   q8,  q9,  #8 +    vrshr.u16   q6,  q13, #8 +    vraddhn.u16 d24, q12, q11 +    vraddhn.u16 d25, q9,  q8 +    /* +     * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format +     * and put data into d16 - blue, d17 - green, d18 - red +     */ +       vshrn.u16   d17, q2,  #3 +       vshrn.u16   d18, q2,  #8 +    vraddhn.u16 d26, q13, q6 +       vsli.u16    q2,  q2,  #5 +       vsri.u8     d18, d18, #5 +       vsri.u8     d17, d17, #6 +    /* +     * 'combine_over_ca' replacement +     * +     * output: updated dest in d16 - blue, d17 - green, d18 - red +     */ +    vmvn.8      q12, q12 +       vshrn.u16   d16, q2,  #2 +    vmvn.8      d26, d26 +    vmull.u8    q6,  d16, d24 +    vmull.u8    q7,  d17, d25 +    vmull.u8    q11, d18, d26 +.endm + +.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail +    /* ... continue 'combine_over_ca' replacement */ +    vrshr.u16   q10, q6,  #8 +    vrshr.u16   q14, q7,  #8 +    vrshr.u16   q15, q11, #8 +    vraddhn.u16 d16, q10, q6 +    vraddhn.u16 d17, q14, q7 +    vraddhn.u16 d18, q15, q11 +    vqadd.u8    q8,  q0,  q8 +    vqadd.u8    d18, d2,  d18 +    /* +     * convert the results in d16, d17, d18 to r5g6b5 and store +     * them into {d28, d29} +     */ +    vshll.u8    q14, d18, #8 +    vshll.u8    q10, d17, #8 +    vshll.u8    q15, d16, #8 +    vsri.u16    q14, q10, #5 +    vsri.u16    q14, q15, #11 +.endm + +.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head +    fetch_mask_pixblock +        vrshr.u16   q10, q6, #8 +        vrshr.u16   q14, q7, #8 +    vld1.16     {d4, d5}, [DST_R, :128]! +        vrshr.u16   q15, q11, #8 +        vraddhn.u16 d16, q10, q6 +        vraddhn.u16 d17, q14, q7 +        vraddhn.u16 d22, q15, q11 +            /* process_pixblock_head */ +            /* +             * 'combine_mask_ca' replacement +             * +             * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A] +             *         mask in          {d24, d25, d26}       [B, G, R] +             * output: updated src in   {d0,  d1,  d2 }       [B, G, R] +             *         updated mask in  {d24, d25, d26}       [B, G, R] +             */ +            vmull.u8    q6,  d26, d10 +        vqadd.u8    q8,  q0, q8 +            vmull.u8    q0,  d24, d8 +        vqadd.u8    d22, d2, d22 +            vmull.u8    q1,  d25, d9 +        /* +         * convert the result in d16, d17, d22 to r5g6b5 and store +         * it into {d28, d29} +         */ +        vshll.u8    q14, d22, #8 +        vshll.u8    q10, d17, #8 +        vshll.u8    q15, d16, #8 +            vmull.u8    q9,  d11, d25 +        vsri.u16    q14, q10, #5 +            vmull.u8    q12, d11, d24 +            vmull.u8    q13, d11, d26 +        vsri.u16    q14, q15, #11 +    cache_preload 8, 8 +            vrshr.u16   q8,  q0,  #8 +            vrshr.u16   q10, q1,  #8 +            vrshr.u16   q11, q6,  #8 +            vraddhn.u16 d0,  q0,  q8 +            vraddhn.u16 d1,  q1,  q10 +            vraddhn.u16 d2,  q6,  q11 +            vrshr.u16   q11, q12, #8 +            vrshr.u16   q8,  q9,  #8 +            vrshr.u16   q6,  q13, #8 +            vraddhn.u16 d24, q12, q11 +            vraddhn.u16 d25, q9,  q8 +                /* +                 * convert 8 r5g6b5 pixel data from {d4, d5} to planar +	         * 8-bit format and put data into d16 - blue, d17 - green, +	         * d18 - red +                 */ +                vshrn.u16   d17, q2,  #3 +                vshrn.u16   d18, q2,  #8 +            vraddhn.u16 d26, q13, q6 +                vsli.u16    q2,  q2,  #5 +                vsri.u8     d17, d17, #6 +                vsri.u8     d18, d18, #5 +            /* +             * 'combine_over_ca' replacement +             * +             * output: updated dest in d16 - blue, d17 - green, d18 - red +             */ +            vmvn.8      q12, q12 +                vshrn.u16   d16, q2,  #2 +            vmvn.8      d26, d26 +            vmull.u8    q7,  d17, d25 +            vmull.u8    q6,  d16, d24 +            vmull.u8    q11, d18, d26 +    vst1.16     {d28, d29}, [DST_W, :128]! +.endm + +.macro pixman_composite_over_n_8888_0565_ca_init +    add         DUMMY, sp, #ARGS_STACK_OFFSET +    vpush       {d8-d15} +    vld1.32     {d11[0]}, [DUMMY] +    vdup.8      d8, d11[0] +    vdup.8      d9, d11[1] +    vdup.8      d10, d11[2] +    vdup.8      d11, d11[3] +.endm + +.macro pixman_composite_over_n_8888_0565_ca_cleanup +    vpop        {d8-d15} +.endm + +generate_composite_function \ +    pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    pixman_composite_over_n_8888_0565_ca_init, \ +    pixman_composite_over_n_8888_0565_ca_cleanup, \ +    pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \ +    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \ +    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_in_n_8_process_pixblock_head +    /* expecting source data in {d0, d1, d2, d3} */ +    /* and destination data in {d4, d5, d6, d7} */ +    vmull.u8    q8,  d4,  d3 +    vmull.u8    q9,  d5,  d3 +    vmull.u8    q10, d6,  d3 +    vmull.u8    q11, d7,  d3 +.endm + +.macro pixman_composite_in_n_8_process_pixblock_tail +    vrshr.u16   q14, q8,  #8 +    vrshr.u16   q15, q9,  #8 +    vrshr.u16   q12, q10, #8 +    vrshr.u16   q13, q11, #8 +    vraddhn.u16 d28, q8,  q14 +    vraddhn.u16 d29, q9,  q15 +    vraddhn.u16 d30, q10, q12 +    vraddhn.u16 d31, q11, q13 +.endm + +.macro pixman_composite_in_n_8_process_pixblock_tail_head +    pixman_composite_in_n_8_process_pixblock_tail +    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]! +    cache_preload 32, 32 +    pixman_composite_in_n_8_process_pixblock_head +    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +.macro pixman_composite_in_n_8_init +    add         DUMMY, sp, #ARGS_STACK_OFFSET +    vld1.32     {d3[0]}, [DUMMY] +    vdup.8      d3, d3[3] +.endm + +.macro pixman_composite_in_n_8_cleanup +.endm + +generate_composite_function \ +    pixman_composite_in_n_8_asm_neon, 0, 0, 8, \ +    FLAG_DST_READWRITE, \ +    32, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    pixman_composite_in_n_8_init, \ +    pixman_composite_in_n_8_cleanup, \ +    pixman_composite_in_n_8_process_pixblock_head, \ +    pixman_composite_in_n_8_process_pixblock_tail, \ +    pixman_composite_in_n_8_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    4,  /* dst_r_basereg */ \ +    0,  /* src_basereg   */ \ +    24  /* mask_basereg  */ + +.macro pixman_composite_add_n_8_8_process_pixblock_head +    /* expecting source data in {d8, d9, d10, d11} */ +    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ +    /* and destination data in {d4, d5, d6, d7} */ +    /* mask is in d24, d25, d26, d27 */ +    vmull.u8    q0, d24, d11 +    vmull.u8    q1, d25, d11 +    vmull.u8    q6, d26, d11 +    vmull.u8    q7, d27, d11 +    vrshr.u16   q10, q0, #8 +    vrshr.u16   q11, q1, #8 +    vrshr.u16   q12, q6, #8 +    vrshr.u16   q13, q7, #8 +    vraddhn.u16 d0, q0, q10 +    vraddhn.u16 d1, q1, q11 +    vraddhn.u16 d2, q6, q12 +    vraddhn.u16 d3, q7, q13 +    vqadd.u8    q14, q0, q2 +    vqadd.u8    q15, q1, q3 +.endm + +.macro pixman_composite_add_n_8_8_process_pixblock_tail +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_add_n_8_8_process_pixblock_tail_head +    pixman_composite_add_n_8_8_process_pixblock_tail +    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]! +    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]! +    fetch_mask_pixblock +    cache_preload 32, 32 +    pixman_composite_add_n_8_8_process_pixblock_head +.endm + +.macro pixman_composite_add_n_8_8_init +    add         DUMMY, sp, #ARGS_STACK_OFFSET +    vpush       {d8-d15} +    vld1.32     {d11[0]}, [DUMMY] +    vdup.8      d11, d11[3] +.endm + +.macro pixman_composite_add_n_8_8_cleanup +    vpop        {d8-d15} +.endm + +generate_composite_function \ +    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \ +    FLAG_DST_READWRITE, \ +    32, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    pixman_composite_add_n_8_8_init, \ +    pixman_composite_add_n_8_8_cleanup, \ +    pixman_composite_add_n_8_8_process_pixblock_head, \ +    pixman_composite_add_n_8_8_process_pixblock_tail, \ +    pixman_composite_add_n_8_8_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_add_8_8_8_process_pixblock_head +    /* expecting source data in {d0, d1, d2, d3} */ +    /* destination data in {d4, d5, d6, d7} */ +    /* mask in {d24, d25, d26, d27} */ +    vmull.u8    q8, d24, d0 +    vmull.u8    q9, d25, d1 +    vmull.u8    q10, d26, d2 +    vmull.u8    q11, d27, d3 +    vrshr.u16   q0, q8, #8 +    vrshr.u16   q1, q9, #8 +    vrshr.u16   q12, q10, #8 +    vrshr.u16   q13, q11, #8 +    vraddhn.u16 d0, q0, q8 +    vraddhn.u16 d1, q1, q9 +    vraddhn.u16 d2, q12, q10 +    vraddhn.u16 d3, q13, q11 +    vqadd.u8    q14, q0, q2 +    vqadd.u8    q15, q1, q3 +.endm + +.macro pixman_composite_add_8_8_8_process_pixblock_tail +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_add_8_8_8_process_pixblock_tail_head +    pixman_composite_add_8_8_8_process_pixblock_tail +    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]! +    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]! +    fetch_mask_pixblock +    fetch_src_pixblock +    cache_preload 32, 32 +    pixman_composite_add_8_8_8_process_pixblock_head +.endm + +.macro pixman_composite_add_8_8_8_init +.endm + +.macro pixman_composite_add_8_8_8_cleanup +.endm + +generate_composite_function \ +    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \ +    FLAG_DST_READWRITE, \ +    32, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    pixman_composite_add_8_8_8_init, \ +    pixman_composite_add_8_8_8_cleanup, \ +    pixman_composite_add_8_8_8_process_pixblock_head, \ +    pixman_composite_add_8_8_8_process_pixblock_tail, \ +    pixman_composite_add_8_8_8_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_add_8888_8888_8888_process_pixblock_head +    /* expecting source data in {d0, d1, d2, d3} */ +    /* destination data in {d4, d5, d6, d7} */ +    /* mask in {d24, d25, d26, d27} */ +    vmull.u8    q8,  d27, d0 +    vmull.u8    q9,  d27, d1 +    vmull.u8    q10, d27, d2 +    vmull.u8    q11, d27, d3 +    /* 1 cycle bubble */ +    vrsra.u16   q8,  q8,  #8 +    vrsra.u16   q9,  q9,  #8 +    vrsra.u16   q10, q10, #8 +    vrsra.u16   q11, q11, #8 +.endm + +.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail +    /* 2 cycle bubble */ +    vrshrn.u16  d28, q8,  #8 +    vrshrn.u16  d29, q9,  #8 +    vrshrn.u16  d30, q10, #8 +    vrshrn.u16  d31, q11, #8 +    vqadd.u8    q14, q2,  q14 +    /* 1 cycle bubble */ +    vqadd.u8    q15, q3,  q15 +.endm + +.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head +    fetch_src_pixblock +        vrshrn.u16  d28, q8,  #8 +    fetch_mask_pixblock +        vrshrn.u16  d29, q9,  #8 +    vmull.u8    q8,  d27, d0 +        vrshrn.u16  d30, q10, #8 +    vmull.u8    q9,  d27, d1 +        vrshrn.u16  d31, q11, #8 +    vmull.u8    q10, d27, d2 +        vqadd.u8    q14, q2,  q14 +    vmull.u8    q11, d27, d3 +        vqadd.u8    q15, q3,  q15 +    vrsra.u16   q8,  q8,  #8 +    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]! +    vrsra.u16   q9,  q9,  #8 +        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]! +    vrsra.u16   q10, q10, #8 + +    cache_preload 8, 8 + +    vrsra.u16   q11, q11, #8 +.endm + +generate_composite_function \ +    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    10, /* prefetch distance */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_add_8888_8888_8888_process_pixblock_head, \ +    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ +    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head + +generate_composite_function_single_scanline \ +    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_add_8888_8888_8888_process_pixblock_head, \ +    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ +    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head + +/******************************************************************************/ + +generate_composite_function \ +    pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_add_8888_8888_8888_process_pixblock_head, \ +    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ +    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    4,  /* dst_r_basereg */ \ +    0,  /* src_basereg   */ \ +    27  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_add_n_8_8888_init +    add         DUMMY, sp, #ARGS_STACK_OFFSET +    vld1.32     {d3[0]}, [DUMMY] +    vdup.8      d0, d3[0] +    vdup.8      d1, d3[1] +    vdup.8      d2, d3[2] +    vdup.8      d3, d3[3] +.endm + +.macro pixman_composite_add_n_8_8888_cleanup +.endm + +generate_composite_function \ +    pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    pixman_composite_add_n_8_8888_init, \ +    pixman_composite_add_n_8_8888_cleanup, \ +    pixman_composite_add_8888_8888_8888_process_pixblock_head, \ +    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ +    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    4,  /* dst_r_basereg */ \ +    0,  /* src_basereg   */ \ +    27  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_add_8888_n_8888_init +    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8) +    vld1.32     {d27[0]}, [DUMMY] +    vdup.8      d27, d27[3] +.endm + +.macro pixman_composite_add_8888_n_8888_cleanup +.endm + +generate_composite_function \ +    pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    pixman_composite_add_8888_n_8888_init, \ +    pixman_composite_add_8888_n_8888_cleanup, \ +    pixman_composite_add_8888_8888_8888_process_pixblock_head, \ +    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ +    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    4,  /* dst_r_basereg */ \ +    0,  /* src_basereg   */ \ +    27  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head +    /* expecting source data in {d0, d1, d2, d3} */ +    /* destination data in {d4, d5, d6, d7} */ +    /* solid mask is in d15 */ + +    /* 'in' */ +    vmull.u8    q8, d15, d3 +    vmull.u8    q6, d15, d2 +    vmull.u8    q5, d15, d1 +    vmull.u8    q4, d15, d0 +    vrshr.u16   q13, q8, #8 +    vrshr.u16   q12, q6, #8 +    vrshr.u16   q11, q5, #8 +    vrshr.u16   q10, q4, #8 +    vraddhn.u16 d3, q8, q13 +    vraddhn.u16 d2, q6, q12 +    vraddhn.u16 d1, q5, q11 +    vraddhn.u16 d0, q4, q10 +    vmvn.8      d24, d3  /* get inverted alpha */ +    /* now do alpha blending */ +    vmull.u8    q8, d24, d4 +    vmull.u8    q9, d24, d5 +    vmull.u8    q10, d24, d6 +    vmull.u8    q11, d24, d7 +.endm + +.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail +    vrshr.u16   q14, q8, #8 +    vrshr.u16   q15, q9, #8 +    vrshr.u16   q12, q10, #8 +    vrshr.u16   q13, q11, #8 +    vraddhn.u16 d28, q14, q8 +    vraddhn.u16 d29, q15, q9 +    vraddhn.u16 d30, q12, q10 +    vraddhn.u16 d31, q13, q11 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head +    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]! +    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail +    fetch_src_pixblock +    cache_preload 8, 8 +    fetch_mask_pixblock +    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head +    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +generate_composite_function_single_scanline \ +    pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    default_init_need_all_regs, \ +    default_cleanup_need_all_regs, \ +    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \ +    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \ +    pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \ +    28, /* dst_w_basereg */ \ +    4,  /* dst_r_basereg */ \ +    0,  /* src_basereg   */ \ +    12  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_over_8888_n_8888_process_pixblock_head +    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head +.endm + +.macro pixman_composite_over_8888_n_8888_process_pixblock_tail +    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail +    vqadd.u8    q14, q0, q14 +    vqadd.u8    q15, q1, q15 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head +    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]! +    pixman_composite_over_8888_n_8888_process_pixblock_tail +    fetch_src_pixblock +    cache_preload 8, 8 +    pixman_composite_over_8888_n_8888_process_pixblock_head +    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +.macro pixman_composite_over_8888_n_8888_init +    add         DUMMY, sp, #48 +    vpush       {d8-d15} +    vld1.32     {d15[0]}, [DUMMY] +    vdup.8      d15, d15[3] +.endm + +.macro pixman_composite_over_8888_n_8888_cleanup +    vpop        {d8-d15} +.endm + +generate_composite_function \ +    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    pixman_composite_over_8888_n_8888_init, \ +    pixman_composite_over_8888_n_8888_cleanup, \ +    pixman_composite_over_8888_n_8888_process_pixblock_head, \ +    pixman_composite_over_8888_n_8888_process_pixblock_tail, \ +    pixman_composite_over_8888_n_8888_process_pixblock_tail_head + +/******************************************************************************/ + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head +    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]! +    pixman_composite_over_8888_n_8888_process_pixblock_tail +    fetch_src_pixblock +    cache_preload 8, 8 +    fetch_mask_pixblock +    pixman_composite_over_8888_n_8888_process_pixblock_head +    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +generate_composite_function \ +    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    default_init_need_all_regs, \ +    default_cleanup_need_all_regs, \ +    pixman_composite_over_8888_n_8888_process_pixblock_head, \ +    pixman_composite_over_8888_n_8888_process_pixblock_tail, \ +    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ +    28, /* dst_w_basereg */ \ +    4,  /* dst_r_basereg */ \ +    0,  /* src_basereg   */ \ +    12  /* mask_basereg  */ + +generate_composite_function_single_scanline \ +    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    default_init_need_all_regs, \ +    default_cleanup_need_all_regs, \ +    pixman_composite_over_8888_n_8888_process_pixblock_head, \ +    pixman_composite_over_8888_n_8888_process_pixblock_tail, \ +    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ +    28, /* dst_w_basereg */ \ +    4,  /* dst_r_basereg */ \ +    0,  /* src_basereg   */ \ +    12  /* mask_basereg  */ + +/******************************************************************************/ + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head +    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]! +    pixman_composite_over_8888_n_8888_process_pixblock_tail +    fetch_src_pixblock +    cache_preload 8, 8 +    fetch_mask_pixblock +    pixman_composite_over_8888_n_8888_process_pixblock_head +    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +generate_composite_function \ +    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    default_init_need_all_regs, \ +    default_cleanup_need_all_regs, \ +    pixman_composite_over_8888_n_8888_process_pixblock_head, \ +    pixman_composite_over_8888_n_8888_process_pixblock_tail, \ +    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \ +    28, /* dst_w_basereg */ \ +    4,  /* dst_r_basereg */ \ +    0,  /* src_basereg   */ \ +    15  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_src_0888_0888_process_pixblock_head +.endm + +.macro pixman_composite_src_0888_0888_process_pixblock_tail +.endm + +.macro pixman_composite_src_0888_0888_process_pixblock_tail_head +    vst3.8 {d0, d1, d2}, [DST_W]! +    fetch_src_pixblock +    cache_preload 8, 8 +.endm + +generate_composite_function \ +    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \ +    FLAG_DST_WRITEONLY, \ +    8, /* number of pixels, processed in a single block */ \ +    10, /* prefetch distance */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_src_0888_0888_process_pixblock_head, \ +    pixman_composite_src_0888_0888_process_pixblock_tail, \ +    pixman_composite_src_0888_0888_process_pixblock_tail_head, \ +    0, /* dst_w_basereg */ \ +    0, /* dst_r_basereg */ \ +    0, /* src_basereg   */ \ +    0  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_src_0888_8888_rev_process_pixblock_head +    vswp   d0, d2 +.endm + +.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail +.endm + +.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head +    vst4.8 {d0, d1, d2, d3}, [DST_W]! +    fetch_src_pixblock +    vswp   d0, d2 +    cache_preload 8, 8 +.endm + +.macro pixman_composite_src_0888_8888_rev_init +    veor   d3, d3, d3 +.endm + +generate_composite_function \ +    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \ +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    10, /* prefetch distance */ \ +    pixman_composite_src_0888_8888_rev_init, \ +    default_cleanup, \ +    pixman_composite_src_0888_8888_rev_process_pixblock_head, \ +    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \ +    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \ +    0, /* dst_w_basereg */ \ +    0, /* dst_r_basereg */ \ +    0, /* src_basereg   */ \ +    0  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_src_0888_0565_rev_process_pixblock_head +    vshll.u8    q8, d1, #8 +    vshll.u8    q9, d2, #8 +.endm + +.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail +    vshll.u8    q14, d0, #8 +    vsri.u16    q14, q8, #5 +    vsri.u16    q14, q9, #11 +.endm + +.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head +        vshll.u8    q14, d0, #8 +    fetch_src_pixblock +        vsri.u16    q14, q8, #5 +        vsri.u16    q14, q9, #11 +    vshll.u8    q8, d1, #8 +        vst1.16 {d28, d29}, [DST_W, :128]! +    vshll.u8    q9, d2, #8 +.endm + +generate_composite_function \ +    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \ +    FLAG_DST_WRITEONLY, \ +    8, /* number of pixels, processed in a single block */ \ +    10, /* prefetch distance */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_src_0888_0565_rev_process_pixblock_head, \ +    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \ +    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    0, /* dst_r_basereg */ \ +    0, /* src_basereg   */ \ +    0  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_src_pixbuf_8888_process_pixblock_head +    vmull.u8    q8, d3, d0 +    vmull.u8    q9, d3, d1 +    vmull.u8    q10, d3, d2 +.endm + +.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail +    vrshr.u16   q11, q8, #8 +    vswp        d3, d31 +    vrshr.u16   q12, q9, #8 +    vrshr.u16   q13, q10, #8 +    vraddhn.u16 d30, q11, q8 +    vraddhn.u16 d29, q12, q9 +    vraddhn.u16 d28, q13, q10 +.endm + +.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head +        vrshr.u16   q11, q8, #8 +        vswp        d3, d31 +        vrshr.u16   q12, q9, #8 +        vrshr.u16   q13, q10, #8 +    fetch_src_pixblock +        vraddhn.u16 d30, q11, q8 +                                    PF add PF_X, PF_X, #8 +                                    PF tst PF_CTL, #0xF +                                    PF addne PF_X, PF_X, #8 +                                    PF subne PF_CTL, PF_CTL, #1 +        vraddhn.u16 d29, q12, q9 +        vraddhn.u16 d28, q13, q10 +    vmull.u8    q8, d3, d0 +    vmull.u8    q9, d3, d1 +    vmull.u8    q10, d3, d2 +        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +                                    PF cmp PF_X, ORIG_W +                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] +                                    PF subge PF_X, PF_X, ORIG_W +                                    PF subges PF_CTL, PF_CTL, #0x10 +                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! +.endm + +generate_composite_function \ +    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    10, /* prefetch distance */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_src_pixbuf_8888_process_pixblock_head, \ +    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \ +    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    0, /* dst_r_basereg */ \ +    0, /* src_basereg   */ \ +    0  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head +    vmull.u8    q8, d3, d0 +    vmull.u8    q9, d3, d1 +    vmull.u8    q10, d3, d2 +.endm + +.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail +    vrshr.u16   q11, q8, #8 +    vswp        d3, d31 +    vrshr.u16   q12, q9, #8 +    vrshr.u16   q13, q10, #8 +    vraddhn.u16 d28, q11, q8 +    vraddhn.u16 d29, q12, q9 +    vraddhn.u16 d30, q13, q10 +.endm + +.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head +        vrshr.u16   q11, q8, #8 +        vswp        d3, d31 +        vrshr.u16   q12, q9, #8 +        vrshr.u16   q13, q10, #8 +    fetch_src_pixblock +        vraddhn.u16 d28, q11, q8 +                                    PF add PF_X, PF_X, #8 +                                    PF tst PF_CTL, #0xF +                                    PF addne PF_X, PF_X, #8 +                                    PF subne PF_CTL, PF_CTL, #1 +        vraddhn.u16 d29, q12, q9 +        vraddhn.u16 d30, q13, q10 +    vmull.u8    q8, d3, d0 +    vmull.u8    q9, d3, d1 +    vmull.u8    q10, d3, d2 +        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +                                    PF cmp PF_X, ORIG_W +                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] +                                    PF subge PF_X, PF_X, ORIG_W +                                    PF subges PF_CTL, PF_CTL, #0x10 +                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! +.endm + +generate_composite_function \ +    pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    10, /* prefetch distance */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_src_rpixbuf_8888_process_pixblock_head, \ +    pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \ +    pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    0, /* dst_r_basereg */ \ +    0, /* src_basereg   */ \ +    0  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_over_0565_8_0565_process_pixblock_head +    /* mask is in d15 */ +    convert_0565_to_x888 q4, d2, d1, d0 +    convert_0565_to_x888 q5, d6, d5, d4 +    /* source pixel data is in      {d0, d1, d2, XX} */ +    /* destination pixel data is in {d4, d5, d6, XX} */ +    vmvn.8      d7,  d15 +    vmull.u8    q6,  d15, d2 +    vmull.u8    q5,  d15, d1 +    vmull.u8    q4,  d15, d0 +    vmull.u8    q8,  d7,  d4 +    vmull.u8    q9,  d7,  d5 +    vmull.u8    q13, d7,  d6 +    vrshr.u16   q12, q6,  #8 +    vrshr.u16   q11, q5,  #8 +    vrshr.u16   q10, q4,  #8 +    vraddhn.u16 d2,  q6,  q12 +    vraddhn.u16 d1,  q5,  q11 +    vraddhn.u16 d0,  q4,  q10 +.endm + +.macro pixman_composite_over_0565_8_0565_process_pixblock_tail +    vrshr.u16   q14, q8,  #8 +    vrshr.u16   q15, q9,  #8 +    vrshr.u16   q12, q13, #8 +    vraddhn.u16 d28, q14, q8 +    vraddhn.u16 d29, q15, q9 +    vraddhn.u16 d30, q12, q13 +    vqadd.u8    q0,  q0,  q14 +    vqadd.u8    q1,  q1,  q15 +    /* 32bpp result is in {d0, d1, d2, XX} */ +    convert_8888_to_0565 d2, d1, d0, q14, q15, q3 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head +    fetch_mask_pixblock +    pixman_composite_over_0565_8_0565_process_pixblock_tail +    fetch_src_pixblock +    vld1.16    {d10, d11}, [DST_R, :128]! +    cache_preload 8, 8 +    pixman_composite_over_0565_8_0565_process_pixblock_head +    vst1.16    {d28, d29}, [DST_W, :128]! +.endm + +generate_composite_function \ +    pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \ +    FLAG_DST_READWRITE, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    default_init_need_all_regs, \ +    default_cleanup_need_all_regs, \ +    pixman_composite_over_0565_8_0565_process_pixblock_head, \ +    pixman_composite_over_0565_8_0565_process_pixblock_tail, \ +    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    10,  /* dst_r_basereg */ \ +    8,  /* src_basereg   */ \ +    15  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_over_0565_n_0565_init +    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8) +    vpush       {d8-d15} +    vld1.32     {d15[0]}, [DUMMY] +    vdup.8      d15, d15[3] +.endm + +.macro pixman_composite_over_0565_n_0565_cleanup +    vpop        {d8-d15} +.endm + +generate_composite_function \ +    pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \ +    FLAG_DST_READWRITE, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    pixman_composite_over_0565_n_0565_init, \ +    pixman_composite_over_0565_n_0565_cleanup, \ +    pixman_composite_over_0565_8_0565_process_pixblock_head, \ +    pixman_composite_over_0565_8_0565_process_pixblock_tail, \ +    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    10, /* dst_r_basereg */ \ +    8,  /* src_basereg   */ \ +    15  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_add_0565_8_0565_process_pixblock_head +    /* mask is in d15 */ +    convert_0565_to_x888 q4, d2, d1, d0 +    convert_0565_to_x888 q5, d6, d5, d4 +    /* source pixel data is in      {d0, d1, d2, XX} */ +    /* destination pixel data is in {d4, d5, d6, XX} */ +    vmull.u8    q6,  d15, d2 +    vmull.u8    q5,  d15, d1 +    vmull.u8    q4,  d15, d0 +    vrshr.u16   q12, q6,  #8 +    vrshr.u16   q11, q5,  #8 +    vrshr.u16   q10, q4,  #8 +    vraddhn.u16 d2,  q6,  q12 +    vraddhn.u16 d1,  q5,  q11 +    vraddhn.u16 d0,  q4,  q10 +.endm + +.macro pixman_composite_add_0565_8_0565_process_pixblock_tail +    vqadd.u8    q0,  q0,  q2 +    vqadd.u8    q1,  q1,  q3 +    /* 32bpp result is in {d0, d1, d2, XX} */ +    convert_8888_to_0565 d2, d1, d0, q14, q15, q3 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head +    fetch_mask_pixblock +    pixman_composite_add_0565_8_0565_process_pixblock_tail +    fetch_src_pixblock +    vld1.16    {d10, d11}, [DST_R, :128]! +    cache_preload 8, 8 +    pixman_composite_add_0565_8_0565_process_pixblock_head +    vst1.16    {d28, d29}, [DST_W, :128]! +.endm + +generate_composite_function \ +    pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \ +    FLAG_DST_READWRITE, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    default_init_need_all_regs, \ +    default_cleanup_need_all_regs, \ +    pixman_composite_add_0565_8_0565_process_pixblock_head, \ +    pixman_composite_add_0565_8_0565_process_pixblock_tail, \ +    pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    10, /* dst_r_basereg */ \ +    8,  /* src_basereg   */ \ +    15  /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_out_reverse_8_0565_process_pixblock_head +    /* mask is in d15 */ +    convert_0565_to_x888 q5, d6, d5, d4 +    /* destination pixel data is in {d4, d5, d6, xx} */ +    vmvn.8      d24, d15 /* get inverted alpha */ +    /* now do alpha blending */ +    vmull.u8    q8, d24, d4 +    vmull.u8    q9, d24, d5 +    vmull.u8    q10, d24, d6 +.endm + +.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail +    vrshr.u16   q14, q8, #8 +    vrshr.u16   q15, q9, #8 +    vrshr.u16   q12, q10, #8 +    vraddhn.u16 d0, q14, q8 +    vraddhn.u16 d1, q15, q9 +    vraddhn.u16 d2, q12, q10 +    /* 32bpp result is in {d0, d1, d2, XX} */ +    convert_8888_to_0565 d2, d1, d0, q14, q15, q3 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head +    fetch_src_pixblock +    pixman_composite_out_reverse_8_0565_process_pixblock_tail +    vld1.16    {d10, d11}, [DST_R, :128]! +    cache_preload 8, 8 +    pixman_composite_out_reverse_8_0565_process_pixblock_head +    vst1.16    {d28, d29}, [DST_W, :128]! +.endm + +generate_composite_function \ +    pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \ +    FLAG_DST_READWRITE, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    default_init_need_all_regs, \ +    default_cleanup_need_all_regs, \ +    pixman_composite_out_reverse_8_0565_process_pixblock_head, \ +    pixman_composite_out_reverse_8_0565_process_pixblock_tail, \ +    pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    10, /* dst_r_basereg */ \ +    15, /* src_basereg   */ \ +    0   /* mask_basereg  */ + +/******************************************************************************/ + +.macro pixman_composite_out_reverse_8_8888_process_pixblock_head +    /* src is in d0 */ +    /* destination pixel data is in {d4, d5, d6, d7} */ +    vmvn.8      d1, d0 /* get inverted alpha */ +    /* now do alpha blending */ +    vmull.u8    q8, d1, d4 +    vmull.u8    q9, d1, d5 +    vmull.u8    q10, d1, d6 +    vmull.u8    q11, d1, d7 +.endm + +.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail +    vrshr.u16   q14, q8, #8 +    vrshr.u16   q15, q9, #8 +    vrshr.u16   q12, q10, #8 +    vrshr.u16   q13, q11, #8 +    vraddhn.u16 d28, q14, q8 +    vraddhn.u16 d29, q15, q9 +    vraddhn.u16 d30, q12, q10 +    vraddhn.u16 d31, q13, q11 +    /* 32bpp result is in {d28, d29, d30, d31} */ +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head +    fetch_src_pixblock +    pixman_composite_out_reverse_8_8888_process_pixblock_tail +    vld4.8    {d4, d5, d6, d7}, [DST_R, :128]! +    cache_preload 8, 8 +    pixman_composite_out_reverse_8_8888_process_pixblock_head +    vst4.8    {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +generate_composite_function \ +    pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    5, /* prefetch distance */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_out_reverse_8_8888_process_pixblock_head, \ +    pixman_composite_out_reverse_8_8888_process_pixblock_tail, \ +    pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    4, /* dst_r_basereg */ \ +    0, /* src_basereg   */ \ +    0   /* mask_basereg  */ + +/******************************************************************************/ + +generate_composite_function_nearest_scanline \ +    pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_over_8888_8888_process_pixblock_head, \ +    pixman_composite_over_8888_8888_process_pixblock_tail, \ +    pixman_composite_over_8888_8888_process_pixblock_tail_head + +generate_composite_function_nearest_scanline \ +    pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_over_8888_0565_process_pixblock_head, \ +    pixman_composite_over_8888_0565_process_pixblock_tail, \ +    pixman_composite_over_8888_0565_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    4,  /* dst_r_basereg */ \ +    0,  /* src_basereg   */ \ +    24  /* mask_basereg  */ + +generate_composite_function_nearest_scanline \ +    pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \ +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_src_8888_0565_process_pixblock_head, \ +    pixman_composite_src_8888_0565_process_pixblock_tail, \ +    pixman_composite_src_8888_0565_process_pixblock_tail_head + +generate_composite_function_nearest_scanline \ +    pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \ +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    default_init, \ +    default_cleanup, \ +    pixman_composite_src_0565_8888_process_pixblock_head, \ +    pixman_composite_src_0565_8888_process_pixblock_tail, \ +    pixman_composite_src_0565_8888_process_pixblock_tail_head + +generate_composite_function_nearest_scanline \ +    pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \ +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ +    8, /* number of pixels, processed in a single block */ \ +    default_init_need_all_regs, \ +    default_cleanup_need_all_regs, \ +    pixman_composite_over_8888_8_0565_process_pixblock_head, \ +    pixman_composite_over_8888_8_0565_process_pixblock_tail, \ +    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    4,  /* dst_r_basereg */ \ +    8,  /* src_basereg   */ \ +    24  /* mask_basereg  */ + +generate_composite_function_nearest_scanline \ +    pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \ +    FLAG_DST_READWRITE, \ +    8, /* number of pixels, processed in a single block */ \ +    default_init_need_all_regs, \ +    default_cleanup_need_all_regs, \ +    pixman_composite_over_0565_8_0565_process_pixblock_head, \ +    pixman_composite_over_0565_8_0565_process_pixblock_tail, \ +    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ +    28, /* dst_w_basereg */ \ +    10,  /* dst_r_basereg */ \ +    8,  /* src_basereg   */ \ +    15  /* mask_basereg  */ + +/******************************************************************************/ + +/* + * Bilinear scaling support code which tries to provide pixel fetching, color + * format conversion, and interpolation as separate macros which can be used + * as the basic building blocks for constructing bilinear scanline functions. + */ + +.macro bilinear_load_8888 reg1, reg2, tmp +    mov       TMP1, X, asr #16 +    add       X, X, UX +    add       TMP1, TOP, TMP1, asl #2 +    vld1.32   {reg1}, [TMP1], STRIDE +    vld1.32   {reg2}, [TMP1] +.endm + +.macro bilinear_load_0565 reg1, reg2, tmp +    mov       TMP1, X, asr #16 +    add       X, X, UX +    add       TMP1, TOP, TMP1, asl #1 +    vld1.32   {reg2[0]}, [TMP1], STRIDE +    vld1.32   {reg2[1]}, [TMP1] +    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp +.endm + +.macro bilinear_load_and_vertical_interpolate_two_8888 \ +                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 + +    bilinear_load_8888 reg1, reg2, tmp1 +    vmull.u8  acc1, reg1, d28 +    vmlal.u8  acc1, reg2, d29 +    bilinear_load_8888 reg3, reg4, tmp2 +    vmull.u8  acc2, reg3, d28 +    vmlal.u8  acc2, reg4, d29 +.endm + +.macro bilinear_load_and_vertical_interpolate_four_8888 \ +                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ +                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + +    bilinear_load_and_vertical_interpolate_two_8888 \ +                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi +    bilinear_load_and_vertical_interpolate_two_8888 \ +                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi +.endm + +.macro bilinear_load_and_vertical_interpolate_two_0565 \ +                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi + +    mov       TMP1, X, asr #16 +    add       X, X, UX +    add       TMP1, TOP, TMP1, asl #1 +    mov       TMP2, X, asr #16 +    add       X, X, UX +    add       TMP2, TOP, TMP2, asl #1 +    vld1.32   {acc2lo[0]}, [TMP1], STRIDE +    vld1.32   {acc2hi[0]}, [TMP2], STRIDE +    vld1.32   {acc2lo[1]}, [TMP1] +    vld1.32   {acc2hi[1]}, [TMP2] +    convert_0565_to_x888 acc2, reg3, reg2, reg1 +    vzip.u8   reg1, reg3 +    vzip.u8   reg2, reg4 +    vzip.u8   reg3, reg4 +    vzip.u8   reg1, reg2 +    vmull.u8  acc1, reg1, d28 +    vmlal.u8  acc1, reg2, d29 +    vmull.u8  acc2, reg3, d28 +    vmlal.u8  acc2, reg4, d29 +.endm + +.macro bilinear_load_and_vertical_interpolate_four_0565 \ +                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ +                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + +    mov       TMP1, X, asr #16 +    add       X, X, UX +    add       TMP1, TOP, TMP1, asl #1 +    mov       TMP2, X, asr #16 +    add       X, X, UX +    add       TMP2, TOP, TMP2, asl #1 +    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE +    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE +    vld1.32   {xacc2lo[1]}, [TMP1] +    vld1.32   {xacc2hi[1]}, [TMP2] +    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 +    mov       TMP1, X, asr #16 +    add       X, X, UX +    add       TMP1, TOP, TMP1, asl #1 +    mov       TMP2, X, asr #16 +    add       X, X, UX +    add       TMP2, TOP, TMP2, asl #1 +    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE +    vzip.u8   xreg1, xreg3 +    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE +    vzip.u8   xreg2, xreg4 +    vld1.32   {yacc2lo[1]}, [TMP1] +    vzip.u8   xreg3, xreg4 +    vld1.32   {yacc2hi[1]}, [TMP2] +    vzip.u8   xreg1, xreg2 +    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 +    vmull.u8  xacc1, xreg1, d28 +    vzip.u8   yreg1, yreg3 +    vmlal.u8  xacc1, xreg2, d29 +    vzip.u8   yreg2, yreg4 +    vmull.u8  xacc2, xreg3, d28 +    vzip.u8   yreg3, yreg4 +    vmlal.u8  xacc2, xreg4, d29 +    vzip.u8   yreg1, yreg2 +    vmull.u8  yacc1, yreg1, d28 +    vmlal.u8  yacc1, yreg2, d29 +    vmull.u8  yacc2, yreg3, d28 +    vmlal.u8  yacc2, yreg4, d29 +.endm + +.macro bilinear_store_8888 numpix, tmp1, tmp2 +.if numpix == 4 +    vst1.32   {d0, d1}, [OUT, :128]! +.elseif numpix == 2 +    vst1.32   {d0}, [OUT, :64]! +.elseif numpix == 1 +    vst1.32   {d0[0]}, [OUT, :32]! +.else +    .error bilinear_store_8888 numpix is unsupported +.endif +.endm + +.macro bilinear_store_0565 numpix, tmp1, tmp2 +    vuzp.u8 d0, d1 +    vuzp.u8 d2, d3 +    vuzp.u8 d1, d3 +    vuzp.u8 d0, d2 +    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 +.if numpix == 4 +    vst1.16   {d2}, [OUT, :64]! +.elseif numpix == 2 +    vst1.32   {d2[0]}, [OUT, :32]! +.elseif numpix == 1 +    vst1.16   {d2[0]}, [OUT, :16]! +.else +    .error bilinear_store_0565 numpix is unsupported +.endif +.endm + +.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt +    bilinear_load_&src_fmt d0, d1, d2 +    vmull.u8  q1, d0, d28 +    vmlal.u8  q1, d1, d29 +    /* 5 cycles bubble */ +    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q0, d2, d30 +    vmlal.u16 q0, d3, d30 +    /* 5 cycles bubble */ +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    /* 3 cycles bubble */ +    vmovn.u16 d0, q0 +    /* 1 cycle bubble */ +    bilinear_store_&dst_fmt 1, q2, q3 +.endm + +.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt +    bilinear_load_and_vertical_interpolate_two_&src_fmt \ +                q1, q11, d0, d1, d20, d21, d22, d23 +    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q0, d2, d30 +    vmlal.u16 q0, d3, d30 +    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q10, d22, d31 +    vmlal.u16 q10, d23, d31 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vadd.u16  q12, q12, q13 +    vmovn.u16 d0, q0 +    bilinear_store_&dst_fmt 2, q2, q3 +.endm + +.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt +    bilinear_load_and_vertical_interpolate_four_&src_fmt \ +                q1, q11, d0, d1, d20, d21, d22, d23 \ +                q3, q9,  d4, d5, d16, d17, d18, d19 +    pld       [TMP1, PF_OFFS] +    sub       TMP1, TMP1, STRIDE +    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q0, d2, d30 +    vmlal.u16 q0, d3, d30 +    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q10, d22, d31 +    vmlal.u16 q10, d23, d31 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q2, d6, d30 +    vmlal.u16 q2, d7, d30 +    vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS +    pld       [TMP2, PF_OFFS] +    vmlsl.u16 q8, d18, d31 +    vmlal.u16 q8, d19, d31 +    vadd.u16  q12, q12, q13 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vmovn.u16 d0, q0 +    vmovn.u16 d1, q2 +    vadd.u16  q12, q12, q13 +    bilinear_store_&dst_fmt 4, q2, q3 +.endm + +.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt +.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt +    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head +.else +    bilinear_interpolate_four_pixels src_fmt, dst_fmt +.endif +.endm + +.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt +.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt +    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail +.endif +.endm + +.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt +.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt +    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head +.else +    bilinear_interpolate_four_pixels src_fmt, dst_fmt +.endif +.endm + +.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt +.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt +    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head +.else +    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt +    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt +.endif +.endm + +.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt +.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt +    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail +.else +    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt +.endif +.endm + +.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt +.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt +    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head +.else +    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt +    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt +.endif +.endm + +.set BILINEAR_FLAG_UNROLL_4,          0 +.set BILINEAR_FLAG_UNROLL_8,          1 +.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 + +/* + * Main template macro for generating NEON optimized bilinear scanline + * functions. + * + * Bilinear scanline scaler macro template uses the following arguments: + *  fname             - name of the function to generate + *  src_fmt           - source color format (8888 or 0565) + *  dst_fmt           - destination color format (8888 or 0565) + *  bpp_shift         - (1 << bpp_shift) is the size of source pixel in bytes + *  prefetch_distance - prefetch in the source image by that many + *                      pixels ahead + */ + +.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ +                                       src_bpp_shift, dst_bpp_shift, \ +                                       prefetch_distance, flags + +pixman_asm_function fname +    OUT       .req      r0 +    TOP       .req      r1 +    BOTTOM    .req      r2 +    WT        .req      r3 +    WB        .req      r4 +    X         .req      r5 +    UX        .req      r6 +    WIDTH     .req      ip +    TMP1      .req      r3 +    TMP2      .req      r4 +    PF_OFFS   .req      r7 +    TMP3      .req      r8 +    TMP4      .req      r9 +    STRIDE    .req      r2 + +    mov       ip, sp +    push      {r4, r5, r6, r7, r8, r9} +    mov       PF_OFFS, #prefetch_distance +    ldmia     ip, {WB, X, UX, WIDTH} +    mul       PF_OFFS, PF_OFFS, UX + +.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 +    vpush     {d8-d15} +.endif + +    sub       STRIDE, BOTTOM, TOP +    .unreq    BOTTOM + +    cmp       WIDTH, #0 +    ble       3f + +    vdup.u16  q12, X +    vdup.u16  q13, UX +    vdup.u8   d28, WT +    vdup.u8   d29, WB +    vadd.u16  d25, d25, d26 + +    /* ensure good destination alignment  */ +    cmp       WIDTH, #1 +    blt       0f +    tst       OUT, #(1 << dst_bpp_shift) +    beq       0f +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vadd.u16  q12, q12, q13 +    bilinear_interpolate_last_pixel src_fmt, dst_fmt +    sub       WIDTH, WIDTH, #1 +0: +    vadd.u16  q13, q13, q13 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vadd.u16  q12, q12, q13 + +    cmp       WIDTH, #2 +    blt       0f +    tst       OUT, #(1 << (dst_bpp_shift + 1)) +    beq       0f +    bilinear_interpolate_two_pixels src_fmt, dst_fmt +    sub       WIDTH, WIDTH, #2 +0: +.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 +/*********** 8 pixels per iteration *****************/ +    cmp       WIDTH, #4 +    blt       0f +    tst       OUT, #(1 << (dst_bpp_shift + 2)) +    beq       0f +    bilinear_interpolate_four_pixels src_fmt, dst_fmt +    sub       WIDTH, WIDTH, #4 +0: +    subs      WIDTH, WIDTH, #8 +    blt       1f +    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) +    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt +    subs      WIDTH, WIDTH, #8 +    blt       5f +0: +    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt +    subs      WIDTH, WIDTH, #8 +    bge       0b +5: +    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt +1: +    tst       WIDTH, #4 +    beq       2f +    bilinear_interpolate_four_pixels src_fmt, dst_fmt +2: +.else +/*********** 4 pixels per iteration *****************/ +    subs      WIDTH, WIDTH, #4 +    blt       1f +    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) +    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt +    subs      WIDTH, WIDTH, #4 +    blt       5f +0: +    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt +    subs      WIDTH, WIDTH, #4 +    bge       0b +5: +    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt +1: +/****************************************************/ +.endif +    /* handle the remaining trailing pixels */ +    tst       WIDTH, #2 +    beq       2f +    bilinear_interpolate_two_pixels src_fmt, dst_fmt +2: +    tst       WIDTH, #1 +    beq       3f +    bilinear_interpolate_last_pixel src_fmt, dst_fmt +3: +.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 +    vpop      {d8-d15} +.endif +    pop       {r4, r5, r6, r7, r8, r9} +    bx        lr + +    .unreq    OUT +    .unreq    TOP +    .unreq    WT +    .unreq    WB +    .unreq    X +    .unreq    UX +    .unreq    WIDTH +    .unreq    TMP1 +    .unreq    TMP2 +    .unreq    PF_OFFS +    .unreq    TMP3 +    .unreq    TMP4 +    .unreq    STRIDE +.endfunc + +.endm + +/*****************************************************************************/ + +.set have_bilinear_interpolate_four_pixels_8888_8888, 1 + +.macro bilinear_interpolate_four_pixels_8888_8888_head +    mov       TMP1, X, asr #16 +    add       X, X, UX +    add       TMP1, TOP, TMP1, asl #2 +    mov       TMP2, X, asr #16 +    add       X, X, UX +    add       TMP2, TOP, TMP2, asl #2 + +    vld1.32   {d22}, [TMP1], STRIDE +    vld1.32   {d23}, [TMP1] +    mov       TMP3, X, asr #16 +    add       X, X, UX +    add       TMP3, TOP, TMP3, asl #2 +    vmull.u8  q8, d22, d28 +    vmlal.u8  q8, d23, d29 + +    vld1.32   {d22}, [TMP2], STRIDE +    vld1.32   {d23}, [TMP2] +    mov       TMP4, X, asr #16 +    add       X, X, UX +    add       TMP4, TOP, TMP4, asl #2 +    vmull.u8  q9, d22, d28 +    vmlal.u8  q9, d23, d29 + +    vld1.32   {d22}, [TMP3], STRIDE +    vld1.32   {d23}, [TMP3] +    vmull.u8  q10, d22, d28 +    vmlal.u8  q10, d23, d29 + +    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q0, d16, d30 +    vmlal.u16 q0, d17, d30 + +    pld       [TMP4, PF_OFFS] +    vld1.32   {d16}, [TMP4], STRIDE +    vld1.32   {d17}, [TMP4] +    pld       [TMP4, PF_OFFS] +    vmull.u8  q11, d16, d28 +    vmlal.u8  q11, d17, d29 + +    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q1, d18, d31 +.endm + +.macro bilinear_interpolate_four_pixels_8888_8888_tail +    vmlal.u16 q1, d19, d31 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q2, d20, d30 +    vmlal.u16 q2, d21, d30 +    vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q3, d22, d31 +    vmlal.u16 q3, d23, d31 +    vadd.u16  q12, q12, q13 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) +    vmovn.u16 d6, q0 +    vmovn.u16 d7, q2 +    vadd.u16  q12, q12, q13 +    vst1.32   {d6, d7}, [OUT, :128]! +.endm + +.macro bilinear_interpolate_four_pixels_8888_8888_tail_head +    mov       TMP1, X, asr #16 +    add       X, X, UX +    add       TMP1, TOP, TMP1, asl #2 +    mov       TMP2, X, asr #16 +    add       X, X, UX +    add       TMP2, TOP, TMP2, asl #2 +        vmlal.u16 q1, d19, d31 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS +        vmlsl.u16 q2, d20, d30 +        vmlal.u16 q2, d21, d30 +        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS +    vld1.32   {d20}, [TMP1], STRIDE +        vmlsl.u16 q3, d22, d31 +        vmlal.u16 q3, d23, d31 +    vld1.32   {d21}, [TMP1] +    vmull.u8  q8, d20, d28 +    vmlal.u8  q8, d21, d29 +        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) +        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) +    vld1.32   {d22}, [TMP2], STRIDE +        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) +        vadd.u16  q12, q12, q13 +    vld1.32   {d23}, [TMP2] +    vmull.u8  q9, d22, d28 +    mov       TMP3, X, asr #16 +    add       X, X, UX +    add       TMP3, TOP, TMP3, asl #2 +    mov       TMP4, X, asr #16 +    add       X, X, UX +    add       TMP4, TOP, TMP4, asl #2 +    vmlal.u8  q9, d23, d29 +    vld1.32   {d22}, [TMP3], STRIDE +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vld1.32   {d23}, [TMP3] +    vmull.u8  q10, d22, d28 +    vmlal.u8  q10, d23, d29 +        vmovn.u16 d6, q0 +    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS +        vmovn.u16 d7, q2 +    vmlsl.u16 q0, d16, d30 +    vmlal.u16 q0, d17, d30 +    pld       [TMP4, PF_OFFS] +    vld1.32   {d16}, [TMP4], STRIDE +        vadd.u16  q12, q12, q13 +    vld1.32   {d17}, [TMP4] +    pld       [TMP4, PF_OFFS] +    vmull.u8  q11, d16, d28 +    vmlal.u8  q11, d17, d29 +        vst1.32   {d6, d7}, [OUT, :128]! +    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q1, d18, d31 +.endm + +/*****************************************************************************/ + +.set have_bilinear_interpolate_eight_pixels_8888_0565, 1 + +.macro bilinear_interpolate_eight_pixels_8888_0565_head +    mov       TMP1, X, asr #16 +    add       X, X, UX +    add       TMP1, TOP, TMP1, asl #2 +    mov       TMP2, X, asr #16 +    add       X, X, UX +    add       TMP2, TOP, TMP2, asl #2 +    vld1.32   {d20}, [TMP1], STRIDE +    vld1.32   {d21}, [TMP1] +    vmull.u8  q8, d20, d28 +    vmlal.u8  q8, d21, d29 +    vld1.32   {d22}, [TMP2], STRIDE +    vld1.32   {d23}, [TMP2] +    vmull.u8  q9, d22, d28 +    mov       TMP3, X, asr #16 +    add       X, X, UX +    add       TMP3, TOP, TMP3, asl #2 +    mov       TMP4, X, asr #16 +    add       X, X, UX +    add       TMP4, TOP, TMP4, asl #2 +    vmlal.u8  q9, d23, d29 +    vld1.32   {d22}, [TMP3], STRIDE +    vld1.32   {d23}, [TMP3] +    vmull.u8  q10, d22, d28 +    vmlal.u8  q10, d23, d29 +    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q0, d16, d30 +    vmlal.u16 q0, d17, d30 +    pld       [TMP4, PF_OFFS] +    vld1.32   {d16}, [TMP4], STRIDE +    vld1.32   {d17}, [TMP4] +    pld       [TMP4, PF_OFFS] +    vmull.u8  q11, d16, d28 +    vmlal.u8  q11, d17, d29 +    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q1, d18, d31 + +    mov       TMP1, X, asr #16 +    add       X, X, UX +    add       TMP1, TOP, TMP1, asl #2 +    mov       TMP2, X, asr #16 +    add       X, X, UX +    add       TMP2, TOP, TMP2, asl #2 +        vmlal.u16 q1, d19, d31 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS +        vmlsl.u16 q2, d20, d30 +        vmlal.u16 q2, d21, d30 +        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS +    vld1.32   {d20}, [TMP1], STRIDE +        vmlsl.u16 q3, d22, d31 +        vmlal.u16 q3, d23, d31 +    vld1.32   {d21}, [TMP1] +    vmull.u8  q8, d20, d28 +    vmlal.u8  q8, d21, d29 +        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) +        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) +    vld1.32   {d22}, [TMP2], STRIDE +        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) +        vadd.u16  q12, q12, q13 +    vld1.32   {d23}, [TMP2] +    vmull.u8  q9, d22, d28 +    mov       TMP3, X, asr #16 +    add       X, X, UX +    add       TMP3, TOP, TMP3, asl #2 +    mov       TMP4, X, asr #16 +    add       X, X, UX +    add       TMP4, TOP, TMP4, asl #2 +    vmlal.u8  q9, d23, d29 +    vld1.32   {d22}, [TMP3], STRIDE +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vld1.32   {d23}, [TMP3] +    vmull.u8  q10, d22, d28 +    vmlal.u8  q10, d23, d29 +        vmovn.u16 d8, q0 +    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS +        vmovn.u16 d9, q2 +    vmlsl.u16 q0, d16, d30 +    vmlal.u16 q0, d17, d30 +    pld       [TMP4, PF_OFFS] +    vld1.32   {d16}, [TMP4], STRIDE +        vadd.u16  q12, q12, q13 +    vld1.32   {d17}, [TMP4] +    pld       [TMP4, PF_OFFS] +    vmull.u8  q11, d16, d28 +    vmlal.u8  q11, d17, d29 +    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q1, d18, d31 +.endm + +.macro bilinear_interpolate_eight_pixels_8888_0565_tail +    vmlal.u16 q1, d19, d31 +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q2, d20, d30 +    vmlal.u16 q2, d21, d30 +    vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q3, d22, d31 +    vmlal.u16 q3, d23, d31 +    vadd.u16  q12, q12, q13 +    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) +    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) +    vmovn.u16 d10, q0 +    vmovn.u16 d11, q2 +    vadd.u16  q12, q12, q13 + +    vuzp.u8   d8, d9 +    vuzp.u8   d10, d11 +    vuzp.u8   d9, d11 +    vuzp.u8   d8, d10 +    vshll.u8  q6, d9, #8 +    vshll.u8  q5, d10, #8 +    vshll.u8  q7, d8, #8 +    vsri.u16  q5, q6, #5 +    vsri.u16  q5, q7, #11 +    vst1.32   {d10, d11}, [OUT, :128]! +.endm + +.macro bilinear_interpolate_eight_pixels_8888_0565_tail_head +    mov       TMP1, X, asr #16 +    add       X, X, UX +    add       TMP1, TOP, TMP1, asl #2 +    mov       TMP2, X, asr #16 +    add       X, X, UX +    add       TMP2, TOP, TMP2, asl #2 +        vmlal.u16 q1, d19, d31 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +            vuzp.u8 d8, d9 +        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS +        vmlsl.u16 q2, d20, d30 +        vmlal.u16 q2, d21, d30 +        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS +    vld1.32   {d20}, [TMP1], STRIDE +        vmlsl.u16 q3, d22, d31 +        vmlal.u16 q3, d23, d31 +    vld1.32   {d21}, [TMP1] +    vmull.u8  q8, d20, d28 +    vmlal.u8  q8, d21, d29 +        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) +        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) +    vld1.32   {d22}, [TMP2], STRIDE +        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) +        vadd.u16  q12, q12, q13 +    vld1.32   {d23}, [TMP2] +    vmull.u8  q9, d22, d28 +    mov       TMP3, X, asr #16 +    add       X, X, UX +    add       TMP3, TOP, TMP3, asl #2 +    mov       TMP4, X, asr #16 +    add       X, X, UX +    add       TMP4, TOP, TMP4, asl #2 +    vmlal.u8  q9, d23, d29 +    vld1.32   {d22}, [TMP3], STRIDE +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vld1.32   {d23}, [TMP3] +    vmull.u8  q10, d22, d28 +    vmlal.u8  q10, d23, d29 +        vmovn.u16 d10, q0 +    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS +        vmovn.u16 d11, q2 +    vmlsl.u16 q0, d16, d30 +    vmlal.u16 q0, d17, d30 +    pld       [TMP4, PF_OFFS] +    vld1.32   {d16}, [TMP4], STRIDE +        vadd.u16  q12, q12, q13 +    vld1.32   {d17}, [TMP4] +    pld       [TMP4, PF_OFFS] +    vmull.u8  q11, d16, d28 +    vmlal.u8  q11, d17, d29 +            vuzp.u8 d10, d11 +    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS +    vmlsl.u16 q1, d18, d31 + +    mov       TMP1, X, asr #16 +    add       X, X, UX +    add       TMP1, TOP, TMP1, asl #2 +    mov       TMP2, X, asr #16 +    add       X, X, UX +    add       TMP2, TOP, TMP2, asl #2 +        vmlal.u16 q1, d19, d31 +            vuzp.u8 d9, d11 +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS +            vuzp.u8 d8, d10 +        vmlsl.u16 q2, d20, d30 +        vmlal.u16 q2, d21, d30 +        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS +    vld1.32   {d20}, [TMP1], STRIDE +        vmlsl.u16 q3, d22, d31 +        vmlal.u16 q3, d23, d31 +    vld1.32   {d21}, [TMP1] +    vmull.u8  q8, d20, d28 +    vmlal.u8  q8, d21, d29 +            vshll.u8  q6, d9, #8 +            vshll.u8  q5, d10, #8 +            vshll.u8  q7, d8, #8 +        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) +            vsri.u16  q5, q6, #5 +        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) +            vsri.u16  q5, q7, #11 +        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) +    vld1.32   {d22}, [TMP2], STRIDE +        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) +        vadd.u16  q12, q12, q13 +    vld1.32   {d23}, [TMP2] +    vmull.u8  q9, d22, d28 +    mov       TMP3, X, asr #16 +    add       X, X, UX +    add       TMP3, TOP, TMP3, asl #2 +    mov       TMP4, X, asr #16 +    add       X, X, UX +    add       TMP4, TOP, TMP4, asl #2 +    vmlal.u8  q9, d23, d29 +    vld1.32   {d22}, [TMP3], STRIDE +        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) +    vld1.32   {d23}, [TMP3] +    vmull.u8  q10, d22, d28 +    vmlal.u8  q10, d23, d29 +        vmovn.u16 d8, q0 +    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS +        vmovn.u16 d9, q2 +    vmlsl.u16 q0, d16, d30 +    vmlal.u16 q0, d17, d30 +    pld       [TMP4, PF_OFFS] +    vld1.32   {d16}, [TMP4], STRIDE +        vadd.u16  q12, q12, q13 +    vld1.32   {d17}, [TMP4] +    pld       [TMP4, PF_OFFS] +    vmull.u8  q11, d16, d28 +    vmlal.u8  q11, d17, d29 +    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS +            vst1.32   {d10, d11}, [OUT, :128]! +    vmlsl.u16 q1, d18, d31 +.endm +/*****************************************************************************/ + +generate_bilinear_scanline_func \ +    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \ +    2, 2, 28, BILINEAR_FLAG_UNROLL_4 + +generate_bilinear_scanline_func \ +    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \ +    2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS + +generate_bilinear_scanline_func \ +    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \ +    1, 2, 28, BILINEAR_FLAG_UNROLL_4 + +generate_bilinear_scanline_func \ +    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \ +    1, 1, 28, BILINEAR_FLAG_UNROLL_4 diff --git a/libs/pixman-0.40.0/pixman/pixman-arm-neon-asm.h b/libs/pixman-0.40.0/pixman/pixman-arm-neon-asm.h new file mode 100644 index 0000000..bdcf6a9 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-arm-neon-asm.h @@ -0,0 +1,1184 @@ +/* + * Copyright © 2009 Nokia Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com) + */ + +/* + * This file contains a macro ('generate_composite_function') which can + * construct 2D image processing functions, based on a common template. + * Any combinations of source, destination and mask images with 8bpp, + * 16bpp, 24bpp, 32bpp color formats are supported. + * + * This macro takes care of: + *  - handling of leading and trailing unaligned pixels + *  - doing most of the work related to L2 cache preload + *  - encourages the use of software pipelining for better instructions + *    scheduling + * + * The user of this macro has to provide some configuration parameters + * (bit depths for the images, prefetch distance, etc.) and a set of + * macros, which should implement basic code chunks responsible for + * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage + * examples. + * + * TODO: + *  - try overlapped pixel method (from Ian Rickards) when processing + *    exactly two blocks of pixels + *  - maybe add an option to do reverse scanline processing + */ + +/* + * Bit flags for 'generate_composite_function' macro which are used + * to tune generated functions behavior. + */ +.set FLAG_DST_WRITEONLY,       0 +.set FLAG_DST_READWRITE,       1 +.set FLAG_DEINTERLEAVE_32BPP,  2 + +/* + * Offset in stack where mask and source pointer/stride can be accessed + * from 'init' macro. This is useful for doing special handling for solid mask. + */ +.set ARGS_STACK_OFFSET,        40 + +/* + * Constants for selecting preferable prefetch type. + */ +.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */ +.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */ +.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */ + +/* + * Definitions of supplementary pixld/pixst macros (for partial load/store of + * pixel data). + */ + +.macro pixldst1 op, elem_size, reg1, mem_operand, abits +.if abits > 0 +    op&.&elem_size {d®1}, [&mem_operand&, :&abits&]! +.else +    op&.&elem_size {d®1}, [&mem_operand&]! +.endif +.endm + +.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits +.if abits > 0 +    op&.&elem_size {d®1, d®2}, [&mem_operand&, :&abits&]! +.else +    op&.&elem_size {d®1, d®2}, [&mem_operand&]! +.endif +.endm + +.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits +.if abits > 0 +    op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&, :&abits&]! +.else +    op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&]! +.endif +.endm + +.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits +    op&.&elem_size {d®1[idx]}, [&mem_operand&]! +.endm + +.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand +    op&.&elem_size {d®1, d®2, d®3}, [&mem_operand&]! +.endm + +.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand +    op&.&elem_size {d®1[idx], d®2[idx], d®3[idx]}, [&mem_operand&]! +.endm + +.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits +.if numbytes == 32 +    pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \ +                              %(basereg+6), %(basereg+7), mem_operand, abits +.elseif numbytes == 16 +    pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits +.elseif numbytes == 8 +    pixldst1 op, elem_size, %(basereg+1), mem_operand, abits +.elseif numbytes == 4 +    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) +        pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits +    .elseif elem_size == 16 +        pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits +        pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits +    .else +        pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits +        pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits +        pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits +        pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits +    .endif +.elseif numbytes == 2 +    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) +        pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits +    .else +        pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits +        pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits +    .endif +.elseif numbytes == 1 +    pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits +.else +    .error "unsupported size: numbytes" +.endif +.endm + +.macro pixld numpix, bpp, basereg, mem_operand, abits=0 +.if bpp > 0 +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) +    pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \ +                      %(basereg+6), %(basereg+7), mem_operand, abits +.elseif (bpp == 24) && (numpix == 8) +    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand +.elseif (bpp == 24) && (numpix == 4) +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand +.elseif (bpp == 24) && (numpix == 2) +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand +.elseif (bpp == 24) && (numpix == 1) +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand +.else +    pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits +.endif +.endif +.endm + +.macro pixst numpix, bpp, basereg, mem_operand, abits=0 +.if bpp > 0 +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) +    pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \ +                      %(basereg+6), %(basereg+7), mem_operand, abits +.elseif (bpp == 24) && (numpix == 8) +    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand +.elseif (bpp == 24) && (numpix == 4) +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand +.elseif (bpp == 24) && (numpix == 2) +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand +.elseif (bpp == 24) && (numpix == 1) +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand +.else +    pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits +.endif +.endif +.endm + +.macro pixld_a numpix, bpp, basereg, mem_operand +.if (bpp * numpix) <= 128 +    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix) +.else +    pixld numpix, bpp, basereg, mem_operand, 128 +.endif +.endm + +.macro pixst_a numpix, bpp, basereg, mem_operand +.if (bpp * numpix) <= 128 +    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix) +.else +    pixst numpix, bpp, basereg, mem_operand, 128 +.endif +.endm + +/* + * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register + * aliases to be defined) + */ +.macro pixld1_s elem_size, reg1, mem_operand +.if elem_size == 16 +    mov     TMP1, VX, asr #16 +    adds    VX, VX, UNIT_X +5:  subpls  VX, VX, SRC_WIDTH_FIXED +    bpl     5b +    add     TMP1, mem_operand, TMP1, asl #1 +    mov     TMP2, VX, asr #16 +    adds    VX, VX, UNIT_X +5:  subpls  VX, VX, SRC_WIDTH_FIXED +    bpl     5b +    add     TMP2, mem_operand, TMP2, asl #1 +    vld1.16 {d®1&[0]}, [TMP1, :16] +    mov     TMP1, VX, asr #16 +    adds    VX, VX, UNIT_X +5:  subpls  VX, VX, SRC_WIDTH_FIXED +    bpl     5b +    add     TMP1, mem_operand, TMP1, asl #1 +    vld1.16 {d®1&[1]}, [TMP2, :16] +    mov     TMP2, VX, asr #16 +    adds    VX, VX, UNIT_X +5:  subpls  VX, VX, SRC_WIDTH_FIXED +    bpl     5b +    add     TMP2, mem_operand, TMP2, asl #1 +    vld1.16 {d®1&[2]}, [TMP1, :16] +    vld1.16 {d®1&[3]}, [TMP2, :16] +.elseif elem_size == 32 +    mov     TMP1, VX, asr #16 +    adds    VX, VX, UNIT_X +5:  subpls  VX, VX, SRC_WIDTH_FIXED +    bpl     5b +    add     TMP1, mem_operand, TMP1, asl #2 +    mov     TMP2, VX, asr #16 +    adds    VX, VX, UNIT_X +5:  subpls  VX, VX, SRC_WIDTH_FIXED +    bpl     5b +    add     TMP2, mem_operand, TMP2, asl #2 +    vld1.32 {d®1&[0]}, [TMP1, :32] +    vld1.32 {d®1&[1]}, [TMP2, :32] +.else +    .error "unsupported" +.endif +.endm + +.macro pixld2_s elem_size, reg1, reg2, mem_operand +.if 0 /* elem_size == 32 */ +    mov     TMP1, VX, asr #16 +    add     VX, VX, UNIT_X, asl #1 +    add     TMP1, mem_operand, TMP1, asl #2 +    mov     TMP2, VX, asr #16 +    sub     VX, VX, UNIT_X +    add     TMP2, mem_operand, TMP2, asl #2 +    vld1.32 {d®1&[0]}, [TMP1, :32] +    mov     TMP1, VX, asr #16 +    add     VX, VX, UNIT_X, asl #1 +    add     TMP1, mem_operand, TMP1, asl #2 +    vld1.32 {d®2&[0]}, [TMP2, :32] +    mov     TMP2, VX, asr #16 +    add     VX, VX, UNIT_X +    add     TMP2, mem_operand, TMP2, asl #2 +    vld1.32 {d®1&[1]}, [TMP1, :32] +    vld1.32 {d®2&[1]}, [TMP2, :32] +.else +    pixld1_s elem_size, reg1, mem_operand +    pixld1_s elem_size, reg2, mem_operand +.endif +.endm + +.macro pixld0_s elem_size, reg1, idx, mem_operand +.if elem_size == 16 +    mov     TMP1, VX, asr #16 +    adds    VX, VX, UNIT_X +5:  subpls  VX, VX, SRC_WIDTH_FIXED +    bpl     5b +    add     TMP1, mem_operand, TMP1, asl #1 +    vld1.16 {d®1&[idx]}, [TMP1, :16] +.elseif elem_size == 32 +    mov     TMP1, VX, asr #16 +    adds    VX, VX, UNIT_X +5:  subpls  VX, VX, SRC_WIDTH_FIXED +    bpl     5b +    add     TMP1, mem_operand, TMP1, asl #2 +    vld1.32 {d®1&[idx]}, [TMP1, :32] +.endif +.endm + +.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand +.if numbytes == 32 +    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand +    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand +    pixdeinterleave elem_size, %(basereg+4) +.elseif numbytes == 16 +    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand +.elseif numbytes == 8 +    pixld1_s elem_size, %(basereg+1), mem_operand +.elseif numbytes == 4 +    .if elem_size == 32 +        pixld0_s elem_size, %(basereg+0), 1, mem_operand +    .elseif elem_size == 16 +        pixld0_s elem_size, %(basereg+0), 2, mem_operand +        pixld0_s elem_size, %(basereg+0), 3, mem_operand +    .else +        pixld0_s elem_size, %(basereg+0), 4, mem_operand +        pixld0_s elem_size, %(basereg+0), 5, mem_operand +        pixld0_s elem_size, %(basereg+0), 6, mem_operand +        pixld0_s elem_size, %(basereg+0), 7, mem_operand +    .endif +.elseif numbytes == 2 +    .if elem_size == 16 +        pixld0_s elem_size, %(basereg+0), 1, mem_operand +    .else +        pixld0_s elem_size, %(basereg+0), 2, mem_operand +        pixld0_s elem_size, %(basereg+0), 3, mem_operand +    .endif +.elseif numbytes == 1 +    pixld0_s elem_size, %(basereg+0), 1, mem_operand +.else +    .error "unsupported size: numbytes" +.endif +.endm + +.macro pixld_s numpix, bpp, basereg, mem_operand +.if bpp > 0 +    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand +.endif +.endm + +.macro vuzp8 reg1, reg2 +    vuzp.8 d®1, d®2 +.endm + +.macro vzip8 reg1, reg2 +    vzip.8 d®1, d®2 +.endm + +/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ +.macro pixdeinterleave bpp, basereg +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) +    vuzp8 %(basereg+0), %(basereg+1) +    vuzp8 %(basereg+2), %(basereg+3) +    vuzp8 %(basereg+1), %(basereg+3) +    vuzp8 %(basereg+0), %(basereg+2) +.endif +.endm + +/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ +.macro pixinterleave bpp, basereg +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) +    vzip8 %(basereg+0), %(basereg+2) +    vzip8 %(basereg+1), %(basereg+3) +    vzip8 %(basereg+2), %(basereg+3) +    vzip8 %(basereg+0), %(basereg+1) +.endif +.endm + +/* + * This is a macro for implementing cache preload. The main idea is that + * cache preload logic is mostly independent from the rest of pixels + * processing code. It starts at the top left pixel and moves forward + * across pixels and can jump across scanlines. Prefetch distance is + * handled in an 'incremental' way: it starts from 0 and advances to the + * optimal distance over time. After reaching optimal prefetch distance, + * it is kept constant. There are some checks which prevent prefetching + * unneeded pixel lines below the image (but it still can prefetch a bit + * more data on the right side of the image - not a big issue and may + * be actually helpful when rendering text glyphs). Additional trick is + * the use of LDR instruction for prefetch instead of PLD when moving to + * the next line, the point is that we have a high chance of getting TLB + * miss in this case, and PLD would be useless. + * + * This sounds like it may introduce a noticeable overhead (when working with + * fully cached data). But in reality, due to having a separate pipeline and + * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can + * execute simultaneously with NEON and be completely shadowed by it. Thus + * we get no performance overhead at all (*). This looks like a very nice + * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher, + * but still can implement some rather advanced prefetch logic in software + * for almost zero cost! + * + * (*) The overhead of the prefetcher is visible when running some trivial + * pixels processing like simple copy. Anyway, having prefetch is a must + * when working with the graphics data. + */ +.macro PF a, x:vararg +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) +    a x +.endif +.endm + +.macro cache_preload std_increment, boost_increment +.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) +.if regs_shortage +    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ +.endif +.if std_increment != 0 +    PF add PF_X, PF_X, #std_increment +.endif +    PF tst PF_CTL, #0xF +    PF addne PF_X, PF_X, #boost_increment +    PF subne PF_CTL, PF_CTL, #1 +    PF cmp PF_X, ORIG_W +.if src_bpp_shift >= 0 +    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] +.endif +.if dst_r_bpp != 0 +    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] +.endif +.if mask_bpp_shift >= 0 +    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] +.endif +    PF subge PF_X, PF_X, ORIG_W +    PF subges PF_CTL, PF_CTL, #0x10 +.if src_bpp_shift >= 0 +    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! +.endif +.if dst_r_bpp != 0 +    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! +.endif +.if mask_bpp_shift >= 0 +    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! +.endif +.endif +.endm + +.macro cache_preload_simple +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) +.if src_bpp > 0 +    pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] +.endif +.if dst_r_bpp > 0 +    pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)] +.endif +.if mask_bpp > 0 +    pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)] +.endif +.endif +.endm + +.macro fetch_mask_pixblock +    pixld       pixblock_size, mask_bpp, \ +                (mask_basereg - pixblock_size * mask_bpp / 64), MASK +.endm + +/* + * Macro which is used to process leading pixels until destination + * pointer is properly aligned (at 16 bytes boundary). When destination + * buffer uses 16bpp format, this is unnecessary, or even pointless. + */ +.macro ensure_destination_ptr_alignment process_pixblock_head, \ +                                        process_pixblock_tail, \ +                                        process_pixblock_tail_head +.if dst_w_bpp != 24 +    tst         DST_R, #0xF +    beq         2f + +.irp lowbit, 1, 2, 4, 8, 16 +local skip1 +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) +.if lowbit < 16 /* we don't need more than 16-byte alignment */ +    tst         DST_R, #lowbit +    beq         1f +.endif +    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC +    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK +.if dst_r_bpp > 0 +    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R +.else +    add         DST_R, DST_R, #lowbit +.endif +    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) +    sub         W, W, #(lowbit * 8 / dst_w_bpp) +1: +.endif +.endr +    pixdeinterleave src_bpp, src_basereg +    pixdeinterleave mask_bpp, mask_basereg +    pixdeinterleave dst_r_bpp, dst_r_basereg + +    process_pixblock_head +    cache_preload 0, pixblock_size +    cache_preload_simple +    process_pixblock_tail + +    pixinterleave dst_w_bpp, dst_w_basereg +.irp lowbit, 1, 2, 4, 8, 16 +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) +.if lowbit < 16 /* we don't need more than 16-byte alignment */ +    tst         DST_W, #lowbit +    beq         1f +.endif +    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W +1: +.endif +.endr +.endif +2: +.endm + +/* + * Special code for processing up to (pixblock_size - 1) remaining + * trailing pixels. As SIMD processing performs operation on + * pixblock_size pixels, anything smaller than this has to be loaded + * and stored in a special way. Loading and storing of pixel data is + * performed in such a way that we fill some 'slots' in the NEON + * registers (some slots naturally are unused), then perform compositing + * operation as usual. In the end, the data is taken from these 'slots' + * and saved to memory. + * + * cache_preload_flag - allows to suppress prefetch if + *                      set to 0 + * dst_aligned_flag   - selects whether destination buffer + *                      is aligned + */ +.macro process_trailing_pixels cache_preload_flag, \ +                               dst_aligned_flag, \ +                               process_pixblock_head, \ +                               process_pixblock_tail, \ +                               process_pixblock_tail_head +    tst         W, #(pixblock_size - 1) +    beq         2f +.irp chunk_size, 16, 8, 4, 2, 1 +.if pixblock_size > chunk_size +    tst         W, #chunk_size +    beq         1f +    pixld_src   chunk_size, src_bpp, src_basereg, SRC +    pixld       chunk_size, mask_bpp, mask_basereg, MASK +.if dst_aligned_flag != 0 +    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R +.else +    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R +.endif +.if cache_preload_flag != 0 +    PF add      PF_X, PF_X, #chunk_size +.endif +1: +.endif +.endr +    pixdeinterleave src_bpp, src_basereg +    pixdeinterleave mask_bpp, mask_basereg +    pixdeinterleave dst_r_bpp, dst_r_basereg + +    process_pixblock_head +.if cache_preload_flag != 0 +    cache_preload 0, pixblock_size +    cache_preload_simple +.endif +    process_pixblock_tail +    pixinterleave dst_w_bpp, dst_w_basereg +.irp chunk_size, 16, 8, 4, 2, 1 +.if pixblock_size > chunk_size +    tst         W, #chunk_size +    beq         1f +.if dst_aligned_flag != 0 +    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W +.else +    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W +.endif +1: +.endif +.endr +2: +.endm + +/* + * Macro, which performs all the needed operations to switch to the next + * scanline and start the next loop iteration unless all the scanlines + * are already processed. + */ +.macro advance_to_next_scanline start_of_loop_label +.if regs_shortage +    ldrd        W, [sp] /* load W and H (width and height) from stack */ +.else +    mov         W, ORIG_W +.endif +    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift +.if src_bpp != 0 +    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift +.endif +.if mask_bpp != 0 +    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift +.endif +.if (dst_w_bpp != 24) +    sub         DST_W, DST_W, W, lsl #dst_bpp_shift +.endif +.if (src_bpp != 24) && (src_bpp != 0) +    sub         SRC, SRC, W, lsl #src_bpp_shift +.endif +.if (mask_bpp != 24) && (mask_bpp != 0) +    sub         MASK, MASK, W, lsl #mask_bpp_shift +.endif +    subs        H, H, #1 +    mov         DST_R, DST_W +.if regs_shortage +    str         H, [sp, #4] /* save updated height to stack */ +.endif +    bge         start_of_loop_label +.endm + +/* + * Registers are allocated in the following way by default: + * d0, d1, d2, d3     - reserved for loading source pixel data + * d4, d5, d6, d7     - reserved for loading destination pixel data + * d24, d25, d26, d27 - reserved for loading mask pixel data + * d28, d29, d30, d31 - final destination pixel data for writeback to memory + */ +.macro generate_composite_function fname, \ +                                   src_bpp_, \ +                                   mask_bpp_, \ +                                   dst_w_bpp_, \ +                                   flags, \ +                                   pixblock_size_, \ +                                   prefetch_distance, \ +                                   init, \ +                                   cleanup, \ +                                   process_pixblock_head, \ +                                   process_pixblock_tail, \ +                                   process_pixblock_tail_head, \ +                                   dst_w_basereg_ = 28, \ +                                   dst_r_basereg_ = 4, \ +                                   src_basereg_   = 0, \ +                                   mask_basereg_  = 24 + +    pixman_asm_function fname + +    push        {r4-r12, lr}        /* save all registers */ + +/* + * Select prefetch type for this function. If prefetch distance is + * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch + * has to be used instead of ADVANCED. + */ +    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT +.if prefetch_distance == 0 +    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE +.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ +        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24)) +    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE +.endif + +/* + * Make some macro arguments globally visible and accessible + * from other macros + */ +    .set src_bpp, src_bpp_ +    .set mask_bpp, mask_bpp_ +    .set dst_w_bpp, dst_w_bpp_ +    .set pixblock_size, pixblock_size_ +    .set dst_w_basereg, dst_w_basereg_ +    .set dst_r_basereg, dst_r_basereg_ +    .set src_basereg, src_basereg_ +    .set mask_basereg, mask_basereg_ + +    .macro pixld_src x:vararg +        pixld x +    .endm +    .macro fetch_src_pixblock +        pixld_src   pixblock_size, src_bpp, \ +                    (src_basereg - pixblock_size * src_bpp / 64), SRC +    .endm +/* + * Assign symbolic names to registers + */ +    W           .req        r0      /* width (is updated during processing) */ +    H           .req        r1      /* height (is updated during processing) */ +    DST_W       .req        r2      /* destination buffer pointer for writes */ +    DST_STRIDE  .req        r3      /* destination image stride */ +    SRC         .req        r4      /* source buffer pointer */ +    SRC_STRIDE  .req        r5      /* source image stride */ +    DST_R       .req        r6      /* destination buffer pointer for reads */ + +    MASK        .req        r7      /* mask pointer */ +    MASK_STRIDE .req        r8      /* mask stride */ + +    PF_CTL      .req        r9      /* combined lines counter and prefetch */ +                                    /* distance increment counter */ +    PF_X        .req        r10     /* pixel index in a scanline for current */ +                                    /* pretetch position */ +    PF_SRC      .req        r11     /* pointer to source scanline start */ +                                    /* for prefetch purposes */ +    PF_DST      .req        r12     /* pointer to destination scanline start */ +                                    /* for prefetch purposes */ +    PF_MASK     .req        r14     /* pointer to mask scanline start */ +                                    /* for prefetch purposes */ +/* + * Check whether we have enough registers for all the local variables. + * If we don't have enough registers, original width and height are + * kept on top of stack (and 'regs_shortage' variable is set to indicate + * this for the rest of code). Even if there are enough registers, the + * allocation scheme may be a bit different depending on whether source + * or mask is not used. + */ +.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED) +    ORIG_W      .req        r10     /* saved original width */ +    DUMMY       .req        r12     /* temporary register */ +    .set        regs_shortage, 0 +.elseif mask_bpp == 0 +    ORIG_W      .req        r7      /* saved original width */ +    DUMMY       .req        r8      /* temporary register */ +    .set        regs_shortage, 0 +.elseif src_bpp == 0 +    ORIG_W      .req        r4      /* saved original width */ +    DUMMY       .req        r5      /* temporary register */ +    .set        regs_shortage, 0 +.else +    ORIG_W      .req        r1      /* saved original width */ +    DUMMY       .req        r1      /* temporary register */ +    .set        regs_shortage, 1 +.endif + +    .set mask_bpp_shift, -1 +.if src_bpp == 32 +    .set src_bpp_shift, 2 +.elseif src_bpp == 24 +    .set src_bpp_shift, 0 +.elseif src_bpp == 16 +    .set src_bpp_shift, 1 +.elseif src_bpp == 8 +    .set src_bpp_shift, 0 +.elseif src_bpp == 0 +    .set src_bpp_shift, -1 +.else +    .error "requested src bpp (src_bpp) is not supported" +.endif +.if mask_bpp == 32 +    .set mask_bpp_shift, 2 +.elseif mask_bpp == 24 +    .set mask_bpp_shift, 0 +.elseif mask_bpp == 8 +    .set mask_bpp_shift, 0 +.elseif mask_bpp == 0 +    .set mask_bpp_shift, -1 +.else +    .error "requested mask bpp (mask_bpp) is not supported" +.endif +.if dst_w_bpp == 32 +    .set dst_bpp_shift, 2 +.elseif dst_w_bpp == 24 +    .set dst_bpp_shift, 0 +.elseif dst_w_bpp == 16 +    .set dst_bpp_shift, 1 +.elseif dst_w_bpp == 8 +    .set dst_bpp_shift, 0 +.else +    .error "requested dst bpp (dst_w_bpp) is not supported" +.endif + +.if (((flags) & FLAG_DST_READWRITE) != 0) +    .set dst_r_bpp, dst_w_bpp +.else +    .set dst_r_bpp, 0 +.endif +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) +    .set DEINTERLEAVE_32BPP_ENABLED, 1 +.else +    .set DEINTERLEAVE_32BPP_ENABLED, 0 +.endif + +.if prefetch_distance < 0 || prefetch_distance > 15 +    .error "invalid prefetch distance (prefetch_distance)" +.endif + +.if src_bpp > 0 +    ldr         SRC, [sp, #40] +.endif +.if mask_bpp > 0 +    ldr         MASK, [sp, #48] +.endif +    PF mov      PF_X, #0 +.if src_bpp > 0 +    ldr         SRC_STRIDE, [sp, #44] +.endif +.if mask_bpp > 0 +    ldr         MASK_STRIDE, [sp, #52] +.endif +    mov         DST_R, DST_W + +.if src_bpp == 24 +    sub         SRC_STRIDE, SRC_STRIDE, W +    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1 +.endif +.if mask_bpp == 24 +    sub         MASK_STRIDE, MASK_STRIDE, W +    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1 +.endif +.if dst_w_bpp == 24 +    sub         DST_STRIDE, DST_STRIDE, W +    sub         DST_STRIDE, DST_STRIDE, W, lsl #1 +.endif + +/* + * Setup advanced prefetcher initial state + */ +    PF mov      PF_SRC, SRC +    PF mov      PF_DST, DST_R +    PF mov      PF_MASK, MASK +    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ +    PF mov      PF_CTL, H, lsl #4 +    PF add      PF_CTL, #(prefetch_distance - 0x10) + +    init +.if regs_shortage +    push        {r0, r1} +.endif +    subs        H, H, #1 +.if regs_shortage +    str         H, [sp, #4] /* save updated height to stack */ +.else +    mov         ORIG_W, W +.endif +    blt         9f +    cmp         W, #(pixblock_size * 2) +    blt         8f +/* + * This is the start of the pipelined loop, which if optimized for + * long scanlines + */ +0: +    ensure_destination_ptr_alignment process_pixblock_head, \ +                                     process_pixblock_tail, \ +                                     process_pixblock_tail_head + +    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ +    pixld_a     pixblock_size, dst_r_bpp, \ +                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R +    fetch_src_pixblock +    pixld       pixblock_size, mask_bpp, \ +                (mask_basereg - pixblock_size * mask_bpp / 64), MASK +    PF add      PF_X, PF_X, #pixblock_size +    process_pixblock_head +    cache_preload 0, pixblock_size +    cache_preload_simple +    subs        W, W, #(pixblock_size * 2) +    blt         2f +1: +    process_pixblock_tail_head +    cache_preload_simple +    subs        W, W, #pixblock_size +    bge         1b +2: +    process_pixblock_tail +    pixst_a     pixblock_size, dst_w_bpp, \ +                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W + +    /* Process the remaining trailing pixels in the scanline */ +    process_trailing_pixels 1, 1, \ +                            process_pixblock_head, \ +                            process_pixblock_tail, \ +                            process_pixblock_tail_head +    advance_to_next_scanline 0b + +.if regs_shortage +    pop         {r0, r1} +.endif +    cleanup +    pop         {r4-r12, pc}  /* exit */ +/* + * This is the start of the loop, designed to process images with small width + * (less than pixblock_size * 2 pixels). In this case neither pipelining + * nor prefetch are used. + */ +8: +    /* Process exactly pixblock_size pixels if needed */ +    tst         W, #pixblock_size +    beq         1f +    pixld       pixblock_size, dst_r_bpp, \ +                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R +    fetch_src_pixblock +    pixld       pixblock_size, mask_bpp, \ +                (mask_basereg - pixblock_size * mask_bpp / 64), MASK +    process_pixblock_head +    process_pixblock_tail +    pixst       pixblock_size, dst_w_bpp, \ +                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W +1: +    /* Process the remaining trailing pixels in the scanline */ +    process_trailing_pixels 0, 0, \ +                            process_pixblock_head, \ +                            process_pixblock_tail, \ +                            process_pixblock_tail_head +    advance_to_next_scanline 8b +9: +.if regs_shortage +    pop         {r0, r1} +.endif +    cleanup +    pop         {r4-r12, pc}  /* exit */ + +    .purgem     fetch_src_pixblock +    .purgem     pixld_src + +    .unreq      SRC +    .unreq      MASK +    .unreq      DST_R +    .unreq      DST_W +    .unreq      ORIG_W +    .unreq      W +    .unreq      H +    .unreq      SRC_STRIDE +    .unreq      DST_STRIDE +    .unreq      MASK_STRIDE +    .unreq      PF_CTL +    .unreq      PF_X +    .unreq      PF_SRC +    .unreq      PF_DST +    .unreq      PF_MASK +    .unreq      DUMMY +    .endfunc +.endm + +/* + * A simplified variant of function generation template for a single + * scanline processing (for implementing pixman combine functions) + */ +.macro generate_composite_function_scanline        use_nearest_scaling, \ +                                                   fname, \ +                                                   src_bpp_, \ +                                                   mask_bpp_, \ +                                                   dst_w_bpp_, \ +                                                   flags, \ +                                                   pixblock_size_, \ +                                                   init, \ +                                                   cleanup, \ +                                                   process_pixblock_head, \ +                                                   process_pixblock_tail, \ +                                                   process_pixblock_tail_head, \ +                                                   dst_w_basereg_ = 28, \ +                                                   dst_r_basereg_ = 4, \ +                                                   src_basereg_   = 0, \ +                                                   mask_basereg_  = 24 + +    pixman_asm_function fname + +    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE +/* + * Make some macro arguments globally visible and accessible + * from other macros + */ +    .set src_bpp, src_bpp_ +    .set mask_bpp, mask_bpp_ +    .set dst_w_bpp, dst_w_bpp_ +    .set pixblock_size, pixblock_size_ +    .set dst_w_basereg, dst_w_basereg_ +    .set dst_r_basereg, dst_r_basereg_ +    .set src_basereg, src_basereg_ +    .set mask_basereg, mask_basereg_ + +.if use_nearest_scaling != 0 +    /* +     * Assign symbolic names to registers for nearest scaling +     */ +    W           .req        r0 +    DST_W       .req        r1 +    SRC         .req        r2 +    VX          .req        r3 +    UNIT_X      .req        ip +    MASK        .req        lr +    TMP1        .req        r4 +    TMP2        .req        r5 +    DST_R       .req        r6 +    SRC_WIDTH_FIXED .req        r7 + +    .macro pixld_src x:vararg +        pixld_s x +    .endm + +    ldr         UNIT_X, [sp] +    push        {r4-r8, lr} +    ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)] +    .if mask_bpp != 0 +    ldr         MASK, [sp, #(24 + 8)] +    .endif +.else +    /* +     * Assign symbolic names to registers +     */ +    W           .req        r0      /* width (is updated during processing) */ +    DST_W       .req        r1      /* destination buffer pointer for writes */ +    SRC         .req        r2      /* source buffer pointer */ +    DST_R       .req        ip      /* destination buffer pointer for reads */ +    MASK        .req        r3      /* mask pointer */ + +    .macro pixld_src x:vararg +        pixld x +    .endm +.endif + +.if (((flags) & FLAG_DST_READWRITE) != 0) +    .set dst_r_bpp, dst_w_bpp +.else +    .set dst_r_bpp, 0 +.endif +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) +    .set DEINTERLEAVE_32BPP_ENABLED, 1 +.else +    .set DEINTERLEAVE_32BPP_ENABLED, 0 +.endif + +    .macro fetch_src_pixblock +        pixld_src   pixblock_size, src_bpp, \ +                    (src_basereg - pixblock_size * src_bpp / 64), SRC +    .endm + +    init +    mov         DST_R, DST_W + +    cmp         W, #pixblock_size +    blt         8f + +    ensure_destination_ptr_alignment process_pixblock_head, \ +                                     process_pixblock_tail, \ +                                     process_pixblock_tail_head + +    subs        W, W, #pixblock_size +    blt         7f + +    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ +    pixld_a     pixblock_size, dst_r_bpp, \ +                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R +    fetch_src_pixblock +    pixld       pixblock_size, mask_bpp, \ +                (mask_basereg - pixblock_size * mask_bpp / 64), MASK +    process_pixblock_head +    subs        W, W, #pixblock_size +    blt         2f +1: +    process_pixblock_tail_head +    subs        W, W, #pixblock_size +    bge         1b +2: +    process_pixblock_tail +    pixst_a     pixblock_size, dst_w_bpp, \ +                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W +7: +    /* Process the remaining trailing pixels in the scanline (dst aligned) */ +    process_trailing_pixels 0, 1, \ +                            process_pixblock_head, \ +                            process_pixblock_tail, \ +                            process_pixblock_tail_head + +    cleanup +.if use_nearest_scaling != 0 +    pop         {r4-r8, pc}  /* exit */ +.else +    bx          lr  /* exit */ +.endif +8: +    /* Process the remaining trailing pixels in the scanline (dst unaligned) */ +    process_trailing_pixels 0, 0, \ +                            process_pixblock_head, \ +                            process_pixblock_tail, \ +                            process_pixblock_tail_head + +    cleanup + +.if use_nearest_scaling != 0 +    pop         {r4-r8, pc}  /* exit */ + +    .unreq      DST_R +    .unreq      SRC +    .unreq      W +    .unreq      VX +    .unreq      UNIT_X +    .unreq      TMP1 +    .unreq      TMP2 +    .unreq      DST_W +    .unreq      MASK +    .unreq      SRC_WIDTH_FIXED + +.else +    bx          lr  /* exit */ + +    .unreq      SRC +    .unreq      MASK +    .unreq      DST_R +    .unreq      DST_W +    .unreq      W +.endif + +    .purgem     fetch_src_pixblock +    .purgem     pixld_src + +    .endfunc +.endm + +.macro generate_composite_function_single_scanline x:vararg +    generate_composite_function_scanline 0, x +.endm + +.macro generate_composite_function_nearest_scanline x:vararg +    generate_composite_function_scanline 1, x +.endm + +/* Default prologue/epilogue, nothing special needs to be done */ + +.macro default_init +.endm + +.macro default_cleanup +.endm + +/* + * Prologue/epilogue variant which additionally saves/restores d8-d15 + * registers (they need to be saved/restored by callee according to ABI). + * This is required if the code needs to use all the NEON registers. + */ + +.macro default_init_need_all_regs +    vpush       {d8-d15} +.endm + +.macro default_cleanup_need_all_regs +    vpop        {d8-d15} +.endm + +/******************************************************************************/ + +/* + * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in) + * into a planar a8r8g8b8 format (with a, r, g, b color components + * stored into 64-bit registers out_a, out_r, out_g, out_b respectively). + * + * Warning: the conversion is destructive and the original + *          value (in) is lost. + */ +.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b +    vshrn.u16   out_r, in,    #8 +    vshrn.u16   out_g, in,    #3 +    vsli.u16    in,    in,    #5 +    vmov.u8     out_a, #255 +    vsri.u8     out_r, out_r, #5 +    vsri.u8     out_g, out_g, #6 +    vshrn.u16   out_b, in,    #2 +.endm + +.macro convert_0565_to_x888 in, out_r, out_g, out_b +    vshrn.u16   out_r, in,    #8 +    vshrn.u16   out_g, in,    #3 +    vsli.u16    in,    in,    #5 +    vsri.u8     out_r, out_r, #5 +    vsri.u8     out_g, out_g, #6 +    vshrn.u16   out_b, in,    #2 +.endm + +/* + * Conversion from planar a8r8g8b8 format (with a, r, g, b color components + * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 + * pixels packed in 128-bit register (out). Requires two temporary 128-bit + * registers (tmp1, tmp2) + */ +.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 +    vshll.u8    tmp1, in_g, #8 +    vshll.u8    out, in_r, #8 +    vshll.u8    tmp2, in_b, #8 +    vsri.u16    out, tmp1, #5 +    vsri.u16    out, tmp2, #11 +.endm + +/* + * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels + * returned in (out0, out1) registers pair. Requires one temporary + * 64-bit register (tmp). 'out1' and 'in' may overlap, the original + * value from 'in' is lost + */ +.macro convert_four_0565_to_x888_packed in, out0, out1, tmp +    vshl.u16    out0, in,   #5  /* G top 6 bits */ +    vshl.u16    tmp,  in,   #11 /* B top 5 bits */ +    vsri.u16    in,   in,   #5  /* R is ready in top bits */ +    vsri.u16    out0, out0, #6  /* G is ready in top bits */ +    vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */ +    vshr.u16    out1, in,   #8  /* R is in place */ +    vsri.u16    out0, tmp,  #8  /* G & B is in place */ +    vzip.u16    out0, out1      /* everything is in place */ +.endm diff --git a/libs/pixman-0.40.0/pixman/pixman-arm-neon.c b/libs/pixman-0.40.0/pixman/pixman-arm-neon.c new file mode 100644 index 0000000..be761c9 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-arm-neon.c @@ -0,0 +1,472 @@ +/* + * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of ARM Ltd not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission.  ARM Ltd makes no + * representations about the suitability of this software for any purpose.  It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author:  Ian Rickards (ian.rickards@arm.com) + * Author:  Jonathan Morton (jonathan.morton@movial.com) + * Author:  Markku Vire (markku.vire@movial.com) + * + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <string.h> +#include "pixman-private.h" +#include "pixman-arm-common.h" + +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_8888_8888, +                                   uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_x888_8888, +                                   uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0565_0565, +                                   uint16_t, 1, uint16_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0888, +                                   uint8_t, 3, uint8_t, 3) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_8888_0565, +                                   uint32_t, 1, uint16_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0565_8888, +                                   uint16_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_8888_rev, +                                   uint8_t, 3, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0565_rev, +                                   uint8_t, 3, uint16_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_pixbuf_8888, +                                   uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_rpixbuf_8888, +                                   uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8_8, +                                   uint8_t, 1, uint8_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8888_8888, +                                   uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_0565, +                                   uint32_t, 1, uint16_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_8888, +                                   uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, out_reverse_8_0565, +                                   uint8_t, 1, uint16_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, out_reverse_8_8888, +                                   uint8_t, 1, uint32_t, 1) + +PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_0565, +                                 uint16_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_8888, +                                 uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_reverse_n_8888, +                                 uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, neon, in_n_8, +                                 uint8_t, 1) + +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_0565, +                                      uint8_t, 1, uint16_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8888, +                                      uint8_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_8888_ca, +                                      uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_0565_ca, +				      uint32_t, 1, uint16_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8, +                                      uint8_t, 1, uint8_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8, +                                      uint8_t, 1, uint8_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888, +                                      uint8_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888, +                                      uint8_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8, +                                      uint8_t, 1, uint8_t, 1) + +PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888, +                                     uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_0565, +                                     uint32_t, 1, uint16_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_0565_n_0565, +                                     uint16_t, 1, uint16_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, add_8888_n_8888, +                                     uint32_t, 1, uint32_t, 1) + +PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8_8_8, +                                        uint8_t, 1, uint8_t, 1, uint8_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_0565_8_0565, +                                        uint16_t, 1, uint8_t, 1, uint16_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8_8888, +                                        uint32_t, 1, uint8_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8888_8888, +                                        uint32_t, 1, uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_8888, +                                        uint32_t, 1, uint8_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8888_8888, +                                        uint32_t, 1, uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_0565, +                                        uint32_t, 1, uint8_t, 1, uint16_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_0565_8_0565, +                                        uint16_t, 1, uint8_t, 1, uint16_t, 1) + +PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_8888, OVER, +                                        uint32_t, uint32_t) +PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, OVER, +                                        uint32_t, uint16_t) +PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, SRC, +                                        uint32_t, uint16_t) +PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 0565_8888, SRC, +                                        uint16_t, uint32_t) + +PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_0565, +                                           OVER, uint32_t, uint16_t) +PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 0565_8_0565, +                                           OVER, uint16_t, uint16_t) + +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_8888, SRC, +                                         uint32_t, uint32_t) +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC, +                                         uint32_t, uint16_t) +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC, +                                         uint16_t, uint32_t) +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC, +                                         uint16_t, uint16_t) +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, OVER, +                                         uint32_t, uint32_t) +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, ADD, +                                         uint32_t, uint32_t) + +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_8888, SRC, +                                            uint32_t, uint32_t) +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_0565, SRC, +                                            uint32_t, uint16_t) +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_x888, SRC, +                                            uint16_t, uint32_t) +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_0565, SRC, +                                            uint16_t, uint16_t) +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, OVER, +                                            uint32_t, uint32_t) +PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, ADD, +                                            uint32_t, uint32_t) + +void +pixman_composite_src_n_8_asm_neon (int32_t   w, +                                   int32_t   h, +                                   uint8_t  *dst, +                                   int32_t   dst_stride, +                                   uint8_t   src); + +void +pixman_composite_src_n_0565_asm_neon (int32_t   w, +                                      int32_t   h, +                                      uint16_t *dst, +                                      int32_t   dst_stride, +                                      uint16_t  src); + +void +pixman_composite_src_n_8888_asm_neon (int32_t   w, +                                      int32_t   h, +                                      uint32_t *dst, +                                      int32_t   dst_stride, +                                      uint32_t  src); + +static pixman_bool_t +arm_neon_fill (pixman_implementation_t *imp, +               uint32_t *               bits, +               int                      stride, +               int                      bpp, +               int                      x, +               int                      y, +               int                      width, +               int                      height, +	       uint32_t                 _xor) +{ +    /* stride is always multiple of 32bit units in pixman */ +    uint32_t byte_stride = stride * sizeof(uint32_t); + +    switch (bpp) +    { +    case 8: +	pixman_composite_src_n_8_asm_neon ( +		width, +		height, +		(uint8_t *)(((char *) bits) + y * byte_stride + x), +		byte_stride, +		_xor & 0xff); +	return TRUE; +    case 16: +	pixman_composite_src_n_0565_asm_neon ( +		width, +		height, +		(uint16_t *)(((char *) bits) + y * byte_stride + x * 2), +		byte_stride / 2, +		_xor & 0xffff); +	return TRUE; +    case 32: +	pixman_composite_src_n_8888_asm_neon ( +		width, +		height, +		(uint32_t *)(((char *) bits) + y * byte_stride + x * 4), +		byte_stride / 4, +		_xor); +	return TRUE; +    default: +	return FALSE; +    } +} + +static pixman_bool_t +arm_neon_blt (pixman_implementation_t *imp, +              uint32_t *               src_bits, +              uint32_t *               dst_bits, +              int                      src_stride, +              int                      dst_stride, +              int                      src_bpp, +              int                      dst_bpp, +              int                      src_x, +              int                      src_y, +              int                      dest_x, +              int                      dest_y, +              int                      width, +              int                      height) +{ +    if (src_bpp != dst_bpp) +	return FALSE; + +    switch (src_bpp) +    { +    case 16: +	pixman_composite_src_0565_0565_asm_neon ( +		width, height, +		(uint16_t *)(((char *) dst_bits) + +		dest_y * dst_stride * 4 + dest_x * 2), dst_stride * 2, +		(uint16_t *)(((char *) src_bits) + +		src_y * src_stride * 4 + src_x * 2), src_stride * 2); +	return TRUE; +    case 32: +	pixman_composite_src_8888_8888_asm_neon ( +		width, height, +		(uint32_t *)(((char *) dst_bits) + +		dest_y * dst_stride * 4 + dest_x * 4), dst_stride, +		(uint32_t *)(((char *) src_bits) + +		src_y * src_stride * 4 + src_x * 4), src_stride); +	return TRUE; +    default: +	return FALSE; +    } +} + +static const pixman_fast_path_t arm_neon_fast_paths[] = +{ +    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     r5g6b5,   neon_composite_src_0565_0565), +    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     b5g6r5,   neon_composite_src_0565_0565), +    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565), +    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565), +    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565), +    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565), +    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     a8r8g8b8, neon_composite_src_0565_8888), +    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     x8r8g8b8, neon_composite_src_0565_8888), +    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     a8b8g8r8, neon_composite_src_0565_8888), +    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     x8b8g8r8, neon_composite_src_0565_8888), +    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     x8r8g8b8, neon_composite_src_8888_8888), +    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     x8r8g8b8, neon_composite_src_8888_8888), +    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     x8b8g8r8, neon_composite_src_8888_8888), +    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     x8b8g8r8, neon_composite_src_8888_8888), +    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     a8r8g8b8, neon_composite_src_8888_8888), +    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     a8b8g8r8, neon_composite_src_8888_8888), +    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     a8r8g8b8, neon_composite_src_x888_8888), +    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     a8b8g8r8, neon_composite_src_x888_8888), +    PIXMAN_STD_FAST_PATH (SRC,  r8g8b8,   null,     r8g8b8,   neon_composite_src_0888_0888), +    PIXMAN_STD_FAST_PATH (SRC,  b8g8r8,   null,     x8r8g8b8, neon_composite_src_0888_8888_rev), +    PIXMAN_STD_FAST_PATH (SRC,  b8g8r8,   null,     r5g6b5,   neon_composite_src_0888_0565_rev), +    PIXMAN_STD_FAST_PATH (SRC,  pixbuf,   pixbuf,   a8r8g8b8, neon_composite_src_pixbuf_8888), +    PIXMAN_STD_FAST_PATH (SRC,  pixbuf,   pixbuf,   a8b8g8r8, neon_composite_src_rpixbuf_8888), +    PIXMAN_STD_FAST_PATH (SRC,  rpixbuf,  rpixbuf,  a8r8g8b8, neon_composite_src_rpixbuf_8888), +    PIXMAN_STD_FAST_PATH (SRC,  rpixbuf,  rpixbuf,  a8b8g8r8, neon_composite_src_pixbuf_8888), +    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       a8r8g8b8, neon_composite_src_n_8_8888), +    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       x8r8g8b8, neon_composite_src_n_8_8888), +    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       a8b8g8r8, neon_composite_src_n_8_8888), +    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       x8b8g8r8, neon_composite_src_n_8_8888), +    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       a8,       neon_composite_src_n_8_8), + +    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8,       neon_composite_over_n_8_8), +    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       r5g6b5,   neon_composite_over_n_8_0565), +    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       b5g6r5,   neon_composite_over_n_8_0565), +    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8r8g8b8, neon_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8r8g8b8, neon_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8b8g8r8, neon_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8b8g8r8, neon_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     r5g6b5,   neon_composite_over_n_0565), +    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     a8r8g8b8, neon_composite_over_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     x8r8g8b8, neon_composite_over_n_8888), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, neon_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, neon_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, neon_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, neon_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5,   neon_composite_over_n_8888_0565_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5,   neon_composite_over_n_8888_0565_ca), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    a8r8g8b8, neon_composite_over_8888_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    x8r8g8b8, neon_composite_over_8888_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    r5g6b5,   neon_composite_over_8888_n_0565), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid,    b5g6r5,   neon_composite_over_8888_n_0565), +    PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   solid,    r5g6b5,   neon_composite_over_0565_n_0565), +    PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   solid,    b5g6r5,   neon_composite_over_0565_n_0565), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       a8r8g8b8, neon_composite_over_8888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       x8r8g8b8, neon_composite_over_8888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       a8b8g8r8, neon_composite_over_8888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       x8b8g8r8, neon_composite_over_8888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       r5g6b5,   neon_composite_over_8888_8_0565), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       b5g6r5,   neon_composite_over_8888_8_0565), +    PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   a8,       r5g6b5,   neon_composite_over_0565_8_0565), +    PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   a8,       b5g6r5,   neon_composite_over_0565_8_0565), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_over_8888_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     r5g6b5,   neon_composite_over_8888_0565), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     b5g6r5,   neon_composite_over_8888_0565), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     a8r8g8b8, neon_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     x8r8g8b8, neon_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     a8b8g8r8, neon_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     x8b8g8r8, neon_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null,     a8r8g8b8, neon_composite_src_x888_8888), +    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null,     a8b8g8r8, neon_composite_src_x888_8888), +    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8,       neon_composite_add_n_8_8), +    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8r8g8b8, neon_composite_add_n_8_8888), +    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8b8g8r8, neon_composite_add_n_8_8888), +    PIXMAN_STD_FAST_PATH (ADD,  a8,       a8,       a8,       neon_composite_add_8_8_8), +    PIXMAN_STD_FAST_PATH (ADD,  r5g6b5,   a8,       r5g6b5,   neon_composite_add_0565_8_0565), +    PIXMAN_STD_FAST_PATH (ADD,  b5g6r5,   a8,       b5g6r5,   neon_composite_add_0565_8_0565), +    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8,       a8r8g8b8, neon_composite_add_8888_8_8888), +    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, a8,       a8b8g8r8, neon_composite_add_8888_8_8888), +    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888), +    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, solid,    a8r8g8b8, neon_composite_add_8888_n_8888), +    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, solid,    a8b8g8r8, neon_composite_add_8888_n_8888), +    PIXMAN_STD_FAST_PATH (ADD,  a8,       null,     a8,       neon_composite_add_8_8), +    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null,     a8r8g8b8, neon_composite_add_8888_8888), +    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null,     a8b8g8r8, neon_composite_add_8888_8888), +    PIXMAN_STD_FAST_PATH (IN,   solid,    null,     a8,       neon_composite_in_n_8), +    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, neon_composite_over_reverse_n_8888), +    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888), +    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, r5g6b5,   neon_composite_out_reverse_8_0565), +    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, b5g6r5,   neon_composite_out_reverse_8_0565), +    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8r8g8b8, neon_composite_out_reverse_8_8888), +    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8b8g8r8, neon_composite_out_reverse_8_8888), + +    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888), +    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, neon_8888_8888), +    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888), +    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, neon_8888_8888), + +    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565), +    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_0565), + +    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565), +    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565), +    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, b5g6r5, neon_8888_0565), +    SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, b5g6r5, neon_8888_0565), + +    SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, neon_0565_8888), +    SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8888), +    /* Note: NONE repeat is not supported yet */ +    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, r5g6b5, a8r8g8b8, neon_0565_8888), +    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, b5g6r5, a8b8g8r8, neon_0565_8888), +    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, neon_0565_8888), +    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, neon_0565_8888), + +    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_8_0565), +    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_8_0565), + +    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, neon_0565_8_0565), +    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, neon_0565_8_0565), + +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8888), +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8888), +    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8888), + +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565), +    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565), + +    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888), +    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_0565), + +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888), +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888), + +    SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8888), +    SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8888), + +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8_8888), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8_8888), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8_8888), + +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_8_0565), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_8_0565), + +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8_x888), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_8_0565), + +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8_8888), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8_8888), + +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888), + +    { PIXMAN_OP_NONE }, +}; + +#define BIND_COMBINE_U(name)                                             \ +void                                                                     \ +pixman_composite_scanline_##name##_mask_asm_neon (int32_t         w,     \ +                                                  const uint32_t *dst,   \ +                                                  const uint32_t *src,   \ +                                                  const uint32_t *mask); \ +                                                                         \ +void                                                                     \ +pixman_composite_scanline_##name##_asm_neon (int32_t         w,          \ +                                             const uint32_t *dst,        \ +                                             const uint32_t *src);       \ +                                                                         \ +static void                                                              \ +neon_combine_##name##_u (pixman_implementation_t *imp,                   \ +                         pixman_op_t              op,                    \ +                         uint32_t *               dest,                  \ +                         const uint32_t *         src,                   \ +                         const uint32_t *         mask,                  \ +                         int                      width)                 \ +{                                                                        \ +    if (mask)                                                            \ +	pixman_composite_scanline_##name##_mask_asm_neon (width, dest,   \ +	                                                  src, mask);    \ +    else                                                                 \ +	pixman_composite_scanline_##name##_asm_neon (width, dest, src);  \ +} + +BIND_COMBINE_U (over) +BIND_COMBINE_U (add) +BIND_COMBINE_U (out_reverse) + +pixman_implementation_t * +_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback) +{ +    pixman_implementation_t *imp = +	_pixman_implementation_create (fallback, arm_neon_fast_paths); + +    imp->combine_32[PIXMAN_OP_OVER] = neon_combine_over_u; +    imp->combine_32[PIXMAN_OP_ADD] = neon_combine_add_u; +    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = neon_combine_out_reverse_u; + +    imp->blt = arm_neon_blt; +    imp->fill = arm_neon_fill; + +    return imp; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-arm-simd-asm-scaled.S b/libs/pixman-0.40.0/pixman/pixman-arm-simd-asm-scaled.S new file mode 100644 index 0000000..e050292 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-arm-simd-asm-scaled.S @@ -0,0 +1,156 @@ +/* + * Copyright © 2008 Mozilla Corporation + * Copyright © 2010 Nokia Corporation + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Mozilla Corporation not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission.  Mozilla Corporation makes no + * representations about the suitability of this software for any purpose.  It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author:  Jeff Muizelaar (jeff@infidigm.net) + * + */ + +/* Prevent the stack from becoming executable */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +	.text +	.arch armv6 +	.object_arch armv4 +	.arm +	.altmacro +	.p2align 2 + +#include "pixman-arm-asm.h" + +/* + * Note: This code is only using armv5te instructions (not even armv6), + *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to + *       be split into a few variants, tuned for each microarchitecture. + * + * TODO: In order to get good performance on ARM9/ARM11 cores (which don't + * have efficient write combining), it needs to be changed to use 16-byte + * aligned writes using STM instruction. + * + * Nearest scanline scaler macro template uses the following arguments: + *  fname                     - name of the function to generate + *  bpp_shift                 - (1 << bpp_shift) is the size of pixel in bytes + *  t                         - type suffix for LDR/STR instructions + *  prefetch_distance         - prefetch in the source image by that many + *                              pixels ahead + *  prefetch_braking_distance - stop prefetching when that many pixels are + *                              remaining before the end of scanline + */ + +.macro generate_nearest_scanline_func fname, bpp_shift, t,      \ +                                      prefetch_distance,        \ +                                      prefetch_braking_distance + +pixman_asm_function fname +	W		.req	r0 +	DST		.req	r1 +	SRC		.req	r2 +	VX		.req	r3 +	UNIT_X		.req	ip +	TMP1		.req	r4 +	TMP2		.req	r5 +	VXMASK		.req	r6 +	PF_OFFS		.req	r7 +	SRC_WIDTH_FIXED	.req	r8 + +	ldr	UNIT_X, [sp] +	push	{r4, r5, r6, r7, r8, r10} +	mvn	VXMASK, #((1 << bpp_shift) - 1) +	ldr	SRC_WIDTH_FIXED, [sp, #28] + +	/* define helper macro */ +	.macro	scale_2_pixels +		ldr&t	TMP1, [SRC, TMP1] +		and	TMP2, VXMASK, VX, asr #(16 - bpp_shift) +		adds	VX, VX, UNIT_X +		str&t	TMP1, [DST], #(1 << bpp_shift) +9:		subpls	VX, VX, SRC_WIDTH_FIXED +		bpl	9b + +		ldr&t	TMP2, [SRC, TMP2] +		and	TMP1, VXMASK, VX, asr #(16 - bpp_shift) +		adds	VX, VX, UNIT_X +		str&t	TMP2, [DST], #(1 << bpp_shift) +9:		subpls	VX, VX, SRC_WIDTH_FIXED +		bpl	9b +	.endm + +	/* now do the scaling */ +	and	TMP1, VXMASK, VX, asr #(16 - bpp_shift) +	adds	VX, VX, UNIT_X +9:	subpls	VX, VX, SRC_WIDTH_FIXED +	bpl	9b +	subs	W, W, #(8 + prefetch_braking_distance) +	blt	2f +	/* calculate prefetch offset */ +	mov	PF_OFFS, #prefetch_distance +	mla	PF_OFFS, UNIT_X, PF_OFFS, VX +1:	/* main loop, process 8 pixels per iteration with prefetch */ +	pld	[SRC, PF_OFFS, asr #(16 - bpp_shift)] +	add	PF_OFFS, UNIT_X, lsl #3 +	scale_2_pixels +	scale_2_pixels +	scale_2_pixels +	scale_2_pixels +	subs	W, W, #8 +	bge	1b +2: +	subs	W, W, #(4 - 8 - prefetch_braking_distance) +	blt	2f +1:	/* process the remaining pixels */ +	scale_2_pixels +	scale_2_pixels +	subs	W, W, #4 +	bge	1b +2: +	tst	W, #2 +	beq	2f +	scale_2_pixels +2: +	tst	W, #1 +	ldrne&t	TMP1, [SRC, TMP1] +	strne&t	TMP1, [DST] +	/* cleanup helper macro */ +	.purgem	scale_2_pixels +	.unreq	DST +	.unreq	SRC +	.unreq	W +	.unreq	VX +	.unreq	UNIT_X +	.unreq	TMP1 +	.unreq	TMP2 +	.unreq	VXMASK +	.unreq	PF_OFFS +	.unreq  SRC_WIDTH_FIXED +	/* return */ +	pop	{r4, r5, r6, r7, r8, r10} +	bx	lr +.endfunc +.endm + +generate_nearest_scanline_func \ +    pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32 + +generate_nearest_scanline_func \ +    pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2,  , 48, 32 diff --git a/libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.S b/libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.S new file mode 100644 index 0000000..a74a0a8 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.S @@ -0,0 +1,1179 @@ +/* + * Copyright © 2012 Raspberry Pi Foundation + * Copyright © 2012 RISC OS Open Ltd + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of the copyright holders not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission.  The copyright holders make no + * representations about the suitability of this software for any purpose.  It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author:  Ben Avison (bavison@riscosopen.org) + * + */ + +/* Prevent the stack from becoming executable */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +	.text +	.arch armv6 +	.object_arch armv4 +	.arm +	.altmacro +	.p2align 2 + +#include "pixman-arm-asm.h" +#include "pixman-arm-simd-asm.h" + +/* A head macro should do all processing which results in an output of up to + * 16 bytes, as far as the final load instruction. The corresponding tail macro + * should complete the processing of the up-to-16 bytes. The calling macro will + * sometimes choose to insert a preload or a decrement of X between them. + *   cond           ARM condition code for code block + *   numbytes       Number of output bytes that should be generated this time + *   firstreg       First WK register in which to place output + *   unaligned_src  Whether to use non-wordaligned loads of source image + *   unaligned_mask Whether to use non-wordaligned loads of mask image + *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output + */ + +.macro blit_init +        line_saved_regs STRIDE_D, STRIDE_S +.endm + +.macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload +        pixld   cond, numbytes, firstreg, SRC, unaligned_src +.endm + +.macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment +    WK4     .req    STRIDE_D +    WK5     .req    STRIDE_S +    WK6     .req    MASK +    WK7     .req    STRIDE_M +110:    pixld   , 16, 0, SRC, unaligned_src +        pixld   , 16, 4, SRC, unaligned_src +        pld     [SRC, SCRATCH] +        pixst   , 16, 0, DST +        pixst   , 16, 4, DST +        subs    X, X, #32*8/src_bpp +        bhs     110b +    .unreq  WK4 +    .unreq  WK5 +    .unreq  WK6 +    .unreq  WK7 +.endm + +generate_composite_function \ +    pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \ +    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ +    4, /* prefetch distance */ \ +    blit_init, \ +    nop_macro, /* newline */ \ +    nop_macro, /* cleanup */ \ +    blit_process_head, \ +    nop_macro, /* process tail */ \ +    blit_inner_loop + +generate_composite_function \ +    pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \ +    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ +    4, /* prefetch distance */ \ +    blit_init, \ +    nop_macro, /* newline */ \ +    nop_macro, /* cleanup */ \ +    blit_process_head, \ +    nop_macro, /* process tail */ \ +    blit_inner_loop + +generate_composite_function \ +    pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \ +    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ +    3, /* prefetch distance */ \ +    blit_init, \ +    nop_macro, /* newline */ \ +    nop_macro, /* cleanup */ \ +    blit_process_head, \ +    nop_macro, /* process tail */ \ +    blit_inner_loop + +/******************************************************************************/ + +.macro src_n_8888_init +        ldr     SRC, [sp, #ARGS_STACK_OFFSET] +        mov     STRIDE_S, SRC +        mov     MASK, SRC +        mov     STRIDE_M, SRC +.endm + +.macro src_n_0565_init +        ldrh    SRC, [sp, #ARGS_STACK_OFFSET] +        orr     SRC, SRC, lsl #16 +        mov     STRIDE_S, SRC +        mov     MASK, SRC +        mov     STRIDE_M, SRC +.endm + +.macro src_n_8_init +        ldrb    SRC, [sp, #ARGS_STACK_OFFSET] +        orr     SRC, SRC, lsl #8 +        orr     SRC, SRC, lsl #16 +        mov     STRIDE_S, SRC +        mov     MASK, SRC +        mov     STRIDE_M, SRC +.endm + +.macro fill_process_tail  cond, numbytes, firstreg +    WK4     .req    SRC +    WK5     .req    STRIDE_S +    WK6     .req    MASK +    WK7     .req    STRIDE_M +        pixst   cond, numbytes, 4, DST +    .unreq  WK4 +    .unreq  WK5 +    .unreq  WK6 +    .unreq  WK7 +.endm + +generate_composite_function \ +    pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \ +    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ +    0, /* prefetch distance doesn't apply */ \ +    src_n_8888_init \ +    nop_macro, /* newline */ \ +    nop_macro /* cleanup */ \ +    nop_macro /* process head */ \ +    fill_process_tail + +generate_composite_function \ +    pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \ +    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ +    0, /* prefetch distance doesn't apply */ \ +    src_n_0565_init \ +    nop_macro, /* newline */ \ +    nop_macro /* cleanup */ \ +    nop_macro /* process head */ \ +    fill_process_tail + +generate_composite_function \ +    pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \ +    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ +    0, /* prefetch distance doesn't apply */ \ +    src_n_8_init \ +    nop_macro, /* newline */ \ +    nop_macro /* cleanup */ \ +    nop_macro /* process head */ \ +    fill_process_tail + +/******************************************************************************/ + +.macro src_x888_8888_pixel, cond, reg +        orr&cond WK®, WK®, #0xFF000000 +.endm + +.macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload +        pixld   cond, numbytes, firstreg, SRC, unaligned_src +.endm + +.macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg +        src_x888_8888_pixel cond, %(firstreg+0) + .if numbytes >= 8 +        src_x888_8888_pixel cond, %(firstreg+1) +  .if numbytes == 16 +        src_x888_8888_pixel cond, %(firstreg+2) +        src_x888_8888_pixel cond, %(firstreg+3) +  .endif + .endif +.endm + +generate_composite_function \ +    pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \ +    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ +    3, /* prefetch distance */ \ +    nop_macro, /* init */ \ +    nop_macro, /* newline */ \ +    nop_macro, /* cleanup */ \ +    pixman_composite_src_x888_8888_process_head, \ +    pixman_composite_src_x888_8888_process_tail + +/******************************************************************************/ + +.macro src_0565_8888_init +        /* Hold loop invariants in MASK and STRIDE_M */ +        ldr     MASK, =0x07E007E0 +        mov     STRIDE_M, #0xFF000000 +        /* Set GE[3:0] to 1010 so SEL instructions do what we want */ +        ldr     SCRATCH, =0x80008000 +        uadd8   SCRATCH, SCRATCH, SCRATCH +.endm + +.macro src_0565_8888_2pixels, reg1, reg2 +        and     SCRATCH, WK®1, MASK             @ 00000GGGGGG0000000000gggggg00000 +        bic     WK®2, WK®1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb +        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg +        mov     WK®1, WK®2, lsl #16          @ rrrrr000000bbbbb0000000000000000 +        mov     SCRATCH, SCRATCH, ror #19          @ GGGG0000ggggggggggg00000GGGGGGGG +        bic     WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000 +        orr     WK®1, WK®1, WK®1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000 +        orr     WK®2, WK®2, WK®2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000 +        pkhtb   WK®1, WK®1, WK®1, asr #5  @ rrrrrrrr--------bbbbbbbb-------- +        sel     WK®1, WK®1, SCRATCH          @ rrrrrrrrggggggggbbbbbbbb-------- +        mov     SCRATCH, SCRATCH, ror #16          @ ggg00000GGGGGGGGGGGG0000gggggggg +        pkhtb   WK®2, WK®2, WK®2, asr #5  @ RRRRRRRR--------BBBBBBBB-------- +        sel     WK®2, WK®2, SCRATCH          @ RRRRRRRRGGGGGGGGBBBBBBBB-------- +        orr     WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb +        orr     WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB +.endm + +/* This version doesn't need STRIDE_M, but is one instruction longer. +   It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case? +        and     SCRATCH, WK®1, MASK             @ 00000GGGGGG0000000000gggggg00000 +        bic     WK®1, WK®1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb +        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg +        mov     WK®2, WK®1, lsr #16          @ 0000000000000000RRRRR000000BBBBB +        mov     SCRATCH, SCRATCH, ror #27          @ GGGGGGGGGGGG0000ggggggggggg00000 +        bic     WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb +        mov     WK®2, WK®2, lsl #3           @ 0000000000000RRRRR000000BBBBB000 +        mov     WK®1, WK®1, lsl #3           @ 0000000000000rrrrr000000bbbbb000 +        orr     WK®2, WK®2, WK®2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB +        orr     WK®1, WK®1, WK®1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb +        pkhbt   WK®2, WK®2, WK®2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB +        pkhbt   WK®1, WK®1, WK®1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb +        sel     WK®2, SCRATCH, WK®2          @ --------RRRRRRRRGGGGGGGGBBBBBBBB +        sel     WK®1, SCRATCH, WK®1          @ --------rrrrrrrrggggggggbbbbbbbb +        orr     WK®2, WK®2, #0xFF000000      @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB +        orr     WK®1, WK®1, #0xFF000000      @ 11111111rrrrrrrrggggggggbbbbbbbb +*/ + +.macro src_0565_8888_1pixel, reg +        bic     SCRATCH, WK®, MASK              @ 0000000000000000rrrrr000000bbbbb +        and     WK®, WK®, MASK               @ 000000000000000000000gggggg00000 +        mov     SCRATCH, SCRATCH, lsl #3           @ 0000000000000rrrrr000000bbbbb000 +        mov     WK®, WK®, lsl #5             @ 0000000000000000gggggg0000000000 +        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb +        orr     WK®, WK®, WK®, lsr #6     @ 000000000000000gggggggggggg00000 +        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5  @ --------rrrrrrrr--------bbbbbbbb +        sel     WK®, WK®, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb +        orr     WK®, WK®, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb +.endm + +.macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + .if numbytes == 16 +        pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src + .elseif numbytes == 8 +        pixld   , 4, firstreg, SRC, unaligned_src + .elseif numbytes == 4 +        pixld   , 2, firstreg, SRC, unaligned_src + .endif +.endm + +.macro src_0565_8888_process_tail   cond, numbytes, firstreg + .if numbytes == 16 +        src_0565_8888_2pixels firstreg, %(firstreg+1) +        src_0565_8888_2pixels %(firstreg+2), %(firstreg+3) + .elseif numbytes == 8 +        src_0565_8888_2pixels firstreg, %(firstreg+1) + .else +        src_0565_8888_1pixel firstreg + .endif +.endm + +generate_composite_function \ +    pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \ +    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ +    3, /* prefetch distance */ \ +    src_0565_8888_init, \ +    nop_macro, /* newline */ \ +    nop_macro, /* cleanup */ \ +    src_0565_8888_process_head, \ +    src_0565_8888_process_tail + +/******************************************************************************/ + +.macro src_x888_0565_init +        /* Hold loop invariant in MASK */ +        ldr     MASK, =0x001F001F +        line_saved_regs  STRIDE_S, ORIG_W +.endm + +.macro src_x888_0565_1pixel  s, d +        and     WK&d, MASK, WK&s, lsr #3           @ 00000000000rrrrr00000000000bbbbb +        and     STRIDE_S, WK&s, #0xFC00            @ 0000000000000000gggggg0000000000 +        orr     WK&d, WK&d, WK&d, lsr #5           @ 00000000000-----rrrrr000000bbbbb +        orr     WK&d, WK&d, STRIDE_S, lsr #5       @ 00000000000-----rrrrrggggggbbbbb +        /* Top 16 bits are discarded during the following STRH */ +.endm + +.macro src_x888_0565_2pixels  slo, shi, d, tmp +        and     SCRATCH, WK&shi, #0xFC00           @ 0000000000000000GGGGGG0000000000 +        and     WK&tmp, MASK, WK&shi, lsr #3       @ 00000000000RRRRR00000000000BBBBB +        and     WK&shi, MASK, WK&slo, lsr #3       @ 00000000000rrrrr00000000000bbbbb +        orr     WK&tmp, WK&tmp, WK&tmp, lsr #5     @ 00000000000-----RRRRR000000BBBBB +        orr     WK&tmp, WK&tmp, SCRATCH, lsr #5    @ 00000000000-----RRRRRGGGGGGBBBBB +        and     SCRATCH, WK&slo, #0xFC00           @ 0000000000000000gggggg0000000000 +        orr     WK&shi, WK&shi, WK&shi, lsr #5     @ 00000000000-----rrrrr000000bbbbb +        orr     WK&shi, WK&shi, SCRATCH, lsr #5    @ 00000000000-----rrrrrggggggbbbbb +        pkhbt   WK&d, WK&shi, WK&tmp, lsl #16      @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb +.endm + +.macro src_x888_0565_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload +        WK4     .req    STRIDE_S +        WK5     .req    STRIDE_M +        WK6     .req    WK3 +        WK7     .req    ORIG_W + .if numbytes == 16 +        pixld   , 16, 4, SRC, 0 +        src_x888_0565_2pixels  4, 5, 0, 0 +        pixld   , 8, 4, SRC, 0 +        src_x888_0565_2pixels  6, 7, 1, 1 +        pixld   , 8, 6, SRC, 0 + .else +        pixld   , numbytes*2, 4, SRC, 0 + .endif +.endm + +.macro src_x888_0565_process_tail   cond, numbytes, firstreg + .if numbytes == 16 +        src_x888_0565_2pixels  4, 5, 2, 2 +        src_x888_0565_2pixels  6, 7, 3, 4 + .elseif numbytes == 8 +        src_x888_0565_2pixels  4, 5, 1, 1 +        src_x888_0565_2pixels  6, 7, 2, 2 + .elseif numbytes == 4 +        src_x888_0565_2pixels  4, 5, 1, 1 + .else +        src_x888_0565_1pixel  4, 1 + .endif + .if numbytes == 16 +        pixst   , numbytes, 0, DST + .else +        pixst   , numbytes, 1, DST + .endif +        .unreq  WK4 +        .unreq  WK5 +        .unreq  WK6 +        .unreq  WK7 +.endm + +generate_composite_function \ +    pixman_composite_src_x888_0565_asm_armv6, 32, 0, 16, \ +    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \ +    3, /* prefetch distance */ \ +    src_x888_0565_init, \ +    nop_macro, /* newline */ \ +    nop_macro, /* cleanup */ \ +    src_x888_0565_process_head, \ +    src_x888_0565_process_tail + +/******************************************************************************/ + +.macro add_8_8_8pixels  cond, dst1, dst2 +        uqadd8&cond  WK&dst1, WK&dst1, MASK +        uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M +.endm + +.macro add_8_8_4pixels  cond, dst +        uqadd8&cond  WK&dst, WK&dst, MASK +.endm + +.macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload +    WK4     .req    MASK +    WK5     .req    STRIDE_M + .if numbytes == 16 +        pixld   cond, 8, 4, SRC, unaligned_src +        pixld   cond, 16, firstreg, DST, 0 +        add_8_8_8pixels cond, firstreg, %(firstreg+1) +        pixld   cond, 8, 4, SRC, unaligned_src + .else +        pixld   cond, numbytes, 4, SRC, unaligned_src +        pixld   cond, numbytes, firstreg, DST, 0 + .endif +    .unreq  WK4 +    .unreq  WK5 +.endm + +.macro add_8_8_process_tail  cond, numbytes, firstreg + .if numbytes == 16 +        add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3) + .elseif numbytes == 8 +        add_8_8_8pixels cond, firstreg, %(firstreg+1) + .else +        add_8_8_4pixels cond, firstreg + .endif +.endm + +generate_composite_function \ +    pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \ +    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \ +    2, /* prefetch distance */ \ +    nop_macro, /* init */ \ +    nop_macro, /* newline */ \ +    nop_macro, /* cleanup */ \ +    add_8_8_process_head, \ +    add_8_8_process_tail + +/******************************************************************************/ + +.macro over_8888_8888_init +        /* Hold loop invariant in MASK */ +        ldr     MASK, =0x00800080 +        /* Set GE[3:0] to 0101 so SEL instructions do what we want */ +        uadd8   SCRATCH, MASK, MASK +        line_saved_regs STRIDE_D, STRIDE_S, ORIG_W +.endm + +.macro over_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload +    WK4     .req    STRIDE_D +    WK5     .req    STRIDE_S +    WK6     .req    STRIDE_M +    WK7     .req    ORIG_W +        pixld   , numbytes, %(4+firstreg), SRC, unaligned_src +        pixld   , numbytes, firstreg, DST, 0 +    .unreq  WK4 +    .unreq  WK5 +    .unreq  WK6 +    .unreq  WK7 +.endm + +.macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3 +        /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */ +        teq     WK®0, #0 + .if numbytes > 4 +        teqeq   WK®1, #0 +  .if numbytes > 8 +        teqeq   WK®2, #0 +        teqeq   WK®3, #0 +  .endif + .endif +.endm + +.macro over_8888_8888_prepare  next +        mov     WK&next, WK&next, lsr #24 +.endm + +.macro over_8888_8888_1pixel src, dst, offset, next +        /* src = destination component multiplier */ +        rsb     WK&src, WK&src, #255 +        /* Split even/odd bytes of dst into SCRATCH/dst */ +        uxtb16  SCRATCH, WK&dst +        uxtb16  WK&dst, WK&dst, ror #8 +        /* Multiply through, adding 0.5 to the upper byte of result for rounding */ +        mla     SCRATCH, SCRATCH, WK&src, MASK +        mla     WK&dst, WK&dst, WK&src, MASK +        /* Where we would have had a stall between the result of the first MLA and the shifter input, +         * reload the complete source pixel */ +        ldr     WK&src, [SRC, #offset] +        /* Multiply by 257/256 to approximate 256/255 */ +        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 +        /* In this stall, start processing the next pixel */ + .if offset < -4 +        mov     WK&next, WK&next, lsr #24 + .endif +        uxtab16 WK&dst, WK&dst, WK&dst, ror #8 +        /* Recombine even/odd bytes of multiplied destination */ +        mov     SCRATCH, SCRATCH, ror #8 +        sel     WK&dst, SCRATCH, WK&dst +        /* Saturated add of source to multiplied destination */ +        uqadd8  WK&dst, WK&dst, WK&src +.endm + +.macro over_8888_8888_process_tail  cond, numbytes, firstreg +    WK4     .req    STRIDE_D +    WK5     .req    STRIDE_S +    WK6     .req    STRIDE_M +    WK7     .req    ORIG_W +        over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg) +        beq     10f +        over_8888_8888_prepare  %(4+firstreg) + .set PROCESS_REG, firstreg + .set PROCESS_OFF, -numbytes + .rept numbytes / 4 +        over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG) +  .set PROCESS_REG, PROCESS_REG+1 +  .set PROCESS_OFF, PROCESS_OFF+4 + .endr +        pixst   , numbytes, firstreg, DST +10: +    .unreq  WK4 +    .unreq  WK5 +    .unreq  WK6 +    .unreq  WK7 +.endm + +generate_composite_function \ +    pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \ +    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ +    2, /* prefetch distance */ \ +    over_8888_8888_init, \ +    nop_macro, /* newline */ \ +    nop_macro, /* cleanup */ \ +    over_8888_8888_process_head, \ +    over_8888_8888_process_tail + +/******************************************************************************/ + +/* Multiply each byte of a word by a byte. + * Useful when there aren't any obvious ways to fill the stalls with other instructions. + * word  Register containing 4 bytes + * byte  Register containing byte multiplier (bits 8-31 must be 0) + * tmp   Scratch register + * half  Register containing the constant 0x00800080 + * GE[3:0] bits must contain 0101 + */ +.macro mul_8888_8  word, byte, tmp, half +        /* Split even/odd bytes of word apart */ +        uxtb16  tmp, word +        uxtb16  word, word, ror #8 +        /* Multiply bytes together with rounding, then by 257/256 */ +        mla     tmp, tmp, byte, half +        mla     word, word, byte, half /* 1 stall follows */ +        uxtab16 tmp, tmp, tmp, ror #8  /* 1 stall follows */ +        uxtab16 word, word, word, ror #8 +        /* Recombine bytes */ +        mov     tmp, tmp, ror #8 +        sel     word, tmp, word +.endm + +/******************************************************************************/ + +.macro over_8888_n_8888_init +        /* Mask is constant */ +        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8] +        /* Hold loop invariant in STRIDE_M */ +        ldr     STRIDE_M, =0x00800080 +        /* We only want the alpha bits of the constant mask */ +        mov     MASK, MASK, lsr #24 +        /* Set GE[3:0] to 0101 so SEL instructions do what we want */ +        uadd8   SCRATCH, STRIDE_M, STRIDE_M +        line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W +.endm + +.macro over_8888_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload +    WK4     .req    Y +    WK5     .req    STRIDE_D +    WK6     .req    STRIDE_S +    WK7     .req    ORIG_W +        pixld   , numbytes, %(4+(firstreg%2)), SRC, unaligned_src +        pixld   , numbytes, firstreg, DST, 0 +    .unreq  WK4 +    .unreq  WK5 +    .unreq  WK6 +    .unreq  WK7 +.endm + +.macro over_8888_n_8888_1pixel src, dst +        mul_8888_8  WK&src, MASK, SCRATCH, STRIDE_M +        sub     WK7, WK6, WK&src, lsr #24 +        mul_8888_8  WK&dst, WK7, SCRATCH, STRIDE_M +        uqadd8  WK&dst, WK&dst, WK&src +.endm + +.macro over_8888_n_8888_process_tail  cond, numbytes, firstreg +    WK4     .req    Y +    WK5     .req    STRIDE_D +    WK6     .req    STRIDE_S +    WK7     .req    ORIG_W +        over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg) +        beq     10f +        mov     WK6, #255 + .set PROCESS_REG, firstreg + .rept numbytes / 4 +  .if numbytes == 16 && PROCESS_REG == 2 +        /* We're using WK6 and WK7 as temporaries, so half way through +         * 4 pixels, reload the second two source pixels but this time +         * into WK4 and WK5 */ +        ldmdb   SRC, {WK4, WK5} +  .endif +        over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG) +  .set PROCESS_REG, PROCESS_REG+1 + .endr +        pixst   , numbytes, firstreg, DST +10: +    .unreq  WK4 +    .unreq  WK5 +    .unreq  WK6 +    .unreq  WK7 +.endm + +generate_composite_function \ +    pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \ +    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ +    2, /* prefetch distance */ \ +    over_8888_n_8888_init, \ +    nop_macro, /* newline */ \ +    nop_macro, /* cleanup */ \ +    over_8888_n_8888_process_head, \ +    over_8888_n_8888_process_tail + +/******************************************************************************/ + +.macro over_n_8_8888_init +        /* Source is constant, but splitting it into even/odd bytes is a loop invariant */ +        ldr     SRC, [sp, #ARGS_STACK_OFFSET] +        /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */ +        ldr     SCRATCH, =0x00800080 +        uxtb16  STRIDE_S, SRC +        uxtb16  SRC, SRC, ror #8 +        /* Set GE[3:0] to 0101 so SEL instructions do what we want */ +        uadd8   SCRATCH, SCRATCH, SCRATCH +        line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W +.endm + +.macro over_n_8_8888_newline +        ldr     STRIDE_D, =0x00800080 +        b       1f + .ltorg +1: +.endm + +.macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload +    WK4     .req    STRIDE_M +        pixld   , numbytes/4, 4, MASK, unaligned_mask +        pixld   , numbytes, firstreg, DST, 0 +    .unreq  WK4 +.endm + +.macro over_n_8_8888_1pixel src, dst +        uxtb    Y, WK4, ror #src*8 +        /* Trailing part of multiplication of source */ +        mla     SCRATCH, STRIDE_S, Y, STRIDE_D +        mla     Y, SRC, Y, STRIDE_D +        mov     ORIG_W, #255 +        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 +        uxtab16 Y, Y, Y, ror #8 +        mov     SCRATCH, SCRATCH, ror #8 +        sub     ORIG_W, ORIG_W, Y, lsr #24 +        sel     Y, SCRATCH, Y +        /* Then multiply the destination */ +        mul_8888_8  WK&dst, ORIG_W, SCRATCH, STRIDE_D +        uqadd8  WK&dst, WK&dst, Y +.endm + +.macro over_n_8_8888_process_tail  cond, numbytes, firstreg +    WK4     .req    STRIDE_M +        teq     WK4, #0 +        beq     10f + .set PROCESS_REG, firstreg + .rept numbytes / 4 +        over_n_8_8888_1pixel  %(PROCESS_REG-firstreg), %(PROCESS_REG) +  .set PROCESS_REG, PROCESS_REG+1 + .endr +        pixst   , numbytes, firstreg, DST +10: +    .unreq  WK4 +.endm + +generate_composite_function \ +    pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \ +    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ +    2, /* prefetch distance */ \ +    over_n_8_8888_init, \ +    over_n_8_8888_newline, \ +    nop_macro, /* cleanup */ \ +    over_n_8_8888_process_head, \ +    over_n_8_8888_process_tail + +/******************************************************************************/ + +.macro over_reverse_n_8888_init +        ldr     SRC, [sp, #ARGS_STACK_OFFSET] +        ldr     MASK, =0x00800080 +        /* Split source pixel into RB/AG parts */ +        uxtb16  STRIDE_S, SRC +        uxtb16  STRIDE_M, SRC, ror #8 +        /* Set GE[3:0] to 0101 so SEL instructions do what we want */ +        uadd8   SCRATCH, MASK, MASK +        line_saved_regs  STRIDE_D, ORIG_W +.endm + +.macro over_reverse_n_8888_newline +        mov     STRIDE_D, #0xFF +.endm + +.macro over_reverse_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload +        pixld   , numbytes, firstreg, DST, 0 +.endm + +.macro over_reverse_n_8888_1pixel  d, is_only +        teq     WK&d, #0 +        beq     8f       /* replace with source */ +        bics    ORIG_W, STRIDE_D, WK&d, lsr #24 + .if is_only == 1 +        beq     49f      /* skip store */ + .else +        beq     9f       /* write same value back */ + .endif +        mla     SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */ +        mla     ORIG_W, STRIDE_M, ORIG_W, MASK  /* alpha/green */ +        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 +        uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8 +        mov     SCRATCH, SCRATCH, ror #8 +        sel     ORIG_W, SCRATCH, ORIG_W +        uqadd8  WK&d, WK&d, ORIG_W +        b       9f +8:      mov     WK&d, SRC +9: +.endm + +.macro over_reverse_n_8888_tail  numbytes, reg1, reg2, reg3, reg4 + .if numbytes == 4 +        over_reverse_n_8888_1pixel  reg1, 1 + .else +        and     SCRATCH, WK®1, WK®2 +  .if numbytes == 16 +        and     SCRATCH, SCRATCH, WK®3 +        and     SCRATCH, SCRATCH, WK®4 +  .endif +        mvns    SCRATCH, SCRATCH, asr #24 +        beq     49f /* skip store if all opaque */ +        over_reverse_n_8888_1pixel  reg1, 0 +        over_reverse_n_8888_1pixel  reg2, 0 +  .if numbytes == 16 +        over_reverse_n_8888_1pixel  reg3, 0 +        over_reverse_n_8888_1pixel  reg4, 0 +  .endif + .endif +        pixst   , numbytes, reg1, DST +49: +.endm + +.macro over_reverse_n_8888_process_tail  cond, numbytes, firstreg +        over_reverse_n_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3) +.endm + +generate_composite_function \ +    pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \ +    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \ +    3, /* prefetch distance */ \ +    over_reverse_n_8888_init, \ +    over_reverse_n_8888_newline, \ +    nop_macro, /* cleanup */ \ +    over_reverse_n_8888_process_head, \ +    over_reverse_n_8888_process_tail + +/******************************************************************************/ + +.macro over_white_8888_8888_ca_init +        HALF    .req    SRC +        TMP0    .req    STRIDE_D +        TMP1    .req    STRIDE_S +        TMP2    .req    STRIDE_M +        TMP3    .req    ORIG_W +        WK4     .req    SCRATCH +        line_saved_regs STRIDE_D, STRIDE_M, ORIG_W +        ldr     SCRATCH, =0x800080 +        mov     HALF, #0x80 +        /* Set GE[3:0] to 0101 so SEL instructions do what we want */ +        uadd8   SCRATCH, SCRATCH, SCRATCH +        .set DST_PRELOAD_BIAS, 8 +.endm + +.macro over_white_8888_8888_ca_cleanup +        .set DST_PRELOAD_BIAS, 0 +        .unreq  HALF +        .unreq  TMP0 +        .unreq  TMP1 +        .unreq  TMP2 +        .unreq  TMP3 +        .unreq  WK4 +.endm + +.macro over_white_8888_8888_ca_combine  m, d +        uxtb16  TMP1, TMP0                /* rb_notmask */ +        uxtb16  TMP2, d                   /* rb_dest; 1 stall follows */ +        smlatt  TMP3, TMP2, TMP1, HALF    /* red */ +        smlabb  TMP2, TMP2, TMP1, HALF    /* blue */ +        uxtb16  TMP0, TMP0, ror #8        /* ag_notmask */ +        uxtb16  TMP1, d, ror #8           /* ag_dest; 1 stall follows */ +        smlatt  d, TMP1, TMP0, HALF       /* alpha */ +        smlabb  TMP1, TMP1, TMP0, HALF    /* green */ +        pkhbt   TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */ +        pkhbt   TMP1, TMP1, d, lsl #16    /* ag */ +        uxtab16 TMP0, TMP0, TMP0, ror #8 +        uxtab16 TMP1, TMP1, TMP1, ror #8 +        mov     TMP0, TMP0, ror #8 +        sel     d, TMP0, TMP1 +        uqadd8  d, d, m                   /* d is a late result */ +.endm + +.macro over_white_8888_8888_ca_1pixel_head +        pixld   , 4, 1, MASK, 0 +        pixld   , 4, 3, DST, 0 +.endm + +.macro over_white_8888_8888_ca_1pixel_tail +        mvn     TMP0, WK1 +        teq     WK1, WK1, asr #32 +        bne     01f +        bcc     03f +        mov     WK3, WK1 +        b       02f +01:     over_white_8888_8888_ca_combine WK1, WK3 +02:     pixst   , 4, 3, DST +03: +.endm + +.macro over_white_8888_8888_ca_2pixels_head +        pixld   , 8, 1, MASK, 0 +.endm + +.macro over_white_8888_8888_ca_2pixels_tail +        pixld   , 8, 3, DST +        mvn     TMP0, WK1 +        teq     WK1, WK1, asr #32 +        bne     01f +        movcs   WK3, WK1 +        bcs     02f +        teq     WK2, #0 +        beq     05f +        b       02f +01:     over_white_8888_8888_ca_combine WK1, WK3 +02:     mvn     TMP0, WK2 +        teq     WK2, WK2, asr #32 +        bne     03f +        movcs   WK4, WK2 +        b       04f +03:     over_white_8888_8888_ca_combine WK2, WK4 +04:     pixst   , 8, 3, DST +05: +.endm + +.macro over_white_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + .if numbytes == 4 +        over_white_8888_8888_ca_1pixel_head + .else +  .if numbytes == 16 +        over_white_8888_8888_ca_2pixels_head +        over_white_8888_8888_ca_2pixels_tail +  .endif +        over_white_8888_8888_ca_2pixels_head + .endif +.endm + +.macro over_white_8888_8888_ca_process_tail  cond, numbytes, firstreg + .if numbytes == 4 +        over_white_8888_8888_ca_1pixel_tail + .else +        over_white_8888_8888_ca_2pixels_tail + .endif +.endm + +generate_composite_function \ +    pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \ +    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH \ +    2, /* prefetch distance */ \ +    over_white_8888_8888_ca_init, \ +    nop_macro, /* newline */ \ +    over_white_8888_8888_ca_cleanup, \ +    over_white_8888_8888_ca_process_head, \ +    over_white_8888_8888_ca_process_tail + + +.macro over_n_8888_8888_ca_init +        /* Set up constants. RB_SRC and AG_SRC are in registers; +         * RB_FLDS, A_SRC, and the two HALF values need to go on the +         * stack (and the ful SRC value is already there) */ +        ldr     SCRATCH, [sp, #ARGS_STACK_OFFSET] +        mov     WK0, #0x00FF0000 +        orr     WK0, WK0, #0xFF        /* RB_FLDS (0x00FF00FF) */ +        mov     WK1, #0x80             /* HALF default value */ +        mov     WK2, SCRATCH, lsr #24  /* A_SRC */ +        orr     WK3, WK1, WK1, lsl #16 /* HALF alternate value (0x00800080) */ +        push    {WK0-WK3} + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+16 +        uxtb16  SRC, SCRATCH +        uxtb16  STRIDE_S, SCRATCH, ror #8 + +        /* Set GE[3:0] to 0101 so SEL instructions do what we want */ +        uadd8   SCRATCH, WK3, WK3 + +        .unreq  WK0 +        .unreq  WK1 +        .unreq  WK2 +        .unreq  WK3 +        WK0     .req    Y +        WK1     .req    STRIDE_D +        RB_SRC  .req    SRC +        AG_SRC  .req    STRIDE_S +        WK2     .req    STRIDE_M +        RB_FLDS .req    r8       /* the reloaded constants have to be at consecutive registers starting at an even one */ +        A_SRC   .req    r8 +        HALF    .req    r9 +        WK3     .req    r10 +        WK4     .req    r11 +        WK5     .req    SCRATCH +        WK6     .req    ORIG_W + +        line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W +.endm + +.macro over_n_8888_8888_ca_cleanup +        add     sp, sp, #16 + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-16 + +        .unreq  WK0 +        .unreq  WK1 +        .unreq  RB_SRC +        .unreq  AG_SRC +        .unreq  WK2 +        .unreq  RB_FLDS +        .unreq  A_SRC +        .unreq  HALF +        .unreq  WK3 +        .unreq  WK4 +        .unreq  WK5 +        .unreq  WK6 +        WK0     .req    r8 +        WK1     .req    r9 +        WK2     .req    r10 +        WK3     .req    r11 +.endm + +.macro over_n_8888_8888_ca_1pixel_head +        pixld   , 4, 6, MASK, 0 +        pixld   , 4, 0, DST, 0 +.endm + +.macro over_n_8888_8888_ca_1pixel_tail +        ldrd    A_SRC, HALF, [sp, #LOCALS_STACK_OFFSET+8] +        uxtb16  WK1, WK6                 /* rb_mask (first step of hard case placed in what would otherwise be a stall) */ +        teq     WK6, WK6, asr #32        /* Zc if transparent, ZC if opaque */ +        bne     20f +        bcc     40f +        /* Mask is fully opaque (all channels) */ +        ldr     WK6, [sp, #ARGS_STACK_OFFSET] /* get SRC back */ +        eors    A_SRC, A_SRC, #0xFF +        bne     10f +        /* Source is also opaque - same as src_8888_8888 */ +        mov     WK0, WK6 +        b       30f +10:     /* Same as over_8888_8888 */ +        mul_8888_8 WK0, A_SRC, WK5, HALF +        uqadd8  WK0, WK0, WK6 +        b       30f +20:     /* No simplifications possible - do it the hard way */ +        uxtb16  WK2, WK6, ror #8         /* ag_mask */ +        mla     WK3, WK1, A_SRC, HALF    /* rb_mul; 2 cycles */ +        mla     WK4, WK2, A_SRC, HALF    /* ag_mul; 2 cycles */ +        ldrd    RB_FLDS, HALF, [sp, #LOCALS_STACK_OFFSET] +        uxtb16  WK5, WK0                 /* rb_dest */ +        uxtab16 WK3, WK3, WK3, ror #8 +        uxtb16  WK6, WK0, ror #8         /* ag_dest */ +        uxtab16 WK4, WK4, WK4, ror #8 +        smlatt  WK0, RB_SRC, WK1, HALF   /* red1 */ +        smlabb  WK1, RB_SRC, WK1, HALF   /* blue1 */ +        bic     WK3, RB_FLDS, WK3, lsr #8 +        bic     WK4, RB_FLDS, WK4, lsr #8 +        pkhbt   WK1, WK1, WK0, lsl #16   /* rb1 */ +        smlatt  WK0, WK5, WK3, HALF      /* red2 */ +        smlabb  WK3, WK5, WK3, HALF      /* blue2 */ +        uxtab16 WK1, WK1, WK1, ror #8 +        smlatt  WK5, AG_SRC, WK2, HALF   /* alpha1 */ +        pkhbt   WK3, WK3, WK0, lsl #16   /* rb2 */ +        smlabb  WK0, AG_SRC, WK2, HALF   /* green1 */ +        smlatt  WK2, WK6, WK4, HALF      /* alpha2 */ +        smlabb  WK4, WK6, WK4, HALF      /* green2 */ +        pkhbt   WK0, WK0, WK5, lsl #16   /* ag1 */ +        uxtab16 WK3, WK3, WK3, ror #8 +        pkhbt   WK4, WK4, WK2, lsl #16   /* ag2 */ +        uxtab16 WK0, WK0, WK0, ror #8 +        uxtab16 WK4, WK4, WK4, ror #8 +        mov     WK1, WK1, ror #8 +        mov     WK3, WK3, ror #8 +        sel     WK2, WK1, WK0            /* recombine source*mask */ +        sel     WK1, WK3, WK4            /* recombine dest*(1-source_alpha*mask) */ +        uqadd8  WK0, WK1, WK2            /* followed by 1 stall */ +30:     /* The destination buffer is already in the L1 cache, so +         * there's little point in amalgamating writes */ +        pixst   , 4, 0, DST +40: +.endm + +.macro over_n_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + .rept (numbytes / 4) - 1 +        over_n_8888_8888_ca_1pixel_head +        over_n_8888_8888_ca_1pixel_tail + .endr +        over_n_8888_8888_ca_1pixel_head +.endm + +.macro over_n_8888_8888_ca_process_tail  cond, numbytes, firstreg +        over_n_8888_8888_ca_1pixel_tail +.endm + +pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6 +        ldr     ip, [sp] +        cmp     ip, #-1 +        beq     pixman_composite_over_white_8888_8888_ca_asm_armv6 +        /* else drop through... */ + .endfunc +generate_composite_function \ +    pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \ +    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \ +    2, /* prefetch distance */ \ +    over_n_8888_8888_ca_init, \ +    nop_macro, /* newline */ \ +    over_n_8888_8888_ca_cleanup, \ +    over_n_8888_8888_ca_process_head, \ +    over_n_8888_8888_ca_process_tail + +/******************************************************************************/ + +.macro in_reverse_8888_8888_init +        /* Hold loop invariant in MASK */ +        ldr     MASK, =0x00800080 +        /* Set GE[3:0] to 0101 so SEL instructions do what we want */ +        uadd8   SCRATCH, MASK, MASK +        /* Offset the source pointer: we only need the alpha bytes */ +        add     SRC, SRC, #3 +        line_saved_regs  ORIG_W +.endm + +.macro in_reverse_8888_8888_head  numbytes, reg1, reg2, reg3 +        ldrb    ORIG_W, [SRC], #4 + .if numbytes >= 8 +        ldrb    WK®1, [SRC], #4 +  .if numbytes == 16 +        ldrb    WK®2, [SRC], #4 +        ldrb    WK®3, [SRC], #4 +  .endif + .endif +        add     DST, DST, #numbytes +.endm + +.macro in_reverse_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload +        in_reverse_8888_8888_head  numbytes, firstreg, %(firstreg+1), %(firstreg+2) +.endm + +.macro in_reverse_8888_8888_1pixel  s, d, offset, is_only + .if is_only != 1 +        movs    s, ORIG_W +  .if offset != 0 +        ldrb    ORIG_W, [SRC, #offset] +  .endif +        beq     01f +        teq     STRIDE_M, #0xFF +        beq     02f + .endif +        uxtb16  SCRATCH, d                 /* rb_dest */ +        uxtb16  d, d, ror #8               /* ag_dest */ +        mla     SCRATCH, SCRATCH, s, MASK +        mla     d, d, s, MASK +        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 +        uxtab16 d, d, d, ror #8 +        mov     SCRATCH, SCRATCH, ror #8 +        sel     d, SCRATCH, d +        b       02f + .if offset == 0 +48:     /* Last mov d,#0 of the set - used as part of shortcut for +         * source values all 0 */ + .endif +01:     mov     d, #0 +02: +.endm + +.macro in_reverse_8888_8888_tail  numbytes, reg1, reg2, reg3, reg4 + .if numbytes == 4 +        teq     ORIG_W, ORIG_W, asr #32 +        ldrne   WK®1, [DST, #-4] + .elseif numbytes == 8 +        teq     ORIG_W, WK®1 +        teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */ +        ldmnedb DST, {WK®1-WK®2} + .else +        teq     ORIG_W, WK®1 +        teqeq   ORIG_W, WK®2 +        teqeq   ORIG_W, WK®3 +        teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */ +        ldmnedb DST, {WK®1-WK®4} + .endif +        cmnne   DST, #0   /* clear C if NE */ +        bcs     49f       /* no writes to dest if source all -1 */ +        beq     48f       /* set dest to all 0 if source all 0 */ + .if numbytes == 4 +        in_reverse_8888_8888_1pixel  ORIG_W, WK®1, 0, 1 +        str     WK®1, [DST, #-4] + .elseif numbytes == 8 +        in_reverse_8888_8888_1pixel  STRIDE_M, WK®1, -4, 0 +        in_reverse_8888_8888_1pixel  STRIDE_M, WK®2, 0, 0 +        stmdb   DST, {WK®1-WK®2} + .else +        in_reverse_8888_8888_1pixel  STRIDE_M, WK®1, -12, 0 +        in_reverse_8888_8888_1pixel  STRIDE_M, WK®2, -8, 0 +        in_reverse_8888_8888_1pixel  STRIDE_M, WK®3, -4, 0 +        in_reverse_8888_8888_1pixel  STRIDE_M, WK®4, 0, 0 +        stmdb   DST, {WK®1-WK®4} + .endif +49: +.endm + +.macro in_reverse_8888_8888_process_tail  cond, numbytes, firstreg +        in_reverse_8888_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3) +.endm + +generate_composite_function \ +    pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \ +    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \ +    2, /* prefetch distance */ \ +    in_reverse_8888_8888_init, \ +    nop_macro, /* newline */ \ +    nop_macro, /* cleanup */ \ +    in_reverse_8888_8888_process_head, \ +    in_reverse_8888_8888_process_tail + +/******************************************************************************/ + +.macro over_n_8888_init +        ldr     SRC, [sp, #ARGS_STACK_OFFSET] +        /* Hold loop invariant in MASK */ +        ldr     MASK, =0x00800080 +        /* Hold multiplier for destination in STRIDE_M */ +        mov     STRIDE_M, #255 +        sub     STRIDE_M, STRIDE_M, SRC, lsr #24 +        /* Set GE[3:0] to 0101 so SEL instructions do what we want */ +        uadd8   SCRATCH, MASK, MASK +.endm + +.macro over_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload +        pixld   , numbytes, firstreg, DST, 0 +.endm + +.macro over_n_8888_1pixel dst +        mul_8888_8  WK&dst, STRIDE_M, SCRATCH, MASK +        uqadd8  WK&dst, WK&dst, SRC +.endm + +.macro over_n_8888_process_tail  cond, numbytes, firstreg + .set PROCESS_REG, firstreg + .rept numbytes / 4 +        over_n_8888_1pixel %(PROCESS_REG) +  .set PROCESS_REG, PROCESS_REG+1 + .endr +        pixst   , numbytes, firstreg, DST +.endm + +generate_composite_function \ +    pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \ +    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \ +    2, /* prefetch distance */ \ +    over_n_8888_init, \ +    nop_macro, /* newline */ \ +    nop_macro, /* cleanup */ \ +    over_n_8888_process_head, \ +    over_n_8888_process_tail + +/******************************************************************************/ diff --git a/libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.h b/libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.h new file mode 100644 index 0000000..da153c3 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-arm-simd-asm.h @@ -0,0 +1,966 @@ +/* + * Copyright © 2012 Raspberry Pi Foundation + * Copyright © 2012 RISC OS Open Ltd + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of the copyright holders not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission.  The copyright holders make no + * representations about the suitability of this software for any purpose.  It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author:  Ben Avison (bavison@riscosopen.org) + * + */ + +/* + * Because the alignment of pixel data to cachelines, and even the number of + * cachelines per row can vary from row to row, and because of the need to + * preload each scanline once and only once, this prefetch strategy treats + * each row of pixels independently. When a pixel row is long enough, there + * are three distinct phases of prefetch: + * * an inner loop section, where each time a cacheline of data is + *    processed, another cacheline is preloaded (the exact distance ahead is + *    determined empirically using profiling results from lowlevel-blt-bench) + * * a leading section, where enough cachelines are preloaded to ensure no + *    cachelines escape being preloaded when the inner loop starts + * * a trailing section, where a limited number (0 or more) of cachelines + *    are preloaded to deal with data (if any) that hangs off the end of the + *    last iteration of the inner loop, plus any trailing bytes that were not + *    enough to make up one whole iteration of the inner loop + *  + * There are (in general) three distinct code paths, selected between + * depending upon how long the pixel row is. If it is long enough that there + * is at least one iteration of the inner loop (as described above) then + * this is described as the "wide" case. If it is shorter than that, but + * there are still enough bytes output that there is at least one 16-byte- + * long, 16-byte-aligned write to the destination (the optimum type of + * write), then this is the "medium" case. If it is not even this long, then + * this is the "narrow" case, and there is no attempt to align writes to + * 16-byte boundaries. In the "medium" and "narrow" cases, all the + * cachelines containing data from the pixel row are prefetched up-front. + */ + +/* + * Determine whether we put the arguments on the stack for debugging. + */ +#undef DEBUG_PARAMS + +/* + * Bit flags for 'generate_composite_function' macro which are used + * to tune generated functions behavior. + */ +.set FLAG_DST_WRITEONLY,         0 +.set FLAG_DST_READWRITE,         1 +.set FLAG_COND_EXEC,             0 +.set FLAG_BRANCH_OVER,           2 +.set FLAG_PROCESS_PRESERVES_PSR, 0 +.set FLAG_PROCESS_CORRUPTS_PSR,  4 +.set FLAG_PROCESS_DOESNT_STORE,  0 +.set FLAG_PROCESS_DOES_STORE,    8 /* usually because it needs to conditionally skip it */ +.set FLAG_NO_SPILL_LINE_VARS,        0 +.set FLAG_SPILL_LINE_VARS_WIDE,      16 +.set FLAG_SPILL_LINE_VARS_NON_WIDE,  32 +.set FLAG_SPILL_LINE_VARS,           48 +.set FLAG_PROCESS_CORRUPTS_SCRATCH,  0 +.set FLAG_PROCESS_PRESERVES_SCRATCH, 64 +.set FLAG_PROCESS_PRESERVES_WK0,     0 +.set FLAG_PROCESS_CORRUPTS_WK0,      128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */ +.set FLAG_PRELOAD_DST,               0 +.set FLAG_NO_PRELOAD_DST,            256 + +/* + * Number of bytes by which to adjust preload offset of destination + * buffer (allows preload instruction to be moved before the load(s)) + */ +.set DST_PRELOAD_BIAS, 0 + +/* + * Offset into stack where mask and source pointer/stride can be accessed. + */ +#ifdef DEBUG_PARAMS +.set ARGS_STACK_OFFSET,        (9*4+9*4) +#else +.set ARGS_STACK_OFFSET,        (9*4) +#endif + +/* + * Offset into stack where space allocated during init macro can be accessed. + */ +.set LOCALS_STACK_OFFSET,     0 + +/* + * Constants for selecting preferable prefetch type. + */ +.set PREFETCH_TYPE_NONE,       0 +.set PREFETCH_TYPE_STANDARD,   1 + +/* + * Definitions of macros for load/store of pixel data. + */ + +.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0 + .if numbytes == 16 +  .if unaligned == 1 +        op&r&cond    WK®0, [base], #4 +        op&r&cond    WK®1, [base], #4 +        op&r&cond    WK®2, [base], #4 +        op&r&cond    WK®3, [base], #4 +  .else +        op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3} +  .endif + .elseif numbytes == 8 +  .if unaligned == 1 +        op&r&cond    WK®0, [base], #4 +        op&r&cond    WK®1, [base], #4 +  .else +        op&m&cond&ia base!, {WK®0,WK®1} +  .endif + .elseif numbytes == 4 +        op&r&cond    WK®0, [base], #4 + .elseif numbytes == 2 +        op&r&cond&h  WK®0, [base], #2 + .elseif numbytes == 1 +        op&r&cond&b  WK®0, [base], #1 + .else +  .error "unsupported size: numbytes" + .endif +.endm + +.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base + .if numbytes == 16 +        stm&cond&db base, {WK®0,WK®1,WK®2,WK®3} + .elseif numbytes == 8 +        stm&cond&db base, {WK®0,WK®1} + .elseif numbytes == 4 +        str&cond    WK®0, [base, #-4] + .elseif numbytes == 2 +        str&cond&h  WK®0, [base, #-2] + .elseif numbytes == 1 +        str&cond&b  WK®0, [base, #-1] + .else +  .error "unsupported size: numbytes" + .endif +.endm + +.macro pixld cond, numbytes, firstreg, base, unaligned +        pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned +.endm + +.macro pixst cond, numbytes, firstreg, base + .if (flags) & FLAG_DST_READWRITE +        pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base + .else +        pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base + .endif +.endm + +.macro PF a, x:vararg + .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD) +        a x + .endif +.endm + + +.macro preload_leading_step1  bpp, ptr, base +/* If the destination is already 16-byte aligned, then we need to preload + * between 0 and prefetch_distance (inclusive) cache lines ahead so there + * are no gaps when the inner loop starts. + */ + .if bpp > 0 +        PF  bic,    ptr, base, #31 +  .set OFFSET, 0 +  .rept prefetch_distance+1 +        PF  pld,    [ptr, #OFFSET] +   .set OFFSET, OFFSET+32 +  .endr + .endif +.endm + +.macro preload_leading_step2  bpp, bpp_shift, ptr, base +/* However, if the destination is not 16-byte aligned, we may need to + * preload more cache lines than that. The question we need to ask is: + * are the bytes corresponding to the leading pixels more than the amount + * by which the source pointer will be rounded down for preloading, and if + * so, by how many cache lines? Effectively, we want to calculate + *     leading_bytes = ((-dst)&15)*src_bpp/dst_bpp + *     inner_loop_offset = (src+leading_bytes)&31 + *     extra_needed = leading_bytes - inner_loop_offset + * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only + * possible when there are 4 src bytes for every 1 dst byte). + */ + .if bpp > 0 +  .ifc base,DST +        /* The test can be simplified further when preloading the destination */ +        PF  tst,    base, #16 +        PF  beq,    61f +  .else +   .if bpp/dst_w_bpp == 4 +        PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift +        PF  and,    SCRATCH, SCRATCH, #31 +        PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift +        PF  sub,    SCRATCH, SCRATCH, #1        /* so now ranges are -16..-1 / 0..31 / 32..63 */ +        PF  movs,   SCRATCH, SCRATCH, lsl #32-6 /* so this sets         NC   /  nc   /   Nc   */ +        PF  bcs,    61f +        PF  bpl,    60f +        PF  pld,    [ptr, #32*(prefetch_distance+2)] +   .else +        PF  mov,    SCRATCH, base, lsl #32-5 +        PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift +        PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift +        PF  bls,    61f +   .endif +  .endif +60:     PF  pld,    [ptr, #32*(prefetch_distance+1)] +61: + .endif +.endm + +#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2)) +.macro preload_middle   bpp, base, scratch_holds_offset + .if bpp > 0 +        /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */ +  .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp) +   .if scratch_holds_offset +        PF  pld,    [base, SCRATCH] +   .else +        PF  bic,    SCRATCH, base, #31 +        PF  pld,    [SCRATCH, #32*prefetch_distance] +   .endif +  .endif + .endif +.endm + +.macro preload_trailing  bpp, bpp_shift, base + .if bpp > 0 +  .if bpp*pix_per_block > 256 +        /* Calculations are more complex if more than one fetch per block */ +        PF  and,    WK1, base, #31 +        PF  add,    WK1, WK1, WK0, lsl #bpp_shift +        PF  add,    WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1) +        PF  bic,    SCRATCH, base, #31 +80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)] +        PF  add,    SCRATCH, SCRATCH, #32 +        PF  subs,   WK1, WK1, #32 +        PF  bhi,    80b +  .else +        /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */ +        PF  mov,    SCRATCH, base, lsl #32-5 +        PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift +        PF  adceqs, SCRATCH, SCRATCH, #0 +        /* The instruction above has two effects: ensures Z is only +         * set if C was clear (so Z indicates that both shifted quantities +         * were 0), and clears C if Z was set (so C indicates that the sum +         * of the shifted quantities was greater and not equal to 32) */ +        PF  beq,    82f +        PF  bic,    SCRATCH, base, #31 +        PF  bcc,    81f +        PF  pld,    [SCRATCH, #32*(prefetch_distance+2)] +81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)] +82: +  .endif + .endif +.endm + + +.macro preload_line    narrow_case, bpp, bpp_shift, base +/* "narrow_case" - just means that the macro was invoked from the "narrow" + *    code path rather than the "medium" one - because in the narrow case, + *    the row of pixels is known to output no more than 30 bytes, then + *    (assuming the source pixels are no wider than the the destination + *    pixels) they cannot possibly straddle more than 2 32-byte cachelines, + *    meaning there's no need for a loop. + * "bpp" - number of bits per pixel in the channel (source, mask or + *    destination) that's being preloaded, or 0 if this channel is not used + *    for reading + * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course) + * "base" - base address register of channel to preload (SRC, MASK or DST) + */ + .if bpp > 0 +  .if narrow_case && (bpp <= dst_w_bpp) +        /* In these cases, each line for each channel is in either 1 or 2 cache lines */ +        PF  bic,    WK0, base, #31 +        PF  pld,    [WK0] +        PF  add,    WK1, base, X, LSL #bpp_shift +        PF  sub,    WK1, WK1, #1 +        PF  bic,    WK1, WK1, #31 +        PF  cmp,    WK1, WK0 +        PF  beq,    90f +        PF  pld,    [WK1] +90: +  .else +        PF  bic,    WK0, base, #31 +        PF  pld,    [WK0] +        PF  add,    WK1, base, X, lsl #bpp_shift +        PF  sub,    WK1, WK1, #1 +        PF  bic,    WK1, WK1, #31 +        PF  cmp,    WK1, WK0 +        PF  beq,    92f +91:     PF  add,    WK0, WK0, #32 +        PF  cmp,    WK0, WK1 +        PF  pld,    [WK0] +        PF  bne,    91b +92: +  .endif + .endif +.endm + + +.macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx +        process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0 + .if decrementx +        sub&cond X, X, #8*numbytes/dst_w_bpp + .endif +        process_tail  cond, numbytes, firstreg + .if !((flags) & FLAG_PROCESS_DOES_STORE) +        pixst   cond, numbytes, firstreg, DST + .endif +.endm + +.macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx + .if (flags) & FLAG_BRANCH_OVER +  .ifc cond,mi +        bpl     100f +  .endif +  .ifc cond,cs +        bcc     100f +  .endif +  .ifc cond,ne +        beq     100f +  .endif +        conditional_process1_helper  , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx +100: + .else +        conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx + .endif +.endm + +.macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx + .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE) +        /* Can't interleave reads and writes */ +        test +        conditional_process1  cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx +  .if (flags) & FLAG_PROCESS_CORRUPTS_PSR +        test +  .endif +        conditional_process1  cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx + .else +        /* Can interleave reads and writes for better scheduling */ +        test +        process_head  cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0 +        process_head  cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0 +  .if decrementx +        sub&cond1 X, X, #8*numbytes1/dst_w_bpp +        sub&cond2 X, X, #8*numbytes2/dst_w_bpp +  .endif +        process_tail  cond1, numbytes1, firstreg1 +        process_tail  cond2, numbytes2, firstreg2 +        pixst   cond1, numbytes1, firstreg1, DST +        pixst   cond2, numbytes2, firstreg2, DST + .endif +.endm + + +.macro test_bits_1_0_ptr + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 +        movs    SCRATCH, X, lsl #32-1  /* C,N = bits 1,0 of DST */ + .else +        movs    SCRATCH, WK0, lsl #32-1  /* C,N = bits 1,0 of DST */ + .endif +.endm + +.macro test_bits_3_2_ptr + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 +        movs    SCRATCH, X, lsl #32-3  /* C,N = bits 3, 2 of DST */ + .else +        movs    SCRATCH, WK0, lsl #32-3  /* C,N = bits 3, 2 of DST */ + .endif +.endm + +.macro leading_15bytes  process_head, process_tail +        /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */ + .set DECREMENT_X, 1 + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 +  .set DECREMENT_X, 0 +        sub     X, X, WK0, lsr #dst_bpp_shift +        str     X, [sp, #LINE_SAVED_REG_COUNT*4] +        mov     X, WK0 + .endif +        /* Use unaligned loads in all cases for simplicity */ + .if dst_w_bpp == 8 +        conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X + .elseif dst_w_bpp == 16 +        test_bits_1_0_ptr +        conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X + .endif +        conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 +        ldr     X, [sp, #LINE_SAVED_REG_COUNT*4] + .endif +.endm + +.macro test_bits_3_2_pix +        movs    SCRATCH, X, lsl #dst_bpp_shift+32-3 +.endm + +.macro test_bits_1_0_pix + .if dst_w_bpp == 8 +        movs    SCRATCH, X, lsl #dst_bpp_shift+32-1 + .else +        movs    SCRATCH, X, lsr #1 + .endif +.endm + +.macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask +        conditional_process2  test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0 + .if dst_w_bpp == 16 +        test_bits_1_0_pix +        conditional_process1  cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0 + .elseif dst_w_bpp == 8 +        conditional_process2  test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0 + .endif +.endm + + +.macro wide_case_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment +110: + .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */ + .rept pix_per_block*dst_w_bpp/128 +        process_head  , 16, 0, unaligned_src, unaligned_mask, 1 +  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) +        preload_middle  src_bpp, SRC, 1 +  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) +        preload_middle  mask_bpp, MASK, 1 +  .else +        preload_middle  src_bpp, SRC, 0 +        preload_middle  mask_bpp, MASK, 0 +  .endif +  .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0) +        /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that +         * destination prefetches are 32-byte aligned. It's also the easiest channel to offset +         * preloads for, to achieve staggered prefetches for multiple channels, because there are +         * always two STMs per prefetch, so there is always an opposite STM on which to put the +         * preload. Note, no need to BIC the base register here */ +        PF  pld,    [DST, #32*prefetch_distance - dst_alignment] +  .endif +        process_tail  , 16, 0 +  .if !((flags) & FLAG_PROCESS_DOES_STORE) +        pixst   , 16, 0, DST +  .endif +  .set SUBBLOCK, SUBBLOCK+1 + .endr +        subs    X, X, #pix_per_block +        bhs     110b +.endm + +.macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask +        /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */ + .if dst_r_bpp > 0 +        tst     DST, #16 +        bne     111f +        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS +        b       112f +111: + .endif +        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS +112: +        /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ + .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256) +        PF  and,    WK0, X, #pix_per_block-1 + .endif +        preload_trailing  src_bpp, src_bpp_shift, SRC +        preload_trailing  mask_bpp, mask_bpp_shift, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 +        preload_trailing  dst_r_bpp, dst_bpp_shift, DST + .endif +        add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp +        /* The remainder of the line is handled identically to the medium case */ +        medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask +.endm + +.macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask +120: +        process_head  , 16, 0, unaligned_src, unaligned_mask, 0 +        process_tail  , 16, 0 + .if !((flags) & FLAG_PROCESS_DOES_STORE) +        pixst   , 16, 0, DST + .endif +        subs    X, X, #128/dst_w_bpp +        bhs     120b +        /* Trailing pixels */ +        tst     X, #128/dst_w_bpp - 1 +        beq     exit_label +        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask +.endm + +.macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask +        tst     X, #16*8/dst_w_bpp +        conditional_process1  ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0 +        /* Trailing pixels */ +        /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */ +        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask +.endm + +.macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label + /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */ + .if mask_bpp == 8 || mask_bpp == 16 +        tst     MASK, #3 +        bne     141f + .endif +  .if src_bpp == 8 || src_bpp == 16 +        tst     SRC, #3 +        bne     140f +  .endif +        action  process_head, process_tail, process_inner_loop, exit_label, 0, 0 +  .if src_bpp == 8 || src_bpp == 16 +        b       exit_label +140: +        action  process_head, process_tail, process_inner_loop, exit_label, 1, 0 +  .endif + .if mask_bpp == 8 || mask_bpp == 16 +        b       exit_label +141: +  .if src_bpp == 8 || src_bpp == 16 +        tst     SRC, #3 +        bne     142f +  .endif +        action  process_head, process_tail, process_inner_loop, exit_label, 0, 1 +  .if src_bpp == 8 || src_bpp == 16 +        b       exit_label +142: +        action  process_head, process_tail, process_inner_loop, exit_label, 1, 1 +  .endif + .endif +.endm + + +.macro end_of_line      restore_x, vars_spilled, loop_label, last_one + .if vars_spilled +        /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */ +        /* This is ldmia sp,{} */ +        .word   0xE89D0000 | LINE_SAVED_REGS + .endif +        subs    Y, Y, #1 + .if vars_spilled +  .if (LINE_SAVED_REGS) & (1<<1) +        str     Y, [sp] +  .endif + .endif +        add     DST, DST, STRIDE_D + .if src_bpp > 0 +        add     SRC, SRC, STRIDE_S + .endif + .if mask_bpp > 0 +        add     MASK, MASK, STRIDE_M + .endif + .if restore_x +        mov     X, ORIG_W + .endif +        bhs     loop_label + .ifc "last_one","" +  .if vars_spilled +        b       197f +  .else +        b       198f +  .endif + .else +  .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) +        b       198f +  .endif + .endif +.endm + + +.macro generate_composite_function fname, \ +                                   src_bpp_, \ +                                   mask_bpp_, \ +                                   dst_w_bpp_, \ +                                   flags_, \ +                                   prefetch_distance_, \ +                                   init, \ +                                   newline, \ +                                   cleanup, \ +                                   process_head, \ +                                   process_tail, \ +                                   process_inner_loop + +    pixman_asm_function fname + +/* + * Make some macro arguments globally visible and accessible + * from other macros + */ + .set src_bpp, src_bpp_ + .set mask_bpp, mask_bpp_ + .set dst_w_bpp, dst_w_bpp_ + .set flags, flags_ + .set prefetch_distance, prefetch_distance_ + +/* + * Select prefetch type for this function. + */ + .if prefetch_distance == 0 +  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE + .else +  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD + .endif + + .if src_bpp == 32 +  .set src_bpp_shift, 2 + .elseif src_bpp == 24 +  .set src_bpp_shift, 0 + .elseif src_bpp == 16 +  .set src_bpp_shift, 1 + .elseif src_bpp == 8 +  .set src_bpp_shift, 0 + .elseif src_bpp == 0 +  .set src_bpp_shift, -1 + .else +  .error "requested src bpp (src_bpp) is not supported" + .endif + + .if mask_bpp == 32 +  .set mask_bpp_shift, 2 + .elseif mask_bpp == 24 +  .set mask_bpp_shift, 0 + .elseif mask_bpp == 8 +  .set mask_bpp_shift, 0 + .elseif mask_bpp == 0 +  .set mask_bpp_shift, -1 + .else +  .error "requested mask bpp (mask_bpp) is not supported" + .endif + + .if dst_w_bpp == 32 +  .set dst_bpp_shift, 2 + .elseif dst_w_bpp == 24 +  .set dst_bpp_shift, 0 + .elseif dst_w_bpp == 16 +  .set dst_bpp_shift, 1 + .elseif dst_w_bpp == 8 +  .set dst_bpp_shift, 0 + .else +  .error "requested dst bpp (dst_w_bpp) is not supported" + .endif + + .if (((flags) & FLAG_DST_READWRITE) != 0) +  .set dst_r_bpp, dst_w_bpp + .else +  .set dst_r_bpp, 0 + .endif + + .set pix_per_block, 16*8/dst_w_bpp + .if src_bpp != 0 +  .if 32*8/src_bpp > pix_per_block +   .set pix_per_block, 32*8/src_bpp +  .endif + .endif + .if mask_bpp != 0 +  .if 32*8/mask_bpp > pix_per_block +   .set pix_per_block, 32*8/mask_bpp +  .endif + .endif + .if dst_r_bpp != 0 +  .if 32*8/dst_r_bpp > pix_per_block +   .set pix_per_block, 32*8/dst_r_bpp +  .endif + .endif + +/* The standard entry conditions set up by pixman-arm-common.h are: + * r0 = width (pixels) + * r1 = height (rows) + * r2 = pointer to top-left pixel of destination + * r3 = destination stride (pixels) + * [sp] = source pixel value, or pointer to top-left pixel of source + * [sp,#4] = 0 or source stride (pixels) + * The following arguments are unused for non-mask operations + * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask + * [sp,#12] = 0 or mask stride (pixels) + */ + +/* + * Assign symbolic names to registers + */ +    X           .req    r0  /* pixels to go on this line */ +    Y           .req    r1  /* lines to go */ +    DST         .req    r2  /* destination pixel pointer */ +    STRIDE_D    .req    r3  /* destination stride (bytes, minus width) */ +    SRC         .req    r4  /* source pixel pointer */ +    STRIDE_S    .req    r5  /* source stride (bytes, minus width) */ +    MASK        .req    r6  /* mask pixel pointer (if applicable) */ +    STRIDE_M    .req    r7  /* mask stride (bytes, minus width) */ +    WK0         .req    r8  /* pixel data registers */ +    WK1         .req    r9 +    WK2         .req    r10 +    WK3         .req    r11 +    SCRATCH     .req    r12 +    ORIG_W      .req    r14 /* width (pixels) */ + +        push    {r4-r11, lr}        /* save all registers */ + +        subs    Y, Y, #1 +        blo     199f + +#ifdef DEBUG_PARAMS +        sub     sp, sp, #9*4 +#endif + + .if src_bpp > 0 +        ldr     SRC, [sp, #ARGS_STACK_OFFSET] +        ldr     STRIDE_S, [sp, #ARGS_STACK_OFFSET+4] + .endif + .if mask_bpp > 0 +        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8] +        ldr     STRIDE_M, [sp, #ARGS_STACK_OFFSET+12] + .endif +         +#ifdef DEBUG_PARAMS +        add     Y, Y, #1 +        stmia   sp, {r0-r7,pc} +        sub     Y, Y, #1 +#endif + +        init + + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 +        /* Reserve a word in which to store X during leading pixels */ +        sub     sp, sp, #4 +  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4 +  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4 + .endif +         +        lsl     STRIDE_D, #dst_bpp_shift /* stride in bytes */ +        sub     STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift + .if src_bpp > 0 +        lsl     STRIDE_S, #src_bpp_shift +        sub     STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift + .endif + .if mask_bpp > 0 +        lsl     STRIDE_M, #mask_bpp_shift +        sub     STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift + .endif +  +        /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */ +        cmp     X, #2*16*8/dst_w_bpp - 1 +        blo     170f + .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */ +        /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */ +        cmp     X, #(prefetch_distance+3)*pix_per_block - 1 +        blo     160f + +        /* Wide case */ +        /* Adjust X so that the decrement instruction can also test for +         * inner loop termination. We want it to stop when there are +         * (prefetch_distance+1) complete blocks to go. */ +        sub     X, X, #(prefetch_distance+2)*pix_per_block +        mov     ORIG_W, X +  .if (flags) & FLAG_SPILL_LINE_VARS_WIDE +        /* This is stmdb sp!,{} */ +        .word   0xE92D0000 | LINE_SAVED_REGS +   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 +   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 +  .endif +151:    /* New line */ +        newline +        preload_leading_step1  src_bpp, WK1, SRC +        preload_leading_step1  mask_bpp, WK2, MASK +  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 +        preload_leading_step1  dst_r_bpp, WK3, DST +  .endif +         +        ands    WK0, DST, #15 +        beq     154f +        rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */ + +        preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC +        preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK +  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 +        preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST +  .endif + +        leading_15bytes  process_head, process_tail +         +154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */ +  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) +        and     SCRATCH, SRC, #31 +        rsb     SCRATCH, SCRATCH, #32*prefetch_distance +  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) +        and     SCRATCH, MASK, #31 +        rsb     SCRATCH, SCRATCH, #32*prefetch_distance +  .endif +  .ifc "process_inner_loop","" +        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f +  .else +        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f +  .endif + +157:    /* Check for another line */ +        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b +  .if (flags) & FLAG_SPILL_LINE_VARS_WIDE +   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 +   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 +  .endif + .endif + + .ltorg + +160:    /* Medium case */ +        mov     ORIG_W, X + .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE +        /* This is stmdb sp!,{} */ +        .word   0xE92D0000 | LINE_SAVED_REGS +  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 +  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 + .endif +161:    /* New line */ +        newline +        preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */ +        preload_line 0, mask_bpp, mask_bpp_shift, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 +        preload_line 0, dst_r_bpp, dst_bpp_shift, DST + .endif +         +        sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */ +        ands    WK0, DST, #15 +        beq     164f +        rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */ +         +        leading_15bytes  process_head, process_tail +         +164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */ +        switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f +         +167:    /* Check for another line */ +        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b + + .ltorg + +170:    /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */ + .if dst_w_bpp < 32 +        mov     ORIG_W, X + .endif + .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE +        /* This is stmdb sp!,{} */ +        .word   0xE92D0000 | LINE_SAVED_REGS + .endif +171:    /* New line */ +        newline +        preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */ +        preload_line 1, mask_bpp, mask_bpp_shift, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 +        preload_line 1, dst_r_bpp, dst_bpp_shift, DST + .endif +         + .if dst_w_bpp == 8 +        tst     DST, #3 +        beq     174f +172:    subs    X, X, #1 +        blo     177f +        process_head  , 1, 0, 1, 1, 0 +        process_tail  , 1, 0 +  .if !((flags) & FLAG_PROCESS_DOES_STORE) +        pixst   , 1, 0, DST +  .endif +        tst     DST, #3 +        bne     172b + .elseif dst_w_bpp == 16 +        tst     DST, #2 +        beq     174f +        subs    X, X, #1 +        blo     177f +        process_head  , 2, 0, 1, 1, 0 +        process_tail  , 2, 0 +  .if !((flags) & FLAG_PROCESS_DOES_STORE) +        pixst   , 2, 0, DST +  .endif + .endif + +174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ +        switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f + +177:    /* Check for another line */ +        end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one + .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE +  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 +  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 + .endif + +197: + .if (flags) & FLAG_SPILL_LINE_VARS +        add     sp, sp, #LINE_SAVED_REG_COUNT*4 + .endif +198: + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 +  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4 +  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4 +        add     sp, sp, #4 + .endif + +        cleanup + +#ifdef DEBUG_PARAMS +        add     sp, sp, #9*4 /* junk the debug copy of arguments */ +#endif +199: +        pop     {r4-r11, pc}  /* exit */ + + .ltorg + +    .unreq  X +    .unreq  Y +    .unreq  DST +    .unreq  STRIDE_D +    .unreq  SRC +    .unreq  STRIDE_S +    .unreq  MASK +    .unreq  STRIDE_M +    .unreq  WK0 +    .unreq  WK1 +    .unreq  WK2 +    .unreq  WK3 +    .unreq  SCRATCH +    .unreq  ORIG_W +    .endfunc +.endm + +.macro line_saved_regs  x:vararg + .set LINE_SAVED_REGS, 0 + .set LINE_SAVED_REG_COUNT, 0 + .irp SAVED_REG,x +  .ifc "SAVED_REG","Y" +   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1) +   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 +  .endif +  .ifc "SAVED_REG","STRIDE_D" +   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3) +   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 +  .endif +  .ifc "SAVED_REG","STRIDE_S" +   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5) +   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 +  .endif +  .ifc "SAVED_REG","STRIDE_M" +   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7) +   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 +  .endif +  .ifc "SAVED_REG","ORIG_W" +   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14) +   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 +  .endif + .endr +.endm + +.macro nop_macro x:vararg +.endm diff --git a/libs/pixman-0.40.0/pixman/pixman-arm-simd.c b/libs/pixman-0.40.0/pixman/pixman-arm-simd.c new file mode 100644 index 0000000..f0d1454 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-arm-simd.c @@ -0,0 +1,291 @@ +/* + * Copyright © 2008 Mozilla Corporation + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Mozilla Corporation not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission.  Mozilla Corporation makes no + * representations about the suitability of this software for any purpose.  It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author:  Jeff Muizelaar (jeff@infidigm.net) + * + */ +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include "pixman-private.h" +#include "pixman-arm-common.h" +#include "pixman-inlines.h" + +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8888_8888, +		                   uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_8888, +                                   uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_0565, +                                   uint16_t, 1, uint16_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8_8, +                                   uint8_t, 1, uint8_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_8888, +                                   uint16_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_0565, +                                   uint32_t, 1, uint16_t, 1) + +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8, +                                   uint8_t, 1, uint8_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888, +                                   uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, in_reverse_8888_8888, +                                   uint32_t, 1, uint32_t, 1) + +PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, armv6, over_n_8888, +                                 uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, armv6, over_reverse_n_8888, +                                 uint32_t, 1) + +PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888, +                                     uint32_t, 1, uint32_t, 1) + +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888, +                                      uint8_t, 1, uint32_t, 1) + +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8888_8888_ca, +                                      uint32_t, 1, uint32_t, 1) + +PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC, +                                        uint16_t, uint16_t) +PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC, +                                        uint32_t, uint32_t) + +void +pixman_composite_src_n_8888_asm_armv6 (int32_t   w, +                                       int32_t   h, +                                       uint32_t *dst, +                                       int32_t   dst_stride, +                                       uint32_t  src); + +void +pixman_composite_src_n_0565_asm_armv6 (int32_t   w, +                                       int32_t   h, +                                       uint16_t *dst, +                                       int32_t   dst_stride, +                                       uint16_t  src); + +void +pixman_composite_src_n_8_asm_armv6 (int32_t   w, +                                    int32_t   h, +                                    uint8_t  *dst, +                                    int32_t   dst_stride, +                                    uint8_t  src); + +static pixman_bool_t +arm_simd_fill (pixman_implementation_t *imp, +               uint32_t *               bits, +               int                      stride, /* in 32-bit words */ +               int                      bpp, +               int                      x, +               int                      y, +               int                      width, +               int                      height, +               uint32_t                 _xor) +{ +    /* stride is always multiple of 32bit units in pixman */ +    uint32_t byte_stride = stride * sizeof(uint32_t); + +    switch (bpp) +    { +    case 8: +	pixman_composite_src_n_8_asm_armv6 ( +		width, +		height, +		(uint8_t *)(((char *) bits) + y * byte_stride + x), +		byte_stride, +		_xor & 0xff); +	return TRUE; +    case 16: +	pixman_composite_src_n_0565_asm_armv6 ( +		width, +		height, +		(uint16_t *)(((char *) bits) + y * byte_stride + x * 2), +		byte_stride / 2, +		_xor & 0xffff); +	return TRUE; +    case 32: +	pixman_composite_src_n_8888_asm_armv6 ( +		width, +		height, +		(uint32_t *)(((char *) bits) + y * byte_stride + x * 4), +		byte_stride / 4, +		_xor); +	return TRUE; +    default: +	return FALSE; +    } +} + +static pixman_bool_t +arm_simd_blt (pixman_implementation_t *imp, +              uint32_t *               src_bits, +              uint32_t *               dst_bits, +              int                      src_stride, /* in 32-bit words */ +              int                      dst_stride, /* in 32-bit words */ +              int                      src_bpp, +              int                      dst_bpp, +              int                      src_x, +              int                      src_y, +              int                      dest_x, +              int                      dest_y, +              int                      width, +              int                      height) +{ +    if (src_bpp != dst_bpp) +	return FALSE; + +    switch (src_bpp) +    { +    case 8: +        pixman_composite_src_8_8_asm_armv6 ( +                width, height, +                (uint8_t *)(((char *) dst_bits) + +                dest_y * dst_stride * 4 + dest_x * 1), dst_stride * 4, +                (uint8_t *)(((char *) src_bits) + +                src_y * src_stride * 4 + src_x * 1), src_stride * 4); +        return TRUE; +    case 16: +	pixman_composite_src_0565_0565_asm_armv6 ( +		width, height, +		(uint16_t *)(((char *) dst_bits) + +		dest_y * dst_stride * 4 + dest_x * 2), dst_stride * 2, +		(uint16_t *)(((char *) src_bits) + +		src_y * src_stride * 4 + src_x * 2), src_stride * 2); +	return TRUE; +    case 32: +	pixman_composite_src_8888_8888_asm_armv6 ( +		width, height, +		(uint32_t *)(((char *) dst_bits) + +		dest_y * dst_stride * 4 + dest_x * 4), dst_stride, +		(uint32_t *)(((char *) src_bits) + +		src_y * src_stride * 4 + src_x * 4), src_stride); +	return TRUE; +    default: +	return FALSE; +    } +} + +static const pixman_fast_path_t arm_simd_fast_paths[] = +{ +    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, armv6_composite_src_8888_8888), +    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, armv6_composite_src_8888_8888), +    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, armv6_composite_src_8888_8888), +    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, armv6_composite_src_8888_8888), +    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, armv6_composite_src_8888_8888), +    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, armv6_composite_src_8888_8888), + +    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, armv6_composite_src_x888_8888), +    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, armv6_composite_src_x888_8888), + +    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, armv6_composite_src_0565_0565), +    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, armv6_composite_src_0565_0565), +    PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, a1r5g5b5, armv6_composite_src_0565_0565), +    PIXMAN_STD_FAST_PATH (SRC, a1b5g5r5, null, a1b5g5r5, armv6_composite_src_0565_0565), +    PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, armv6_composite_src_0565_0565), +    PIXMAN_STD_FAST_PATH (SRC, a1b5g5r5, null, x1b5g5r5, armv6_composite_src_0565_0565), +    PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, armv6_composite_src_0565_0565), +    PIXMAN_STD_FAST_PATH (SRC, x1b5g5r5, null, x1b5g5r5, armv6_composite_src_0565_0565), +    PIXMAN_STD_FAST_PATH (SRC, a4r4g4b4, null, a4r4g4b4, armv6_composite_src_0565_0565), +    PIXMAN_STD_FAST_PATH (SRC, a4b4g4r4, null, a4b4g4r4, armv6_composite_src_0565_0565), +    PIXMAN_STD_FAST_PATH (SRC, a4r4g4b4, null, x4r4g4b4, armv6_composite_src_0565_0565), +    PIXMAN_STD_FAST_PATH (SRC, a4b4g4r4, null, x4b4g4r4, armv6_composite_src_0565_0565), +    PIXMAN_STD_FAST_PATH (SRC, x4r4g4b4, null, x4r4g4b4, armv6_composite_src_0565_0565), +    PIXMAN_STD_FAST_PATH (SRC, x4b4g4r4, null, x4b4g4r4, armv6_composite_src_0565_0565), + +    PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, armv6_composite_src_8_8), +    PIXMAN_STD_FAST_PATH (SRC, r3g3b2, null, r3g3b2, armv6_composite_src_8_8), +    PIXMAN_STD_FAST_PATH (SRC, b2g3r3, null, b2g3r3, armv6_composite_src_8_8), +    PIXMAN_STD_FAST_PATH (SRC, a2r2g2b2, null, a2r2g2b2, armv6_composite_src_8_8), +    PIXMAN_STD_FAST_PATH (SRC, a2b2g2r2, null, a2b2g2r2, armv6_composite_src_8_8), +    PIXMAN_STD_FAST_PATH (SRC, c8, null, c8, armv6_composite_src_8_8), +    PIXMAN_STD_FAST_PATH (SRC, g8, null, g8, armv6_composite_src_8_8), +    PIXMAN_STD_FAST_PATH (SRC, x4a4, null, x4a4, armv6_composite_src_8_8), +    PIXMAN_STD_FAST_PATH (SRC, x4c4, null, x4c4, armv6_composite_src_8_8), +    PIXMAN_STD_FAST_PATH (SRC, x4g4, null, x4g4, armv6_composite_src_8_8), + +    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, a8r8g8b8, armv6_composite_src_0565_8888), +    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, x8r8g8b8, armv6_composite_src_0565_8888), +    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, a8b8g8r8, armv6_composite_src_0565_8888), +    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, x8b8g8r8, armv6_composite_src_0565_8888), + +    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, armv6_composite_src_x888_0565), +    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, armv6_composite_src_x888_0565), +    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, armv6_composite_src_x888_0565), +    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, armv6_composite_src_x888_0565), + +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888), + +    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, armv6_composite_over_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, armv6_composite_over_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8b8g8r8, armv6_composite_over_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8b8g8r8, armv6_composite_over_n_8888), +    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, armv6_composite_over_reverse_n_8888), +    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, armv6_composite_over_reverse_n_8888), + +    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8), + +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888), + +    PIXMAN_STD_FAST_PATH (IN_REVERSE, a8r8g8b8, null, a8r8g8b8, armv6_composite_in_reverse_8888_8888), +    PIXMAN_STD_FAST_PATH (IN_REVERSE, a8r8g8b8, null, x8r8g8b8, armv6_composite_in_reverse_8888_8888), +    PIXMAN_STD_FAST_PATH (IN_REVERSE, a8b8g8r8, null, a8b8g8r8, armv6_composite_in_reverse_8888_8888), +    PIXMAN_STD_FAST_PATH (IN_REVERSE, a8b8g8r8, null, x8b8g8r8, armv6_composite_in_reverse_8888_8888), + +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, armv6_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, armv6_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, armv6_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, armv6_composite_over_n_8888_8888_ca), + +    SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565), +    SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565), + +    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888), +    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888), +    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888), +    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888), +    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888), +    SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888), + +    { PIXMAN_OP_NONE }, +}; + +pixman_implementation_t * +_pixman_implementation_create_arm_simd (pixman_implementation_t *fallback) +{ +    pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths); + +    imp->blt = arm_simd_blt; +    imp->fill = arm_simd_fill; + +    return imp; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-arm.c b/libs/pixman-0.40.0/pixman/pixman-arm.c new file mode 100644 index 0000000..4a2ae85 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-arm.c @@ -0,0 +1,250 @@ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of SuSE not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission.  SuSE makes no representations about the + * suitability of this software for any purpose.  It is provided "as is" + * without express or implied warranty. + * + * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include "pixman-private.h" + +typedef enum +{ +    ARM_V7		= (1 << 0), +    ARM_V6		= (1 << 1), +    ARM_VFP		= (1 << 2), +    ARM_NEON		= (1 << 3), +    ARM_IWMMXT		= (1 << 4) +} arm_cpu_features_t; + +#if defined(USE_ARM_SIMD) || defined(USE_ARM_NEON) || defined(USE_ARM_IWMMXT) + +#if defined(_MSC_VER) + +/* Needed for EXCEPTION_ILLEGAL_INSTRUCTION */ +#include <windows.h> + +extern int pixman_msvc_try_arm_neon_op (); +extern int pixman_msvc_try_arm_simd_op (); + +static arm_cpu_features_t +detect_cpu_features (void) +{ +    arm_cpu_features_t features = 0; + +    __try +    { +	pixman_msvc_try_arm_simd_op (); +	features |= ARM_V6; +    } +    __except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION) +    { +    } + +    __try +    { +	pixman_msvc_try_arm_neon_op (); +	features |= ARM_NEON; +    } +    __except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION) +    { +    } + +    return features; +} + +#elif defined(__APPLE__) && defined(TARGET_OS_IPHONE) /* iOS */ + +#include "TargetConditionals.h" + +static arm_cpu_features_t +detect_cpu_features (void) +{ +    arm_cpu_features_t features = 0; + +    features |= ARM_V6; + +    /* Detection of ARM NEON on iOS is fairly simple because iOS binaries +     * contain separate executable images for each processor architecture. +     * So all we have to do is detect the armv7 architecture build. The +     * operating system automatically runs the armv7 binary for armv7 devices +     * and the armv6 binary for armv6 devices. +     */ +#if defined(__ARM_NEON__) +    features |= ARM_NEON; +#endif + +    return features; +} + +#elif defined(__ANDROID__) || defined(ANDROID) /* Android */ + +#include <cpu-features.h> + +static arm_cpu_features_t +detect_cpu_features (void) +{ +    arm_cpu_features_t features = 0; +    AndroidCpuFamily cpu_family; +    uint64_t cpu_features; + +    cpu_family = android_getCpuFamily(); +    cpu_features = android_getCpuFeatures(); + +    if (cpu_family == ANDROID_CPU_FAMILY_ARM) +    { +	if (cpu_features & ANDROID_CPU_ARM_FEATURE_ARMv7) +	    features |= ARM_V7; + +	if (cpu_features & ANDROID_CPU_ARM_FEATURE_VFPv3) +	    features |= ARM_VFP; + +	if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) +	    features |= ARM_NEON; +    } + +    return features; +} + +#elif defined (__linux__) /* linux ELF */ + +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <string.h> +#include <elf.h> + +static arm_cpu_features_t +detect_cpu_features (void) +{ +    arm_cpu_features_t features = 0; +    Elf32_auxv_t aux; +    int fd; + +    fd = open ("/proc/self/auxv", O_RDONLY); +    if (fd >= 0) +    { +	while (read (fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t)) +	{ +	    if (aux.a_type == AT_HWCAP) +	    { +		uint32_t hwcap = aux.a_un.a_val; + +		/* hardcode these values to avoid depending on specific +		 * versions of the hwcap header, e.g. HWCAP_NEON +		 */ +		if ((hwcap & 64) != 0) +		    features |= ARM_VFP; +		if ((hwcap & 512) != 0) +		    features |= ARM_IWMMXT; +		/* this flag is only present on kernel 2.6.29 */ +		if ((hwcap & 4096) != 0) +		    features |= ARM_NEON; +	    } +	    else if (aux.a_type == AT_PLATFORM) +	    { +		const char *plat = (const char*) aux.a_un.a_val; + +		if (strncmp (plat, "v7l", 3) == 0) +		    features |= (ARM_V7 | ARM_V6); +		else if (strncmp (plat, "v6l", 3) == 0) +		    features |= ARM_V6; +	    } +	} +	close (fd); +    } + +    return features; +} + +#elif defined (_3DS) /* 3DS homebrew (devkitARM) */ + +static arm_cpu_features_t +detect_cpu_features (void) +{ +    arm_cpu_features_t features = 0; + +    features |= ARM_V6; + +    return features; +} + +#elif defined (PSP2) || defined (__SWITCH__) +/* Vita (VitaSDK) or Switch (devkitA64) homebrew */ + +static arm_cpu_features_t +detect_cpu_features (void) +{ +    arm_cpu_features_t features = 0; + +    features |= ARM_NEON; + +    return features; +} + +#else /* Unknown */ + +static arm_cpu_features_t +detect_cpu_features (void) +{ +    return 0; +} + +#endif /* Linux elf */ + +static pixman_bool_t +have_feature (arm_cpu_features_t feature) +{ +    static pixman_bool_t initialized; +    static arm_cpu_features_t features; + +    if (!initialized) +    { +	features = detect_cpu_features(); +	initialized = TRUE; +    } + +    return (features & feature) == feature; +} + +#endif /* USE_ARM_SIMD || USE_ARM_NEON || USE_ARM_IWMMXT */ + +pixman_implementation_t * +_pixman_arm_get_implementations (pixman_implementation_t *imp) +{ +#ifdef USE_ARM_SIMD +    if (!_pixman_disabled ("arm-simd") && have_feature (ARM_V6)) +	imp = _pixman_implementation_create_arm_simd (imp); +#endif + +#ifdef USE_ARM_IWMMXT +    if (!_pixman_disabled ("arm-iwmmxt") && have_feature (ARM_IWMMXT)) +	imp = _pixman_implementation_create_mmx (imp); +#endif + +#ifdef USE_ARM_NEON +    if (!_pixman_disabled ("arm-neon") && have_feature (ARM_NEON)) +	imp = _pixman_implementation_create_arm_neon (imp); +#endif + +    return imp; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-bits-image.c b/libs/pixman-0.40.0/pixman/pixman-bits-image.c new file mode 100644 index 0000000..4cfabe3 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-bits-image.c @@ -0,0 +1,1379 @@ +/* + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. + *             2005 Lars Knoll & Zack Rusin, Trolltech + *             2008 Aaron Plattner, NVIDIA Corporation + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007, 2009 Red Hat, Inc. + * Copyright © 2008 André Tupinambá <andrelrt@gmail.com> + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Keith Packard not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission.  Keith Packard makes no + * representations about the suitability of this software for any purpose.  It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "pixman-private.h" +#include "pixman-combine32.h" +#include "pixman-inlines.h" +#include "dither/blue-noise-64x64.h" + +/* Fetch functions */ + +static force_inline void +fetch_pixel_no_alpha_32 (bits_image_t *image, +			 int x, int y, pixman_bool_t check_bounds, +			 void *out) +{ +    uint32_t *ret = out; + +    if (check_bounds && +	(x < 0 || x >= image->width || y < 0 || y >= image->height)) +	*ret = 0; +    else +	*ret = image->fetch_pixel_32 (image, x, y); +} + +static force_inline void +fetch_pixel_no_alpha_float (bits_image_t *image, +			    int x, int y, pixman_bool_t check_bounds, +			    void *out) +{ +    argb_t *ret = out; + +    if (check_bounds && +	(x < 0 || x >= image->width || y < 0 || y >= image->height)) +	ret->a = ret->r = ret->g = ret->b = 0.f; +    else +	*ret = image->fetch_pixel_float (image, x, y); +} + +typedef void (* get_pixel_t) (bits_image_t *image, +			      int x, int y, pixman_bool_t check_bounds, void *out); + +static force_inline void +bits_image_fetch_pixel_nearest (bits_image_t   *image, +				pixman_fixed_t  x, +				pixman_fixed_t  y, +				get_pixel_t	get_pixel, +				void	       *out) +{ +    int x0 = pixman_fixed_to_int (x - pixman_fixed_e); +    int y0 = pixman_fixed_to_int (y - pixman_fixed_e); + +    if (image->common.repeat != PIXMAN_REPEAT_NONE) +    { +	repeat (image->common.repeat, &x0, image->width); +	repeat (image->common.repeat, &y0, image->height); + +	get_pixel (image, x0, y0, FALSE, out); +    } +    else +    { +	get_pixel (image, x0, y0, TRUE, out); +    } +} + +static force_inline void +bits_image_fetch_pixel_bilinear_32 (bits_image_t   *image, +				    pixman_fixed_t  x, +				    pixman_fixed_t  y, +				    get_pixel_t	    get_pixel, +				    void	   *out) +{ +    pixman_repeat_t repeat_mode = image->common.repeat; +    int width = image->width; +    int height = image->height; +    int x1, y1, x2, y2; +    uint32_t tl, tr, bl, br; +    int32_t distx, disty; +    uint32_t *ret = out; + +    x1 = x - pixman_fixed_1 / 2; +    y1 = y - pixman_fixed_1 / 2; + +    distx = pixman_fixed_to_bilinear_weight (x1); +    disty = pixman_fixed_to_bilinear_weight (y1); + +    x1 = pixman_fixed_to_int (x1); +    y1 = pixman_fixed_to_int (y1); +    x2 = x1 + 1; +    y2 = y1 + 1; + +    if (repeat_mode != PIXMAN_REPEAT_NONE) +    { +	repeat (repeat_mode, &x1, width); +	repeat (repeat_mode, &y1, height); +	repeat (repeat_mode, &x2, width); +	repeat (repeat_mode, &y2, height); + +	get_pixel (image, x1, y1, FALSE, &tl); +	get_pixel (image, x2, y1, FALSE, &tr); +	get_pixel (image, x1, y2, FALSE, &bl); +	get_pixel (image, x2, y2, FALSE, &br); +    } +    else +    { +	get_pixel (image, x1, y1, TRUE, &tl); +	get_pixel (image, x2, y1, TRUE, &tr); +	get_pixel (image, x1, y2, TRUE, &bl); +	get_pixel (image, x2, y2, TRUE, &br); +    } + +    *ret = bilinear_interpolation (tl, tr, bl, br, distx, disty); +} + +static force_inline void +bits_image_fetch_pixel_bilinear_float (bits_image_t   *image, +				       pixman_fixed_t  x, +				       pixman_fixed_t  y, +				       get_pixel_t     get_pixel, +				       void	      *out) +{ +    pixman_repeat_t repeat_mode = image->common.repeat; +    int width = image->width; +    int height = image->height; +    int x1, y1, x2, y2; +    argb_t tl, tr, bl, br; +    float distx, disty; +    argb_t *ret = out; + +    x1 = x - pixman_fixed_1 / 2; +    y1 = y - pixman_fixed_1 / 2; + +    distx = ((float)pixman_fixed_fraction(x1)) / 65536.f; +    disty = ((float)pixman_fixed_fraction(y1)) / 65536.f; + +    x1 = pixman_fixed_to_int (x1); +    y1 = pixman_fixed_to_int (y1); +    x2 = x1 + 1; +    y2 = y1 + 1; + +    if (repeat_mode != PIXMAN_REPEAT_NONE) +    { +	repeat (repeat_mode, &x1, width); +	repeat (repeat_mode, &y1, height); +	repeat (repeat_mode, &x2, width); +	repeat (repeat_mode, &y2, height); + +	get_pixel (image, x1, y1, FALSE, &tl); +	get_pixel (image, x2, y1, FALSE, &tr); +	get_pixel (image, x1, y2, FALSE, &bl); +	get_pixel (image, x2, y2, FALSE, &br); +    } +    else +    { +	get_pixel (image, x1, y1, TRUE, &tl); +	get_pixel (image, x2, y1, TRUE, &tr); +	get_pixel (image, x1, y2, TRUE, &bl); +	get_pixel (image, x2, y2, TRUE, &br); +    } + +    *ret = bilinear_interpolation_float (tl, tr, bl, br, distx, disty); +} + +static force_inline void accum_32(unsigned int *satot, unsigned int *srtot, +				  unsigned int *sgtot, unsigned int *sbtot, +				  const void *p, pixman_fixed_t f) +{ +    uint32_t pixel = *(uint32_t *)p; + +    *srtot += (int)RED_8 (pixel) * f; +    *sgtot += (int)GREEN_8 (pixel) * f; +    *sbtot += (int)BLUE_8 (pixel) * f; +    *satot += (int)ALPHA_8 (pixel) * f; +} + +static force_inline void reduce_32(unsigned int satot, unsigned int srtot, +				   unsigned int sgtot, unsigned int sbtot, +                                   void *p) +{ +    uint32_t *ret = p; + +    satot = (satot + 0x8000) >> 16; +    srtot = (srtot + 0x8000) >> 16; +    sgtot = (sgtot + 0x8000) >> 16; +    sbtot = (sbtot + 0x8000) >> 16; + +    satot = CLIP (satot, 0, 0xff); +    srtot = CLIP (srtot, 0, 0xff); +    sgtot = CLIP (sgtot, 0, 0xff); +    sbtot = CLIP (sbtot, 0, 0xff); + +    *ret = ((satot << 24) | (srtot << 16) | (sgtot <<  8) | (sbtot)); +} + +static force_inline void accum_float(unsigned int *satot, unsigned int *srtot, +				     unsigned int *sgtot, unsigned int *sbtot, +				     const void *p, pixman_fixed_t f) +{ +    const argb_t *pixel = p; + +    *satot += pixel->a * f; +    *srtot += pixel->r * f; +    *sgtot += pixel->g * f; +    *sbtot += pixel->b * f; +} + +static force_inline void reduce_float(unsigned int satot, unsigned int srtot, +				      unsigned int sgtot, unsigned int sbtot, +				      void *p) +{ +    argb_t *ret = p; + +    ret->a = CLIP (satot / 65536.f, 0.f, 1.f); +    ret->r = CLIP (srtot / 65536.f, 0.f, 1.f); +    ret->g = CLIP (sgtot / 65536.f, 0.f, 1.f); +    ret->b = CLIP (sbtot / 65536.f, 0.f, 1.f); +} + +typedef void (* accumulate_pixel_t) (unsigned int *satot, unsigned int *srtot, +				     unsigned int *sgtot, unsigned int *sbtot, +				     const void *pixel, pixman_fixed_t f); + +typedef void (* reduce_pixel_t) (unsigned int satot, unsigned int srtot, +				 unsigned int sgtot, unsigned int sbtot, +                                 void *out); + +static force_inline void +bits_image_fetch_pixel_convolution (bits_image_t   *image, +				    pixman_fixed_t  x, +				    pixman_fixed_t  y, +				    get_pixel_t     get_pixel, +				    void	      *out, +				    accumulate_pixel_t accum, +				    reduce_pixel_t reduce) +{ +    pixman_fixed_t *params = image->common.filter_params; +    int x_off = (params[0] - pixman_fixed_1) >> 1; +    int y_off = (params[1] - pixman_fixed_1) >> 1; +    int32_t cwidth = pixman_fixed_to_int (params[0]); +    int32_t cheight = pixman_fixed_to_int (params[1]); +    int32_t i, j, x1, x2, y1, y2; +    pixman_repeat_t repeat_mode = image->common.repeat; +    int width = image->width; +    int height = image->height; +    unsigned int srtot, sgtot, sbtot, satot; + +    params += 2; + +    x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off); +    y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off); +    x2 = x1 + cwidth; +    y2 = y1 + cheight; + +    srtot = sgtot = sbtot = satot = 0; + +    for (i = y1; i < y2; ++i) +    { +	for (j = x1; j < x2; ++j) +	{ +	    int rx = j; +	    int ry = i; + +	    pixman_fixed_t f = *params; + +	    if (f) +	    { +		/* Must be big enough to hold a argb_t */ +		argb_t pixel; + +		if (repeat_mode != PIXMAN_REPEAT_NONE) +		{ +		    repeat (repeat_mode, &rx, width); +		    repeat (repeat_mode, &ry, height); + +		    get_pixel (image, rx, ry, FALSE, &pixel); +		} +		else +		{ +		    get_pixel (image, rx, ry, TRUE, &pixel); +		} + +		accum (&satot, &srtot, &sgtot, &sbtot, &pixel, f); +	    } + +	    params++; +	} +    } + +    reduce (satot, srtot, sgtot, sbtot, out); +} + +static void +bits_image_fetch_pixel_separable_convolution (bits_image_t  *image, +					      pixman_fixed_t x, +					      pixman_fixed_t y, +					      get_pixel_t    get_pixel, +					      void	    *out, +					      accumulate_pixel_t accum, +					      reduce_pixel_t     reduce) +{ +    pixman_fixed_t *params = image->common.filter_params; +    pixman_repeat_t repeat_mode = image->common.repeat; +    int width = image->width; +    int height = image->height; +    int cwidth = pixman_fixed_to_int (params[0]); +    int cheight = pixman_fixed_to_int (params[1]); +    int x_phase_bits = pixman_fixed_to_int (params[2]); +    int y_phase_bits = pixman_fixed_to_int (params[3]); +    int x_phase_shift = 16 - x_phase_bits; +    int y_phase_shift = 16 - y_phase_bits; +    int x_off = ((cwidth << 16) - pixman_fixed_1) >> 1; +    int y_off = ((cheight << 16) - pixman_fixed_1) >> 1; +    pixman_fixed_t *y_params; +    unsigned int srtot, sgtot, sbtot, satot; +    int32_t x1, x2, y1, y2; +    int32_t px, py; +    int i, j; + +    /* Round x and y to the middle of the closest phase before continuing. This +     * ensures that the convolution matrix is aligned right, since it was +     * positioned relative to a particular phase (and not relative to whatever +     * exact fraction we happen to get here). +     */ +    x = ((x >> x_phase_shift) << x_phase_shift) + ((1 << x_phase_shift) >> 1); +    y = ((y >> y_phase_shift) << y_phase_shift) + ((1 << y_phase_shift) >> 1); + +    px = (x & 0xffff) >> x_phase_shift; +    py = (y & 0xffff) >> y_phase_shift; + +    y_params = params + 4 + (1 << x_phase_bits) * cwidth + py * cheight; + +    x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off); +    y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off); +    x2 = x1 + cwidth; +    y2 = y1 + cheight; + +    srtot = sgtot = sbtot = satot = 0; + +    for (i = y1; i < y2; ++i) +    { +        pixman_fixed_48_16_t fy = *y_params++; +        pixman_fixed_t *x_params = params + 4 + px * cwidth; + +        if (fy) +        { +            for (j = x1; j < x2; ++j) +            { +                pixman_fixed_t fx = *x_params++; +		int rx = j; +		int ry = i; + +                if (fx) +                { +                    /* Must be big enough to hold a argb_t */ +                    argb_t pixel; +                    pixman_fixed_t f; + +                    if (repeat_mode != PIXMAN_REPEAT_NONE) +                    { +                        repeat (repeat_mode, &rx, width); +                        repeat (repeat_mode, &ry, height); + +                        get_pixel (image, rx, ry, FALSE, &pixel); +                    } +                    else +                    { +                        get_pixel (image, rx, ry, TRUE, &pixel); +		    } + +                    f = (fy * fx + 0x8000) >> 16; + +		    accum(&satot, &srtot, &sgtot, &sbtot, &pixel, f); +                } +            } +	} +    } + + +    reduce(satot, srtot, sgtot, sbtot, out); +} + +static force_inline void +bits_image_fetch_pixel_filtered (bits_image_t  *image, +				 pixman_bool_t  wide, +				 pixman_fixed_t x, +				 pixman_fixed_t y, +				 get_pixel_t    get_pixel, +				 void          *out) +{ +    switch (image->common.filter) +    { +    case PIXMAN_FILTER_NEAREST: +    case PIXMAN_FILTER_FAST: +	bits_image_fetch_pixel_nearest (image, x, y, get_pixel, out); +	break; + +    case PIXMAN_FILTER_BILINEAR: +    case PIXMAN_FILTER_GOOD: +    case PIXMAN_FILTER_BEST: +	if (wide) +	    bits_image_fetch_pixel_bilinear_float (image, x, y, get_pixel, out); +	else +	    bits_image_fetch_pixel_bilinear_32 (image, x, y, get_pixel, out); +	break; + +    case PIXMAN_FILTER_CONVOLUTION: +	if (wide) +	{ +	    bits_image_fetch_pixel_convolution (image, x, y, +						get_pixel, out, +						accum_float, +						reduce_float); +	} +	else +	{ +	    bits_image_fetch_pixel_convolution (image, x, y, +						get_pixel, out, +						accum_32, reduce_32); +	} +	break; + +    case PIXMAN_FILTER_SEPARABLE_CONVOLUTION: +	if (wide) +	{ +	    bits_image_fetch_pixel_separable_convolution (image, x, y, +							  get_pixel, out, +							  accum_float, +							  reduce_float); +	} +	else +	{ +	    bits_image_fetch_pixel_separable_convolution (image, x, y, +							  get_pixel, out, +							  accum_32, reduce_32); +	} +        break; + +    default: +	assert (0); +        break; +    } +} + +static uint32_t * +__bits_image_fetch_affine_no_alpha (pixman_iter_t *  iter, +				    pixman_bool_t    wide, +				    const uint32_t * mask) +{ +    pixman_image_t *image  = iter->image; +    int             offset = iter->x; +    int             line   = iter->y++; +    int             width  = iter->width; +    uint32_t *      buffer = iter->buffer; + +    pixman_fixed_t x, y; +    pixman_fixed_t ux, uy; +    pixman_vector_t v; +    int i; +    get_pixel_t get_pixel = +	wide ? fetch_pixel_no_alpha_float : fetch_pixel_no_alpha_32; + +    /* reference point is the center of the pixel */ +    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; +    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; +    v.vector[2] = pixman_fixed_1; + +    if (image->common.transform) +    { +	if (!pixman_transform_point_3d (image->common.transform, &v)) +	    return iter->buffer; + +	ux = image->common.transform->matrix[0][0]; +	uy = image->common.transform->matrix[1][0]; +    } +    else +    { +	ux = pixman_fixed_1; +	uy = 0; +    } + +    x = v.vector[0]; +    y = v.vector[1]; + +    for (i = 0; i < width; ++i) +    { +	if (!mask || mask[i]) +	{ +	    bits_image_fetch_pixel_filtered ( +		&image->bits, wide, x, y, get_pixel, buffer); +	} + +	x += ux; +	y += uy; +	buffer += wide ? 4 : 1; +    } + +    return iter->buffer; +} + +static uint32_t * +bits_image_fetch_affine_no_alpha_32 (pixman_iter_t  *iter, +				     const uint32_t *mask) +{ +    return __bits_image_fetch_affine_no_alpha(iter, FALSE, mask); +} + +static uint32_t * +bits_image_fetch_affine_no_alpha_float (pixman_iter_t  *iter, +					const uint32_t *mask) +{ +    return __bits_image_fetch_affine_no_alpha(iter, TRUE, mask); +} + +/* General fetcher */ +static force_inline void +fetch_pixel_general_32 (bits_image_t *image, +			int x, int y, pixman_bool_t check_bounds, +			void *out) +{ +    uint32_t pixel, *ret = out; + +    if (check_bounds && +	(x < 0 || x >= image->width || y < 0 || y >= image->height)) +    { +	*ret = 0; +	return; +    } + +    pixel = image->fetch_pixel_32 (image, x, y); + +    if (image->common.alpha_map) +    { +	uint32_t pixel_a; + +	x -= image->common.alpha_origin_x; +	y -= image->common.alpha_origin_y; + +	if (x < 0 || x >= image->common.alpha_map->width || +	    y < 0 || y >= image->common.alpha_map->height) +	{ +	    pixel_a = 0; +	} +	else +	{ +	    pixel_a = image->common.alpha_map->fetch_pixel_32 ( +		image->common.alpha_map, x, y); + +	    pixel_a = ALPHA_8 (pixel_a); +	} + +	pixel &= 0x00ffffff; +	pixel |= (pixel_a << 24); +    } + +    *ret = pixel; +} + +static force_inline void +fetch_pixel_general_float (bits_image_t *image, +			int x, int y, pixman_bool_t check_bounds, +			void *out) +{ +    argb_t *ret = out; + +    if (check_bounds && +	(x < 0 || x >= image->width || y < 0 || y >= image->height)) +    { +	ret->a = ret->r = ret->g = ret->b = 0; +	return; +    } + +    *ret = image->fetch_pixel_float (image, x, y); + +    if (image->common.alpha_map) +    { +	x -= image->common.alpha_origin_x; +	y -= image->common.alpha_origin_y; + +	if (x < 0 || x >= image->common.alpha_map->width || +	    y < 0 || y >= image->common.alpha_map->height) +	{ +	    ret->a = 0.f; +	} +	else +	{ +	    argb_t alpha; + +	    alpha = image->common.alpha_map->fetch_pixel_float ( +		    image->common.alpha_map, x, y); + +	    ret->a = alpha.a; +	} +    } +} + +static uint32_t * +__bits_image_fetch_general (pixman_iter_t  *iter, +			    pixman_bool_t wide, +			    const uint32_t *mask) +{ +    pixman_image_t *image  = iter->image; +    int             offset = iter->x; +    int             line   = iter->y++; +    int             width  = iter->width; +    uint32_t *      buffer = iter->buffer; +    get_pixel_t     get_pixel = +	wide ? fetch_pixel_general_float : fetch_pixel_general_32; + +    pixman_fixed_t x, y, w; +    pixman_fixed_t ux, uy, uw; +    pixman_vector_t v; +    int i; + +    /* reference point is the center of the pixel */ +    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; +    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; +    v.vector[2] = pixman_fixed_1; + +    if (image->common.transform) +    { +	if (!pixman_transform_point_3d (image->common.transform, &v)) +	    return buffer; + +	ux = image->common.transform->matrix[0][0]; +	uy = image->common.transform->matrix[1][0]; +	uw = image->common.transform->matrix[2][0]; +    } +    else +    { +	ux = pixman_fixed_1; +	uy = 0; +	uw = 0; +    } + +    x = v.vector[0]; +    y = v.vector[1]; +    w = v.vector[2]; + +    for (i = 0; i < width; ++i) +    { +	pixman_fixed_t x0, y0; + +	if (!mask || mask[i]) +	{ +	    if (w != 0) +	    { +		x0 = ((uint64_t)x << 16) / w; +		y0 = ((uint64_t)y << 16) / w; +	    } +	    else +	    { +		x0 = 0; +		y0 = 0; +	    } + +	    bits_image_fetch_pixel_filtered ( +		&image->bits, wide, x0, y0, get_pixel, buffer); +	} + +	x += ux; +	y += uy; +	w += uw; +	buffer += wide ? 4 : 1; +    } + +    return iter->buffer; +} + +static uint32_t * +bits_image_fetch_general_32 (pixman_iter_t  *iter, +			     const uint32_t *mask) +{ +    return __bits_image_fetch_general(iter, FALSE, mask); +} + +static uint32_t * +bits_image_fetch_general_float (pixman_iter_t  *iter, +				const uint32_t *mask) +{ +    return __bits_image_fetch_general(iter, TRUE, mask); +} + +static void +replicate_pixel_32 (bits_image_t *   bits, +		    int              x, +		    int              y, +		    int              width, +		    uint32_t *       buffer) +{ +    uint32_t color; +    uint32_t *end; + +    color = bits->fetch_pixel_32 (bits, x, y); + +    end = buffer + width; +    while (buffer < end) +	*(buffer++) = color; +} + +static void +replicate_pixel_float (bits_image_t *   bits, +		       int              x, +		       int              y, +		       int              width, +		       uint32_t *       b) +{ +    argb_t color; +    argb_t *buffer = (argb_t *)b; +    argb_t *end; + +    color = bits->fetch_pixel_float (bits, x, y); + +    end = buffer + width; +    while (buffer < end) +	*(buffer++) = color; +} + +static void +bits_image_fetch_untransformed_repeat_none (bits_image_t *image, +                                            pixman_bool_t wide, +                                            int           x, +                                            int           y, +                                            int           width, +                                            uint32_t *    buffer) +{ +    uint32_t w; + +    if (y < 0 || y >= image->height) +    { +	memset (buffer, 0, width * (wide? sizeof (argb_t) : 4)); +	return; +    } + +    if (x < 0) +    { +	w = MIN (width, -x); + +	memset (buffer, 0, w * (wide ? sizeof (argb_t) : 4)); + +	width -= w; +	buffer += w * (wide? 4 : 1); +	x += w; +    } + +    if (x < image->width) +    { +	w = MIN (width, image->width - x); + +	if (wide) +	    image->fetch_scanline_float (image, x, y, w, buffer, NULL); +	else +	    image->fetch_scanline_32 (image, x, y, w, buffer, NULL); + +	width -= w; +	buffer += w * (wide? 4 : 1); +	x += w; +    } + +    memset (buffer, 0, width * (wide ? sizeof (argb_t) : 4)); +} + +static void +bits_image_fetch_untransformed_repeat_normal (bits_image_t *image, +                                              pixman_bool_t wide, +                                              int           x, +                                              int           y, +                                              int           width, +                                              uint32_t *    buffer) +{ +    uint32_t w; + +    while (y < 0) +	y += image->height; + +    while (y >= image->height) +	y -= image->height; + +    if (image->width == 1) +    { +	if (wide) +	    replicate_pixel_float (image, 0, y, width, buffer); +	else +	    replicate_pixel_32 (image, 0, y, width, buffer); + +	return; +    } + +    while (width) +    { +	while (x < 0) +	    x += image->width; +	while (x >= image->width) +	    x -= image->width; + +	w = MIN (width, image->width - x); + +	if (wide) +	    image->fetch_scanline_float (image, x, y, w, buffer, NULL); +	else +	    image->fetch_scanline_32 (image, x, y, w, buffer, NULL); + +	buffer += w * (wide? 4 : 1); +	x += w; +	width -= w; +    } +} + +static uint32_t * +bits_image_fetch_untransformed_32 (pixman_iter_t * iter, +				   const uint32_t *mask) +{ +    pixman_image_t *image  = iter->image; +    int             x      = iter->x; +    int             y      = iter->y; +    int             width  = iter->width; +    uint32_t *      buffer = iter->buffer; + +    if (image->common.repeat == PIXMAN_REPEAT_NONE) +    { +	bits_image_fetch_untransformed_repeat_none ( +	    &image->bits, FALSE, x, y, width, buffer); +    } +    else +    { +	bits_image_fetch_untransformed_repeat_normal ( +	    &image->bits, FALSE, x, y, width, buffer); +    } + +    iter->y++; +    return buffer; +} + +static uint32_t * +bits_image_fetch_untransformed_float (pixman_iter_t * iter, +				      const uint32_t *mask) +{ +    pixman_image_t *image  = iter->image; +    int             x      = iter->x; +    int             y      = iter->y; +    int             width  = iter->width; +    uint32_t *      buffer = iter->buffer; + +    if (image->common.repeat == PIXMAN_REPEAT_NONE) +    { +	bits_image_fetch_untransformed_repeat_none ( +	    &image->bits, TRUE, x, y, width, buffer); +    } +    else +    { +	bits_image_fetch_untransformed_repeat_normal ( +	    &image->bits, TRUE, x, y, width, buffer); +    } + +    iter->y++; +    return buffer; +} + +typedef struct +{ +    pixman_format_code_t	format; +    uint32_t			flags; +    pixman_iter_get_scanline_t	get_scanline_32; +    pixman_iter_get_scanline_t  get_scanline_float; +} fetcher_info_t; + +static const fetcher_info_t fetcher_info[] = +{ +    { PIXMAN_any, +      (FAST_PATH_NO_ALPHA_MAP			| +       FAST_PATH_ID_TRANSFORM			| +       FAST_PATH_NO_CONVOLUTION_FILTER		| +       FAST_PATH_NO_PAD_REPEAT			| +       FAST_PATH_NO_REFLECT_REPEAT), +      bits_image_fetch_untransformed_32, +      bits_image_fetch_untransformed_float +    }, + +    /* Affine, no alpha */ +    { PIXMAN_any, +      (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM), +      bits_image_fetch_affine_no_alpha_32, +      bits_image_fetch_affine_no_alpha_float, +    }, + +    /* General */ +    { PIXMAN_any, +      0, +      bits_image_fetch_general_32, +      bits_image_fetch_general_float, +    }, + +    { PIXMAN_null }, +}; + +static void +bits_image_property_changed (pixman_image_t *image) +{ +    _pixman_bits_image_setup_accessors (&image->bits); +} + +void +_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter) +{ +    pixman_format_code_t format = image->common.extended_format_code; +    uint32_t flags = image->common.flags; +    const fetcher_info_t *info; + +    for (info = fetcher_info; info->format != PIXMAN_null; ++info) +    { +	if ((info->format == format || info->format == PIXMAN_any)	&& +	    (info->flags & flags) == info->flags) +	{ +	    if (iter->iter_flags & ITER_NARROW) +	    { +		iter->get_scanline = info->get_scanline_32; +	    } +	    else +	    { +		iter->get_scanline = info->get_scanline_float; +	    } +	    return; +	} +    } + +    /* Just in case we somehow didn't find a scanline function */ +    iter->get_scanline = _pixman_iter_get_scanline_noop; +} + +static uint32_t * +dest_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask) +{ +    pixman_image_t *image  = iter->image; +    int             x      = iter->x; +    int             y      = iter->y; +    int             width  = iter->width; +    uint32_t *	    buffer = iter->buffer; + +    image->bits.fetch_scanline_32 (&image->bits, x, y, width, buffer, mask); +    if (image->common.alpha_map) +    { +	uint32_t *alpha; + +	if ((alpha = malloc (width * sizeof (uint32_t)))) +	{ +	    int i; + +	    x -= image->common.alpha_origin_x; +	    y -= image->common.alpha_origin_y; + +	    image->common.alpha_map->fetch_scanline_32 ( +		image->common.alpha_map, x, y, width, alpha, mask); + +	    for (i = 0; i < width; ++i) +	    { +		buffer[i] &= ~0xff000000; +		buffer[i] |= (alpha[i] & 0xff000000); +	    } + +	    free (alpha); +	} +    } + +    return iter->buffer; +} + +static uint32_t * +dest_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask) +{ +    bits_image_t *  image  = &iter->image->bits; +    int             x      = iter->x; +    int             y      = iter->y; +    int             width  = iter->width; +    argb_t *	    buffer = (argb_t *)iter->buffer; + +    image->fetch_scanline_float ( +	image, x, y, width, (uint32_t *)buffer, mask); +    if (image->common.alpha_map) +    { +	argb_t *alpha; + +	if ((alpha = malloc (width * sizeof (argb_t)))) +	{ +	    int i; + +	    x -= image->common.alpha_origin_x; +	    y -= image->common.alpha_origin_y; + +	    image->common.alpha_map->fetch_scanline_float ( +		image->common.alpha_map, x, y, width, (uint32_t *)alpha, mask); + +	    for (i = 0; i < width; ++i) +		buffer[i].a = alpha[i].a; + +	    free (alpha); +	} +    } + +    return iter->buffer; +} + +static void +dest_write_back_narrow (pixman_iter_t *iter) +{ +    bits_image_t *  image  = &iter->image->bits; +    int             x      = iter->x; +    int             y      = iter->y; +    int             width  = iter->width; +    const uint32_t *buffer = iter->buffer; + +    image->store_scanline_32 (image, x, y, width, buffer); + +    if (image->common.alpha_map) +    { +	x -= image->common.alpha_origin_x; +	y -= image->common.alpha_origin_y; + +	image->common.alpha_map->store_scanline_32 ( +	    image->common.alpha_map, x, y, width, buffer); +    } + +    iter->y++; +} + +static const float +dither_factor_blue_noise_64 (int x, int y) +{ +    float m = dither_blue_noise_64x64[((y & 0x3f) << 6) | (x & 0x3f)]; +    return m * (1. / 4096.f) + (1. / 8192.f); +} + +static const float +dither_factor_bayer_8 (int x, int y) +{ +    uint32_t m; + +    y ^= x; + +    /* Compute reverse(interleave(xor(x mod n, y mod n), x mod n)) +     * Here n = 8 and `mod n` is the bottom 3 bits. +     */ +    m = ((y & 0x1) << 5) | ((x & 0x1) << 4) | +	((y & 0x2) << 2) | ((x & 0x2) << 1) | +	((y & 0x4) >> 1) | ((x & 0x4) >> 2); + +    /* m is in range [0, 63].  We scale it to [0, 63.0f/64.0f], then +     * shift it to to [1.0f/128.0f, 127.0f/128.0f] so that 0 < d < 1. +     * This ensures exact values are not changed by dithering. +     */ +    return (float)(m) * (1 / 64.0f) + (1.0f / 128.0f); +} + +typedef float (* dither_factor_t)(int x, int y); + +static force_inline float +dither_apply_channel (float f, float d, float s) +{ +    /* float_to_unorm splits the [0, 1] segment in (1 << n_bits) +     * subsections of equal length; however unorm_to_float does not +     * map to the center of those sections.  In fact, pixel value u is +     * mapped to: +     * +     *       u              u              u               1 +     * -------------- = ---------- + -------------- * ---------- +     *  2^n_bits - 1     2^n_bits     2^n_bits - 1     2^n_bits +     * +     * Hence if f = u / (2^n_bits - 1) is exactly representable on a +     * n_bits palette, all the numbers between +     * +     *     u +     * ----------  =  f - f * 2^n_bits = f + (0 - f) * 2^n_bits +     *  2^n_bits +     * +     *  and +     * +     *    u + 1 +     * ---------- = f - (f - 1) * 2^n_bits = f + (1 - f) * 2^n_bits +     *  2^n_bits +     * +     * are also mapped back to u. +     * +     * Hence the following calculation ensures that we add as much +     * noise as possible without perturbing values which are exactly +     * representable in the target colorspace.  Note that this corresponds to +     * mixing the original color with noise with a ratio of `1 / 2^n_bits`. +     */ +    return f + (d - f) * s; +} + +static force_inline float +dither_compute_scale (int n_bits) +{ +    // No dithering for wide formats +    if (n_bits == 0 || n_bits >= 32) +	return 0.f; + +    return 1.f / (float)(1 << n_bits); +} + +static const uint32_t * +dither_apply_ordered (pixman_iter_t *iter, dither_factor_t factor) +{ +    bits_image_t        *image  = &iter->image->bits; +    int                  x      = iter->x + image->dither_offset_x; +    int                  y      = iter->y + image->dither_offset_y; +    int                  width  = iter->width; +    argb_t              *buffer = (argb_t *)iter->buffer; + +    pixman_format_code_t format = image->format; +    int                  a_size = PIXMAN_FORMAT_A (format); +    int                  r_size = PIXMAN_FORMAT_R (format); +    int                  g_size = PIXMAN_FORMAT_G (format); +    int                  b_size = PIXMAN_FORMAT_B (format); + +    float a_scale = dither_compute_scale (a_size); +    float r_scale = dither_compute_scale (r_size); +    float g_scale = dither_compute_scale (g_size); +    float b_scale = dither_compute_scale (b_size); + +    int   i; +    float d; + +    for (i = 0; i < width; ++i) +    { +	d = factor (x + i, y); + +	buffer->a = dither_apply_channel (buffer->a, d, a_scale); +	buffer->r = dither_apply_channel (buffer->r, d, r_scale); +	buffer->g = dither_apply_channel (buffer->g, d, g_scale); +	buffer->b = dither_apply_channel (buffer->b, d, b_scale); + +	buffer++; +    } + +    return iter->buffer; +} + +static void +dest_write_back_wide (pixman_iter_t *iter) +{ +    bits_image_t *  image  = &iter->image->bits; +    int             x      = iter->x; +    int             y      = iter->y; +    int             width  = iter->width; +    const uint32_t *buffer = iter->buffer; + +    switch (image->dither) +    { +    case PIXMAN_DITHER_NONE: +	break; + +    case PIXMAN_DITHER_GOOD: +    case PIXMAN_DITHER_BEST: +    case PIXMAN_DITHER_ORDERED_BLUE_NOISE_64: +	buffer = dither_apply_ordered (iter, dither_factor_blue_noise_64); +	break; + +    case PIXMAN_DITHER_FAST: +    case PIXMAN_DITHER_ORDERED_BAYER_8: +	buffer = dither_apply_ordered (iter, dither_factor_bayer_8); +	break; +    } + +    image->store_scanline_float (image, x, y, width, buffer); + +    if (image->common.alpha_map) +    { +	x -= image->common.alpha_origin_x; +	y -= image->common.alpha_origin_y; + +	image->common.alpha_map->store_scanline_float ( +	    image->common.alpha_map, x, y, width, buffer); +    } + +    iter->y++; +} + +void +_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter) +{ +    if (iter->iter_flags & ITER_NARROW) +    { +	if ((iter->iter_flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) == +	    (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) +	{ +	    iter->get_scanline = _pixman_iter_get_scanline_noop; +	} +	else +	{ +	    iter->get_scanline = dest_get_scanline_narrow; +	} +	 +	iter->write_back = dest_write_back_narrow; +    } +    else +    { +	iter->get_scanline = dest_get_scanline_wide; +	iter->write_back = dest_write_back_wide; +    } +} + +static uint32_t * +create_bits (pixman_format_code_t format, +             int                  width, +             int                  height, +             int *		  rowstride_bytes, +	     pixman_bool_t	  clear) +{ +    int stride; +    size_t buf_size; +    int bpp; + +    /* what follows is a long-winded way, avoiding any possibility of integer +     * overflows, of saying: +     * stride = ((width * bpp + 0x1f) >> 5) * sizeof (uint32_t); +     */ + +    bpp = PIXMAN_FORMAT_BPP (format); +    if (_pixman_multiply_overflows_int (width, bpp)) +	return NULL; + +    stride = width * bpp; +    if (_pixman_addition_overflows_int (stride, 0x1f)) +	return NULL; + +    stride += 0x1f; +    stride >>= 5; + +    stride *= sizeof (uint32_t); + +    if (_pixman_multiply_overflows_size (height, stride)) +	return NULL; + +    buf_size = (size_t)height * stride; + +    if (rowstride_bytes) +	*rowstride_bytes = stride; + +    if (clear) +	return calloc (buf_size, 1); +    else +	return malloc (buf_size); +} + +pixman_bool_t +_pixman_bits_image_init (pixman_image_t *     image, +                         pixman_format_code_t format, +                         int                  width, +                         int                  height, +                         uint32_t *           bits, +                         int                  rowstride, +			 pixman_bool_t	      clear) +{ +    uint32_t *free_me = NULL; + +    if (PIXMAN_FORMAT_BPP (format) == 128) +	return_val_if_fail(!(rowstride % 4), FALSE); + +    if (!bits && width && height) +    { +	int rowstride_bytes; + +	free_me = bits = create_bits (format, width, height, &rowstride_bytes, clear); + +	if (!bits) +	    return FALSE; + +	rowstride = rowstride_bytes / (int) sizeof (uint32_t); +    } + +    _pixman_image_init (image); + +    image->type = BITS; +    image->bits.format = format; +    image->bits.width = width; +    image->bits.height = height; +    image->bits.bits = bits; +    image->bits.free_me = free_me; +    image->bits.dither = PIXMAN_DITHER_NONE; +    image->bits.dither_offset_x = 0; +    image->bits.dither_offset_y = 0; +    image->bits.read_func = NULL; +    image->bits.write_func = NULL; +    image->bits.rowstride = rowstride; +    image->bits.indexed = NULL; + +    image->common.property_changed = bits_image_property_changed; + +    _pixman_image_reset_clip_region (image); + +    return TRUE; +} + +static pixman_image_t * +create_bits_image_internal (pixman_format_code_t format, +			    int                  width, +			    int                  height, +			    uint32_t *           bits, +			    int                  rowstride_bytes, +			    pixman_bool_t	 clear) +{ +    pixman_image_t *image; + +    /* must be a whole number of uint32_t's +     */ +    return_val_if_fail ( +	bits == NULL || (rowstride_bytes % sizeof (uint32_t)) == 0, NULL); + +    return_val_if_fail (PIXMAN_FORMAT_BPP (format) >= PIXMAN_FORMAT_DEPTH (format), NULL); + +    image = _pixman_image_allocate (); + +    if (!image) +	return NULL; + +    if (!_pixman_bits_image_init (image, format, width, height, bits, +				  rowstride_bytes / (int) sizeof (uint32_t), +				  clear)) +    { +	free (image); +	return NULL; +    } + +    return image; +} + +/* If bits is NULL, a buffer will be allocated and initialized to 0 */ +PIXMAN_EXPORT pixman_image_t * +pixman_image_create_bits (pixman_format_code_t format, +                          int                  width, +                          int                  height, +                          uint32_t *           bits, +                          int                  rowstride_bytes) +{ +    return create_bits_image_internal ( +	format, width, height, bits, rowstride_bytes, TRUE); +} + + +/* If bits is NULL, a buffer will be allocated and _not_ initialized */ +PIXMAN_EXPORT pixman_image_t * +pixman_image_create_bits_no_clear (pixman_format_code_t format, +				   int                  width, +				   int                  height, +				   uint32_t *           bits, +				   int                  rowstride_bytes) +{ +    return create_bits_image_internal ( +	format, width, height, bits, rowstride_bytes, FALSE); +} diff --git a/libs/pixman-0.40.0/pixman/pixman-combine-float.c b/libs/pixman-0.40.0/pixman/pixman-combine-float.c new file mode 100644 index 0000000..f5145bc --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-combine-float.c @@ -0,0 +1,1158 @@ +/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */ +/* + * Copyright © 2010, 2012 Soren Sandmann Pedersen + * Copyright © 2010, 2012 Red Hat, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Soren Sandmann Pedersen (sandmann@cs.au.dk) + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <math.h> +#include <string.h> +#include <float.h> + +#include "pixman-private.h" + +/* Workaround for http://gcc.gnu.org/PR54965 */ +/* GCC 4.6 has problems with force_inline, so just use normal inline instead */ +#if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 6) +#undef force_inline +#define force_inline __inline__ +#endif + +typedef float (* combine_channel_t) (float sa, float s, float da, float d); + +static force_inline void +combine_inner (pixman_bool_t component, +	       float *dest, const float *src, const float *mask, int n_pixels, +	       combine_channel_t combine_a, combine_channel_t combine_c) +{ +    int i; + +    if (!mask) +    { +	for (i = 0; i < 4 * n_pixels; i += 4) +	{ +	    float sa = src[i + 0]; +	    float sr = src[i + 1]; +	    float sg = src[i + 2]; +	    float sb = src[i + 3]; +	     +	    float da = dest[i + 0]; +	    float dr = dest[i + 1]; +	    float dg = dest[i + 2]; +	    float db = dest[i + 3];					 +	     +	    dest[i + 0] = combine_a (sa, sa, da, da); +	    dest[i + 1] = combine_c (sa, sr, da, dr); +	    dest[i + 2] = combine_c (sa, sg, da, dg); +	    dest[i + 3] = combine_c (sa, sb, da, db); +	} +    } +    else +    { +	for (i = 0; i < 4 * n_pixels; i += 4) +	{ +	    float sa, sr, sg, sb; +	    float ma, mr, mg, mb; +	    float da, dr, dg, db; +	     +	    sa = src[i + 0]; +	    sr = src[i + 1]; +	    sg = src[i + 2]; +	    sb = src[i + 3]; +	     +	    if (component) +	    { +		ma = mask[i + 0]; +		mr = mask[i + 1]; +		mg = mask[i + 2]; +		mb = mask[i + 3]; + +		sr *= mr; +		sg *= mg; +		sb *= mb; + +		ma *= sa; +		mr *= sa; +		mg *= sa; +		mb *= sa; +		 +		sa = ma; +	    } +	    else +	    { +		ma = mask[i + 0]; + +		sa *= ma; +		sr *= ma; +		sg *= ma; +		sb *= ma; + +		ma = mr = mg = mb = sa; +	    } +	     +	    da = dest[i + 0]; +	    dr = dest[i + 1]; +	    dg = dest[i + 2]; +	    db = dest[i + 3]; +	     +	    dest[i + 0] = combine_a (ma, sa, da, da); +	    dest[i + 1] = combine_c (mr, sr, da, dr); +	    dest[i + 2] = combine_c (mg, sg, da, dg); +	    dest[i + 3] = combine_c (mb, sb, da, db); +	} +    } +} + +#define MAKE_COMBINER(name, component, combine_a, combine_c)		\ +    static void								\ +    combine_ ## name ## _float (pixman_implementation_t *imp,		\ +				pixman_op_t              op,		\ +				float                   *dest,		\ +				const float             *src,		\ +				const float             *mask,		\ +				int		         n_pixels)	\ +    {									\ +	combine_inner (component, dest, src, mask, n_pixels,		\ +		       combine_a, combine_c);				\ +    } + +#define MAKE_COMBINERS(name, combine_a, combine_c)			\ +    MAKE_COMBINER(name ## _ca, TRUE, combine_a, combine_c)		\ +    MAKE_COMBINER(name ## _u, FALSE, combine_a, combine_c) + + +/* + * Porter/Duff operators + */ +typedef enum +{ +    ZERO, +    ONE, +    SRC_ALPHA, +    DEST_ALPHA, +    INV_SA, +    INV_DA, +    SA_OVER_DA, +    DA_OVER_SA, +    INV_SA_OVER_DA, +    INV_DA_OVER_SA, +    ONE_MINUS_SA_OVER_DA, +    ONE_MINUS_DA_OVER_SA, +    ONE_MINUS_INV_DA_OVER_SA, +    ONE_MINUS_INV_SA_OVER_DA +} combine_factor_t; + +#define CLAMP(f)					\ +    (((f) < 0)? 0 : (((f) > 1.0) ? 1.0 : (f))) + +static force_inline float +get_factor (combine_factor_t factor, float sa, float da) +{ +    float f = -1; + +    switch (factor) +    { +    case ZERO: +	f = 0.0f; +	break; + +    case ONE: +	f = 1.0f; +	break; + +    case SRC_ALPHA: +	f = sa; +	break; + +    case DEST_ALPHA: +	f = da; +	break; + +    case INV_SA: +	f = 1 - sa; +	break; + +    case INV_DA: +	f = 1 - da; +	break; + +    case SA_OVER_DA: +	if (FLOAT_IS_ZERO (da)) +	    f = 1.0f; +	else +	    f = CLAMP (sa / da); +	break; + +    case DA_OVER_SA: +	if (FLOAT_IS_ZERO (sa)) +	    f = 1.0f; +	else +	    f = CLAMP (da / sa); +	break; + +    case INV_SA_OVER_DA: +	if (FLOAT_IS_ZERO (da)) +	    f = 1.0f; +	else +	    f = CLAMP ((1.0f - sa) / da); +	break; + +    case INV_DA_OVER_SA: +	if (FLOAT_IS_ZERO (sa)) +	    f = 1.0f; +	else +	    f = CLAMP ((1.0f - da) / sa); +	break; + +    case ONE_MINUS_SA_OVER_DA: +	if (FLOAT_IS_ZERO (da)) +	    f = 0.0f; +	else +	    f = CLAMP (1.0f - sa / da); +	break; + +    case ONE_MINUS_DA_OVER_SA: +	if (FLOAT_IS_ZERO (sa)) +	    f = 0.0f; +	else +	    f = CLAMP (1.0f - da / sa); +	break; + +    case ONE_MINUS_INV_DA_OVER_SA: +	if (FLOAT_IS_ZERO (sa)) +	    f = 0.0f; +	else +	    f = CLAMP (1.0f - (1.0f - da) / sa); +	break; + +    case ONE_MINUS_INV_SA_OVER_DA: +	if (FLOAT_IS_ZERO (da)) +	    f = 0.0f; +	else +	    f = CLAMP (1.0f - (1.0f - sa) / da); +	break; +    } + +    return f; +} + +#define MAKE_PD_COMBINERS(name, a, b)					\ +    static float force_inline						\ +    pd_combine_ ## name (float sa, float s, float da, float d)		\ +    {									\ +	const float fa = get_factor (a, sa, da);			\ +	const float fb = get_factor (b, sa, da);			\ +									\ +	return MIN (1.0f, s * fa + d * fb);				\ +    }									\ +    									\ +    MAKE_COMBINERS(name, pd_combine_ ## name, pd_combine_ ## name) + +MAKE_PD_COMBINERS (clear,			ZERO,				ZERO) +MAKE_PD_COMBINERS (src,				ONE,				ZERO) +MAKE_PD_COMBINERS (dst,				ZERO,				ONE) +MAKE_PD_COMBINERS (over,			ONE,				INV_SA) +MAKE_PD_COMBINERS (over_reverse,		INV_DA,				ONE) +MAKE_PD_COMBINERS (in,				DEST_ALPHA,			ZERO) +MAKE_PD_COMBINERS (in_reverse,			ZERO,				SRC_ALPHA) +MAKE_PD_COMBINERS (out,				INV_DA,				ZERO) +MAKE_PD_COMBINERS (out_reverse,			ZERO,				INV_SA) +MAKE_PD_COMBINERS (atop,			DEST_ALPHA,			INV_SA) +MAKE_PD_COMBINERS (atop_reverse,		INV_DA,				SRC_ALPHA) +MAKE_PD_COMBINERS (xor,				INV_DA,				INV_SA) +MAKE_PD_COMBINERS (add,				ONE,				ONE) + +MAKE_PD_COMBINERS (saturate,			INV_DA_OVER_SA,			ONE) + +MAKE_PD_COMBINERS (disjoint_clear,		ZERO,				ZERO) +MAKE_PD_COMBINERS (disjoint_src,		ONE,				ZERO) +MAKE_PD_COMBINERS (disjoint_dst,		ZERO,				ONE) +MAKE_PD_COMBINERS (disjoint_over,		ONE,				INV_SA_OVER_DA) +MAKE_PD_COMBINERS (disjoint_over_reverse,	INV_DA_OVER_SA,			ONE) +MAKE_PD_COMBINERS (disjoint_in,			ONE_MINUS_INV_DA_OVER_SA,	ZERO) +MAKE_PD_COMBINERS (disjoint_in_reverse,		ZERO,				ONE_MINUS_INV_SA_OVER_DA) +MAKE_PD_COMBINERS (disjoint_out,		INV_DA_OVER_SA,			ZERO) +MAKE_PD_COMBINERS (disjoint_out_reverse,	ZERO,				INV_SA_OVER_DA) +MAKE_PD_COMBINERS (disjoint_atop,		ONE_MINUS_INV_DA_OVER_SA,	INV_SA_OVER_DA) +MAKE_PD_COMBINERS (disjoint_atop_reverse,	INV_DA_OVER_SA,			ONE_MINUS_INV_SA_OVER_DA) +MAKE_PD_COMBINERS (disjoint_xor,		INV_DA_OVER_SA,			INV_SA_OVER_DA) + +MAKE_PD_COMBINERS (conjoint_clear,		ZERO,				ZERO) +MAKE_PD_COMBINERS (conjoint_src,		ONE,				ZERO) +MAKE_PD_COMBINERS (conjoint_dst,		ZERO,				ONE) +MAKE_PD_COMBINERS (conjoint_over,		ONE,				ONE_MINUS_SA_OVER_DA) +MAKE_PD_COMBINERS (conjoint_over_reverse,	ONE_MINUS_DA_OVER_SA,		ONE) +MAKE_PD_COMBINERS (conjoint_in,			DA_OVER_SA,			ZERO) +MAKE_PD_COMBINERS (conjoint_in_reverse,		ZERO,				SA_OVER_DA) +MAKE_PD_COMBINERS (conjoint_out,		ONE_MINUS_DA_OVER_SA,		ZERO) +MAKE_PD_COMBINERS (conjoint_out_reverse,	ZERO,				ONE_MINUS_SA_OVER_DA) +MAKE_PD_COMBINERS (conjoint_atop,		DA_OVER_SA,			ONE_MINUS_SA_OVER_DA) +MAKE_PD_COMBINERS (conjoint_atop_reverse,	ONE_MINUS_DA_OVER_SA,		SA_OVER_DA) +MAKE_PD_COMBINERS (conjoint_xor,		ONE_MINUS_DA_OVER_SA,		ONE_MINUS_SA_OVER_DA) + +/* + * PDF blend modes: + * + * The following blend modes have been taken from the PDF ISO 32000 + * specification, which at this point in time is available from + * + *     http://www.adobe.com/devnet/pdf/pdf_reference.html + * + * The specific documents of interest are the PDF spec itself: + * + *     http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/PDF32000_2008.pdf + * + * chapters 11.3.5 and 11.3.6 and a later supplement for Adobe Acrobat + * 9.1 and Reader 9.1: + * + *     http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/adobe_supplement_iso32000_1.pdf + * + * that clarifies the specifications for blend modes ColorDodge and + * ColorBurn. + * + * The formula for computing the final pixel color given in 11.3.6 is: + * + *     αr × Cr = (1 – αs) × αb × Cb + (1 – αb) × αs × Cs + αb × αs × B(Cb, Cs) + * + * with B() is the blend function. When B(Cb, Cs) = Cs, this formula + * reduces to the regular OVER operator. + * + * Cs and Cb are not premultiplied, so in our implementation we instead + * use: + * + *     cr = (1 – αs) × cb  +  (1 – αb) × cs  +  αb × αs × B (cb/αb, cs/αs) + * + * where cr, cs, and cb are premultiplied colors, and where the + * + *     αb × αs × B(cb/αb, cs/αs) + * + * part is first arithmetically simplified under the assumption that αb + * and αs are not 0, and then updated to produce a meaningful result when + * they are. + * + * For all the blend mode operators, the alpha channel is given by + * + *     αr = αs + αb + αb × αs + */ + +#define MAKE_SEPARABLE_PDF_COMBINERS(name)				\ +    static force_inline float						\ +    combine_ ## name ## _a (float sa, float s, float da, float d)	\ +    {									\ +	return da + sa - da * sa;					\ +    }									\ +    									\ +    static force_inline float						\ +    combine_ ## name ## _c (float sa, float s, float da, float d)	\ +    {									\ +	float f = (1 - sa) * d + (1 - da) * s;				\ +									\ +	return f + blend_ ## name (sa, s, da, d);			\ +    }									\ +    									\ +    MAKE_COMBINERS (name, combine_ ## name ## _a, combine_ ## name ## _c) + +/* + * Multiply + * + *      ad * as * B(d / ad, s / as) + *    = ad * as * d/ad * s/as + *    = d * s + * + */ +static force_inline float +blend_multiply (float sa, float s, float da, float d) +{ +    return d * s; +} + +/* + * Screen + * + *      ad * as * B(d/ad, s/as) + *    = ad * as * (d/ad + s/as - s/as * d/ad) + *    = ad * s + as * d - s * d + */ +static force_inline float +blend_screen (float sa, float s, float da, float d) +{ +    return d * sa + s * da - s * d; +} + +/* + * Overlay + * + *     ad * as * B(d/ad, s/as) + *   = ad * as * Hardlight (s, d) + *   = if (d / ad < 0.5) + *         as * ad * Multiply (s/as, 2 * d/ad) + *     else + *         as * ad * Screen (s/as, 2 * d / ad - 1) + *   = if (d < 0.5 * ad) + *         as * ad * s/as * 2 * d /ad + *     else + *         as * ad * (s/as + 2 * d / ad - 1 - s / as * (2 * d / ad - 1)) + *   = if (2 * d < ad) + *         2 * s * d + *     else + *         ad * s + 2 * as * d - as * ad - ad * s * (2 * d / ad - 1) + *   = if (2 * d < ad) + *         2 * s * d + *     else + *         as * ad - 2 * (ad - d) * (as - s) + */ +static force_inline float +blend_overlay (float sa, float s, float da, float d) +{ +    if (2 * d < da) +	return 2 * s * d; +    else +	return sa * da - 2 * (da - d) * (sa - s); +} + +/* + * Darken + * + *     ad * as * B(d/ad, s/as) + *   = ad * as * MIN(d/ad, s/as) + *   = MIN (as * d, ad * s) + */ +static force_inline float +blend_darken (float sa, float s, float da, float d) +{ +    s = s * da; +    d = d * sa; + +    if (s > d) +	return d; +    else +	return s; +} + +/* + * Lighten + * + *     ad * as * B(d/ad, s/as) + *   = ad * as * MAX(d/ad, s/as) + *   = MAX (as * d, ad * s) + */ +static force_inline float +blend_lighten (float sa, float s, float da, float d) +{ +    s = s * da; +    d = d * sa; + +    if (s > d) +	return s; +    else +	return d; +} + +/* + * Color dodge + * + *     ad * as * B(d/ad, s/as) + *   = if d/ad = 0 + *         ad * as * 0 + *     else if (d/ad >= (1 - s/as) + *         ad * as * 1 + *     else + *         ad * as * ((d/ad) / (1 - s/as)) + *   = if d = 0 + *         0 + *     elif as * d >= ad * (as - s) + *         ad * as + *     else + *         as * (as * d / (as - s)) + * + */ +static force_inline float +blend_color_dodge (float sa, float s, float da, float d) +{ +    if (FLOAT_IS_ZERO (d)) +	return 0.0f; +    else if (d * sa >= sa * da - s * da) +	return sa * da; +    else if (FLOAT_IS_ZERO (sa - s)) +	return sa * da; +    else +	return sa * sa * d / (sa - s); +} + +/* + * Color burn + * + * We modify the first clause "if d = 1" to "if d >= 1" since with + * premultiplied colors d > 1 can actually happen. + * + *     ad * as * B(d/ad, s/as) + *   = if d/ad >= 1 + *         ad * as * 1 + *     elif (1 - d/ad) >= s/as + *         ad * as * 0 + *     else + *         ad * as * (1 - ((1 - d/ad) / (s/as))) + *   = if d >= ad + *         ad * as + *     elif as * ad - as * d >= ad * s + *         0 + *     else + *         ad * as  - as * as * (ad - d) / s + */ +static force_inline float +blend_color_burn (float sa, float s, float da, float d) +{ +    if (d >= da) +	return sa * da; +    else if (sa * (da - d) >= s * da) +	return 0.0f; +    else if (FLOAT_IS_ZERO (s)) +	return 0.0f; +    else +	return sa * (da - sa * (da - d) / s); +} + +/* + * Hard light + * + *     ad * as * B(d/ad, s/as) + *   = if (s/as <= 0.5) + *         ad * as * Multiply (d/ad, 2 * s/as) + *     else + *         ad * as * Screen (d/ad, 2 * s/as - 1) + *   = if 2 * s <= as + *         ad * as * d/ad * 2 * s / as + *     else + *         ad * as * (d/ad + (2 * s/as - 1) + d/ad * (2 * s/as - 1)) + *   = if 2 * s <= as + *         2 * s * d + *     else + *         as * ad - 2 * (ad - d) * (as - s) + */ +static force_inline float +blend_hard_light (float sa, float s, float da, float d) +{ +    if (2 * s < sa) +	return 2 * s * d; +    else +	return sa * da - 2 * (da - d) * (sa - s); +} + +/* + * Soft light + * + *     ad * as * B(d/ad, s/as) + *   = if (s/as <= 0.5) + *         ad * as * (d/ad - (1 - 2 * s/as) * d/ad * (1 - d/ad)) + *     else if (d/ad <= 0.25) + *         ad * as * (d/ad + (2 * s/as - 1) * ((((16 * d/ad - 12) * d/ad + 4) * d/ad) - d/ad)) + *     else + *         ad * as * (d/ad + (2 * s/as - 1) * sqrt (d/ad)) + *   = if (2 * s <= as) + *         d * as - d * (ad - d) * (as - 2 * s) / ad; + *     else if (4 * d <= ad) + *         (2 * s - as) * d * ((16 * d / ad - 12) * d / ad + 3); + *     else + *         d * as + (sqrt (d * ad) - d) * (2 * s - as); + */ +static force_inline float +blend_soft_light (float sa, float s, float da, float d) +{ +    if (2 * s <= sa) +    { +	if (FLOAT_IS_ZERO (da)) +	    return d * sa; +	else +	    return d * sa - d * (da - d) * (sa - 2 * s) / da; +    } +    else +    { +	if (FLOAT_IS_ZERO (da)) +	{ +	    return d * sa; +	} +	else +	{ +	    if (4 * d <= da) +		return d * sa + (2 * s - sa) * d * ((16 * d / da - 12) * d / da + 3); +	    else +		return d * sa + (sqrtf (d * da) - d) * (2 * s - sa); +	} +    } +} + +/* + * Difference + * + *     ad * as * B(s/as, d/ad) + *   = ad * as * abs (s/as - d/ad) + *   = if (s/as <= d/ad) + *         ad * as * (d/ad - s/as) + *     else + *         ad * as * (s/as - d/ad) + *   = if (ad * s <= as * d) + *        as * d - ad * s + *     else + *        ad * s - as * d + */ +static force_inline float +blend_difference (float sa, float s, float da, float d) +{ +    float dsa = d * sa; +    float sda = s * da; + +    if (sda < dsa) +	return dsa - sda; +    else +	return sda - dsa; +} + +/* + * Exclusion + * + *     ad * as * B(s/as, d/ad) + *   = ad * as * (d/ad + s/as - 2 * d/ad * s/as) + *   = as * d + ad * s - 2 * s * d + */ +static force_inline float +blend_exclusion (float sa, float s, float da, float d) +{ +    return s * da + d * sa - 2 * d * s; +} + +MAKE_SEPARABLE_PDF_COMBINERS (multiply) +MAKE_SEPARABLE_PDF_COMBINERS (screen) +MAKE_SEPARABLE_PDF_COMBINERS (overlay) +MAKE_SEPARABLE_PDF_COMBINERS (darken) +MAKE_SEPARABLE_PDF_COMBINERS (lighten) +MAKE_SEPARABLE_PDF_COMBINERS (color_dodge) +MAKE_SEPARABLE_PDF_COMBINERS (color_burn) +MAKE_SEPARABLE_PDF_COMBINERS (hard_light) +MAKE_SEPARABLE_PDF_COMBINERS (soft_light) +MAKE_SEPARABLE_PDF_COMBINERS (difference) +MAKE_SEPARABLE_PDF_COMBINERS (exclusion) + +/* + * PDF nonseperable blend modes are implemented using the following functions + * to operate in Hsl space, with Cmax, Cmid, Cmin referring to the max, mid + * and min value of the red, green and blue components. + * + * LUM (C) = 0.3 × Cred + 0.59 × Cgreen + 0.11 × Cblue + * + * clip_color (C): + *     l = LUM (C) + *     min = Cmin + *     max = Cmax + *     if n < 0.0 + *         C = l + (((C – l) × l) ⁄ (l – min)) + *     if x > 1.0 + *         C = l + (((C – l) × (1 – l) ) ⁄ (max – l)) + *     return C + * + * set_lum (C, l): + *     d = l – LUM (C) + *     C += d + *     return clip_color (C) + * + * SAT (C) = CH_MAX (C) - CH_MIN (C) + * + * set_sat (C, s): + *     if Cmax > Cmin + *         Cmid = ( ( ( Cmid – Cmin ) × s ) ⁄ ( Cmax – Cmin ) ) + *         Cmax = s + *     else + *         Cmid = Cmax = 0.0 + *         Cmin = 0.0 + *     return C + */ + +/* For premultiplied colors, we need to know what happens when C is + * multiplied by a real number. LUM and SAT are linear: + * + *     LUM (r × C) = r × LUM (C)	SAT (r * C) = r * SAT (C) + * + * If we extend clip_color with an extra argument a and change + * + *     if x >= 1.0 + * + * into + * + *     if x >= a + * + * then clip_color is also linear: + * + *     r * clip_color (C, a) = clip_color (r * C, r * a); + * + * for positive r. + * + * Similarly, we can extend set_lum with an extra argument that is just passed + * on to clip_color: + * + *       r * set_lum (C, l, a) + * + *     = r × clip_color (C + l - LUM (C), a) + * + *     = clip_color (r * C + r × l - r * LUM (C), r * a) + * + *     = set_lum (r * C, r * l, r * a) + * + * Finally, set_sat: + * + *       r * set_sat (C, s) = set_sat (x * C, r * s) + * + * The above holds for all non-zero x, because the x'es in the fraction for + * C_mid cancel out. Specifically, it holds for x = r: + * + *       r * set_sat (C, s) = set_sat (r * C, r * s) + * + */ +typedef struct +{ +    float	r; +    float	g; +    float	b; +} rgb_t; + +static force_inline float +minf (float a, float b) +{ +    return a < b? a : b; +} + +static force_inline float +maxf (float a, float b) +{ +    return a > b? a : b; +} + +static force_inline float +channel_min (const rgb_t *c) +{ +    return minf (minf (c->r, c->g), c->b); +} + +static force_inline float +channel_max (const rgb_t *c) +{ +    return maxf (maxf (c->r, c->g), c->b); +} + +static force_inline float +get_lum (const rgb_t *c) +{ +    return c->r * 0.3f + c->g * 0.59f + c->b * 0.11f; +} + +static force_inline float +get_sat (const rgb_t *c) +{ +    return channel_max (c) - channel_min (c); +} + +static void +clip_color (rgb_t *color, float a) +{ +    float l = get_lum (color); +    float n = channel_min (color); +    float x = channel_max (color); +    float t; + +    if (n < 0.0f) +    { +	t = l - n; +	if (FLOAT_IS_ZERO (t)) +	{ +	    color->r = 0.0f; +	    color->g = 0.0f; +	    color->b = 0.0f; +	} +	else +	{ +	    color->r = l + (((color->r - l) * l) / t); +	    color->g = l + (((color->g - l) * l) / t); +	    color->b = l + (((color->b - l) * l) / t); +	} +    } +    if (x > a) +    { +	t = x - l; +	if (FLOAT_IS_ZERO (t)) +	{ +	    color->r = a; +	    color->g = a; +	    color->b = a; +	} +	else +	{ +	    color->r = l + (((color->r - l) * (a - l) / t)); +	    color->g = l + (((color->g - l) * (a - l) / t)); +	    color->b = l + (((color->b - l) * (a - l) / t)); +	} +    } +} + +static void +set_lum (rgb_t *color, float sa, float l) +{ +    float d = l - get_lum (color); + +    color->r = color->r + d; +    color->g = color->g + d; +    color->b = color->b + d; + +    clip_color (color, sa); +} + +static void +set_sat (rgb_t *src, float sat) +{ +    float *max, *mid, *min; +    float t; + +    if (src->r > src->g) +    { +	if (src->r > src->b) +	{ +	    max = &(src->r); + +	    if (src->g > src->b) +	    { +		mid = &(src->g); +		min = &(src->b); +	    } +	    else +	    { +		mid = &(src->b); +		min = &(src->g); +	    } +	} +	else +	{ +	    max = &(src->b); +	    mid = &(src->r); +	    min = &(src->g); +	} +    } +    else +    { +	if (src->r > src->b) +	{ +	    max = &(src->g); +	    mid = &(src->r); +	    min = &(src->b); +	} +	else +	{ +	    min = &(src->r); + +	    if (src->g > src->b) +	    { +		max = &(src->g); +		mid = &(src->b); +	    } +	    else +	    { +		max = &(src->b); +		mid = &(src->g); +	    } +	} +    } + +    t = *max - *min; + +    if (FLOAT_IS_ZERO (t)) +    { +	*mid = *max = 0.0f; +    } +    else +    { +	*mid = ((*mid - *min) * sat) / t; +	*max = sat; +    } + +    *min = 0.0f; +} + +/* Hue: + * + *       as * ad * B(s/as, d/as) + *     = as * ad * set_lum (set_sat (s/as, SAT (d/ad)), LUM (d/ad), 1) + *     = set_lum (set_sat (ad * s, as * SAT (d)), as * LUM (d), as * ad) + * + */ +static force_inline void +blend_hsl_hue (rgb_t *res, +	       const rgb_t *dest, float da, +	       const rgb_t *src, float sa) +{ +    res->r = src->r * da; +    res->g = src->g * da; +    res->b = src->b * da; + +    set_sat (res, get_sat (dest) * sa); +    set_lum (res, sa * da, get_lum (dest) * sa); +} + +/*  + * Saturation + * + *     as * ad * B(s/as, d/ad) + *   = as * ad * set_lum (set_sat (d/ad, SAT (s/as)), LUM (d/ad), 1) + *   = set_lum (as * ad * set_sat (d/ad, SAT (s/as)), + *                                       as * LUM (d), as * ad) + *   = set_lum (set_sat (as * d, ad * SAT (s), as * LUM (d), as * ad)) + */ +static force_inline void +blend_hsl_saturation (rgb_t *res, +		      const rgb_t *dest, float da, +		      const rgb_t *src, float sa) +{ +    res->r = dest->r * sa; +    res->g = dest->g * sa; +    res->b = dest->b * sa; + +    set_sat (res, get_sat (src) * da); +    set_lum (res, sa * da, get_lum (dest) * sa); +} + +/*  + * Color + * + *     as * ad * B(s/as, d/as) + *   = as * ad * set_lum (s/as, LUM (d/ad), 1) + *   = set_lum (s * ad, as * LUM (d), as * ad) + */ +static force_inline void +blend_hsl_color (rgb_t *res, +		 const rgb_t *dest, float da, +		 const rgb_t *src, float sa) +{ +    res->r = src->r * da; +    res->g = src->g * da; +    res->b = src->b * da; + +    set_lum (res, sa * da, get_lum (dest) * sa); +} + +/* + * Luminosity + * + *     as * ad * B(s/as, d/ad) + *   = as * ad * set_lum (d/ad, LUM (s/as), 1) + *   = set_lum (as * d, ad * LUM (s), as * ad) + */ +static force_inline void +blend_hsl_luminosity (rgb_t *res, +		      const rgb_t *dest, float da, +		      const rgb_t *src, float sa) +{ +    res->r = dest->r * sa; +    res->g = dest->g * sa; +    res->b = dest->b * sa; + +    set_lum (res, sa * da, get_lum (src) * da); +} + +#define MAKE_NON_SEPARABLE_PDF_COMBINERS(name)				\ +    static void								\ +    combine_ ## name ## _u_float (pixman_implementation_t *imp,		\ +				  pixman_op_t              op,		\ +				  float                   *dest,	\ +				  const float             *src,		\ +				  const float             *mask,	\ +				  int		           n_pixels)	\ +    {									\ +    	int i;								\ +									\ +	for (i = 0; i < 4 * n_pixels; i += 4)				\ +	{								\ +	    float sa, da;						\ +	    rgb_t sc, dc, rc;						\ +									\ +	    sa = src[i + 0];						\ +	    sc.r = src[i + 1];						\ +	    sc.g = src[i + 2];						\ +	    sc.b = src[i + 3];						\ +									\ +	    da = dest[i + 0];						\ +	    dc.r = dest[i + 1];						\ +	    dc.g = dest[i + 2];						\ +	    dc.b = dest[i + 3];						\ +									\ +	    if (mask)							\ +	    {								\ +		float ma = mask[i + 0];					\ +									\ +		/* Component alpha is not supported for HSL modes */	\ +		sa *= ma;						\ +		sc.r *= ma;						\ +		sc.g *= ma;						\ +		sc.g *= ma;						\ +	    }								\ +									\ +	    blend_ ## name (&rc, &dc, da, &sc, sa);			\ +									\ +	    dest[i + 0] = sa + da - sa * da;				\ +	    dest[i + 1] = (1 - sa) * dc.r + (1 - da) * sc.r + rc.r;	\ +	    dest[i + 2] = (1 - sa) * dc.g + (1 - da) * sc.g + rc.g;	\ +	    dest[i + 3] = (1 - sa) * dc.b + (1 - da) * sc.b + rc.b;	\ +	}								\ +    } + +MAKE_NON_SEPARABLE_PDF_COMBINERS(hsl_hue) +MAKE_NON_SEPARABLE_PDF_COMBINERS(hsl_saturation) +MAKE_NON_SEPARABLE_PDF_COMBINERS(hsl_color) +MAKE_NON_SEPARABLE_PDF_COMBINERS(hsl_luminosity) + +void +_pixman_setup_combiner_functions_float (pixman_implementation_t *imp) +{ +    /* Unified alpha */ +    imp->combine_float[PIXMAN_OP_CLEAR] = combine_clear_u_float; +    imp->combine_float[PIXMAN_OP_SRC] = combine_src_u_float; +    imp->combine_float[PIXMAN_OP_DST] = combine_dst_u_float; +    imp->combine_float[PIXMAN_OP_OVER] = combine_over_u_float; +    imp->combine_float[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_u_float; +    imp->combine_float[PIXMAN_OP_IN] = combine_in_u_float; +    imp->combine_float[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_u_float; +    imp->combine_float[PIXMAN_OP_OUT] = combine_out_u_float; +    imp->combine_float[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_u_float; +    imp->combine_float[PIXMAN_OP_ATOP] = combine_atop_u_float; +    imp->combine_float[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_u_float; +    imp->combine_float[PIXMAN_OP_XOR] = combine_xor_u_float; +    imp->combine_float[PIXMAN_OP_ADD] = combine_add_u_float; +    imp->combine_float[PIXMAN_OP_SATURATE] = combine_saturate_u_float; + +    /* Disjoint, unified */ +    imp->combine_float[PIXMAN_OP_DISJOINT_CLEAR] = combine_disjoint_clear_u_float; +    imp->combine_float[PIXMAN_OP_DISJOINT_SRC] = combine_disjoint_src_u_float; +    imp->combine_float[PIXMAN_OP_DISJOINT_DST] = combine_disjoint_dst_u_float; +    imp->combine_float[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_u_float; +    imp->combine_float[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_disjoint_over_reverse_u_float; +    imp->combine_float[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_u_float; +    imp->combine_float[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_u_float; +    imp->combine_float[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_u_float; +    imp->combine_float[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_u_float; +    imp->combine_float[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_u_float; +    imp->combine_float[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_u_float; +    imp->combine_float[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_u_float; + +    /* Conjoint, unified */ +    imp->combine_float[PIXMAN_OP_CONJOINT_CLEAR] = combine_conjoint_clear_u_float; +    imp->combine_float[PIXMAN_OP_CONJOINT_SRC] = combine_conjoint_src_u_float; +    imp->combine_float[PIXMAN_OP_CONJOINT_DST] = combine_conjoint_dst_u_float; +    imp->combine_float[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_u_float; +    imp->combine_float[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_u_float; +    imp->combine_float[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_u_float; +    imp->combine_float[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_u_float; +    imp->combine_float[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_u_float; +    imp->combine_float[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_u_float; +    imp->combine_float[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_u_float; +    imp->combine_float[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_u_float; +    imp->combine_float[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_u_float; + +    /* PDF operators, unified */ +    imp->combine_float[PIXMAN_OP_MULTIPLY] = combine_multiply_u_float; +    imp->combine_float[PIXMAN_OP_SCREEN] = combine_screen_u_float; +    imp->combine_float[PIXMAN_OP_OVERLAY] = combine_overlay_u_float; +    imp->combine_float[PIXMAN_OP_DARKEN] = combine_darken_u_float; +    imp->combine_float[PIXMAN_OP_LIGHTEN] = combine_lighten_u_float; +    imp->combine_float[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_u_float; +    imp->combine_float[PIXMAN_OP_COLOR_BURN] = combine_color_burn_u_float; +    imp->combine_float[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_u_float; +    imp->combine_float[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_u_float; +    imp->combine_float[PIXMAN_OP_DIFFERENCE] = combine_difference_u_float; +    imp->combine_float[PIXMAN_OP_EXCLUSION] = combine_exclusion_u_float; + +    imp->combine_float[PIXMAN_OP_HSL_HUE] = combine_hsl_hue_u_float; +    imp->combine_float[PIXMAN_OP_HSL_SATURATION] = combine_hsl_saturation_u_float; +    imp->combine_float[PIXMAN_OP_HSL_COLOR] = combine_hsl_color_u_float; +    imp->combine_float[PIXMAN_OP_HSL_LUMINOSITY] = combine_hsl_luminosity_u_float; + +    /* Component alpha combiners */ +    imp->combine_float_ca[PIXMAN_OP_CLEAR] = combine_clear_ca_float; +    imp->combine_float_ca[PIXMAN_OP_SRC] = combine_src_ca_float; +    imp->combine_float_ca[PIXMAN_OP_DST] = combine_dst_ca_float; +    imp->combine_float_ca[PIXMAN_OP_OVER] = combine_over_ca_float; +    imp->combine_float_ca[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_ca_float; +    imp->combine_float_ca[PIXMAN_OP_IN] = combine_in_ca_float; +    imp->combine_float_ca[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_ca_float; +    imp->combine_float_ca[PIXMAN_OP_OUT] = combine_out_ca_float; +    imp->combine_float_ca[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_ca_float; +    imp->combine_float_ca[PIXMAN_OP_ATOP] = combine_atop_ca_float; +    imp->combine_float_ca[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_ca_float; +    imp->combine_float_ca[PIXMAN_OP_XOR] = combine_xor_ca_float; +    imp->combine_float_ca[PIXMAN_OP_ADD] = combine_add_ca_float; +    imp->combine_float_ca[PIXMAN_OP_SATURATE] = combine_saturate_ca_float; + +    /* Disjoint CA */ +    imp->combine_float_ca[PIXMAN_OP_DISJOINT_CLEAR] = combine_disjoint_clear_ca_float; +    imp->combine_float_ca[PIXMAN_OP_DISJOINT_SRC] = combine_disjoint_src_ca_float; +    imp->combine_float_ca[PIXMAN_OP_DISJOINT_DST] = combine_disjoint_dst_ca_float; +    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_ca_float; +    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_disjoint_over_reverse_ca_float; +    imp->combine_float_ca[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_ca_float; +    imp->combine_float_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_ca_float; +    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_ca_float; +    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_ca_float; +    imp->combine_float_ca[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_ca_float; +    imp->combine_float_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_ca_float; +    imp->combine_float_ca[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_ca_float; + +    /* Conjoint CA */ +    imp->combine_float_ca[PIXMAN_OP_CONJOINT_CLEAR] = combine_conjoint_clear_ca_float; +    imp->combine_float_ca[PIXMAN_OP_CONJOINT_SRC] = combine_conjoint_src_ca_float; +    imp->combine_float_ca[PIXMAN_OP_CONJOINT_DST] = combine_conjoint_dst_ca_float; +    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_ca_float; +    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_ca_float; +    imp->combine_float_ca[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_ca_float; +    imp->combine_float_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_ca_float; +    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_ca_float; +    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_ca_float; +    imp->combine_float_ca[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_ca_float; +    imp->combine_float_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_ca_float; +    imp->combine_float_ca[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_ca_float; + +    /* PDF operators CA */ +    imp->combine_float_ca[PIXMAN_OP_MULTIPLY] = combine_multiply_ca_float; +    imp->combine_float_ca[PIXMAN_OP_SCREEN] = combine_screen_ca_float; +    imp->combine_float_ca[PIXMAN_OP_OVERLAY] = combine_overlay_ca_float; +    imp->combine_float_ca[PIXMAN_OP_DARKEN] = combine_darken_ca_float; +    imp->combine_float_ca[PIXMAN_OP_LIGHTEN] = combine_lighten_ca_float; +    imp->combine_float_ca[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_ca_float; +    imp->combine_float_ca[PIXMAN_OP_COLOR_BURN] = combine_color_burn_ca_float; +    imp->combine_float_ca[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_ca_float; +    imp->combine_float_ca[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_ca_float; +    imp->combine_float_ca[PIXMAN_OP_DIFFERENCE] = combine_difference_ca_float; +    imp->combine_float_ca[PIXMAN_OP_EXCLUSION] = combine_exclusion_ca_float; + +    /* It is not clear that these make sense, so make them noops for now */ +    imp->combine_float_ca[PIXMAN_OP_HSL_HUE] = combine_dst_u_float; +    imp->combine_float_ca[PIXMAN_OP_HSL_SATURATION] = combine_dst_u_float; +    imp->combine_float_ca[PIXMAN_OP_HSL_COLOR] = combine_dst_u_float; +    imp->combine_float_ca[PIXMAN_OP_HSL_LUMINOSITY] = combine_dst_u_float; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-combine32.c b/libs/pixman-0.40.0/pixman/pixman-combine32.c new file mode 100644 index 0000000..4a89384 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-combine32.c @@ -0,0 +1,1189 @@ +/* + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. + *             2005 Lars Knoll & Zack Rusin, Trolltech + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Keith Packard not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission.  Keith Packard makes no + * representations about the suitability of this software for any purpose.  It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <math.h> +#include <string.h> + +#include "pixman-private.h" +#include "pixman-combine32.h" + +/* component alpha helper functions */ + +static void +combine_mask_ca (uint32_t *src, uint32_t *mask) +{ +    uint32_t a = *mask; + +    uint32_t x; +    uint16_t xa; + +    if (!a) +    { +	*(src) = 0; +	return; +    } + +    x = *(src); +    if (a == ~0) +    { +	x = x >> A_SHIFT; +	x |= x << G_SHIFT; +	x |= x << R_SHIFT; +	*(mask) = x; +	return; +    } + +    xa = x >> A_SHIFT; +    UN8x4_MUL_UN8x4 (x, a); +    *(src) = x; +     +    UN8x4_MUL_UN8 (a, xa); +    *(mask) = a; +} + +static void +combine_mask_value_ca (uint32_t *src, const uint32_t *mask) +{ +    uint32_t a = *mask; +    uint32_t x; + +    if (!a) +    { +	*(src) = 0; +	return; +    } + +    if (a == ~0) +	return; + +    x = *(src); +    UN8x4_MUL_UN8x4 (x, a); +    *(src) = x; +} + +static void +combine_mask_alpha_ca (const uint32_t *src, uint32_t *mask) +{ +    uint32_t a = *(mask); +    uint32_t x; + +    if (!a) +	return; + +    x = *(src) >> A_SHIFT; +    if (x == MASK) +	return; + +    if (a == ~0) +    { +	x |= x << G_SHIFT; +	x |= x << R_SHIFT; +	*(mask) = x; +	return; +    } + +    UN8x4_MUL_UN8 (a, x); +    *(mask) = a; +} + +/* + * There are two ways of handling alpha -- either as a single unified value or + * a separate value for each component, hence each macro must have two + * versions.  The unified alpha version has a 'u' at the end of the name, + * the component version has a 'ca'.  Similarly, functions which deal with + * this difference will have two versions using the same convention. + */ + +static force_inline uint32_t +combine_mask (const uint32_t *src, const uint32_t *mask, int i) +{ +    uint32_t s, m; + +    if (mask) +    { +	m = *(mask + i) >> A_SHIFT; + +	if (!m) +	    return 0; +    } + +    s = *(src + i); + +    if (mask) +	UN8x4_MUL_UN8 (s, m); + +    return s; +} + +static void +combine_clear (pixman_implementation_t *imp, +               pixman_op_t              op, +               uint32_t *               dest, +               const uint32_t *         src, +               const uint32_t *         mask, +               int                      width) +{ +    memset (dest, 0, width * sizeof (uint32_t)); +} + +static void +combine_dst (pixman_implementation_t *imp, +	     pixman_op_t	      op, +	     uint32_t *		      dest, +	     const uint32_t *	      src, +	     const uint32_t *         mask, +	     int		      width) +{ +    return; +} + +static void +combine_src_u (pixman_implementation_t *imp, +               pixman_op_t              op, +               uint32_t *               dest, +               const uint32_t *         src, +               const uint32_t *         mask, +               int                      width) +{ +    int i; + +    if (!mask) +    { +	memcpy (dest, src, width * sizeof (uint32_t)); +    } +    else +    { +	for (i = 0; i < width; ++i) +	{ +	    uint32_t s = combine_mask (src, mask, i); + +	    *(dest + i) = s; +	} +    } +} + +static void +combine_over_u (pixman_implementation_t *imp, +                pixman_op_t              op, +                uint32_t *               dest, +                const uint32_t *         src, +                const uint32_t *         mask, +                int                      width) +{ +    int i; + +    if (!mask) +    { +	for (i = 0; i < width; ++i) +	{ +	    uint32_t s = *(src + i); +	    uint32_t a = ALPHA_8 (s); +	    if (a == 0xFF) +	    { +		*(dest + i) = s; +	    } +	    else if (s) +	    { +		uint32_t d = *(dest + i); +		uint32_t ia = a ^ 0xFF; +		UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); +		*(dest + i) = d; +	    } +	} +    } +    else +    { +	for (i = 0; i < width; ++i) +	{ +	    uint32_t m = ALPHA_8 (*(mask + i)); +	    if (m == 0xFF) +	    { +		uint32_t s = *(src + i); +		uint32_t a = ALPHA_8 (s); +		if (a == 0xFF) +		{ +		    *(dest + i) = s; +		} +		else if (s) +		{ +		    uint32_t d = *(dest + i); +		    uint32_t ia = a ^ 0xFF; +		    UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); +		    *(dest + i) = d; +		} +	    } +	    else if (m) +	    { +		uint32_t s = *(src + i); +		if (s) +		{ +		    uint32_t d = *(dest + i); +		    UN8x4_MUL_UN8 (s, m); +		    UN8x4_MUL_UN8_ADD_UN8x4 (d, ALPHA_8 (~s), s); +		    *(dest + i) = d; +		} +	    } +	} +    } +} + +static void +combine_over_reverse_u (pixman_implementation_t *imp, +                        pixman_op_t              op, +                        uint32_t *               dest, +                        const uint32_t *         src, +                        const uint32_t *         mask, +                        int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t s = combine_mask (src, mask, i); +	uint32_t d = *(dest + i); +	uint32_t ia = ALPHA_8 (~*(dest + i)); +	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d); +	*(dest + i) = s; +    } +} + +static void +combine_in_u (pixman_implementation_t *imp, +              pixman_op_t              op, +              uint32_t *               dest, +              const uint32_t *         src, +              const uint32_t *         mask, +              int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t s = combine_mask (src, mask, i); +	uint32_t a = ALPHA_8 (*(dest + i)); +	UN8x4_MUL_UN8 (s, a); +	*(dest + i) = s; +    } +} + +static void +combine_in_reverse_u (pixman_implementation_t *imp, +                      pixman_op_t              op, +                      uint32_t *               dest, +                      const uint32_t *         src, +                      const uint32_t *         mask, +                      int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t s = combine_mask (src, mask, i); +	uint32_t d = *(dest + i); +	uint32_t a = ALPHA_8 (s); +	UN8x4_MUL_UN8 (d, a); +	*(dest + i) = d; +    } +} + +static void +combine_out_u (pixman_implementation_t *imp, +               pixman_op_t              op, +               uint32_t *               dest, +               const uint32_t *         src, +               const uint32_t *         mask, +               int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t s = combine_mask (src, mask, i); +	uint32_t a = ALPHA_8 (~*(dest + i)); +	UN8x4_MUL_UN8 (s, a); +	*(dest + i) = s; +    } +} + +static void +combine_out_reverse_u (pixman_implementation_t *imp, +                       pixman_op_t              op, +                       uint32_t *               dest, +                       const uint32_t *         src, +                       const uint32_t *         mask, +                       int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t s = combine_mask (src, mask, i); +	uint32_t d = *(dest + i); +	uint32_t a = ALPHA_8 (~s); +	UN8x4_MUL_UN8 (d, a); +	*(dest + i) = d; +    } +} + +static void +combine_atop_u (pixman_implementation_t *imp, +                pixman_op_t              op, +                uint32_t *               dest, +                const uint32_t *         src, +                const uint32_t *         mask, +                int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t s = combine_mask (src, mask, i); +	uint32_t d = *(dest + i); +	uint32_t dest_a = ALPHA_8 (d); +	uint32_t src_ia = ALPHA_8 (~s); + +	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia); +	*(dest + i) = s; +    } +} + +static void +combine_atop_reverse_u (pixman_implementation_t *imp, +                        pixman_op_t              op, +                        uint32_t *               dest, +                        const uint32_t *         src, +                        const uint32_t *         mask, +                        int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t s = combine_mask (src, mask, i); +	uint32_t d = *(dest + i); +	uint32_t src_a = ALPHA_8 (s); +	uint32_t dest_ia = ALPHA_8 (~d); + +	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a); +	*(dest + i) = s; +    } +} + +static void +combine_xor_u (pixman_implementation_t *imp, +               pixman_op_t              op, +               uint32_t *               dest, +               const uint32_t *         src, +               const uint32_t *         mask, +               int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t s = combine_mask (src, mask, i); +	uint32_t d = *(dest + i); +	uint32_t src_ia = ALPHA_8 (~s); +	uint32_t dest_ia = ALPHA_8 (~d); + +	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia); +	*(dest + i) = s; +    } +} + +static void +combine_add_u (pixman_implementation_t *imp, +               pixman_op_t              op, +               uint32_t *               dest, +               const uint32_t *         src, +               const uint32_t *         mask, +               int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t s = combine_mask (src, mask, i); +	uint32_t d = *(dest + i); +	UN8x4_ADD_UN8x4 (d, s); +	*(dest + i) = d; +    } +} + +/* + * PDF blend modes: + * + * The following blend modes have been taken from the PDF ISO 32000 + * specification, which at this point in time is available from + * + *     http://www.adobe.com/devnet/pdf/pdf_reference.html + * + * The specific documents of interest are the PDF spec itself: + * + *     http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/PDF32000_2008.pdf + * + * chapters 11.3.5 and 11.3.6 and a later supplement for Adobe Acrobat + * 9.1 and Reader 9.1: + * + *     http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/adobe_supplement_iso32000_1.pdf + * + * that clarifies the specifications for blend modes ColorDodge and + * ColorBurn. + * + * The formula for computing the final pixel color given in 11.3.6 is: + * + *     αr × Cr = (1 – αs) × αb × Cb + (1 – αb) × αs × Cs + αb × αs × B(Cb, Cs) + * + * with B() is the blend function. When B(Cb, Cs) = Cs, this formula + * reduces to the regular OVER operator. + * + * Cs and Cb are not premultiplied, so in our implementation we instead + * use: + * + *     cr = (1 – αs) × cb  +  (1 – αb) × cs  +  αb × αs × B (cb/αb, cs/αs) + * + * where cr, cs, and cb are premultiplied colors, and where the + * + *     αb × αs × B(cb/αb, cs/αs) + * + * part is first arithmetically simplified under the assumption that αb + * and αs are not 0, and then updated to produce a meaningful result when + * they are. + * + * For all the blend mode operators, the alpha channel is given by + * + *     αr = αs + αb + αb × αs + */ + +/* + * Multiply + * + *      ad * as * B(d / ad, s / as) + *    = ad * as * d/ad * s/as + *    = d * s + * + */ +static void +combine_multiply_u (pixman_implementation_t *imp, +                    pixman_op_t              op, +                    uint32_t *               dest, +                    const uint32_t *         src, +                    const uint32_t *         mask, +                    int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t s = combine_mask (src, mask, i); +	uint32_t d = *(dest + i); +	uint32_t ss = s; +	uint32_t src_ia = ALPHA_8 (~s); +	uint32_t dest_ia = ALPHA_8 (~d); + +	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (ss, dest_ia, d, src_ia); +	UN8x4_MUL_UN8x4 (d, s); +	UN8x4_ADD_UN8x4 (d, ss); + +	*(dest + i) = d; +    } +} + +static void +combine_multiply_ca (pixman_implementation_t *imp, +                     pixman_op_t              op, +                     uint32_t *               dest, +                     const uint32_t *         src, +                     const uint32_t *         mask, +                     int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t m = *(mask + i); +	uint32_t s = *(src + i); +	uint32_t d = *(dest + i); +	uint32_t r = d; +	uint32_t dest_ia = ALPHA_8 (~d); + +	combine_mask_ca (&s, &m); + +	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (r, ~m, s, dest_ia); +	UN8x4_MUL_UN8x4 (d, s); +	UN8x4_ADD_UN8x4 (r, d); + +	*(dest + i) = r; +    } +} + +#define CLAMP(v, low, high)						\ +    do									\ +    {									\ +	if (v < (low))							\ +	    v = (low);							\ +	if (v > (high))							\ +	    v = (high);							\ +    } while (0) + +#define PDF_SEPARABLE_BLEND_MODE(name)					\ +    static void								\ +    combine_ ## name ## _u (pixman_implementation_t *imp,		\ +			    pixman_op_t              op,		\ +                            uint32_t *               dest,		\ +			    const uint32_t *         src,		\ +			    const uint32_t *         mask,		\ +			    int                      width)		\ +    {									\ +	int i;								\ +	for (i = 0; i < width; ++i)					\ +	{								\ +	    uint32_t s = combine_mask (src, mask, i);			\ +	    uint32_t d = *(dest + i);					\ +	    uint8_t sa = ALPHA_8 (s);					\ +	    uint8_t isa = ~sa;						\ +	    uint8_t da = ALPHA_8 (d);					\ +	    uint8_t ida = ~da;						\ +	    uint32_t ra, rr, rg, rb;					\ +	    								\ +	    ra = da * 0xff + sa * 0xff - sa * da;			\ +	    rr = isa * RED_8 (d) + ida * RED_8 (s);			\ +	    rg = isa * GREEN_8 (d) + ida * GREEN_8 (s);			\ +	    rb = isa * BLUE_8 (d) + ida * BLUE_8 (s);			\ +									\ +	    rr += blend_ ## name (RED_8 (d), da, RED_8 (s), sa);	\ +	    rg += blend_ ## name (GREEN_8 (d), da, GREEN_8 (s), sa);    \ +	    rb += blend_ ## name (BLUE_8 (d), da, BLUE_8 (s), sa);	\ +                                                                        \ +	    CLAMP (ra, 0, 255 * 255);				        \ +	    CLAMP (rr, 0, 255 * 255);				        \ +	    CLAMP (rg, 0, 255 * 255);				        \ +	    CLAMP (rb, 0, 255 * 255);				        \ +									\ +	    ra = DIV_ONE_UN8 (ra);					\ +	    rr = DIV_ONE_UN8 (rr);					\ +	    rg = DIV_ONE_UN8 (rg);					\ +	    rb = DIV_ONE_UN8 (rb);					\ +									\ +	    *(dest + i) = ra << 24 | rr << 16 | rg << 8 | rb;		\ +	}								\ +    }									\ +    									\ +    static void								\ +    combine_ ## name ## _ca (pixman_implementation_t *imp,		\ +			     pixman_op_t              op,		\ +                             uint32_t *               dest,		\ +			     const uint32_t *         src,		\ +			     const uint32_t *         mask,		\ +			     int                      width)		\ +    {									\ +	int i;								\ +	for (i = 0; i < width; ++i)					\ +	{								\ +	    uint32_t m = *(mask + i);					\ +	    uint32_t s = *(src + i);					\ +	    uint32_t d = *(dest + i);					\ +	    uint8_t da = ALPHA_8 (d);					\ +	    uint8_t ida = ~da;						\ +	    uint32_t ra, rr, rg, rb;					\ +	    uint8_t ira, iga, iba;					\ +	    								\ +	    combine_mask_ca (&s, &m);					\ +	    								\ +	    ira = ~RED_8 (m);						\ +	    iga = ~GREEN_8 (m);						\ +	    iba = ~BLUE_8 (m);						\ +									\ +	    ra = da * 0xff + ALPHA_8 (s) * 0xff - ALPHA_8 (s) * da;	\ +	    rr = ira * RED_8 (d) + ida * RED_8 (s);			\ +	    rg = iga * GREEN_8 (d) + ida * GREEN_8 (s);			\ +	    rb = iba * BLUE_8 (d) + ida * BLUE_8 (s);			\ +									\ +	    rr += blend_ ## name (RED_8 (d), da, RED_8 (s), RED_8 (m));	\ +	    rg += blend_ ## name (GREEN_8 (d), da, GREEN_8 (s), GREEN_8 (m)); \ +	    rb += blend_ ## name (BLUE_8 (d), da, BLUE_8 (s), BLUE_8 (m)); \ +									\ +	    CLAMP (ra, 0, 255 * 255);				        \ +	    CLAMP (rr, 0, 255 * 255);				        \ +	    CLAMP (rg, 0, 255 * 255);				        \ +	    CLAMP (rb, 0, 255 * 255);				        \ +									\ +	    ra = DIV_ONE_UN8 (ra);					\ +	    rr = DIV_ONE_UN8 (rr);					\ +	    rg = DIV_ONE_UN8 (rg);					\ +	    rb = DIV_ONE_UN8 (rb);					\ +									\ +	    *(dest + i) = ra << 24 | rr << 16 | rg << 8 | rb;		\ +	}								\ +    } + +/* + * Screen + * + *      ad * as * B(d/ad, s/as) + *    = ad * as * (d/ad + s/as - s/as * d/ad) + *    = ad * s + as * d - s * d + */ +static inline int32_t +blend_screen (int32_t d, int32_t ad, int32_t s, int32_t as) +{ +    return s * ad + d * as - s * d; +} + +PDF_SEPARABLE_BLEND_MODE (screen) + +/* + * Overlay + * + *     ad * as * B(d/ad, s/as) + *   = ad * as * Hardlight (s, d) + *   = if (d / ad < 0.5) + *         as * ad * Multiply (s/as, 2 * d/ad) + *     else + *         as * ad * Screen (s/as, 2 * d / ad - 1) + *   = if (d < 0.5 * ad) + *         as * ad * s/as * 2 * d /ad + *     else + *         as * ad * (s/as + 2 * d / ad - 1 - s / as * (2 * d / ad - 1)) + *   = if (2 * d < ad) + *         2 * s * d + *     else + *         ad * s + 2 * as * d - as * ad - ad * s * (2 * d / ad - 1) + *   = if (2 * d < ad) + *         2 * s * d + *     else + *         as * ad - 2 * (ad - d) * (as - s) + */ +static inline int32_t +blend_overlay (int32_t d, int32_t ad, int32_t s, int32_t as) +{ +    uint32_t r; + +    if (2 * d < ad) +	r = 2 * s * d; +    else +	r = as * ad - 2 * (ad - d) * (as - s); + +    return r; +} + +PDF_SEPARABLE_BLEND_MODE (overlay) + +/* + * Darken + * + *     ad * as * B(d/ad, s/as) + *   = ad * as * MIN(d/ad, s/as) + *   = MIN (as * d, ad * s) + */ +static inline int32_t +blend_darken (int32_t d, int32_t ad, int32_t s, int32_t as) +{ +    s = ad * s; +    d = as * d; + +    return s > d ? d : s; +} + +PDF_SEPARABLE_BLEND_MODE (darken) + +/* + * Lighten + * + *     ad * as * B(d/ad, s/as) + *   = ad * as * MAX(d/ad, s/as) + *   = MAX (as * d, ad * s) + */ +static inline int32_t +blend_lighten (int32_t d, int32_t ad, int32_t s, int32_t as) +{ +    s = ad * s; +    d = as * d; +     +    return s > d ? s : d; +} + +PDF_SEPARABLE_BLEND_MODE (lighten) + +/* + * Hard light + * + *     ad * as * B(d/ad, s/as) + *   = if (s/as <= 0.5) + *         ad * as * Multiply (d/ad, 2 * s/as) + *     else + *         ad * as * Screen (d/ad, 2 * s/as - 1) + *   = if 2 * s <= as + *         ad * as * d/ad * 2 * s / as + *     else + *         ad * as * (d/ad + (2 * s/as - 1) + d/ad * (2 * s/as - 1)) + *   = if 2 * s <= as + *         2 * s * d + *     else + *         as * ad - 2 * (ad - d) * (as - s) + */ +static inline int32_t +blend_hard_light (int32_t d, int32_t ad, int32_t s, int32_t as) +{ +    if (2 * s < as) +	return 2 * s * d; +    else +	return as * ad - 2 * (ad - d) * (as - s); +} + +PDF_SEPARABLE_BLEND_MODE (hard_light) + +/* + * Difference + * + *     ad * as * B(s/as, d/ad) + *   = ad * as * abs (s/as - d/ad) + *   = if (s/as <= d/ad) + *         ad * as * (d/ad - s/as) + *     else + *         ad * as * (s/as - d/ad) + *   = if (ad * s <= as * d) + *        as * d - ad * s + *     else + *        ad * s - as * d + */ +static inline int32_t +blend_difference (int32_t d, int32_t ad, int32_t s, int32_t as) +{ +    int32_t das = d * as; +    int32_t sad = s * ad; + +    if (sad < das) +	return das - sad; +    else +	return sad - das; +} + +PDF_SEPARABLE_BLEND_MODE (difference) + +/* + * Exclusion + * + *     ad * as * B(s/as, d/ad) + *   = ad * as * (d/ad + s/as - 2 * d/ad * s/as) + *   = as * d + ad * s - 2 * s * d + */ + +/* This can be made faster by writing it directly and not using + * PDF_SEPARABLE_BLEND_MODE, but that's a performance optimization */ + +static inline int32_t +blend_exclusion (int32_t d, int32_t ad, int32_t s, int32_t as) +{ +    return s * ad + d * as - 2 * d * s; +} + +PDF_SEPARABLE_BLEND_MODE (exclusion) + +#undef PDF_SEPARABLE_BLEND_MODE + +/* Component alpha combiners */ + +static void +combine_clear_ca (pixman_implementation_t *imp, +                  pixman_op_t              op, +                  uint32_t *                dest, +                  const uint32_t *          src, +                  const uint32_t *          mask, +                  int                      width) +{ +    memset (dest, 0, width * sizeof(uint32_t)); +} + +static void +combine_src_ca (pixman_implementation_t *imp, +                pixman_op_t              op, +                uint32_t *                dest, +                const uint32_t *          src, +                const uint32_t *          mask, +                int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t s = *(src + i); +	uint32_t m = *(mask + i); + +	combine_mask_value_ca (&s, &m); + +	*(dest + i) = s; +    } +} + +static void +combine_over_ca (pixman_implementation_t *imp, +                 pixman_op_t              op, +                 uint32_t *                dest, +                 const uint32_t *          src, +                 const uint32_t *          mask, +                 int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t s = *(src + i); +	uint32_t m = *(mask + i); +	uint32_t a; + +	combine_mask_ca (&s, &m); + +	a = ~m; +	if (a) +	{ +	    uint32_t d = *(dest + i); +	    UN8x4_MUL_UN8x4_ADD_UN8x4 (d, a, s); +	    s = d; +	} + +	*(dest + i) = s; +    } +} + +static void +combine_over_reverse_ca (pixman_implementation_t *imp, +                         pixman_op_t              op, +                         uint32_t *                dest, +                         const uint32_t *          src, +                         const uint32_t *          mask, +                         int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t d = *(dest + i); +	uint32_t a = ~d >> A_SHIFT; + +	if (a) +	{ +	    uint32_t s = *(src + i); +	    uint32_t m = *(mask + i); + +	    UN8x4_MUL_UN8x4 (s, m); +	    UN8x4_MUL_UN8_ADD_UN8x4 (s, a, d); + +	    *(dest + i) = s; +	} +    } +} + +static void +combine_in_ca (pixman_implementation_t *imp, +               pixman_op_t              op, +               uint32_t *                dest, +               const uint32_t *          src, +               const uint32_t *          mask, +               int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t d = *(dest + i); +	uint16_t a = d >> A_SHIFT; +	uint32_t s = 0; + +	if (a) +	{ +	    uint32_t m = *(mask + i); + +	    s = *(src + i); +	    combine_mask_value_ca (&s, &m); + +	    if (a != MASK) +		UN8x4_MUL_UN8 (s, a); +	} + +	*(dest + i) = s; +    } +} + +static void +combine_in_reverse_ca (pixman_implementation_t *imp, +                       pixman_op_t              op, +                       uint32_t *                dest, +                       const uint32_t *          src, +                       const uint32_t *          mask, +                       int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t s = *(src + i); +	uint32_t m = *(mask + i); +	uint32_t a; + +	combine_mask_alpha_ca (&s, &m); + +	a = m; +	if (a != ~0) +	{ +	    uint32_t d = 0; + +	    if (a) +	    { +		d = *(dest + i); +		UN8x4_MUL_UN8x4 (d, a); +	    } + +	    *(dest + i) = d; +	} +    } +} + +static void +combine_out_ca (pixman_implementation_t *imp, +                pixman_op_t              op, +                uint32_t *                dest, +                const uint32_t *          src, +                const uint32_t *          mask, +                int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t d = *(dest + i); +	uint16_t a = ~d >> A_SHIFT; +	uint32_t s = 0; + +	if (a) +	{ +	    uint32_t m = *(mask + i); + +	    s = *(src + i); +	    combine_mask_value_ca (&s, &m); + +	    if (a != MASK) +		UN8x4_MUL_UN8 (s, a); +	} + +	*(dest + i) = s; +    } +} + +static void +combine_out_reverse_ca (pixman_implementation_t *imp, +                        pixman_op_t              op, +                        uint32_t *                dest, +                        const uint32_t *          src, +                        const uint32_t *          mask, +                        int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t s = *(src + i); +	uint32_t m = *(mask + i); +	uint32_t a; + +	combine_mask_alpha_ca (&s, &m); + +	a = ~m; +	if (a != ~0) +	{ +	    uint32_t d = 0; + +	    if (a) +	    { +		d = *(dest + i); +		UN8x4_MUL_UN8x4 (d, a); +	    } + +	    *(dest + i) = d; +	} +    } +} + +static void +combine_atop_ca (pixman_implementation_t *imp, +                 pixman_op_t              op, +                 uint32_t *                dest, +                 const uint32_t *          src, +                 const uint32_t *          mask, +                 int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t d = *(dest + i); +	uint32_t s = *(src + i); +	uint32_t m = *(mask + i); +	uint32_t ad; +	uint16_t as = d >> A_SHIFT; + +	combine_mask_ca (&s, &m); + +	ad = ~m; + +	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ad, s, as); + +	*(dest + i) = d; +    } +} + +static void +combine_atop_reverse_ca (pixman_implementation_t *imp, +                         pixman_op_t              op, +                         uint32_t *                dest, +                         const uint32_t *          src, +                         const uint32_t *          mask, +                         int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t d = *(dest + i); +	uint32_t s = *(src + i); +	uint32_t m = *(mask + i); +	uint32_t ad; +	uint16_t as = ~d >> A_SHIFT; + +	combine_mask_ca (&s, &m); + +	ad = m; + +	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ad, s, as); + +	*(dest + i) = d; +    } +} + +static void +combine_xor_ca (pixman_implementation_t *imp, +                pixman_op_t              op, +                uint32_t *                dest, +                const uint32_t *          src, +                const uint32_t *          mask, +                int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t d = *(dest + i); +	uint32_t s = *(src + i); +	uint32_t m = *(mask + i); +	uint32_t ad; +	uint16_t as = ~d >> A_SHIFT; + +	combine_mask_ca (&s, &m); + +	ad = ~m; + +	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ad, s, as); + +	*(dest + i) = d; +    } +} + +static void +combine_add_ca (pixman_implementation_t *imp, +                pixman_op_t              op, +                uint32_t *                dest, +                const uint32_t *          src, +                const uint32_t *          mask, +                int                      width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t s = *(src + i); +	uint32_t m = *(mask + i); +	uint32_t d = *(dest + i); + +	combine_mask_value_ca (&s, &m); + +	UN8x4_ADD_UN8x4 (d, s); + +	*(dest + i) = d; +    } +} + +void +_pixman_setup_combiner_functions_32 (pixman_implementation_t *imp) +{ +    /* Unified alpha */ +    imp->combine_32[PIXMAN_OP_CLEAR] = combine_clear; +    imp->combine_32[PIXMAN_OP_SRC] = combine_src_u; +    imp->combine_32[PIXMAN_OP_DST] = combine_dst; +    imp->combine_32[PIXMAN_OP_OVER] = combine_over_u; +    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_u; +    imp->combine_32[PIXMAN_OP_IN] = combine_in_u; +    imp->combine_32[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_u; +    imp->combine_32[PIXMAN_OP_OUT] = combine_out_u; +    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_u; +    imp->combine_32[PIXMAN_OP_ATOP] = combine_atop_u; +    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_u; +    imp->combine_32[PIXMAN_OP_XOR] = combine_xor_u; +    imp->combine_32[PIXMAN_OP_ADD] = combine_add_u; + +    imp->combine_32[PIXMAN_OP_MULTIPLY] = combine_multiply_u; +    imp->combine_32[PIXMAN_OP_SCREEN] = combine_screen_u; +    imp->combine_32[PIXMAN_OP_OVERLAY] = combine_overlay_u; +    imp->combine_32[PIXMAN_OP_DARKEN] = combine_darken_u; +    imp->combine_32[PIXMAN_OP_LIGHTEN] = combine_lighten_u; +    imp->combine_32[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_u; +    imp->combine_32[PIXMAN_OP_DIFFERENCE] = combine_difference_u; +    imp->combine_32[PIXMAN_OP_EXCLUSION] = combine_exclusion_u; + +    /* Component alpha combiners */ +    imp->combine_32_ca[PIXMAN_OP_CLEAR] = combine_clear_ca; +    imp->combine_32_ca[PIXMAN_OP_SRC] = combine_src_ca; +    /* dest */ +    imp->combine_32_ca[PIXMAN_OP_OVER] = combine_over_ca; +    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_ca; +    imp->combine_32_ca[PIXMAN_OP_IN] = combine_in_ca; +    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_ca; +    imp->combine_32_ca[PIXMAN_OP_OUT] = combine_out_ca; +    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_ca; +    imp->combine_32_ca[PIXMAN_OP_ATOP] = combine_atop_ca; +    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_ca; +    imp->combine_32_ca[PIXMAN_OP_XOR] = combine_xor_ca; +    imp->combine_32_ca[PIXMAN_OP_ADD] = combine_add_ca; + +    imp->combine_32_ca[PIXMAN_OP_MULTIPLY] = combine_multiply_ca; +    imp->combine_32_ca[PIXMAN_OP_SCREEN] = combine_screen_ca; +    imp->combine_32_ca[PIXMAN_OP_OVERLAY] = combine_overlay_ca; +    imp->combine_32_ca[PIXMAN_OP_DARKEN] = combine_darken_ca; +    imp->combine_32_ca[PIXMAN_OP_LIGHTEN] = combine_lighten_ca; +    imp->combine_32_ca[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_ca; +    imp->combine_32_ca[PIXMAN_OP_DIFFERENCE] = combine_difference_ca; +    imp->combine_32_ca[PIXMAN_OP_EXCLUSION] = combine_exclusion_ca; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-combine32.h b/libs/pixman-0.40.0/pixman/pixman-combine32.h new file mode 100644 index 0000000..59bb247 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-combine32.h @@ -0,0 +1,272 @@ +#define COMPONENT_SIZE 8 +#define MASK 0xff +#define ONE_HALF 0x80 + +#define A_SHIFT 8 * 3 +#define R_SHIFT 8 * 2 +#define G_SHIFT 8 +#define A_MASK 0xff000000 +#define R_MASK 0xff0000 +#define G_MASK 0xff00 + +#define RB_MASK 0xff00ff +#define AG_MASK 0xff00ff00 +#define RB_ONE_HALF 0x800080 +#define RB_MASK_PLUS_ONE 0x1000100 + +#define ALPHA_8(x) ((x) >> A_SHIFT) +#define RED_8(x) (((x) >> R_SHIFT) & MASK) +#define GREEN_8(x) (((x) >> G_SHIFT) & MASK) +#define BLUE_8(x) ((x) & MASK) + +/* + * ARMv6 has UQADD8 instruction, which implements unsigned saturated + * addition for 8-bit values packed in 32-bit registers. It is very useful + * for UN8x4_ADD_UN8x4, UN8_rb_ADD_UN8_rb and ADD_UN8 macros (which would + * otherwise need a lot of arithmetic operations to simulate this operation). + * Since most of the major ARM linux distros are built for ARMv7, we are + * much less dependent on runtime CPU detection and can get practical + * benefits from conditional compilation here for a lot of users. + */ + +#if defined(USE_GCC_INLINE_ASM) && defined(__arm__) && \ +    !defined(__aarch64__) && (!defined(__thumb__) || defined(__thumb2__)) +#if defined(__ARM_ARCH_6__)   || defined(__ARM_ARCH_6J__)  || \ +    defined(__ARM_ARCH_6K__)  || defined(__ARM_ARCH_6Z__)  || \ +    defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) || \ +    defined(__ARM_ARCH_6M__)  || defined(__ARM_ARCH_7__)   || \ +    defined(__ARM_ARCH_7A__)  || defined(__ARM_ARCH_7R__)  || \ +    defined(__ARM_ARCH_7M__)  || defined(__ARM_ARCH_7EM__) + +static force_inline uint32_t +un8x4_add_un8x4 (uint32_t x, uint32_t y) +{ +    uint32_t t; +    asm ("uqadd8 %0, %1, %2" : "=r" (t) : "%r" (x), "r" (y)); +    return t; +} + +#define UN8x4_ADD_UN8x4(x, y) \ +    ((x) = un8x4_add_un8x4 ((x), (y))) + +#define UN8_rb_ADD_UN8_rb(x, y, t) \ +    ((t) = un8x4_add_un8x4 ((x), (y)), (x) = (t)) + +#define ADD_UN8(x, y, t) \ +    ((t) = (x), un8x4_add_un8x4 ((t), (y))) + +#endif +#endif + +/*****************************************************************************/ + +/* + * Helper macros. + */ + +#define MUL_UN8(a, b, t)						\ +    ((t) = (a) * (uint16_t)(b) + ONE_HALF, ((((t) >> G_SHIFT ) + (t) ) >> G_SHIFT )) + +#define DIV_UN8(a, b)							\ +    (((uint16_t) (a) * MASK + ((b) / 2)) / (b)) + +#ifndef ADD_UN8 +#define ADD_UN8(x, y, t)				     \ +    ((t) = (x) + (y),					     \ +     (uint32_t) (uint8_t) ((t) | (0 - ((t) >> G_SHIFT)))) +#endif + +#define DIV_ONE_UN8(x)							\ +    (((x) + ONE_HALF + (((x) + ONE_HALF) >> G_SHIFT)) >> G_SHIFT) + +/* + * The methods below use some tricks to be able to do two color + * components at the same time. + */ + +/* + * x_rb = (x_rb * a) / 255 + */ +#define UN8_rb_MUL_UN8(x, a, t)						\ +    do									\ +    {									\ +	t  = ((x) & RB_MASK) * (a);					\ +	t += RB_ONE_HALF;						\ +	x = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\ +	x &= RB_MASK;							\ +    } while (0) + +/* + * x_rb = min (x_rb + y_rb, 255) + */ +#ifndef UN8_rb_ADD_UN8_rb +#define UN8_rb_ADD_UN8_rb(x, y, t)					\ +    do									\ +    {									\ +	t = ((x) + (y));						\ +	t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);		\ +	x = (t & RB_MASK);						\ +    } while (0) +#endif + +/* + * x_rb = (x_rb * a_rb) / 255 + */ +#define UN8_rb_MUL_UN8_rb(x, a, t)					\ +    do									\ +    {									\ +	t  = (x & MASK) * (a & MASK);					\ +	t |= (x & R_MASK) * ((a >> R_SHIFT) & MASK);			\ +	t += RB_ONE_HALF;						\ +	t = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\ +	x = t & RB_MASK;						\ +    } while (0) + +/* + * x_c = (x_c * a) / 255 + */ +#define UN8x4_MUL_UN8(x, a)						\ +    do									\ +    {									\ +	uint32_t r1__, r2__, t__;					\ +									\ +	r1__ = (x);							\ +	UN8_rb_MUL_UN8 (r1__, (a), t__);				\ +									\ +	r2__ = (x) >> G_SHIFT;						\ +	UN8_rb_MUL_UN8 (r2__, (a), t__);				\ +									\ +	(x) = r1__ | (r2__ << G_SHIFT);					\ +    } while (0) + +/* + * x_c = (x_c * a) / 255 + y_c + */ +#define UN8x4_MUL_UN8_ADD_UN8x4(x, a, y)				\ +    do									\ +    {									\ +	uint32_t r1__, r2__, r3__, t__;					\ +									\ +	r1__ = (x);							\ +	r2__ = (y) & RB_MASK;						\ +	UN8_rb_MUL_UN8 (r1__, (a), t__);				\ +	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\ +									\ +	r2__ = (x) >> G_SHIFT;						\ +	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\ +	UN8_rb_MUL_UN8 (r2__, (a), t__);				\ +	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\ +									\ +	(x) = r1__ | (r2__ << G_SHIFT);					\ +    } while (0) + +/* + * x_c = (x_c * a + y_c * b) / 255 + */ +#define UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8(x, a, y, b)			\ +    do									\ +    {									\ +	uint32_t r1__, r2__, r3__, t__;					\ +									\ +	r1__ = (x);							\ +	r2__ = (y);							\ +	UN8_rb_MUL_UN8 (r1__, (a), t__);				\ +	UN8_rb_MUL_UN8 (r2__, (b), t__);				\ +	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\ +									\ +	r2__ = ((x) >> G_SHIFT);					\ +	r3__ = ((y) >> G_SHIFT);					\ +	UN8_rb_MUL_UN8 (r2__, (a), t__);				\ +	UN8_rb_MUL_UN8 (r3__, (b), t__);				\ +	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\ +									\ +	(x) = r1__ | (r2__ << G_SHIFT);					\ +    } while (0) + +/* + * x_c = (x_c * a_c) / 255 + */ +#define UN8x4_MUL_UN8x4(x, a)						\ +    do									\ +    {									\ +	uint32_t r1__, r2__, r3__, t__;					\ +									\ +	r1__ = (x);							\ +	r2__ = (a);							\ +	UN8_rb_MUL_UN8_rb (r1__, r2__, t__);				\ +									\ +	r2__ = (x) >> G_SHIFT;						\ +	r3__ = (a) >> G_SHIFT;						\ +	UN8_rb_MUL_UN8_rb (r2__, r3__, t__);				\ +									\ +	(x) = r1__ | (r2__ << G_SHIFT);					\ +    } while (0) + +/* + * x_c = (x_c * a_c) / 255 + y_c + */ +#define UN8x4_MUL_UN8x4_ADD_UN8x4(x, a, y)				\ +    do									\ +    {									\ +	uint32_t r1__, r2__, r3__, t__;					\ +									\ +	r1__ = (x);							\ +	r2__ = (a);							\ +	UN8_rb_MUL_UN8_rb (r1__, r2__, t__);				\ +	r2__ = (y) & RB_MASK;						\ +	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\ +									\ +	r2__ = ((x) >> G_SHIFT);					\ +	r3__ = ((a) >> G_SHIFT);					\ +	UN8_rb_MUL_UN8_rb (r2__, r3__, t__);				\ +	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\ +	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\ +									\ +	(x) = r1__ | (r2__ << G_SHIFT);					\ +    } while (0) + +/* + * x_c = (x_c * a_c + y_c * b) / 255 + */ +#define UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8(x, a, y, b)			\ +    do									\ +    {									\ +	uint32_t r1__, r2__, r3__, t__;					\ +									\ +	r1__ = (x);							\ +	r2__ = (a);							\ +	UN8_rb_MUL_UN8_rb (r1__, r2__, t__);				\ +	r2__ = (y);							\ +	UN8_rb_MUL_UN8 (r2__, (b), t__);				\ +	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\ +									\ +	r2__ = (x) >> G_SHIFT;						\ +	r3__ = (a) >> G_SHIFT;						\ +	UN8_rb_MUL_UN8_rb (r2__, r3__, t__);				\ +	r3__ = (y) >> G_SHIFT;						\ +	UN8_rb_MUL_UN8 (r3__, (b), t__);				\ +	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\ +									\ +	x = r1__ | (r2__ << G_SHIFT);					\ +    } while (0) + +/* +  x_c = min(x_c + y_c, 255) +*/ +#ifndef UN8x4_ADD_UN8x4 +#define UN8x4_ADD_UN8x4(x, y)						\ +    do									\ +    {									\ +	uint32_t r1__, r2__, r3__, t__;					\ +									\ +	r1__ = (x) & RB_MASK;						\ +	r2__ = (y) & RB_MASK;						\ +	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\ +									\ +	r2__ = ((x) >> G_SHIFT) & RB_MASK;				\ +	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\ +	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\ +									\ +	x = r1__ | (r2__ << G_SHIFT);					\ +    } while (0) +#endif diff --git a/libs/pixman-0.40.0/pixman/pixman-compiler.h b/libs/pixman-0.40.0/pixman/pixman-compiler.h new file mode 100644 index 0000000..a02aa49 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-compiler.h @@ -0,0 +1,234 @@ +/* Pixman uses some non-standard compiler features. This file ensures + * they exist + * + * The features are: + * + *    FUNC	     must be defined to expand to the current function + *    PIXMAN_EXPORT  should be defined to whatever is required to + *                   export functions from a shared library + *    limits	     limits for various types must be defined + *    inline         must be defined + *    force_inline   must be defined + */ +#if defined (__GNUC__) +#  define FUNC     ((const char*) (__PRETTY_FUNCTION__)) +#elif defined (__sun) || (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) +#  define FUNC     ((const char*) (__func__)) +#else +#  define FUNC     ((const char*) ("???")) +#endif + +#if defined (__GNUC__) +#  define unlikely(expr) __builtin_expect ((expr), 0) +#else +#  define unlikely(expr)  (expr) +#endif + +#if defined (__GNUC__) +#  define MAYBE_UNUSED  __attribute__((unused)) +#else +#  define MAYBE_UNUSED +#endif + +#ifndef INT16_MIN +# define INT16_MIN              (-32767-1) +#endif + +#ifndef INT16_MAX +# define INT16_MAX              (32767) +#endif + +#ifndef INT32_MIN +# define INT32_MIN              (-2147483647-1) +#endif + +#ifndef INT32_MAX +# define INT32_MAX              (2147483647) +#endif + +#ifndef UINT32_MIN +# define UINT32_MIN             (0) +#endif + +#ifndef UINT32_MAX +# define UINT32_MAX             (4294967295U) +#endif + +#ifndef INT64_MIN +# define INT64_MIN              (-9223372036854775807-1) +#endif + +#ifndef INT64_MAX +# define INT64_MAX              (9223372036854775807) +#endif + +#ifndef SIZE_MAX +# define SIZE_MAX               ((size_t)-1) +#endif + + +#ifndef M_PI +# define M_PI			3.14159265358979323846 +#endif + +#ifdef _MSC_VER +/* 'inline' is available only in C++ in MSVC */ +#   define inline __inline +#   define force_inline __forceinline +#   define noinline __declspec(noinline) +#elif defined __GNUC__ || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)) +#   define inline __inline__ +#   define force_inline __inline__ __attribute__ ((__always_inline__)) +#   define noinline __attribute__((noinline)) +#else +#   ifndef force_inline +#      define force_inline inline +#   endif +#   ifndef noinline +#      define noinline +#   endif +#endif + +/* GCC visibility */ +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(_WIN32) +#   define PIXMAN_EXPORT __attribute__ ((visibility("default"))) +/* Sun Studio 8 visibility */ +#elif defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550) +#   define PIXMAN_EXPORT __global +#elif defined (_MSC_VER) || defined(__MINGW32__) +#   define PIXMAN_EXPORT PIXMAN_API +#else +#   define PIXMAN_EXPORT +#endif + +/* member offsets */ +#define CONTAINER_OF(type, member, data)				\ +    ((type *)(((uint8_t *)data) - offsetof (type, member))) + +/* TLS */ +#if defined(PIXMAN_NO_TLS) + +#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\ +    static type name +#   define PIXMAN_GET_THREAD_LOCAL(name)				\ +    (&name) + +#elif defined(TLS) + +#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\ +    static TLS type name +#   define PIXMAN_GET_THREAD_LOCAL(name)				\ +    (&name) + +#elif defined(__MINGW32__) + +#   define _NO_W32_PSEUDO_MODIFIERS +#   include <windows.h> + +#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\ +    static volatile int tls_ ## name ## _initialized = 0;		\ +    static void *tls_ ## name ## _mutex = NULL;				\ +    static unsigned tls_ ## name ## _index;				\ +									\ +    static type *							\ +    tls_ ## name ## _alloc (void)					\ +    {									\ +        type *value = calloc (1, sizeof (type));			\ +        if (value)							\ +            TlsSetValue (tls_ ## name ## _index, value);		\ +        return value;							\ +    }									\ +									\ +    static force_inline type *						\ +    tls_ ## name ## _get (void)						\ +    {									\ +	type *value;							\ +	if (!tls_ ## name ## _initialized)				\ +	{								\ +	    if (!tls_ ## name ## _mutex)				\ +	    {								\ +		void *mutex = CreateMutexA (NULL, 0, NULL);		\ +		if (InterlockedCompareExchangePointer (			\ +			&tls_ ## name ## _mutex, mutex, NULL) != NULL)	\ +		{							\ +		    CloseHandle (mutex);				\ +		}							\ +	    }								\ +	    WaitForSingleObject (tls_ ## name ## _mutex, 0xFFFFFFFF);	\ +	    if (!tls_ ## name ## _initialized)				\ +	    {								\ +		tls_ ## name ## _index = TlsAlloc ();			\ +		tls_ ## name ## _initialized = 1;			\ +	    }								\ +	    ReleaseMutex (tls_ ## name ## _mutex);			\ +	}								\ +	if (tls_ ## name ## _index == 0xFFFFFFFF)			\ +	    return NULL;						\ +	value = TlsGetValue (tls_ ## name ## _index);			\ +	if (!value)							\ +	    value = tls_ ## name ## _alloc ();				\ +	return value;							\ +    } + +#   define PIXMAN_GET_THREAD_LOCAL(name)				\ +    tls_ ## name ## _get () + +#elif defined(_MSC_VER) + +#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\ +    static __declspec(thread) type name +#   define PIXMAN_GET_THREAD_LOCAL(name)				\ +    (&name) + +#elif defined(HAVE_PTHREADS) + +#include <pthread.h> + +#  define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\ +    static pthread_once_t tls_ ## name ## _once_control = PTHREAD_ONCE_INIT; \ +    static pthread_key_t tls_ ## name ## _key;				\ +									\ +    static void								\ +    tls_ ## name ## _destroy_value (void *value)			\ +    {									\ +	free (value);							\ +    }									\ +									\ +    static void								\ +    tls_ ## name ## _make_key (void)					\ +    {									\ +	pthread_key_create (&tls_ ## name ## _key,			\ +			    tls_ ## name ## _destroy_value);		\ +    }									\ +									\ +    static type *							\ +    tls_ ## name ## _alloc (void)					\ +    {									\ +	type *value = calloc (1, sizeof (type));			\ +	if (value)							\ +	    pthread_setspecific (tls_ ## name ## _key, value);		\ +	return value;							\ +    }									\ +									\ +    static force_inline type *						\ +    tls_ ## name ## _get (void)						\ +    {									\ +	type *value = NULL;						\ +	if (pthread_once (&tls_ ## name ## _once_control,		\ +			  tls_ ## name ## _make_key) == 0)		\ +	{								\ +	    value = pthread_getspecific (tls_ ## name ## _key);		\ +	    if (!value)							\ +		value = tls_ ## name ## _alloc ();			\ +	}								\ +	return value;							\ +    } + +#   define PIXMAN_GET_THREAD_LOCAL(name)				\ +    tls_ ## name ## _get () + +#else + +#    error "Unknown thread local support for this system. Pixman will not work with multiple threads. Define PIXMAN_NO_TLS to acknowledge and accept this limitation and compile pixman without thread-safety support." + +#endif diff --git a/libs/pixman-0.40.0/pixman/pixman-conical-gradient.c b/libs/pixman-0.40.0/pixman/pixman-conical-gradient.c new file mode 100644 index 0000000..a39e20c --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-conical-gradient.c @@ -0,0 +1,220 @@ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. + *             2005 Lars Knoll & Zack Rusin, Trolltech + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Keith Packard not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission.  Keith Packard makes no + * representations about the suitability of this software for any purpose.  It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <stdlib.h> +#include <math.h> +#include "pixman-private.h" + +static force_inline double +coordinates_to_parameter (double x, double y, double angle) +{ +    double t; + +    t = atan2 (y, x) + angle; + +    while (t < 0) +	t += 2 * M_PI; + +    while (t >= 2 * M_PI) +	t -= 2 * M_PI; + +    return 1 - t * (1 / (2 * M_PI)); /* Scale t to [0, 1] and +				      * make rotation CCW +				      */ +} + +static uint32_t * +conical_get_scanline (pixman_iter_t                 *iter, +		      const uint32_t                *mask, +		      int                            Bpp, +		      pixman_gradient_walker_write_t write_pixel) +{ +    pixman_image_t *image = iter->image; +    int x = iter->x; +    int y = iter->y; +    int width = iter->width; +    uint32_t *buffer = iter->buffer; + +    gradient_t *gradient = (gradient_t *)image; +    conical_gradient_t *conical = (conical_gradient_t *)image; +    uint32_t       *end = buffer + width * (Bpp / 4); +    pixman_gradient_walker_t walker; +    pixman_bool_t affine = TRUE; +    double cx = 1.; +    double cy = 0.; +    double cz = 0.; +    double rx = x + 0.5; +    double ry = y + 0.5; +    double rz = 1.; + +    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat); + +    if (image->common.transform) +    { +	pixman_vector_t v; + +	/* reference point is the center of the pixel */ +	v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2; +	v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2; +	v.vector[2] = pixman_fixed_1; + +	if (!pixman_transform_point_3d (image->common.transform, &v)) +	    return iter->buffer; + +	cx = image->common.transform->matrix[0][0] / 65536.; +	cy = image->common.transform->matrix[1][0] / 65536.; +	cz = image->common.transform->matrix[2][0] / 65536.; + +	rx = v.vector[0] / 65536.; +	ry = v.vector[1] / 65536.; +	rz = v.vector[2] / 65536.; + +	affine = +	    image->common.transform->matrix[2][0] == 0 && +	    v.vector[2] == pixman_fixed_1; +    } + +    if (affine) +    { +	rx -= conical->center.x / 65536.; +	ry -= conical->center.y / 65536.; + +	while (buffer < end) +	{ +	    if (!mask || *mask++) +	    { +		double t = coordinates_to_parameter (rx, ry, conical->angle); + +		write_pixel (&walker, +			     (pixman_fixed_48_16_t)pixman_double_to_fixed (t), +			     buffer); +	    } + +	    buffer += (Bpp / 4); + +	    rx += cx; +	    ry += cy; +	} +    } +    else +    { +	while (buffer < end) +	{ +	    double x, y; + +	    if (!mask || *mask++) +	    { +		double t; + +		if (rz != 0) +		{ +		    x = rx / rz; +		    y = ry / rz; +		} +		else +		{ +		    x = y = 0.; +		} + +		x -= conical->center.x / 65536.; +		y -= conical->center.y / 65536.; + +		t = coordinates_to_parameter (x, y, conical->angle); + +		write_pixel (&walker, +			     (pixman_fixed_48_16_t)pixman_double_to_fixed (t), +			     buffer); +	    } + +	    buffer += (Bpp / 4); + +	    rx += cx; +	    ry += cy; +	    rz += cz; +	} +    } + +    iter->y++; +    return iter->buffer; +} + +static uint32_t * +conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask) +{ +    return conical_get_scanline (iter, mask, 4, +				 _pixman_gradient_walker_write_narrow); +} + +static uint32_t * +conical_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask) +{ +    return conical_get_scanline (iter, NULL, 16, +				 _pixman_gradient_walker_write_wide); +} + +void +_pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter) +{ +    if (iter->iter_flags & ITER_NARROW) +	iter->get_scanline = conical_get_scanline_narrow; +    else +	iter->get_scanline = conical_get_scanline_wide; +} + +PIXMAN_EXPORT pixman_image_t * +pixman_image_create_conical_gradient (const pixman_point_fixed_t *  center, +                                      pixman_fixed_t                angle, +                                      const pixman_gradient_stop_t *stops, +                                      int                           n_stops) +{ +    pixman_image_t *image = _pixman_image_allocate (); +    conical_gradient_t *conical; + +    if (!image) +	return NULL; + +    conical = &image->conical; + +    if (!_pixman_init_gradient (&conical->common, stops, n_stops)) +    { +	free (image); +	return NULL; +    } + +    angle = MOD (angle, pixman_int_to_fixed (360)); + +    image->type = CONICAL; + +    conical->center = *center; +    conical->angle = (pixman_fixed_to_double (angle) / 180.0) * M_PI; + +    return image; +} + diff --git a/libs/pixman-0.40.0/pixman/pixman-edge-accessors.c b/libs/pixman-0.40.0/pixman/pixman-edge-accessors.c new file mode 100644 index 0000000..ea3a31e --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-edge-accessors.c @@ -0,0 +1,4 @@ + +#define PIXMAN_FB_ACCESSORS + +#include "pixman-edge.c" diff --git a/libs/pixman-0.40.0/pixman/pixman-edge-imp.h b/libs/pixman-0.40.0/pixman/pixman-edge-imp.h new file mode 100644 index 0000000..a4698ed --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-edge-imp.h @@ -0,0 +1,182 @@ +/* + * Copyright © 2004 Keith Packard + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Keith Packard not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission.  Keith Packard makes no + * representations about the suitability of this software for any purpose.  It + * is provided "as is" without express or implied warranty. + * + * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef rasterize_span +#endif + +static void +RASTERIZE_EDGES (pixman_image_t  *image, +		pixman_edge_t	*l, +		pixman_edge_t	*r, +		pixman_fixed_t		t, +		pixman_fixed_t		b) +{ +    pixman_fixed_t  y = t; +    uint32_t  *line; +    uint32_t *buf = (image)->bits.bits; +    int stride = (image)->bits.rowstride; +    int width = (image)->bits.width; + +    line = buf + pixman_fixed_to_int (y) * stride; + +    for (;;) +    { +	pixman_fixed_t	lx; +	pixman_fixed_t      rx; +	int	lxi; +	int rxi; + +	lx = l->x; +	rx = r->x; +#if N_BITS == 1 +	/* For the non-antialiased case, round the coordinates up, in effect +	 * sampling just slightly to the left of the pixel. This is so that +	 * when the sample point lies exactly on the line, we round towards +	 * north-west. +	 * +	 * (The AA case does a similar  adjustment in RENDER_SAMPLES_X) +	 */ +	lx += X_FRAC_FIRST(1) - pixman_fixed_e; +	rx += X_FRAC_FIRST(1) - pixman_fixed_e; +#endif +	/* clip X */ +	if (lx < 0) +	    lx = 0; +	if (pixman_fixed_to_int (rx) >= width) +#if N_BITS == 1 +	    rx = pixman_int_to_fixed (width); +#else +	    /* Use the last pixel of the scanline, covered 100%. +	     * We can't use the first pixel following the scanline, +	     * because accessing it could result in a buffer overrun. +	     */ +	    rx = pixman_int_to_fixed (width) - 1; +#endif + +	/* Skip empty (or backwards) sections */ +	if (rx > lx) +	{ + +	    /* Find pixel bounds for span */ +	    lxi = pixman_fixed_to_int (lx); +	    rxi = pixman_fixed_to_int (rx); + +#if N_BITS == 1 +	    { + +#define LEFT_MASK(x)							\ +		(((x) & 0x1f) ?						\ +		 SCREEN_SHIFT_RIGHT (0xffffffff, (x) & 0x1f) : 0) +#define RIGHT_MASK(x)							\ +		(((32 - (x)) & 0x1f) ?					\ +		 SCREEN_SHIFT_LEFT (0xffffffff, (32 - (x)) & 0x1f) : 0) +		 +#define MASK_BITS(x,w,l,n,r) {						\ +		    n = (w);						\ +		    r = RIGHT_MASK ((x) + n);				\ +		    l = LEFT_MASK (x);					\ +		    if (l) {						\ +			n -= 32 - ((x) & 0x1f);				\ +			if (n < 0) {					\ +			    n = 0;					\ +			    l &= r;					\ +			    r = 0;					\ +			}						\ +		    }							\ +		    n >>= 5;						\ +		} +		 +		uint32_t  *a = line; +		uint32_t  startmask; +		uint32_t  endmask; +		int	    nmiddle; +		int	    width = rxi - lxi; +		int	    x = lxi; +		 +		a += x >> 5; +		x &= 0x1f; +		 +		MASK_BITS (x, width, startmask, nmiddle, endmask); + +		if (startmask) { +		    WRITE(image, a, READ(image, a) | startmask); +		    a++; +		} +		while (nmiddle--) +		    WRITE(image, a++, 0xffffffff); +		if (endmask) +		    WRITE(image, a, READ(image, a) | endmask); +	    } +#else +	    { +		DEFINE_ALPHA(line,lxi); +		int	    lxs; +		int     rxs; + +		/* Sample coverage for edge pixels */ +		lxs = RENDER_SAMPLES_X (lx, N_BITS); +		rxs = RENDER_SAMPLES_X (rx, N_BITS); + +		/* Add coverage across row */ +		if (lxi == rxi) +		{ +		    ADD_ALPHA (rxs - lxs); +		} +		else +		{ +		    int	xi; + +		    ADD_ALPHA (N_X_FRAC(N_BITS) - lxs); +		    STEP_ALPHA; +		    for (xi = lxi + 1; xi < rxi; xi++) +		    { +			ADD_ALPHA (N_X_FRAC(N_BITS)); +			STEP_ALPHA; +		    } +		    ADD_ALPHA (rxs); +		} +	    } +#endif +	} + +	if (y == b) +	    break; + +#if N_BITS > 1 +	if (pixman_fixed_frac (y) != Y_FRAC_LAST(N_BITS)) +	{ +	    RENDER_EDGE_STEP_SMALL (l); +	    RENDER_EDGE_STEP_SMALL (r); +	    y += STEP_Y_SMALL(N_BITS); +	} +	else +#endif +	{ +	    RENDER_EDGE_STEP_BIG (l); +	    RENDER_EDGE_STEP_BIG (r); +	    y += STEP_Y_BIG(N_BITS); +	    line += stride; +	} +    } +} + +#undef rasterize_span diff --git a/libs/pixman-0.40.0/pixman/pixman-edge.c b/libs/pixman-0.40.0/pixman/pixman-edge.c new file mode 100644 index 0000000..ad6dfc4 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-edge.c @@ -0,0 +1,385 @@ +/* + * Copyright © 2004 Keith Packard + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Keith Packard not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission.  Keith Packard makes no + * representations about the suitability of this software for any purpose.  It + * is provided "as is" without express or implied warranty. + * + * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <string.h> + +#include "pixman-private.h" +#include "pixman-accessor.h" + +/* + * Step across a small sample grid gap + */ +#define RENDER_EDGE_STEP_SMALL(edge)					\ +    {									\ +	edge->x += edge->stepx_small;					\ +	edge->e += edge->dx_small;					\ +	if (edge->e > 0)						\ +	{								\ +	    edge->e -= edge->dy;					\ +	    edge->x += edge->signdx;					\ +	}								\ +    } + +/* + * Step across a large sample grid gap + */ +#define RENDER_EDGE_STEP_BIG(edge)					\ +    {									\ +	edge->x += edge->stepx_big;					\ +	edge->e += edge->dx_big;					\ +	if (edge->e > 0)						\ +	{								\ +	    edge->e -= edge->dy;					\ +	    edge->x += edge->signdx;					\ +	}								\ +    } + +#ifdef PIXMAN_FB_ACCESSORS +#define PIXMAN_RASTERIZE_EDGES pixman_rasterize_edges_accessors +#else +#define PIXMAN_RASTERIZE_EDGES pixman_rasterize_edges_no_accessors +#endif + +/* + * 4 bit alpha + */ + +#define N_BITS  4 +#define RASTERIZE_EDGES rasterize_edges_4 + +#ifndef WORDS_BIGENDIAN +#define SHIFT_4(o)      ((o) << 2) +#else +#define SHIFT_4(o)      ((1 - (o)) << 2) +#endif + +#define GET_4(x, o)      (((x) >> SHIFT_4 (o)) & 0xf) +#define PUT_4(x, o, v)							\ +    (((x) & ~(0xf << SHIFT_4 (o))) | (((v) & 0xf) << SHIFT_4 (o))) + +#define DEFINE_ALPHA(line, x)						\ +    uint8_t   *__ap = (uint8_t *) line + ((x) >> 1);			\ +    int __ao = (x) & 1 + +#define STEP_ALPHA      ((__ap += __ao), (__ao ^= 1)) + +#define ADD_ALPHA(a)							\ +    {									\ +        uint8_t __o = READ (image, __ap);				\ +        uint8_t __a = (a) + GET_4 (__o, __ao);				\ +        WRITE (image, __ap, PUT_4 (__o, __ao, __a | (0 - ((__a) >> 4)))); \ +    } + +#include "pixman-edge-imp.h" + +#undef ADD_ALPHA +#undef STEP_ALPHA +#undef DEFINE_ALPHA +#undef RASTERIZE_EDGES +#undef N_BITS + + +/* + * 1 bit alpha + */ + +#define N_BITS 1 +#define RASTERIZE_EDGES rasterize_edges_1 + +#include "pixman-edge-imp.h" + +#undef RASTERIZE_EDGES +#undef N_BITS + +/* + * 8 bit alpha + */ + +static force_inline uint8_t +clip255 (int x) +{ +    if (x > 255) +	return 255; + +    return x; +} + +#define ADD_SATURATE_8(buf, val, length)				\ +    do									\ +    {									\ +        int i__ = (length);						\ +        uint8_t *buf__ = (buf);						\ +        int val__ = (val);						\ +									\ +        while (i__--)							\ +        {								\ +            WRITE (image, (buf__), clip255 (READ (image, (buf__)) + (val__))); \ +            (buf__)++;							\ +	}								\ +    } while (0) + +/* + * We want to detect the case where we add the same value to a long + * span of pixels.  The triangles on the end are filled in while we + * count how many sub-pixel scanlines contribute to the middle section. + * + *                 +--------------------------+ + *  fill_height =|   \                      / + *                     +------------------+ + *                      |================| + *                   fill_start       fill_end + */ +static void +rasterize_edges_8 (pixman_image_t *image, +                   pixman_edge_t * l, +                   pixman_edge_t * r, +                   pixman_fixed_t  t, +                   pixman_fixed_t  b) +{ +    pixman_fixed_t y = t; +    uint32_t  *line; +    int fill_start = -1, fill_end = -1; +    int fill_size = 0; +    uint32_t *buf = (image)->bits.bits; +    int stride = (image)->bits.rowstride; +    int width = (image)->bits.width; + +    line = buf + pixman_fixed_to_int (y) * stride; + +    for (;;) +    { +        uint8_t *ap = (uint8_t *) line; +        pixman_fixed_t lx, rx; +        int lxi, rxi; + +        /* clip X */ +        lx = l->x; +        if (lx < 0) +	    lx = 0; + +        rx = r->x; + +        if (pixman_fixed_to_int (rx) >= width) +	{ +	    /* Use the last pixel of the scanline, covered 100%. +	     * We can't use the first pixel following the scanline, +	     * because accessing it could result in a buffer overrun. +	     */ +	    rx = pixman_int_to_fixed (width) - 1; +	} + +        /* Skip empty (or backwards) sections */ +        if (rx > lx) +        { +            int lxs, rxs; + +            /* Find pixel bounds for span. */ +            lxi = pixman_fixed_to_int (lx); +            rxi = pixman_fixed_to_int (rx); + +            /* Sample coverage for edge pixels */ +            lxs = RENDER_SAMPLES_X (lx, 8); +            rxs = RENDER_SAMPLES_X (rx, 8); + +            /* Add coverage across row */ +            if (lxi == rxi) +            { +                WRITE (image, ap + lxi, +		       clip255 (READ (image, ap + lxi) + rxs - lxs)); +	    } +            else +            { +                WRITE (image, ap + lxi, +		       clip255 (READ (image, ap + lxi) + N_X_FRAC (8) - lxs)); + +                /* Move forward so that lxi/rxi is the pixel span */ +                lxi++; + +                /* Don't bother trying to optimize the fill unless +		 * the span is longer than 4 pixels. */ +                if (rxi - lxi > 4) +                { +                    if (fill_start < 0) +                    { +                        fill_start = lxi; +                        fill_end = rxi; +                        fill_size++; +		    } +                    else +                    { +                        if (lxi >= fill_end || rxi < fill_start) +                        { +                            /* We're beyond what we saved, just fill it */ +                            ADD_SATURATE_8 (ap + fill_start, +                                            fill_size * N_X_FRAC (8), +                                            fill_end - fill_start); +                            fill_start = lxi; +                            fill_end = rxi; +                            fill_size = 1; +			} +                        else +                        { +                            /* Update fill_start */ +                            if (lxi > fill_start) +                            { +                                ADD_SATURATE_8 (ap + fill_start, +                                                fill_size * N_X_FRAC (8), +                                                lxi - fill_start); +                                fill_start = lxi; +			    } +                            else if (lxi < fill_start) +                            { +                                ADD_SATURATE_8 (ap + lxi, N_X_FRAC (8), +                                                fill_start - lxi); +			    } + +                            /* Update fill_end */ +                            if (rxi < fill_end) +                            { +                                ADD_SATURATE_8 (ap + rxi, +                                                fill_size * N_X_FRAC (8), +                                                fill_end - rxi); +                                fill_end = rxi; +			    } +                            else if (fill_end < rxi) +                            { +                                ADD_SATURATE_8 (ap + fill_end, +                                                N_X_FRAC (8), +                                                rxi - fill_end); +			    } +                            fill_size++; +			} +		    } +		} +                else +                { +                    ADD_SATURATE_8 (ap + lxi, N_X_FRAC (8), rxi - lxi); +		} + +                WRITE (image, ap + rxi, clip255 (READ (image, ap + rxi) + rxs)); +	    } +	} + +        if (y == b) +        { +            /* We're done, make sure we clean up any remaining fill. */ +            if (fill_start != fill_end) +            { +                if (fill_size == N_Y_FRAC (8)) +                { +                    MEMSET_WRAPPED (image, ap + fill_start, +				    0xff, fill_end - fill_start); +		} +                else +                { +                    ADD_SATURATE_8 (ap + fill_start, fill_size * N_X_FRAC (8), +                                    fill_end - fill_start); +		} +	    } +            break; +	} + +        if (pixman_fixed_frac (y) != Y_FRAC_LAST (8)) +        { +            RENDER_EDGE_STEP_SMALL (l); +            RENDER_EDGE_STEP_SMALL (r); +            y += STEP_Y_SMALL (8); +	} +        else +        { +            RENDER_EDGE_STEP_BIG (l); +            RENDER_EDGE_STEP_BIG (r); +            y += STEP_Y_BIG (8); +            if (fill_start != fill_end) +            { +                if (fill_size == N_Y_FRAC (8)) +                { +                    MEMSET_WRAPPED (image, ap + fill_start, +				    0xff, fill_end - fill_start); +		} +                else +                { +                    ADD_SATURATE_8 (ap + fill_start, fill_size * N_X_FRAC (8), +                                    fill_end - fill_start); +		} +		 +                fill_start = fill_end = -1; +                fill_size = 0; +	    } +	     +            line += stride; +	} +    } +} + +#ifndef PIXMAN_FB_ACCESSORS +static +#endif +void +PIXMAN_RASTERIZE_EDGES (pixman_image_t *image, +                        pixman_edge_t * l, +                        pixman_edge_t * r, +                        pixman_fixed_t  t, +                        pixman_fixed_t  b) +{ +    switch (PIXMAN_FORMAT_BPP (image->bits.format)) +    { +    case 1: +	rasterize_edges_1 (image, l, r, t, b); +	break; + +    case 4: +	rasterize_edges_4 (image, l, r, t, b); +	break; + +    case 8: +	rasterize_edges_8 (image, l, r, t, b); +	break; + +    default: +        break; +    } +} + +#ifndef PIXMAN_FB_ACCESSORS + +PIXMAN_EXPORT void +pixman_rasterize_edges (pixman_image_t *image, +                        pixman_edge_t * l, +                        pixman_edge_t * r, +                        pixman_fixed_t  t, +                        pixman_fixed_t  b) +{ +    return_if_fail (image->type == BITS); +    return_if_fail (PIXMAN_FORMAT_TYPE (image->bits.format) == PIXMAN_TYPE_A); +     +    if (image->bits.read_func || image->bits.write_func) +	pixman_rasterize_edges_accessors (image, l, r, t, b); +    else +	pixman_rasterize_edges_no_accessors (image, l, r, t, b); +} + +#endif diff --git a/libs/pixman-0.40.0/pixman/pixman-fast-path.c b/libs/pixman-0.40.0/pixman/pixman-fast-path.c new file mode 100644 index 0000000..4b7a6f8 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-fast-path.c @@ -0,0 +1,3294 @@ +/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of SuSE not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission.  SuSE makes no representations about the + * suitability of this software for any purpose.  It is provided "as is" + * without express or implied warranty. + * + * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Author:  Keith Packard, SuSE, Inc. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include <string.h> +#include <stdlib.h> +#include "pixman-private.h" +#include "pixman-combine32.h" +#include "pixman-inlines.h" + +static force_inline uint32_t +fetch_24 (uint8_t *a) +{ +    if (((uintptr_t)a) & 1) +    { +#ifdef WORDS_BIGENDIAN +	return (*a << 16) | (*(uint16_t *)(a + 1)); +#else +	return *a | (*(uint16_t *)(a + 1) << 8); +#endif +    } +    else +    { +#ifdef WORDS_BIGENDIAN +	return (*(uint16_t *)a << 8) | *(a + 2); +#else +	return *(uint16_t *)a | (*(a + 2) << 16); +#endif +    } +} + +static force_inline void +store_24 (uint8_t *a, +          uint32_t v) +{ +    if (((uintptr_t)a) & 1) +    { +#ifdef WORDS_BIGENDIAN +	*a = (uint8_t) (v >> 16); +	*(uint16_t *)(a + 1) = (uint16_t) (v); +#else +	*a = (uint8_t) (v); +	*(uint16_t *)(a + 1) = (uint16_t) (v >> 8); +#endif +    } +    else +    { +#ifdef WORDS_BIGENDIAN +	*(uint16_t *)a = (uint16_t)(v >> 8); +	*(a + 2) = (uint8_t)v; +#else +	*(uint16_t *)a = (uint16_t)v; +	*(a + 2) = (uint8_t)(v >> 16); +#endif +    } +} + +static force_inline uint32_t +over (uint32_t src, +      uint32_t dest) +{ +    uint32_t a = ~src >> 24; + +    UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src); + +    return dest; +} + +static force_inline uint32_t +in (uint32_t x, +    uint8_t  y) +{ +    uint16_t a = y; + +    UN8x4_MUL_UN8 (x, a); + +    return x; +} + +/* + * Naming convention: + * + *  op_src_mask_dest + */ +static void +fast_composite_over_x888_8_8888 (pixman_implementation_t *imp, +                                 pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t    *src, *src_line; +    uint32_t    *dst, *dst_line; +    uint8_t     *mask, *mask_line; +    int src_stride, mask_stride, dst_stride; +    uint8_t m; +    uint32_t s, d; +    int32_t w; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +    while (height--) +    { +	src = src_line; +	src_line += src_stride; +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; + +	w = width; +	while (w--) +	{ +	    m = *mask++; +	    if (m) +	    { +		s = *src | 0xff000000; + +		if (m == 0xff) +		{ +		    *dst = s; +		} +		else +		{ +		    d = in (s, m); +		    *dst = over (d, *dst); +		} +	    } +	    src++; +	    dst++; +	} +    } +} + +static void +fast_composite_in_n_8_8 (pixman_implementation_t *imp, +                         pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src, srca; +    uint8_t     *dst_line, *dst; +    uint8_t     *mask_line, *mask, m; +    int dst_stride, mask_stride; +    int32_t w; +    uint16_t t; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    srca = src >> 24; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + +    if (srca == 0xff) +    { +	while (height--) +	{ +	    dst = dst_line; +	    dst_line += dst_stride; +	    mask = mask_line; +	    mask_line += mask_stride; +	    w = width; + +	    while (w--) +	    { +		m = *mask++; + +		if (m == 0) +		    *dst = 0; +		else if (m != 0xff) +		    *dst = MUL_UN8 (m, *dst, t); + +		dst++; +	    } +	} +    } +    else +    { +	while (height--) +	{ +	    dst = dst_line; +	    dst_line += dst_stride; +	    mask = mask_line; +	    mask_line += mask_stride; +	    w = width; + +	    while (w--) +	    { +		m = *mask++; +		m = MUL_UN8 (m, srca, t); + +		if (m == 0) +		    *dst = 0; +		else if (m != 0xff) +		    *dst = MUL_UN8 (m, *dst, t); + +		dst++; +	    } +	} +    } +} + +static void +fast_composite_in_8_8 (pixman_implementation_t *imp, +                       pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint8_t     *dst_line, *dst; +    uint8_t     *src_line, *src; +    int dst_stride, src_stride; +    int32_t w; +    uint8_t s; +    uint16_t t; + +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w--) +	{ +	    s = *src++; + +	    if (s == 0) +		*dst = 0; +	    else if (s != 0xff) +		*dst = MUL_UN8 (s, *dst, t); + +	    dst++; +	} +    } +} + +static void +fast_composite_over_n_8_8888 (pixman_implementation_t *imp, +                              pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src, srca; +    uint32_t    *dst_line, *dst, d; +    uint8_t     *mask_line, *mask, m; +    int dst_stride, mask_stride; +    int32_t w; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    srca = src >> 24; +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; +	w = width; + +	while (w--) +	{ +	    m = *mask++; +	    if (m == 0xff) +	    { +		if (srca == 0xff) +		    *dst = src; +		else +		    *dst = over (src, *dst); +	    } +	    else if (m) +	    { +		d = in (src, m); +		*dst = over (d, *dst); +	    } +	    dst++; +	} +    } +} + +static void +fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, +				   pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src, s; +    uint32_t    *dst_line, *dst, d; +    uint32_t    *mask_line, *mask, ma; +    int dst_stride, mask_stride; +    int32_t w; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; +	w = width; + +	while (w--) +	{ +	    ma = *mask++; + +	    if (ma) +	    { +		d = *dst; +		s = src; + +		UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d); + +		*dst = s; +	    } + +	    dst++; +	} +    } +} + +static void +fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, +                                    pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src, srca, s; +    uint32_t    *dst_line, *dst, d; +    uint32_t    *mask_line, *mask, ma; +    int dst_stride, mask_stride; +    int32_t w; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    srca = src >> 24; +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; +	w = width; + +	while (w--) +	{ +	    ma = *mask++; +	    if (ma == 0xffffffff) +	    { +		if (srca == 0xff) +		    *dst = src; +		else +		    *dst = over (src, *dst); +	    } +	    else if (ma) +	    { +		d = *dst; +		s = src; + +		UN8x4_MUL_UN8x4 (s, ma); +		UN8x4_MUL_UN8 (ma, srca); +		ma = ~ma; +		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s); + +		*dst = d; +	    } + +	    dst++; +	} +    } +} + +static void +fast_composite_over_n_8_0888 (pixman_implementation_t *imp, +                              pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src, srca; +    uint8_t     *dst_line, *dst; +    uint32_t d; +    uint8_t     *mask_line, *mask, m; +    int dst_stride, mask_stride; +    int32_t w; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    srca = src >> 24; +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3); +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; +	w = width; + +	while (w--) +	{ +	    m = *mask++; +	    if (m == 0xff) +	    { +		if (srca == 0xff) +		{ +		    d = src; +		} +		else +		{ +		    d = fetch_24 (dst); +		    d = over (src, d); +		} +		store_24 (dst, d); +	    } +	    else if (m) +	    { +		d = over (in (src, m), fetch_24 (dst)); +		store_24 (dst, d); +	    } +	    dst += 3; +	} +    } +} + +static void +fast_composite_over_n_8_0565 (pixman_implementation_t *imp, +                              pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src, srca; +    uint16_t    *dst_line, *dst; +    uint32_t d; +    uint8_t     *mask_line, *mask, m; +    int dst_stride, mask_stride; +    int32_t w; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    srca = src >> 24; +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; +	w = width; + +	while (w--) +	{ +	    m = *mask++; +	    if (m == 0xff) +	    { +		if (srca == 0xff) +		{ +		    d = src; +		} +		else +		{ +		    d = *dst; +		    d = over (src, convert_0565_to_0888 (d)); +		} +		*dst = convert_8888_to_0565 (d); +	    } +	    else if (m) +	    { +		d = *dst; +		d = over (in (src, m), convert_0565_to_0888 (d)); +		*dst = convert_8888_to_0565 (d); +	    } +	    dst++; +	} +    } +} + +static void +fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, +                                    pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t  src, srca, s; +    uint16_t  src16; +    uint16_t *dst_line, *dst; +    uint32_t  d; +    uint32_t *mask_line, *mask, ma; +    int dst_stride, mask_stride; +    int32_t w; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    srca = src >> 24; +    if (src == 0) +	return; + +    src16 = convert_8888_to_0565 (src); + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; +	w = width; + +	while (w--) +	{ +	    ma = *mask++; +	    if (ma == 0xffffffff) +	    { +		if (srca == 0xff) +		{ +		    *dst = src16; +		} +		else +		{ +		    d = *dst; +		    d = over (src, convert_0565_to_0888 (d)); +		    *dst = convert_8888_to_0565 (d); +		} +	    } +	    else if (ma) +	    { +		d = *dst; +		d = convert_0565_to_0888 (d); + +		s = src; + +		UN8x4_MUL_UN8x4 (s, ma); +		UN8x4_MUL_UN8 (ma, srca); +		ma = ~ma; +		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s); + +		*dst = convert_8888_to_0565 (d); +	    } +	    dst++; +	} +    } +} + +static void +fast_composite_over_8888_8888 (pixman_implementation_t *imp, +                               pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t    *dst_line, *dst; +    uint32_t    *src_line, *src, s; +    int dst_stride, src_stride; +    uint8_t a; +    int32_t w; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w--) +	{ +	    s = *src++; +	    a = s >> 24; +	    if (a == 0xff) +		*dst = s; +	    else if (s) +		*dst = over (s, *dst); +	    dst++; +	} +    } +} + +static void +fast_composite_src_x888_8888 (pixman_implementation_t *imp, +			      pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t    *dst_line, *dst; +    uint32_t    *src_line, *src; +    int dst_stride, src_stride; +    int32_t w; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w--) +	    *dst++ = (*src++) | 0xff000000; +    } +} + +#if 0 +static void +fast_composite_over_8888_0888 (pixman_implementation_t *imp, +			       pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint8_t     *dst_line, *dst; +    uint32_t d; +    uint32_t    *src_line, *src, s; +    uint8_t a; +    int dst_stride, src_stride; +    int32_t w; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3); +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w--) +	{ +	    s = *src++; +	    a = s >> 24; +	    if (a) +	    { +		if (a == 0xff) +		    d = s; +		else +		    d = over (s, fetch_24 (dst)); + +		store_24 (dst, d); +	    } +	    dst += 3; +	} +    } +} +#endif + +static void +fast_composite_over_8888_0565 (pixman_implementation_t *imp, +                               pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint16_t    *dst_line, *dst; +    uint32_t d; +    uint32_t    *src_line, *src, s; +    uint8_t a; +    int dst_stride, src_stride; +    int32_t w; + +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w--) +	{ +	    s = *src++; +	    a = s >> 24; +	    if (s) +	    { +		if (a == 0xff) +		{ +		    d = s; +		} +		else +		{ +		    d = *dst; +		    d = over (s, convert_0565_to_0888 (d)); +		} +		*dst = convert_8888_to_0565 (d); +	    } +	    dst++; +	} +    } +} + +static void +fast_composite_add_8_8 (pixman_implementation_t *imp, +			pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint8_t     *dst_line, *dst; +    uint8_t     *src_line, *src; +    int dst_stride, src_stride; +    int32_t w; +    uint8_t s, d; +    uint16_t t; + +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w--) +	{ +	    s = *src++; +	    if (s) +	    { +		if (s != 0xff) +		{ +		    d = *dst; +		    t = d + s; +		    s = t | (0 - (t >> 8)); +		} +		*dst = s; +	    } +	    dst++; +	} +    } +} + +static void +fast_composite_add_0565_0565 (pixman_implementation_t *imp, +                              pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint16_t    *dst_line, *dst; +    uint32_t	d; +    uint16_t    *src_line, *src; +    uint32_t	s; +    int dst_stride, src_stride; +    int32_t w; + +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1); +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w--) +	{ +	    s = *src++; +	    if (s) +	    { +		d = *dst; +		s = convert_0565_to_8888 (s); +		if (d) +		{ +		    d = convert_0565_to_8888 (d); +		    UN8x4_ADD_UN8x4 (s, d); +		} +		*dst = convert_8888_to_0565 (s); +	    } +	    dst++; +	} +    } +} + +static void +fast_composite_add_8888_8888 (pixman_implementation_t *imp, +                              pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t    *dst_line, *dst; +    uint32_t    *src_line, *src; +    int dst_stride, src_stride; +    int32_t w; +    uint32_t s, d; + +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w--) +	{ +	    s = *src++; +	    if (s) +	    { +		if (s != 0xffffffff) +		{ +		    d = *dst; +		    if (d) +			UN8x4_ADD_UN8x4 (s, d); +		} +		*dst = s; +	    } +	    dst++; +	} +    } +} + +static void +fast_composite_add_n_8_8 (pixman_implementation_t *imp, +			  pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint8_t     *dst_line, *dst; +    uint8_t     *mask_line, *mask; +    int dst_stride, mask_stride; +    int32_t w; +    uint32_t src; +    uint8_t sa; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); +    sa = (src >> 24); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; +	w = width; + +	while (w--) +	{ +	    uint16_t tmp; +	    uint16_t a; +	    uint32_t m, d; +	    uint32_t r; + +	    a = *mask++; +	    d = *dst; + +	    m = MUL_UN8 (sa, a, tmp); +	    r = ADD_UN8 (m, d, tmp); + +	    *dst++ = r; +	} +    } +} + +#ifdef WORDS_BIGENDIAN +#define CREATE_BITMASK(n) (0x80000000 >> (n)) +#define UPDATE_BITMASK(n) ((n) >> 1) +#else +#define CREATE_BITMASK(n) (1U << (n)) +#define UPDATE_BITMASK(n) ((n) << 1) +#endif + +#define TEST_BIT(p, n)					\ +    (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31)) +#define SET_BIT(p, n)							\ +    do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0); + +static void +fast_composite_add_1_1 (pixman_implementation_t *imp, +			pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t     *dst_line, *dst; +    uint32_t     *src_line, *src; +    int           dst_stride, src_stride; +    int32_t       w; + +    PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t, +                           src_stride, src_line, 1); +    PIXMAN_IMAGE_GET_LINE (dest_image, 0, dest_y, uint32_t, +                           dst_stride, dst_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w--) +	{ +	    /* +	     * TODO: improve performance by processing uint32_t data instead +	     *       of individual bits +	     */ +	    if (TEST_BIT (src, src_x + w)) +		SET_BIT (dst, dest_x + w); +	} +    } +} + +static void +fast_composite_over_n_1_8888 (pixman_implementation_t *imp, +                              pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t     src, srca; +    uint32_t    *dst, *dst_line; +    uint32_t    *mask, *mask_line; +    int          mask_stride, dst_stride; +    uint32_t     bitcache, bitmask; +    int32_t      w; + +    if (width <= 0) +	return; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); +    srca = src >> 24; +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, +                           dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t, +                           mask_stride, mask_line, 1); +    mask_line += mask_x >> 5; + +    if (srca == 0xff) +    { +	while (height--) +	{ +	    dst = dst_line; +	    dst_line += dst_stride; +	    mask = mask_line; +	    mask_line += mask_stride; +	    w = width; + +	    bitcache = *mask++; +	    bitmask = CREATE_BITMASK (mask_x & 31); + +	    while (w--) +	    { +		if (bitmask == 0) +		{ +		    bitcache = *mask++; +		    bitmask = CREATE_BITMASK (0); +		} +		if (bitcache & bitmask) +		    *dst = src; +		bitmask = UPDATE_BITMASK (bitmask); +		dst++; +	    } +	} +    } +    else +    { +	while (height--) +	{ +	    dst = dst_line; +	    dst_line += dst_stride; +	    mask = mask_line; +	    mask_line += mask_stride; +	    w = width; + +	    bitcache = *mask++; +	    bitmask = CREATE_BITMASK (mask_x & 31); + +	    while (w--) +	    { +		if (bitmask == 0) +		{ +		    bitcache = *mask++; +		    bitmask = CREATE_BITMASK (0); +		} +		if (bitcache & bitmask) +		    *dst = over (src, *dst); +		bitmask = UPDATE_BITMASK (bitmask); +		dst++; +	    } +	} +    } +} + +static void +fast_composite_over_n_1_0565 (pixman_implementation_t *imp, +                              pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t     src, srca; +    uint16_t    *dst, *dst_line; +    uint32_t    *mask, *mask_line; +    int          mask_stride, dst_stride; +    uint32_t     bitcache, bitmask; +    int32_t      w; +    uint32_t     d; +    uint16_t     src565; + +    if (width <= 0) +	return; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); +    srca = src >> 24; +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, +                           dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t, +                           mask_stride, mask_line, 1); +    mask_line += mask_x >> 5; + +    if (srca == 0xff) +    { +	src565 = convert_8888_to_0565 (src); +	while (height--) +	{ +	    dst = dst_line; +	    dst_line += dst_stride; +	    mask = mask_line; +	    mask_line += mask_stride; +	    w = width; + +	    bitcache = *mask++; +	    bitmask = CREATE_BITMASK (mask_x & 31); + +	    while (w--) +	    { +		if (bitmask == 0) +		{ +		    bitcache = *mask++; +		    bitmask = CREATE_BITMASK (0); +		} +		if (bitcache & bitmask) +		    *dst = src565; +		bitmask = UPDATE_BITMASK (bitmask); +		dst++; +	    } +	} +    } +    else +    { +	while (height--) +	{ +	    dst = dst_line; +	    dst_line += dst_stride; +	    mask = mask_line; +	    mask_line += mask_stride; +	    w = width; + +	    bitcache = *mask++; +	    bitmask = CREATE_BITMASK (mask_x & 31); + +	    while (w--) +	    { +		if (bitmask == 0) +		{ +		    bitcache = *mask++; +		    bitmask = CREATE_BITMASK (0); +		} +		if (bitcache & bitmask) +		{ +		    d = over (src, convert_0565_to_0888 (*dst)); +		    *dst = convert_8888_to_0565 (d); +		} +		bitmask = UPDATE_BITMASK (bitmask); +		dst++; +	    } +	} +    } +} + +/* + * Simple bitblt + */ + +static void +fast_composite_solid_fill (pixman_implementation_t *imp, +                           pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    if (dest_image->bits.format == PIXMAN_a1) +    { +	src = src >> 31; +    } +    else if (dest_image->bits.format == PIXMAN_a8) +    { +	src = src >> 24; +    } +    else if (dest_image->bits.format == PIXMAN_r5g6b5 || +             dest_image->bits.format == PIXMAN_b5g6r5) +    { +	src = convert_8888_to_0565 (src); +    } + +    pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, +                 PIXMAN_FORMAT_BPP (dest_image->bits.format), +                 dest_x, dest_y, +                 width, height, +                 src); +} + +static void +fast_composite_src_memcpy (pixman_implementation_t *imp, +			   pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    int bpp = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8; +    uint32_t n_bytes = width * bpp; +    int dst_stride, src_stride; +    uint8_t    *dst; +    uint8_t    *src; + +    src_stride = src_image->bits.rowstride * 4; +    dst_stride = dest_image->bits.rowstride * 4; + +    src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp; +    dst = (uint8_t *)dest_image->bits.bits + dest_y * dst_stride + dest_x * bpp; + +    while (height--) +    { +	memcpy (dst, src, n_bytes); + +	dst += dst_stride; +	src += src_stride; +    } +} + +FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER) +FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE) +FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD) +FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL) +FAST_NEAREST (x888_8888_cover, x888, 8888, uint32_t, uint32_t, SRC, COVER) +FAST_NEAREST (x888_8888_pad, x888, 8888, uint32_t, uint32_t, SRC, PAD) +FAST_NEAREST (x888_8888_normal, x888, 8888, uint32_t, uint32_t, SRC, NORMAL) +FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER) +FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE) +FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD) +FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL) +FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER) +FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE) +FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD) +FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL) +FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL) +FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER) +FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE) +FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD) +FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL) + +#define REPEAT_MIN_WIDTH    32 + +static void +fast_composite_tiled_repeat (pixman_implementation_t *imp, +			     pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    pixman_composite_func_t func; +    pixman_format_code_t mask_format; +    uint32_t src_flags, mask_flags; +    int32_t sx, sy; +    int32_t width_remain; +    int32_t num_pixels; +    int32_t src_width; +    int32_t i, j; +    pixman_image_t extended_src_image; +    uint32_t extended_src[REPEAT_MIN_WIDTH * 2]; +    pixman_bool_t need_src_extension; +    uint32_t *src_line; +    int32_t src_stride; +    int32_t src_bpp; +    pixman_composite_info_t info2 = *info; + +    src_flags = (info->src_flags & ~FAST_PATH_NORMAL_REPEAT) | +		    FAST_PATH_SAMPLES_COVER_CLIP_NEAREST; + +    if (mask_image) +    { +	mask_format = mask_image->common.extended_format_code; +	mask_flags = info->mask_flags; +    } +    else +    { +	mask_format = PIXMAN_null; +	mask_flags = FAST_PATH_IS_OPAQUE; +    } + +    _pixman_implementation_lookup_composite ( +	imp->toplevel, info->op, +	src_image->common.extended_format_code, src_flags, +	mask_format, mask_flags, +	dest_image->common.extended_format_code, info->dest_flags, +	&imp, &func); + +    src_bpp = PIXMAN_FORMAT_BPP (src_image->bits.format); + +    if (src_image->bits.width < REPEAT_MIN_WIDTH		&& +	(src_bpp == 32 || src_bpp == 16 || src_bpp == 8)	&& +	!src_image->bits.indexed) +    { +	sx = src_x; +	sx = MOD (sx, src_image->bits.width); +	sx += width; +	src_width = 0; + +	while (src_width < REPEAT_MIN_WIDTH && src_width <= sx) +	    src_width += src_image->bits.width; + +	src_stride = (src_width * (src_bpp >> 3) + 3) / (int) sizeof (uint32_t); + +	/* Initialize/validate stack-allocated temporary image */ +	_pixman_bits_image_init (&extended_src_image, src_image->bits.format, +				 src_width, 1, &extended_src[0], src_stride, +				 FALSE); +	_pixman_image_validate (&extended_src_image); + +	info2.src_image = &extended_src_image; +	need_src_extension = TRUE; +    } +    else +    { +	src_width = src_image->bits.width; +	need_src_extension = FALSE; +    } + +    sx = src_x; +    sy = src_y; + +    while (--height >= 0) +    { +	sx = MOD (sx, src_width); +	sy = MOD (sy, src_image->bits.height); + +	if (need_src_extension) +	{ +	    if (src_bpp == 32) +	    { +		PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint32_t, src_stride, src_line, 1); + +		for (i = 0; i < src_width; ) +		{ +		    for (j = 0; j < src_image->bits.width; j++, i++) +			extended_src[i] = src_line[j]; +		} +	    } +	    else if (src_bpp == 16) +	    { +		uint16_t *src_line_16; + +		PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint16_t, src_stride, +				       src_line_16, 1); +		src_line = (uint32_t*)src_line_16; + +		for (i = 0; i < src_width; ) +		{ +		    for (j = 0; j < src_image->bits.width; j++, i++) +			((uint16_t*)extended_src)[i] = ((uint16_t*)src_line)[j]; +		} +	    } +	    else if (src_bpp == 8) +	    { +		uint8_t *src_line_8; + +		PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint8_t, src_stride, +				       src_line_8, 1); +		src_line = (uint32_t*)src_line_8; + +		for (i = 0; i < src_width; ) +		{ +		    for (j = 0; j < src_image->bits.width; j++, i++) +			((uint8_t*)extended_src)[i] = ((uint8_t*)src_line)[j]; +		} +	    } + +	    info2.src_y = 0; +	} +	else +	{ +	    info2.src_y = sy; +	} + +	width_remain = width; + +	while (width_remain > 0) +	{ +	    num_pixels = src_width - sx; + +	    if (num_pixels > width_remain) +		num_pixels = width_remain; + +	    info2.src_x = sx; +	    info2.width = num_pixels; +	    info2.height = 1; + +	    func (imp, &info2); + +	    width_remain -= num_pixels; +	    info2.mask_x += num_pixels; +	    info2.dest_x += num_pixels; +	    sx = 0; +	} + +	sx = src_x; +	sy++; +	info2.mask_x = info->mask_x; +	info2.mask_y++; +	info2.dest_x = info->dest_x; +	info2.dest_y++; +    } + +    if (need_src_extension) +	_pixman_image_fini (&extended_src_image); +} + +/* Use more unrolling for src_0565_0565 because it is typically CPU bound */ +static force_inline void +scaled_nearest_scanline_565_565_SRC (uint16_t *       dst, +				     const uint16_t * src, +				     int32_t          w, +				     pixman_fixed_t   vx, +				     pixman_fixed_t   unit_x, +				     pixman_fixed_t   max_vx, +				     pixman_bool_t    fully_transparent_src) +{ +    uint16_t tmp1, tmp2, tmp3, tmp4; +    while ((w -= 4) >= 0) +    { +	tmp1 = *(src + pixman_fixed_to_int (vx)); +	vx += unit_x; +	tmp2 = *(src + pixman_fixed_to_int (vx)); +	vx += unit_x; +	tmp3 = *(src + pixman_fixed_to_int (vx)); +	vx += unit_x; +	tmp4 = *(src + pixman_fixed_to_int (vx)); +	vx += unit_x; +	*dst++ = tmp1; +	*dst++ = tmp2; +	*dst++ = tmp3; +	*dst++ = tmp4; +    } +    if (w & 2) +    { +	tmp1 = *(src + pixman_fixed_to_int (vx)); +	vx += unit_x; +	tmp2 = *(src + pixman_fixed_to_int (vx)); +	vx += unit_x; +	*dst++ = tmp1; +	*dst++ = tmp2; +    } +    if (w & 1) +	*dst = *(src + pixman_fixed_to_int (vx)); +} + +FAST_NEAREST_MAINLOOP (565_565_cover_SRC, +		       scaled_nearest_scanline_565_565_SRC, +		       uint16_t, uint16_t, COVER) +FAST_NEAREST_MAINLOOP (565_565_none_SRC, +		       scaled_nearest_scanline_565_565_SRC, +		       uint16_t, uint16_t, NONE) +FAST_NEAREST_MAINLOOP (565_565_pad_SRC, +		       scaled_nearest_scanline_565_565_SRC, +		       uint16_t, uint16_t, PAD) + +static force_inline uint32_t +fetch_nearest (pixman_repeat_t src_repeat, +	       pixman_format_code_t format, +	       uint32_t *src, int x, int src_width) +{ +    if (repeat (src_repeat, &x, src_width)) +    { +	if (format == PIXMAN_x8r8g8b8 || format == PIXMAN_x8b8g8r8) +	    return *(src + x) | 0xff000000; +	else +	    return *(src + x); +    } +    else +    { +	return 0; +    } +} + +static force_inline void +combine_over (uint32_t s, uint32_t *dst) +{ +    if (s) +    { +	uint8_t ia = 0xff - (s >> 24); + +	if (ia) +	    UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s); +	else +	    *dst = s; +    } +} + +static force_inline void +combine_src (uint32_t s, uint32_t *dst) +{ +    *dst = s; +} + +static void +fast_composite_scaled_nearest (pixman_implementation_t *imp, +			       pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t       *dst_line; +    uint32_t       *src_line; +    int             dst_stride, src_stride; +    int		    src_width, src_height; +    pixman_repeat_t src_repeat; +    pixman_fixed_t unit_x, unit_y; +    pixman_format_code_t src_format; +    pixman_vector_t v; +    pixman_fixed_t vy; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be +     * transformed from destination space to source space +     */ +    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1); + +    /* reference point is the center of the pixel */ +    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2; +    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2; +    v.vector[2] = pixman_fixed_1; + +    if (!pixman_transform_point_3d (src_image->common.transform, &v)) +	return; + +    unit_x = src_image->common.transform->matrix[0][0]; +    unit_y = src_image->common.transform->matrix[1][1]; + +    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */ +    v.vector[0] -= pixman_fixed_e; +    v.vector[1] -= pixman_fixed_e; + +    src_height = src_image->bits.height; +    src_width = src_image->bits.width; +    src_repeat = src_image->common.repeat; +    src_format = src_image->bits.format; + +    vy = v.vector[1]; +    while (height--) +    { +        pixman_fixed_t vx = v.vector[0]; +	int y = pixman_fixed_to_int (vy); +	uint32_t *dst = dst_line; + +	dst_line += dst_stride; + +        /* adjust the y location by a unit vector in the y direction +         * this is equivalent to transforming y+1 of the destination point to source space */ +        vy += unit_y; + +	if (!repeat (src_repeat, &y, src_height)) +	{ +	    if (op == PIXMAN_OP_SRC) +		memset (dst, 0, sizeof (*dst) * width); +	} +	else +	{ +	    int w = width; + +	    uint32_t *src = src_line + y * src_stride; + +	    while (w >= 2) +	    { +		uint32_t s1, s2; +		int x1, x2; + +		x1 = pixman_fixed_to_int (vx); +		vx += unit_x; + +		x2 = pixman_fixed_to_int (vx); +		vx += unit_x; + +		w -= 2; + +		s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width); +		s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width); + +		if (op == PIXMAN_OP_OVER) +		{ +		    combine_over (s1, dst++); +		    combine_over (s2, dst++); +		} +		else +		{ +		    combine_src (s1, dst++); +		    combine_src (s2, dst++); +		} +	    } + +	    while (w--) +	    { +		uint32_t s; +		int x; + +		x = pixman_fixed_to_int (vx); +		vx += unit_x; + +		s = fetch_nearest (src_repeat, src_format, src, x, src_width); + +		if (op == PIXMAN_OP_OVER) +		    combine_over (s, dst++); +		else +		    combine_src (s, dst++); +	    } +	} +    } +} + +#define CACHE_LINE_SIZE 64 + +#define FAST_SIMPLE_ROTATE(suffix, pix_type)                                  \ +                                                                              \ +static void                                                                   \ +blt_rotated_90_trivial_##suffix (pix_type       *dst,                         \ +				 int             dst_stride,                  \ +				 const pix_type *src,                         \ +				 int             src_stride,                  \ +				 int             w,                           \ +				 int             h)                           \ +{                                                                             \ +    int x, y;                                                                 \ +    for (y = 0; y < h; y++)                                                   \ +    {                                                                         \ +	const pix_type *s = src + (h - y - 1);                                \ +	pix_type *d = dst + dst_stride * y;                                   \ +	for (x = 0; x < w; x++)                                               \ +	{                                                                     \ +	    *d++ = *s;                                                        \ +	    s += src_stride;                                                  \ +	}                                                                     \ +    }                                                                         \ +}                                                                             \ +                                                                              \ +static void                                                                   \ +blt_rotated_270_trivial_##suffix (pix_type       *dst,                        \ +				  int             dst_stride,                 \ +				  const pix_type *src,                        \ +				  int             src_stride,                 \ +				  int             w,                          \ +				  int             h)                          \ +{                                                                             \ +    int x, y;                                                                 \ +    for (y = 0; y < h; y++)                                                   \ +    {                                                                         \ +	const pix_type *s = src + src_stride * (w - 1) + y;                   \ +	pix_type *d = dst + dst_stride * y;                                   \ +	for (x = 0; x < w; x++)                                               \ +	{                                                                     \ +	    *d++ = *s;                                                        \ +	    s -= src_stride;                                                  \ +	}                                                                     \ +    }                                                                         \ +}                                                                             \ +                                                                              \ +static void                                                                   \ +blt_rotated_90_##suffix (pix_type       *dst,                                 \ +			 int             dst_stride,                          \ +			 const pix_type *src,                                 \ +			 int             src_stride,                          \ +			 int             W,                                   \ +			 int             H)                                   \ +{                                                                             \ +    int x;                                                                    \ +    int leading_pixels = 0, trailing_pixels = 0;                              \ +    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \ +                                                                              \ +    /*                                                                        \ +     * split processing into handling destination as TILE_SIZExH cache line   \ +     * aligned vertical stripes (optimistically assuming that destination     \ +     * stride is a multiple of cache line, if not - it will be just a bit     \ +     * slower)                                                                \ +     */                                                                       \ +                                                                              \ +    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \ +    {                                                                         \ +	leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \ +			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \ +	if (leading_pixels > W)                                               \ +	    leading_pixels = W;                                               \ +                                                                              \ +	/* unaligned leading part NxH (where N < TILE_SIZE) */                \ +	blt_rotated_90_trivial_##suffix (                                     \ +	    dst,                                                              \ +	    dst_stride,                                                       \ +	    src,                                                              \ +	    src_stride,                                                       \ +	    leading_pixels,                                                   \ +	    H);                                                               \ +	                                                                      \ +	dst += leading_pixels;                                                \ +	src += leading_pixels * src_stride;                                   \ +	W -= leading_pixels;                                                  \ +    }                                                                         \ +                                                                              \ +    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \ +    {                                                                         \ +	trailing_pixels = (((uintptr_t)(dst + W) &                            \ +			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \ +	if (trailing_pixels > W)                                              \ +	    trailing_pixels = W;                                              \ +	W -= trailing_pixels;                                                 \ +    }                                                                         \ +                                                                              \ +    for (x = 0; x < W; x += TILE_SIZE)                                        \ +    {                                                                         \ +	/* aligned middle part TILE_SIZExH */                                 \ +	blt_rotated_90_trivial_##suffix (                                     \ +	    dst + x,                                                          \ +	    dst_stride,                                                       \ +	    src + src_stride * x,                                             \ +	    src_stride,                                                       \ +	    TILE_SIZE,                                                        \ +	    H);                                                               \ +    }                                                                         \ +                                                                              \ +    if (trailing_pixels)                                                      \ +    {                                                                         \ +	/* unaligned trailing part NxH (where N < TILE_SIZE) */               \ +	blt_rotated_90_trivial_##suffix (                                     \ +	    dst + W,                                                          \ +	    dst_stride,                                                       \ +	    src + W * src_stride,                                             \ +	    src_stride,                                                       \ +	    trailing_pixels,                                                  \ +	    H);                                                               \ +    }                                                                         \ +}                                                                             \ +                                                                              \ +static void                                                                   \ +blt_rotated_270_##suffix (pix_type       *dst,                                \ +			  int             dst_stride,                         \ +			  const pix_type *src,                                \ +			  int             src_stride,                         \ +			  int             W,                                  \ +			  int             H)                                  \ +{                                                                             \ +    int x;                                                                    \ +    int leading_pixels = 0, trailing_pixels = 0;                              \ +    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \ +                                                                              \ +    /*                                                                        \ +     * split processing into handling destination as TILE_SIZExH cache line   \ +     * aligned vertical stripes (optimistically assuming that destination     \ +     * stride is a multiple of cache line, if not - it will be just a bit     \ +     * slower)                                                                \ +     */                                                                       \ +                                                                              \ +    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \ +    {                                                                         \ +	leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \ +			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \ +	if (leading_pixels > W)                                               \ +	    leading_pixels = W;                                               \ +                                                                              \ +	/* unaligned leading part NxH (where N < TILE_SIZE) */                \ +	blt_rotated_270_trivial_##suffix (                                    \ +	    dst,                                                              \ +	    dst_stride,                                                       \ +	    src + src_stride * (W - leading_pixels),                          \ +	    src_stride,                                                       \ +	    leading_pixels,                                                   \ +	    H);                                                               \ +	                                                                      \ +	dst += leading_pixels;                                                \ +	W -= leading_pixels;                                                  \ +    }                                                                         \ +                                                                              \ +    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \ +    {                                                                         \ +	trailing_pixels = (((uintptr_t)(dst + W) &                            \ +			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \ +	if (trailing_pixels > W)                                              \ +	    trailing_pixels = W;                                              \ +	W -= trailing_pixels;                                                 \ +	src += trailing_pixels * src_stride;                                  \ +    }                                                                         \ +                                                                              \ +    for (x = 0; x < W; x += TILE_SIZE)                                        \ +    {                                                                         \ +	/* aligned middle part TILE_SIZExH */                                 \ +	blt_rotated_270_trivial_##suffix (                                    \ +	    dst + x,                                                          \ +	    dst_stride,                                                       \ +	    src + src_stride * (W - x - TILE_SIZE),                           \ +	    src_stride,                                                       \ +	    TILE_SIZE,                                                        \ +	    H);                                                               \ +    }                                                                         \ +                                                                              \ +    if (trailing_pixels)                                                      \ +    {                                                                         \ +	/* unaligned trailing part NxH (where N < TILE_SIZE) */               \ +	blt_rotated_270_trivial_##suffix (                                    \ +	    dst + W,                                                          \ +	    dst_stride,                                                       \ +	    src - trailing_pixels * src_stride,                               \ +	    src_stride,                                                       \ +	    trailing_pixels,                                                  \ +	    H);                                                               \ +    }                                                                         \ +}                                                                             \ +                                                                              \ +static void                                                                   \ +fast_composite_rotate_90_##suffix (pixman_implementation_t *imp,              \ +				   pixman_composite_info_t *info)	      \ +{									      \ +    PIXMAN_COMPOSITE_ARGS (info);					      \ +    pix_type       *dst_line;						      \ +    pix_type       *src_line;                                                 \ +    int             dst_stride, src_stride;                                   \ +    int             src_x_t, src_y_t;                                         \ +                                                                              \ +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \ +			   dst_stride, dst_line, 1);                          \ +    src_x_t = -src_y + pixman_fixed_to_int (                                  \ +				src_image->common.transform->matrix[0][2] +   \ +				pixman_fixed_1 / 2 - pixman_fixed_e) - height;\ +    src_y_t = src_x + pixman_fixed_to_int (                                   \ +				src_image->common.transform->matrix[1][2] +   \ +				pixman_fixed_1 / 2 - pixman_fixed_e);         \ +    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \ +			   src_stride, src_line, 1);                          \ +    blt_rotated_90_##suffix (dst_line, dst_stride, src_line, src_stride,      \ +			     width, height);                                  \ +}                                                                             \ +                                                                              \ +static void                                                                   \ +fast_composite_rotate_270_##suffix (pixman_implementation_t *imp,             \ +				    pixman_composite_info_t *info)            \ +{                                                                             \ +    PIXMAN_COMPOSITE_ARGS (info);					      \ +    pix_type       *dst_line;						      \ +    pix_type       *src_line;                                                 \ +    int             dst_stride, src_stride;                                   \ +    int             src_x_t, src_y_t;                                         \ +                                                                              \ +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \ +			   dst_stride, dst_line, 1);                          \ +    src_x_t = src_y + pixman_fixed_to_int (                                   \ +				src_image->common.transform->matrix[0][2] +   \ +				pixman_fixed_1 / 2 - pixman_fixed_e);         \ +    src_y_t = -src_x + pixman_fixed_to_int (                                  \ +				src_image->common.transform->matrix[1][2] +   \ +				pixman_fixed_1 / 2 - pixman_fixed_e) - width; \ +    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \ +			   src_stride, src_line, 1);                          \ +    blt_rotated_270_##suffix (dst_line, dst_stride, src_line, src_stride,     \ +			      width, height);                                 \ +} + +FAST_SIMPLE_ROTATE (8, uint8_t) +FAST_SIMPLE_ROTATE (565, uint16_t) +FAST_SIMPLE_ROTATE (8888, uint32_t) + +static const pixman_fast_path_t c_fast_paths[] = +{ +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565), +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565), +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5,   fast_composite_over_n_1_0565), +    PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5,   fast_composite_over_n_1_0565), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca), +    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565), +    PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, fast_composite_add_0565_0565), +    PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, fast_composite_add_0565_0565), +    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888), +    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888), +    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8), +    PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1_1), +    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8), +    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill), +    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill), +    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill), +    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill), +    PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill), +    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill), +    PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill), +    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888), +    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888), +    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy), +    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy), +    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy), +    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy), +    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy), +    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy), +    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy), +    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy), +    PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy), +    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy), +    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy), +    PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy), +    PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy), +    PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy), +    PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy), +    PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy), +    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8), +    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8), + +    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888), +    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888), +    SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888), +    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888), + +    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888), +    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888), + +    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565), +    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565), + +    SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565), + +    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8r8g8b8, a8r8g8b8, x888_8888), +    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8b8g8r8, a8b8g8r8, x888_8888), +    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8r8g8b8, a8r8g8b8, x888_8888), +    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8b8g8r8, a8b8g8r8, x888_8888), +    SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, x888_8888), +    SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, x888_8888), + +    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888), +    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888), +    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888), +    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888), + +    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565), + +#define NEAREST_FAST_PATH(op,s,d)		\ +    {   PIXMAN_OP_ ## op,			\ +	PIXMAN_ ## s, SCALED_NEAREST_FLAGS,	\ +	PIXMAN_null, 0,				\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,	\ +	fast_composite_scaled_nearest,		\ +    } + +    NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8), +    NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8), +    NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8), +    NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8), + +    NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8), +    NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8), +    NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8), +    NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8), + +    NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8), +    NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8), +    NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8), +    NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8), + +    NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8), +    NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8), +    NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8), +    NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8), + +#define SIMPLE_ROTATE_FLAGS(angle)					  \ +    (FAST_PATH_ROTATE_ ## angle ## _TRANSFORM	|			  \ +     FAST_PATH_NEAREST_FILTER			|			  \ +     FAST_PATH_SAMPLES_COVER_CLIP_NEAREST	|			  \ +     FAST_PATH_STANDARD_FLAGS) + +#define SIMPLE_ROTATE_FAST_PATH(op,s,d,suffix)				  \ +    {   PIXMAN_OP_ ## op,						  \ +	PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (90),				  \ +	PIXMAN_null, 0,							  \ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				  \ +	fast_composite_rotate_90_##suffix,				  \ +    },									  \ +    {   PIXMAN_OP_ ## op,						  \ +	PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (270),			  \ +	PIXMAN_null, 0,							  \ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				  \ +	fast_composite_rotate_270_##suffix,				  \ +    } + +    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888), +    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888), +    SIMPLE_ROTATE_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888), +    SIMPLE_ROTATE_FAST_PATH (SRC, r5g6b5, r5g6b5, 565), +    SIMPLE_ROTATE_FAST_PATH (SRC, a8, a8, 8), + +    /* Simple repeat fast path entry. */ +    {	PIXMAN_OP_any, +	PIXMAN_any, +	(FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE | +	 FAST_PATH_NORMAL_REPEAT), +	PIXMAN_any, 0, +	PIXMAN_any, FAST_PATH_STD_DEST_FLAGS, +	fast_composite_tiled_repeat +    }, + +    {   PIXMAN_OP_NONE	}, +}; + +#ifdef WORDS_BIGENDIAN +#define A1_FILL_MASK(n, offs) (((1U << (n)) - 1) << (32 - (offs) - (n))) +#else +#define A1_FILL_MASK(n, offs) (((1U << (n)) - 1) << (offs)) +#endif + +static force_inline void +pixman_fill1_line (uint32_t *dst, int offs, int width, int v) +{ +    if (offs) +    { +	int leading_pixels = 32 - offs; +	if (leading_pixels >= width) +	{ +	    if (v) +		*dst |= A1_FILL_MASK (width, offs); +	    else +		*dst &= ~A1_FILL_MASK (width, offs); +	    return; +	} +	else +	{ +	    if (v) +		*dst++ |= A1_FILL_MASK (leading_pixels, offs); +	    else +		*dst++ &= ~A1_FILL_MASK (leading_pixels, offs); +	    width -= leading_pixels; +	} +    } +    while (width >= 32) +    { +	if (v) +	    *dst++ = 0xFFFFFFFF; +	else +	    *dst++ = 0; +	width -= 32; +    } +    if (width > 0) +    { +	if (v) +	    *dst |= A1_FILL_MASK (width, 0); +	else +	    *dst &= ~A1_FILL_MASK (width, 0); +    } +} + +static void +pixman_fill1 (uint32_t *bits, +              int       stride, +              int       x, +              int       y, +              int       width, +              int       height, +              uint32_t  filler) +{ +    uint32_t *dst = bits + y * stride + (x >> 5); +    int offs = x & 31; + +    if (filler & 1) +    { +	while (height--) +	{ +	    pixman_fill1_line (dst, offs, width, 1); +	    dst += stride; +	} +    } +    else +    { +	while (height--) +	{ +	    pixman_fill1_line (dst, offs, width, 0); +	    dst += stride; +	} +    } +} + +static void +pixman_fill8 (uint32_t *bits, +              int       stride, +              int       x, +              int       y, +              int       width, +              int       height, +              uint32_t  filler) +{ +    int byte_stride = stride * (int) sizeof (uint32_t); +    uint8_t *dst = (uint8_t *) bits; +    uint8_t v = filler & 0xff; +    int i; + +    dst = dst + y * byte_stride + x; + +    while (height--) +    { +	for (i = 0; i < width; ++i) +	    dst[i] = v; + +	dst += byte_stride; +    } +} + +static void +pixman_fill16 (uint32_t *bits, +               int       stride, +               int       x, +               int       y, +               int       width, +               int       height, +               uint32_t  filler) +{ +    int short_stride = +	(stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t); +    uint16_t *dst = (uint16_t *)bits; +    uint16_t v = filler & 0xffff; +    int i; + +    dst = dst + y * short_stride + x; + +    while (height--) +    { +	for (i = 0; i < width; ++i) +	    dst[i] = v; + +	dst += short_stride; +    } +} + +static void +pixman_fill32 (uint32_t *bits, +               int       stride, +               int       x, +               int       y, +               int       width, +               int       height, +               uint32_t  filler) +{ +    int i; + +    bits = bits + y * stride + x; + +    while (height--) +    { +	for (i = 0; i < width; ++i) +	    bits[i] = filler; + +	bits += stride; +    } +} + +static pixman_bool_t +fast_path_fill (pixman_implementation_t *imp, +                uint32_t *               bits, +                int                      stride, +                int                      bpp, +                int                      x, +                int                      y, +                int                      width, +                int                      height, +                uint32_t		 filler) +{ +    switch (bpp) +    { +    case 1: +	pixman_fill1 (bits, stride, x, y, width, height, filler); +	break; + +    case 8: +	pixman_fill8 (bits, stride, x, y, width, height, filler); +	break; + +    case 16: +	pixman_fill16 (bits, stride, x, y, width, height, filler); +	break; + +    case 32: +	pixman_fill32 (bits, stride, x, y, width, height, filler); +	break; + +    default: +	return FALSE; +    } + +    return TRUE; +} + +/*****************************************************************************/ + +static uint32_t * +fast_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) +{ +    int32_t w = iter->width; +    uint32_t *dst = iter->buffer; +    const uint16_t *src = (const uint16_t *)iter->bits; + +    iter->bits += iter->stride; + +    /* Align the source buffer at 4 bytes boundary */ +    if (w > 0 && ((uintptr_t)src & 3)) +    { +	*dst++ = convert_0565_to_8888 (*src++); +	w--; +    } +    /* Process two pixels per iteration */ +    while ((w -= 2) >= 0) +    { +	uint32_t sr, sb, sg, t0, t1; +	uint32_t s = *(const uint32_t *)src; +	src += 2; +	sr = (s >> 8) & 0x00F800F8; +	sb = (s << 3) & 0x00F800F8; +	sg = (s >> 3) & 0x00FC00FC; +	sr |= sr >> 5; +	sb |= sb >> 5; +	sg |= sg >> 6; +	t0 = ((sr << 16) & 0x00FF0000) | ((sg << 8) & 0x0000FF00) | +	     (sb & 0xFF) | 0xFF000000; +	t1 = (sr & 0x00FF0000) | ((sg >> 8) & 0x0000FF00) | +	     (sb >> 16) | 0xFF000000; +#ifdef WORDS_BIGENDIAN +	*dst++ = t1; +	*dst++ = t0; +#else +	*dst++ = t0; +	*dst++ = t1; +#endif +    } +    if (w & 1) +    { +	*dst = convert_0565_to_8888 (*src); +    } + +    return iter->buffer; +} + +static uint32_t * +fast_dest_fetch_noop (pixman_iter_t *iter, const uint32_t *mask) +{ +    iter->bits += iter->stride; +    return iter->buffer; +} + +/* Helper function for a workaround, which tries to ensure that 0x1F001F + * constant is always allocated in a register on RISC architectures. + */ +static force_inline uint32_t +convert_8888_to_0565_workaround (uint32_t s, uint32_t x1F001F) +{ +    uint32_t a, b; +    a = (s >> 3) & x1F001F; +    b = s & 0xFC00; +    a |= a >> 5; +    a |= b >> 5; +    return a; +} + +static void +fast_write_back_r5g6b5 (pixman_iter_t *iter) +{ +    int32_t w = iter->width; +    uint16_t *dst = (uint16_t *)(iter->bits - iter->stride); +    const uint32_t *src = iter->buffer; +    /* Workaround to ensure that x1F001F variable is allocated in a register */ +    static volatile uint32_t volatile_x1F001F = 0x1F001F; +    uint32_t x1F001F = volatile_x1F001F; + +    while ((w -= 4) >= 0) +    { +	uint32_t s1 = *src++; +	uint32_t s2 = *src++; +	uint32_t s3 = *src++; +	uint32_t s4 = *src++; +	*dst++ = convert_8888_to_0565_workaround (s1, x1F001F); +	*dst++ = convert_8888_to_0565_workaround (s2, x1F001F); +	*dst++ = convert_8888_to_0565_workaround (s3, x1F001F); +	*dst++ = convert_8888_to_0565_workaround (s4, x1F001F); +    } +    if (w & 2) +    { +	*dst++ = convert_8888_to_0565_workaround (*src++, x1F001F); +	*dst++ = convert_8888_to_0565_workaround (*src++, x1F001F); +    } +    if (w & 1) +    { +	*dst = convert_8888_to_0565_workaround (*src, x1F001F); +    } +} + +typedef struct +{ +    int		y; +    uint64_t *	buffer; +} line_t; + +typedef struct +{ +    line_t		lines[2]; +    pixman_fixed_t	y; +    pixman_fixed_t	x; +    uint64_t		data[1]; +} bilinear_info_t; + +static void +fetch_horizontal (bits_image_t *image, line_t *line, +		  int y, pixman_fixed_t x, pixman_fixed_t ux, int n) +{ +    uint32_t *bits = image->bits + y * image->rowstride; +    int i; + +    for (i = 0; i < n; ++i) +    { +	int x0 = pixman_fixed_to_int (x); +	int x1 = x0 + 1; +	int32_t dist_x; + +	uint32_t left = *(bits + x0); +	uint32_t right = *(bits + x1); + +	dist_x = pixman_fixed_to_bilinear_weight (x); +	dist_x <<= (8 - BILINEAR_INTERPOLATION_BITS); + +#if SIZEOF_LONG <= 4 +	{ +	    uint32_t lag, rag, ag; +	    uint32_t lrb, rrb, rb; + +	    lag = (left & 0xff00ff00) >> 8; +	    rag = (right & 0xff00ff00) >> 8; +	    ag = (lag << 8) + dist_x * (rag - lag); + +	    lrb = (left & 0x00ff00ff); +	    rrb = (right & 0x00ff00ff); +	    rb = (lrb << 8) + dist_x * (rrb - lrb); + +	    *((uint32_t *)(line->buffer + i)) = ag; +	    *((uint32_t *)(line->buffer + i) + 1) = rb; +	} +#else +	{ +	    uint64_t lagrb, ragrb; +	    uint32_t lag, rag; +	    uint32_t lrb, rrb; + +	    lag = (left & 0xff00ff00); +	    lrb = (left & 0x00ff00ff); +	    rag = (right & 0xff00ff00); +	    rrb = (right & 0x00ff00ff); +	    lagrb = (((uint64_t)lag) << 24) | lrb; +	    ragrb = (((uint64_t)rag) << 24) | rrb; + +	    line->buffer[i] = (lagrb << 8) + dist_x * (ragrb - lagrb); +	} +#endif + +	x += ux; +    } + +    line->y = y; +} + +static uint32_t * +fast_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask) +{ +    pixman_fixed_t fx, ux; +    bilinear_info_t *info = iter->data; +    line_t *line0, *line1; +    int y0, y1; +    int32_t dist_y; +    int i; + +    COMPILE_TIME_ASSERT (BILINEAR_INTERPOLATION_BITS < 8); + +    fx = info->x; +    ux = iter->image->common.transform->matrix[0][0]; + +    y0 = pixman_fixed_to_int (info->y); +    y1 = y0 + 1; +    dist_y = pixman_fixed_to_bilinear_weight (info->y); +    dist_y <<= (8 - BILINEAR_INTERPOLATION_BITS); + +    line0 = &info->lines[y0 & 0x01]; +    line1 = &info->lines[y1 & 0x01]; + +    if (line0->y != y0) +    { +	fetch_horizontal ( +	    &iter->image->bits, line0, y0, fx, ux, iter->width); +    } + +    if (line1->y != y1) +    { +	fetch_horizontal ( +	    &iter->image->bits, line1, y1, fx, ux, iter->width); +    } + +    for (i = 0; i < iter->width; ++i) +    { +#if SIZEOF_LONG <= 4 +	uint32_t ta, tr, tg, tb; +	uint32_t ba, br, bg, bb; +	uint32_t tag, trb; +	uint32_t bag, brb; +	uint32_t a, r, g, b; + +	tag = *((uint32_t *)(line0->buffer + i)); +	trb = *((uint32_t *)(line0->buffer + i) + 1); +	bag = *((uint32_t *)(line1->buffer + i)); +	brb = *((uint32_t *)(line1->buffer + i) + 1); + +	ta = tag >> 16; +	ba = bag >> 16; +	a = (ta << 8) + dist_y * (ba - ta); + +	tr = trb >> 16; +	br = brb >> 16; +	r = (tr << 8) + dist_y * (br - tr); + +	tg = tag & 0xffff; +	bg = bag & 0xffff; +	g = (tg << 8) + dist_y * (bg - tg); +	 +	tb = trb & 0xffff; +	bb = brb & 0xffff; +	b = (tb << 8) + dist_y * (bb - tb); + +	a = (a <<  8) & 0xff000000; +	r = (r <<  0) & 0x00ff0000; +	g = (g >>  8) & 0x0000ff00; +	b = (b >> 16) & 0x000000ff; +#else +	uint64_t top = line0->buffer[i]; +	uint64_t bot = line1->buffer[i]; +	uint64_t tar = (top & 0xffff0000ffff0000ULL) >> 16; +	uint64_t bar = (bot & 0xffff0000ffff0000ULL) >> 16; +	uint64_t tgb = (top & 0x0000ffff0000ffffULL); +	uint64_t bgb = (bot & 0x0000ffff0000ffffULL); +	uint64_t ar, gb; +	uint32_t a, r, g, b; + +	ar = (tar << 8) + dist_y * (bar - tar); +	gb = (tgb << 8) + dist_y * (bgb - tgb); + +	a = ((ar >> 24) & 0xff000000); +	r = ((ar >>  0) & 0x00ff0000); +	g = ((gb >> 40) & 0x0000ff00); +	b = ((gb >> 16) & 0x000000ff); +#endif + +	iter->buffer[i] = a | r | g | b; +    } + +    info->y += iter->image->common.transform->matrix[1][1]; + +    return iter->buffer; +} + +static void +bilinear_cover_iter_fini (pixman_iter_t *iter) +{ +    free (iter->data); +} + +static void +fast_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info) +{ +    int width = iter->width; +    bilinear_info_t *info; +    pixman_vector_t v; + +    /* Reference point is the center of the pixel */ +    v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2; +    v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2; +    v.vector[2] = pixman_fixed_1; + +    if (!pixman_transform_point_3d (iter->image->common.transform, &v)) +	goto fail; + +    info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t)); +    if (!info) +	goto fail; + +    info->x = v.vector[0] - pixman_fixed_1 / 2; +    info->y = v.vector[1] - pixman_fixed_1 / 2; + +    /* It is safe to set the y coordinates to -1 initially +     * because COVER_CLIP_BILINEAR ensures that we will only +     * be asked to fetch lines in the [0, height) interval +     */ +    info->lines[0].y = -1; +    info->lines[0].buffer = &(info->data[0]); +    info->lines[1].y = -1; +    info->lines[1].buffer = &(info->data[width]); + +    iter->get_scanline = fast_fetch_bilinear_cover; +    iter->fini = bilinear_cover_iter_fini; + +    iter->data = info; +    return; + +fail: +    /* Something went wrong, either a bad matrix or OOM; in such cases, +     * we don't guarantee any particular rendering. +     */ +    _pixman_log_error ( +	FUNC, "Allocation failure or bad matrix, skipping rendering\n"); +     +    iter->get_scanline = _pixman_iter_get_scanline_noop; +    iter->fini = NULL; +} + +static uint32_t * +bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter, +					  const uint32_t *mask) +{ + +    pixman_image_t * ima = iter->image; +    int              offset = iter->x; +    int              line = iter->y++; +    int              width = iter->width; +    uint32_t *       buffer = iter->buffer; + +    bits_image_t *bits = &ima->bits; +    pixman_fixed_t x_top, x_bottom, x; +    pixman_fixed_t ux_top, ux_bottom, ux; +    pixman_vector_t v; +    uint32_t top_mask, bottom_mask; +    uint32_t *top_row; +    uint32_t *bottom_row; +    uint32_t *end; +    uint32_t zero[2] = { 0, 0 }; +    uint32_t one = 1; +    int y, y1, y2; +    int disty; +    int mask_inc; +    int w; + +    /* reference point is the center of the pixel */ +    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; +    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; +    v.vector[2] = pixman_fixed_1; + +    if (!pixman_transform_point_3d (bits->common.transform, &v)) +	return iter->buffer; + +    ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0]; +    x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2; + +    y = v.vector[1] - pixman_fixed_1/2; +    disty = pixman_fixed_to_bilinear_weight (y); + +    /* Load the pointers to the first and second lines from the source +     * image that bilinear code must read. +     * +     * The main trick in this code is about the check if any line are +     * outside of the image; +     * +     * When I realize that a line (any one) is outside, I change +     * the pointer to a dummy area with zeros. Once I change this, I +     * must be sure the pointer will not change, so I set the +     * variables to each pointer increments inside the loop. +     */ +    y1 = pixman_fixed_to_int (y); +    y2 = y1 + 1; + +    if (y1 < 0 || y1 >= bits->height) +    { +	top_row = zero; +	x_top = 0; +	ux_top = 0; +    } +    else +    { +	top_row = bits->bits + y1 * bits->rowstride; +	x_top = x; +	ux_top = ux; +    } + +    if (y2 < 0 || y2 >= bits->height) +    { +	bottom_row = zero; +	x_bottom = 0; +	ux_bottom = 0; +    } +    else +    { +	bottom_row = bits->bits + y2 * bits->rowstride; +	x_bottom = x; +	ux_bottom = ux; +    } + +    /* Instead of checking whether the operation uses the mast in +     * each loop iteration, verify this only once and prepare the +     * variables to make the code smaller inside the loop. +     */ +    if (!mask) +    { +        mask_inc = 0; +        mask = &one; +    } +    else +    { +        /* If have a mask, prepare the variables to check it */ +        mask_inc = 1; +    } + +    /* If both are zero, then the whole thing is zero */ +    if (top_row == zero && bottom_row == zero) +    { +	memset (buffer, 0, width * sizeof (uint32_t)); +	return iter->buffer; +    } +    else if (bits->format == PIXMAN_x8r8g8b8) +    { +	if (top_row == zero) +	{ +	    top_mask = 0; +	    bottom_mask = 0xff000000; +	} +	else if (bottom_row == zero) +	{ +	    top_mask = 0xff000000; +	    bottom_mask = 0; +	} +	else +	{ +	    top_mask = 0xff000000; +	    bottom_mask = 0xff000000; +	} +    } +    else +    { +	top_mask = 0; +	bottom_mask = 0; +    } + +    end = buffer + width; + +    /* Zero fill to the left of the image */ +    while (buffer < end && x < pixman_fixed_minus_1) +    { +	*buffer++ = 0; +	x += ux; +	x_top += ux_top; +	x_bottom += ux_bottom; +	mask += mask_inc; +    } + +    /* Left edge +     */ +    while (buffer < end && x < 0) +    { +	uint32_t tr, br; +	int32_t distx; + +	tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask; +	br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask; + +	distx = pixman_fixed_to_bilinear_weight (x); + +	*buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty); + +	x += ux; +	x_top += ux_top; +	x_bottom += ux_bottom; +	mask += mask_inc; +    } + +    /* Main part */ +    w = pixman_int_to_fixed (bits->width - 1); + +    while (buffer < end  &&  x < w) +    { +	if (*mask) +	{ +	    uint32_t tl, tr, bl, br; +	    int32_t distx; + +	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask; +	    tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask; +	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask; +	    br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask; + +	    distx = pixman_fixed_to_bilinear_weight (x); + +	    *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty); +	} + +	buffer++; +	x += ux; +	x_top += ux_top; +	x_bottom += ux_bottom; +	mask += mask_inc; +    } + +    /* Right Edge */ +    w = pixman_int_to_fixed (bits->width); +    while (buffer < end  &&  x < w) +    { +	if (*mask) +	{ +	    uint32_t tl, bl; +	    int32_t distx; + +	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask; +	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask; + +	    distx = pixman_fixed_to_bilinear_weight (x); + +	    *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty); +	} + +	buffer++; +	x += ux; +	x_top += ux_top; +	x_bottom += ux_bottom; +	mask += mask_inc; +    } + +    /* Zero fill to the left of the image */ +    while (buffer < end) +	*buffer++ = 0; + +    return iter->buffer; +} + +typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x); + +static force_inline void +bits_image_fetch_separable_convolution_affine (pixman_image_t * image, +					       int              offset, +					       int              line, +					       int              width, +					       uint32_t *       buffer, +					       const uint32_t * mask, + +					       convert_pixel_t	convert_pixel, +					       pixman_format_code_t	format, +					       pixman_repeat_t	repeat_mode) +{ +    bits_image_t *bits = &image->bits; +    pixman_fixed_t *params = image->common.filter_params; +    int cwidth = pixman_fixed_to_int (params[0]); +    int cheight = pixman_fixed_to_int (params[1]); +    int x_off = ((cwidth << 16) - pixman_fixed_1) >> 1; +    int y_off = ((cheight << 16) - pixman_fixed_1) >> 1; +    int x_phase_bits = pixman_fixed_to_int (params[2]); +    int y_phase_bits = pixman_fixed_to_int (params[3]); +    int x_phase_shift = 16 - x_phase_bits; +    int y_phase_shift = 16 - y_phase_bits; +    pixman_fixed_t vx, vy; +    pixman_fixed_t ux, uy; +    pixman_vector_t v; +    int k; + +    /* reference point is the center of the pixel */ +    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; +    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; +    v.vector[2] = pixman_fixed_1; + +    if (!pixman_transform_point_3d (image->common.transform, &v)) +	return; + +    ux = image->common.transform->matrix[0][0]; +    uy = image->common.transform->matrix[1][0]; + +    vx = v.vector[0]; +    vy = v.vector[1]; + +    for (k = 0; k < width; ++k) +    { +	pixman_fixed_t *y_params; +	int satot, srtot, sgtot, sbtot; +	pixman_fixed_t x, y; +	int32_t x1, x2, y1, y2; +	int32_t px, py; +	int i, j; + +	if (mask && !mask[k]) +	    goto next; + +	/* Round x and y to the middle of the closest phase before continuing. This +	 * ensures that the convolution matrix is aligned right, since it was +	 * positioned relative to a particular phase (and not relative to whatever +	 * exact fraction we happen to get here). +	 */ +	x = ((vx >> x_phase_shift) << x_phase_shift) + ((1 << x_phase_shift) >> 1); +	y = ((vy >> y_phase_shift) << y_phase_shift) + ((1 << y_phase_shift) >> 1); + +	px = (x & 0xffff) >> x_phase_shift; +	py = (y & 0xffff) >> y_phase_shift; + +	x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off); +	y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off); +	x2 = x1 + cwidth; +	y2 = y1 + cheight; + +	satot = srtot = sgtot = sbtot = 0; + +	y_params = params + 4 + (1 << x_phase_bits) * cwidth + py * cheight; + +	for (i = y1; i < y2; ++i) +	{ +	    pixman_fixed_t fy = *y_params++; + +	    if (fy) +	    { +		pixman_fixed_t *x_params = params + 4 + px * cwidth; + +		for (j = x1; j < x2; ++j) +		{ +		    pixman_fixed_t fx = *x_params++; +		    int rx = j; +		    int ry = i; +		     +		    if (fx) +		    { +			pixman_fixed_t f; +			uint32_t pixel, mask; +			uint8_t *row; + +			mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000; + +			if (repeat_mode != PIXMAN_REPEAT_NONE) +			{ +			    repeat (repeat_mode, &rx, bits->width); +			    repeat (repeat_mode, &ry, bits->height); + +			    row = (uint8_t *)(bits->bits + bits->rowstride * ry); +			    pixel = convert_pixel (row, rx) | mask; +			} +			else +			{ +			    if (rx < 0 || ry < 0 || rx >= bits->width || ry >= bits->height) +			    { +				pixel = 0; +			    } +			    else +			    { +				row = (uint8_t *)(bits->bits + bits->rowstride * ry); +				pixel = convert_pixel (row, rx) | mask; +			    } +			} + +			f = ((pixman_fixed_32_32_t)fx * fy + 0x8000) >> 16; +			srtot += (int)RED_8 (pixel) * f; +			sgtot += (int)GREEN_8 (pixel) * f; +			sbtot += (int)BLUE_8 (pixel) * f; +			satot += (int)ALPHA_8 (pixel) * f; +		    } +		} +	    } +	} + +	satot = (satot + 0x8000) >> 16; +	srtot = (srtot + 0x8000) >> 16; +	sgtot = (sgtot + 0x8000) >> 16; +	sbtot = (sbtot + 0x8000) >> 16; + +	satot = CLIP (satot, 0, 0xff); +	srtot = CLIP (srtot, 0, 0xff); +	sgtot = CLIP (sgtot, 0, 0xff); +	sbtot = CLIP (sbtot, 0, 0xff); + +	buffer[k] = (satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot << 0); + +    next: +	vx += ux; +	vy += uy; +    } +} + +static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + +static force_inline void +bits_image_fetch_bilinear_affine (pixman_image_t * image, +				  int              offset, +				  int              line, +				  int              width, +				  uint32_t *       buffer, +				  const uint32_t * mask, + +				  convert_pixel_t	convert_pixel, +				  pixman_format_code_t	format, +				  pixman_repeat_t	repeat_mode) +{ +    pixman_fixed_t x, y; +    pixman_fixed_t ux, uy; +    pixman_vector_t v; +    bits_image_t *bits = &image->bits; +    int i; + +    /* reference point is the center of the pixel */ +    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; +    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; +    v.vector[2] = pixman_fixed_1; + +    if (!pixman_transform_point_3d (image->common.transform, &v)) +	return; + +    ux = image->common.transform->matrix[0][0]; +    uy = image->common.transform->matrix[1][0]; + +    x = v.vector[0]; +    y = v.vector[1]; + +    for (i = 0; i < width; ++i) +    { +	int x1, y1, x2, y2; +	uint32_t tl, tr, bl, br; +	int32_t distx, disty; +	int width = image->bits.width; +	int height = image->bits.height; +	const uint8_t *row1; +	const uint8_t *row2; + +	if (mask && !mask[i]) +	    goto next; + +	x1 = x - pixman_fixed_1 / 2; +	y1 = y - pixman_fixed_1 / 2; + +	distx = pixman_fixed_to_bilinear_weight (x1); +	disty = pixman_fixed_to_bilinear_weight (y1); + +	y1 = pixman_fixed_to_int (y1); +	y2 = y1 + 1; +	x1 = pixman_fixed_to_int (x1); +	x2 = x1 + 1; + +	if (repeat_mode != PIXMAN_REPEAT_NONE) +	{ +	    uint32_t mask; + +	    mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000; + +	    repeat (repeat_mode, &x1, width); +	    repeat (repeat_mode, &y1, height); +	    repeat (repeat_mode, &x2, width); +	    repeat (repeat_mode, &y2, height); + +	    row1 = (uint8_t *)(bits->bits + bits->rowstride * y1); +	    row2 = (uint8_t *)(bits->bits + bits->rowstride * y2); + +	    tl = convert_pixel (row1, x1) | mask; +	    tr = convert_pixel (row1, x2) | mask; +	    bl = convert_pixel (row2, x1) | mask; +	    br = convert_pixel (row2, x2) | mask; +	} +	else +	{ +	    uint32_t mask1, mask2; +	    int bpp; + +	    /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value, +	     * which means if you use it in expressions, those +	     * expressions become unsigned themselves. Since +	     * the variables below can be negative in some cases, +	     * that will lead to crashes on 64 bit architectures. +	     * +	     * So this line makes sure bpp is signed +	     */ +	    bpp = PIXMAN_FORMAT_BPP (format); + +	    if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0) +	    { +		buffer[i] = 0; +		goto next; +	    } + +	    if (y2 == 0) +	    { +		row1 = zero; +		mask1 = 0; +	    } +	    else +	    { +		row1 = (uint8_t *)(bits->bits + bits->rowstride * y1); +		row1 += bpp / 8 * x1; + +		mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000; +	    } + +	    if (y1 == height - 1) +	    { +		row2 = zero; +		mask2 = 0; +	    } +	    else +	    { +		row2 = (uint8_t *)(bits->bits + bits->rowstride * y2); +		row2 += bpp / 8 * x1; + +		mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000; +	    } + +	    if (x2 == 0) +	    { +		tl = 0; +		bl = 0; +	    } +	    else +	    { +		tl = convert_pixel (row1, 0) | mask1; +		bl = convert_pixel (row2, 0) | mask2; +	    } + +	    if (x1 == width - 1) +	    { +		tr = 0; +		br = 0; +	    } +	    else +	    { +		tr = convert_pixel (row1, 1) | mask1; +		br = convert_pixel (row2, 1) | mask2; +	    } +	} + +	buffer[i] = bilinear_interpolation ( +	    tl, tr, bl, br, distx, disty); + +    next: +	x += ux; +	y += uy; +    } +} + +static force_inline void +bits_image_fetch_nearest_affine (pixman_image_t * image, +				 int              offset, +				 int              line, +				 int              width, +				 uint32_t *       buffer, +				 const uint32_t * mask, +				  +				 convert_pixel_t	convert_pixel, +				 pixman_format_code_t	format, +				 pixman_repeat_t	repeat_mode) +{ +    pixman_fixed_t x, y; +    pixman_fixed_t ux, uy; +    pixman_vector_t v; +    bits_image_t *bits = &image->bits; +    int i; + +    /* reference point is the center of the pixel */ +    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2; +    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2; +    v.vector[2] = pixman_fixed_1; + +    if (!pixman_transform_point_3d (image->common.transform, &v)) +	return; + +    ux = image->common.transform->matrix[0][0]; +    uy = image->common.transform->matrix[1][0]; + +    x = v.vector[0]; +    y = v.vector[1]; + +    for (i = 0; i < width; ++i) +    { +	int width, height, x0, y0; +	const uint8_t *row; + +	if (mask && !mask[i]) +	    goto next; +	 +	width = image->bits.width; +	height = image->bits.height; +	x0 = pixman_fixed_to_int (x - pixman_fixed_e); +	y0 = pixman_fixed_to_int (y - pixman_fixed_e); + +	if (repeat_mode == PIXMAN_REPEAT_NONE && +	    (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width)) +	{ +	    buffer[i] = 0; +	} +	else +	{ +	    uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000; + +	    if (repeat_mode != PIXMAN_REPEAT_NONE) +	    { +		repeat (repeat_mode, &x0, width); +		repeat (repeat_mode, &y0, height); +	    } + +	    row = (uint8_t *)(bits->bits + bits->rowstride * y0); + +	    buffer[i] = convert_pixel (row, x0) | mask; +	} + +    next: +	x += ux; +	y += uy; +    } +} + +static force_inline uint32_t +convert_a8r8g8b8 (const uint8_t *row, int x) +{ +    return *(((uint32_t *)row) + x); +} + +static force_inline uint32_t +convert_x8r8g8b8 (const uint8_t *row, int x) +{ +    return *(((uint32_t *)row) + x); +} + +static force_inline uint32_t +convert_a8 (const uint8_t *row, int x) +{ +    return (uint32_t) *(row + x) << 24; +} + +static force_inline uint32_t +convert_r5g6b5 (const uint8_t *row, int x) +{ +    return convert_0565_to_0888 (*((uint16_t *)row + x)); +} + +#define MAKE_SEPARABLE_CONVOLUTION_FETCHER(name, format, repeat_mode)  \ +    static uint32_t *							\ +    bits_image_fetch_separable_convolution_affine_ ## name (pixman_iter_t   *iter, \ +							    const uint32_t * mask) \ +    {									\ +	bits_image_fetch_separable_convolution_affine (                 \ +	    iter->image,                                                \ +	    iter->x, iter->y++,                                         \ +	    iter->width,                                                \ +	    iter->buffer, mask,                                         \ +	    convert_ ## format,                                         \ +	    PIXMAN_ ## format,                                          \ +	    repeat_mode);                                               \ +									\ +	return iter->buffer;                                            \ +    } + +#define MAKE_BILINEAR_FETCHER(name, format, repeat_mode)		\ +    static uint32_t *							\ +    bits_image_fetch_bilinear_affine_ ## name (pixman_iter_t   *iter,	\ +					       const uint32_t * mask)	\ +    {									\ +	bits_image_fetch_bilinear_affine (iter->image,			\ +					  iter->x, iter->y++,		\ +					  iter->width,			\ +					  iter->buffer, mask,		\ +					  convert_ ## format,		\ +					  PIXMAN_ ## format,		\ +					  repeat_mode);			\ +	return iter->buffer;						\ +    } + +#define MAKE_NEAREST_FETCHER(name, format, repeat_mode)			\ +    static uint32_t *							\ +    bits_image_fetch_nearest_affine_ ## name (pixman_iter_t   *iter,	\ +					      const uint32_t * mask)	\ +    {									\ +	bits_image_fetch_nearest_affine (iter->image,			\ +					 iter->x, iter->y++,		\ +					 iter->width,			\ +					 iter->buffer, mask,		\ +					 convert_ ## format,		\ +					 PIXMAN_ ## format,		\ +					 repeat_mode);			\ +	return iter->buffer;						\ +    } + +#define MAKE_FETCHERS(name, format, repeat_mode)			\ +    MAKE_NEAREST_FETCHER (name, format, repeat_mode)			\ +    MAKE_BILINEAR_FETCHER (name, format, repeat_mode)			\ +    MAKE_SEPARABLE_CONVOLUTION_FETCHER (name, format, repeat_mode) + +MAKE_FETCHERS (pad_a8r8g8b8,     a8r8g8b8, PIXMAN_REPEAT_PAD) +MAKE_FETCHERS (none_a8r8g8b8,    a8r8g8b8, PIXMAN_REPEAT_NONE) +MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT) +MAKE_FETCHERS (normal_a8r8g8b8,  a8r8g8b8, PIXMAN_REPEAT_NORMAL) +MAKE_FETCHERS (pad_x8r8g8b8,     x8r8g8b8, PIXMAN_REPEAT_PAD) +MAKE_FETCHERS (none_x8r8g8b8,    x8r8g8b8, PIXMAN_REPEAT_NONE) +MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT) +MAKE_FETCHERS (normal_x8r8g8b8,  x8r8g8b8, PIXMAN_REPEAT_NORMAL) +MAKE_FETCHERS (pad_a8,           a8,       PIXMAN_REPEAT_PAD) +MAKE_FETCHERS (none_a8,          a8,       PIXMAN_REPEAT_NONE) +MAKE_FETCHERS (reflect_a8,	 a8,       PIXMAN_REPEAT_REFLECT) +MAKE_FETCHERS (normal_a8,	 a8,       PIXMAN_REPEAT_NORMAL) +MAKE_FETCHERS (pad_r5g6b5,       r5g6b5,   PIXMAN_REPEAT_PAD) +MAKE_FETCHERS (none_r5g6b5,      r5g6b5,   PIXMAN_REPEAT_NONE) +MAKE_FETCHERS (reflect_r5g6b5,   r5g6b5,   PIXMAN_REPEAT_REFLECT) +MAKE_FETCHERS (normal_r5g6b5,    r5g6b5,   PIXMAN_REPEAT_NORMAL) + +#define IMAGE_FLAGS							\ +    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\ +     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) + +static const pixman_iter_info_t fast_iters[] =  +{ +    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW | ITER_SRC, +      _pixman_iter_init_bits_stride, fast_fetch_r5g6b5, NULL }, + +    { PIXMAN_r5g6b5, FAST_PATH_STD_DEST_FLAGS, +      ITER_NARROW | ITER_DEST, +      _pixman_iter_init_bits_stride, +      fast_fetch_r5g6b5, fast_write_back_r5g6b5 }, +     +    { PIXMAN_r5g6b5, FAST_PATH_STD_DEST_FLAGS, +      ITER_NARROW | ITER_DEST | ITER_IGNORE_RGB | ITER_IGNORE_ALPHA, +      _pixman_iter_init_bits_stride, +      fast_dest_fetch_noop, fast_write_back_r5g6b5 }, + +    { PIXMAN_a8r8g8b8, +      (FAST_PATH_STANDARD_FLAGS			| +       FAST_PATH_SCALE_TRANSFORM		| +       FAST_PATH_BILINEAR_FILTER		| +       FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR), +      ITER_NARROW | ITER_SRC, +      fast_bilinear_cover_iter_init, +      NULL, NULL +    }, + +#define FAST_BILINEAR_FLAGS						\ +    (FAST_PATH_NO_ALPHA_MAP		|				\ +     FAST_PATH_NO_ACCESSORS		|				\ +     FAST_PATH_HAS_TRANSFORM		|				\ +     FAST_PATH_AFFINE_TRANSFORM		|				\ +     FAST_PATH_X_UNIT_POSITIVE		|				\ +     FAST_PATH_Y_UNIT_ZERO		|				\ +     FAST_PATH_NONE_REPEAT		|				\ +     FAST_PATH_BILINEAR_FILTER) + +    { PIXMAN_a8r8g8b8, +      FAST_BILINEAR_FLAGS, +      ITER_NARROW | ITER_SRC, +      NULL, bits_image_fetch_bilinear_no_repeat_8888, NULL +    }, + +    { PIXMAN_x8r8g8b8, +      FAST_BILINEAR_FLAGS, +      ITER_NARROW | ITER_SRC, +      NULL, bits_image_fetch_bilinear_no_repeat_8888, NULL +    }, + +#define GENERAL_BILINEAR_FLAGS						\ +    (FAST_PATH_NO_ALPHA_MAP		|				\ +     FAST_PATH_NO_ACCESSORS		|				\ +     FAST_PATH_HAS_TRANSFORM		|				\ +     FAST_PATH_AFFINE_TRANSFORM		|				\ +     FAST_PATH_BILINEAR_FILTER) + +#define GENERAL_NEAREST_FLAGS						\ +    (FAST_PATH_NO_ALPHA_MAP		|				\ +     FAST_PATH_NO_ACCESSORS		|				\ +     FAST_PATH_HAS_TRANSFORM		|				\ +     FAST_PATH_AFFINE_TRANSFORM		|				\ +     FAST_PATH_NEAREST_FILTER) + +#define GENERAL_SEPARABLE_CONVOLUTION_FLAGS				\ +    (FAST_PATH_NO_ALPHA_MAP            |				\ +     FAST_PATH_NO_ACCESSORS            |				\ +     FAST_PATH_HAS_TRANSFORM           |				\ +     FAST_PATH_AFFINE_TRANSFORM        |				\ +     FAST_PATH_SEPARABLE_CONVOLUTION_FILTER) +     +#define SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)   \ +    { PIXMAN_ ## format,						\ +      GENERAL_SEPARABLE_CONVOLUTION_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \ +      ITER_NARROW | ITER_SRC,						\ +      NULL, bits_image_fetch_separable_convolution_affine_ ## name, NULL \ +    }, + +#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\ +    { PIXMAN_ ## format,						\ +      GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\ +      ITER_NARROW | ITER_SRC,						\ +      NULL, bits_image_fetch_bilinear_affine_ ## name, NULL,		\ +    }, + +#define NEAREST_AFFINE_FAST_PATH(name, format, repeat)			\ +    { PIXMAN_ ## format,						\ +      GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\ +      ITER_NARROW | ITER_SRC,						\ +      NULL, bits_image_fetch_nearest_affine_ ## name, NULL		\ +    }, + +#define AFFINE_FAST_PATHS(name, format, repeat)				\ +    NEAREST_AFFINE_FAST_PATH(name, format, repeat)			\ +    BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\ +    SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat) +     +    AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD) +    AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE) +    AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT) +    AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL) +    AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD) +    AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE) +    AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT) +    AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL) +    AFFINE_FAST_PATHS (pad_a8, a8, PAD) +    AFFINE_FAST_PATHS (none_a8, a8, NONE) +    AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT) +    AFFINE_FAST_PATHS (normal_a8, a8, NORMAL) +    AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD) +    AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE) +    AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT) +    AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL) + +    { PIXMAN_null }, +}; + +pixman_implementation_t * +_pixman_implementation_create_fast_path (pixman_implementation_t *fallback) +{ +    pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths); + +    imp->fill = fast_path_fill; +    imp->iter_info = fast_iters; + +    return imp; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-filter.c b/libs/pixman-0.40.0/pixman/pixman-filter.c new file mode 100644 index 0000000..5f3b752 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-filter.c @@ -0,0 +1,478 @@ +/* + * Copyright 2012, Red Hat, Inc. + * Copyright 2012, Soren Sandmann + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + *  + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Soren Sandmann <soren.sandmann@gmail.com> + */ +#include <string.h> +#include <stdlib.h> +#include <stdio.h> +#include <math.h> +#include <assert.h> +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include "pixman-private.h" + +typedef double (* kernel_func_t) (double x); + +typedef struct +{ +    pixman_kernel_t	kernel; +    kernel_func_t	func; +    double		width; +} filter_info_t; + +static double +impulse_kernel (double x) +{ +    return (x == 0.0)? 1.0 : 0.0; +} + +static double +box_kernel (double x) +{ +    return 1; +} + +static double +linear_kernel (double x) +{ +    return 1 - fabs (x); +} + +static double +gaussian_kernel (double x) +{ +#define SQRT2 (1.4142135623730950488016887242096980785696718753769480) +#define SIGMA (SQRT2 / 2.0) +     +    return exp (- x * x / (2 * SIGMA * SIGMA)) / (SIGMA * sqrt (2.0 * M_PI)); +} + +static double +sinc (double x) +{ +    if (x == 0.0) +	return 1.0; +    else +	return sin (M_PI * x) / (M_PI * x); +} + +static double +lanczos (double x, int n) +{ +    return sinc (x) * sinc (x * (1.0 / n)); +} + +static double +lanczos2_kernel (double x) +{ +    return lanczos (x, 2); +} + +static double +lanczos3_kernel (double x) +{ +    return lanczos (x, 3); +} + +static double +nice_kernel (double x) +{ +    return lanczos3_kernel (x * 0.75); +} + +static double +general_cubic (double x, double B, double C) +{ +    double ax = fabs(x); + +    if (ax < 1) +    { +	return (((12 - 9 * B - 6 * C) * ax + +		 (-18 + 12 * B + 6 * C)) * ax * ax + +		(6 - 2 * B)) / 6; +    } +    else if (ax < 2) +    { +	return ((((-B - 6 * C) * ax + +		  (6 * B + 30 * C)) * ax + +		 (-12 * B - 48 * C)) * ax + +		(8 * B + 24 * C)) / 6; +    } +    else +    { +	return 0; +    } +} + +static double +cubic_kernel (double x) +{ +    /* This is the Mitchell-Netravali filter. +     * +     * (0.0, 0.5) would give us the Catmull-Rom spline, +     * but that one seems to be indistinguishable from Lanczos2. +     */ +    return general_cubic (x, 1/3.0, 1/3.0); +} + +static const filter_info_t filters[] = +{ +    { PIXMAN_KERNEL_IMPULSE,	        impulse_kernel,   0.0 }, +    { PIXMAN_KERNEL_BOX,	        box_kernel,       1.0 }, +    { PIXMAN_KERNEL_LINEAR,	        linear_kernel,    2.0 }, +    { PIXMAN_KERNEL_CUBIC,		cubic_kernel,     4.0 }, +    { PIXMAN_KERNEL_GAUSSIAN,	        gaussian_kernel,  5.0 }, +    { PIXMAN_KERNEL_LANCZOS2,	        lanczos2_kernel,  4.0 }, +    { PIXMAN_KERNEL_LANCZOS3,	        lanczos3_kernel,  6.0 }, +    { PIXMAN_KERNEL_LANCZOS3_STRETCHED, nice_kernel,      8.0 }, +}; + +/* This function scales @kernel2 by @scale, then + * aligns @x1 in @kernel1 with @x2 in @kernel2 and + * and integrates the product of the kernels across @width. + * + * This function assumes that the intervals are within + * the kernels in question. E.g., the caller must not + * try to integrate a linear kernel ouside of [-1:1] + */ +static double +integral (pixman_kernel_t kernel1, double x1, +	  pixman_kernel_t kernel2, double scale, double x2, +	  double width) +{ +    if (kernel1 == PIXMAN_KERNEL_BOX && kernel2 == PIXMAN_KERNEL_BOX) +    { +	return width; +    } +    /* The LINEAR filter is not differentiable at 0, so if the +     * integration interval crosses zero, break it into two +     * separate integrals. +     */ +    else if (kernel1 == PIXMAN_KERNEL_LINEAR && x1 < 0 && x1 + width > 0) +    { +	return +	    integral (kernel1, x1, kernel2, scale, x2, - x1) + +	    integral (kernel1, 0, kernel2, scale, x2 - x1, width + x1); +    } +    else if (kernel2 == PIXMAN_KERNEL_LINEAR && x2 < 0 && x2 + width > 0) +    { +	return +	    integral (kernel1, x1, kernel2, scale, x2, - x2) + +	    integral (kernel1, x1 - x2, kernel2, scale, 0, width + x2); +    } +    else if (kernel1 == PIXMAN_KERNEL_IMPULSE) +    { +	assert (width == 0.0); +	return filters[kernel2].func (x2 * scale); +    } +    else if (kernel2 == PIXMAN_KERNEL_IMPULSE) +    { +	assert (width == 0.0); +	return filters[kernel1].func (x1); +    } +    else +    { +	/* Integration via Simpson's rule +	 * See http://www.intmath.com/integration/6-simpsons-rule.php +	 * 12 segments (6 cubic approximations) seems to produce best +	 * result for lanczos3.linear, which was the combination that +	 * showed the most errors.  This makes sense as the lanczos3 +	 * filter is 6 wide. +	 */ +#define N_SEGMENTS 12 +#define SAMPLE(a1, a2)							\ +	(filters[kernel1].func ((a1)) * filters[kernel2].func ((a2) * scale)) +	 +	double s = 0.0; +	double h = width / N_SEGMENTS; +	int i; + +	s = SAMPLE (x1, x2); + +	for (i = 1; i < N_SEGMENTS; i += 2) +	{ +	    double a1 = x1 + h * i; +	    double a2 = x2 + h * i; +	    s += 4 * SAMPLE (a1, a2); +	} + +	for (i = 2; i < N_SEGMENTS; i += 2) +	{ +	    double a1 = x1 + h * i; +	    double a2 = x2 + h * i; +	    s += 2 * SAMPLE (a1, a2); +	} + +	s += SAMPLE (x1 + width, x2 + width); +	 +	return h * s * (1.0 / 3.0); +    } +} + +static void +create_1d_filter (int              width, +		  pixman_kernel_t  reconstruct, +		  pixman_kernel_t  sample, +		  double           scale, +		  int              n_phases, +		  pixman_fixed_t *p) +{ +    double step; +    int i; + +    step = 1.0 / n_phases; + +    for (i = 0; i < n_phases; ++i) +    { +        double frac = step / 2.0 + i * step; +	pixman_fixed_t new_total; +        int x, x1, x2; +	double total, e; + +	/* Sample convolution of reconstruction and sampling +	 * filter. See rounding.txt regarding the rounding +	 * and sample positions. +	 */ + +	x1 = ceil (frac - width / 2.0 - 0.5); +	x2 = x1 + width; + +	total = 0; +        for (x = x1; x < x2; ++x) +        { +	    double pos = x + 0.5 - frac; +	    double rlow = - filters[reconstruct].width / 2.0; +	    double rhigh = rlow + filters[reconstruct].width; +	    double slow = pos - scale * filters[sample].width / 2.0; +	    double shigh = slow + scale * filters[sample].width; +	    double c = 0.0; +	    double ilow, ihigh; + +	    if (rhigh >= slow && rlow <= shigh) +	    { +		ilow = MAX (slow, rlow); +		ihigh = MIN (shigh, rhigh); + +		c = integral (reconstruct, ilow, +			      sample, 1.0 / scale, ilow - pos, +			      ihigh - ilow); +	    } + +            *p = (pixman_fixed_t)floor (c * 65536.0 + 0.5); +	    total += *p; +	    p++; +        } + +	/* Normalize, with error diffusion */ +	p -= width; +        total = 65536.0 / total; +        new_total = 0; +	e = 0.0; +	for (x = x1; x < x2; ++x) +	{ +	    double v = (*p) * total + e; +	    pixman_fixed_t t = floor (v + 0.5); + +	    e = v - t; +	    new_total += t; +	    *p++ = t; +	} + +	/* pixman_fixed_e's worth of error may remain; put it +	 * at the first sample, since that is the only one that +	 * hasn't had any error diffused into it. +	 */ +	*(p - width) += pixman_fixed_1 - new_total; +    } +} + + +static int +filter_width (pixman_kernel_t reconstruct, pixman_kernel_t sample, double size) +{ +    return ceil (filters[reconstruct].width + size * filters[sample].width); +} + +#ifdef PIXMAN_GNUPLOT + +/* If enable-gnuplot is configured, then you can pipe the output of a + * pixman-using program to gnuplot and get a continuously-updated plot + * of the horizontal filter. This works well with demos/scale to test + * the filter generation. + * + * The plot is all the different subposition filters shuffled + * together. This is misleading in a few cases: + * + *  IMPULSE.BOX - goes up and down as the subfilters have different + *		  numbers of non-zero samples + *  IMPULSE.TRIANGLE - somewhat crooked for the same reason + *  1-wide filters - looks triangular, but a 1-wide box would be more + *		     accurate + */ +static void +gnuplot_filter (int width, int n_phases, const pixman_fixed_t* p) +{ +    double step; +    int i, j; +    int first; + +    step = 1.0 / n_phases; + +    printf ("set style line 1 lc rgb '#0060ad' lt 1 lw 0.5 pt 7 pi 1 ps 0.5\n"); +    printf ("plot [x=%g:%g] '-' with linespoints ls 1\n", -width*0.5, width*0.5); +    /* Print a point at the origin so that y==0 line is included: */ +    printf ("0 0\n\n"); + +    /* The position of the first sample of the phase corresponding to +     * frac is given by: +     *  +     *     ceil (frac - width / 2.0 - 0.5) + 0.5 - frac +     *  +     * We have to find the frac that minimizes this expression. +     *  +     * For odd widths, we have +     *  +     *     ceil (frac - width / 2.0 - 0.5) + 0.5 - frac +     *   = ceil (frac) + K - frac +     *   = 1 + K - frac +     *  +     * for some K, so this is minimized when frac is maximized and +     * strictly growing with frac. So for odd widths, we can simply +     * start at the last phase and go backwards. +     *  +     * For even widths, we have +     *  +     *     ceil (frac - width / 2.0 - 0.5) + 0.5 - frac +     *   = ceil (frac - 0.5) + K - frac +     *  +     * The graph for this function (ignoring K) looks like this: +     *  +     *        0.5 +     *           |    |\  +     *           |    | \  +     *           |    |  \  +     *         0 |    |   \  +     *           |\   | +     *           | \  | +     *           |  \ | +     *      -0.5 |   \| +     *   --------------------------------- +     *           0    0.5   1 +     *  +     * So in this case we need to start with the phase whose frac is +     * less than, but as close as possible to 0.5, then go backwards +     * until we hit the first phase, then wrap around to the last +     * phase and continue backwards. +     *  +     * Which phase is as close as possible 0.5? The locations of the +     * sampling point corresponding to the kth phase is given by +     * 1/(2 * n_phases) + k / n_phases: +     *  +     *         1/(2 * n_phases) + k / n_phases = 0.5 +     *   +     * from which it follows that +     *  +     *         k = (n_phases - 1) / 2 +     *  +     * rounded down is the phase in question. +     */ +    if (width & 1) +	first = n_phases - 1; +    else +	first = (n_phases - 1) / 2; + +    for (j = 0; j < width; ++j) +    { +	for (i = 0; i < n_phases; ++i) +	{ +	    int phase = first - i; +	    double frac, pos; + +	    if (phase < 0) +		phase = n_phases + phase; + +	    frac = step / 2.0 + phase * step; +	    pos = ceil (frac - width / 2.0 - 0.5) + 0.5 - frac + j; + +	    printf ("%g %g\n", +		    pos, +		    pixman_fixed_to_double (*(p + phase * width + j))); +	} +    } + +    printf ("e\n"); +    fflush (stdout); +} + +#endif + +/* Create the parameter list for a SEPARABLE_CONVOLUTION filter + * with the given kernels and scale parameters + */ +PIXMAN_EXPORT pixman_fixed_t * +pixman_filter_create_separable_convolution (int             *n_values, +					    pixman_fixed_t   scale_x, +					    pixman_fixed_t   scale_y, +					    pixman_kernel_t  reconstruct_x, +					    pixman_kernel_t  reconstruct_y, +					    pixman_kernel_t  sample_x, +					    pixman_kernel_t  sample_y, +					    int              subsample_bits_x, +					    int	             subsample_bits_y) +{ +    double sx = fabs (pixman_fixed_to_double (scale_x)); +    double sy = fabs (pixman_fixed_to_double (scale_y)); +    pixman_fixed_t *params; +    int subsample_x, subsample_y; +    int width, height; + +    width = filter_width (reconstruct_x, sample_x, sx); +    subsample_x = (1 << subsample_bits_x); + +    height = filter_width (reconstruct_y, sample_y, sy); +    subsample_y = (1 << subsample_bits_y); + +    *n_values = 4 + width * subsample_x + height * subsample_y; +     +    params = malloc (*n_values * sizeof (pixman_fixed_t)); +    if (!params) +	return NULL; + +    params[0] = pixman_int_to_fixed (width); +    params[1] = pixman_int_to_fixed (height); +    params[2] = pixman_int_to_fixed (subsample_bits_x); +    params[3] = pixman_int_to_fixed (subsample_bits_y); + +    create_1d_filter (width, reconstruct_x, sample_x, sx, subsample_x, +		      params + 4); +    create_1d_filter (height, reconstruct_y, sample_y, sy, subsample_y, +		      params + 4 + width * subsample_x); + +#ifdef PIXMAN_GNUPLOT +    gnuplot_filter(width, subsample_x, params + 4); +#endif + +    return params; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-general.c b/libs/pixman-0.40.0/pixman/pixman-general.c new file mode 100644 index 0000000..7e5a0d0 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-general.c @@ -0,0 +1,264 @@ +/* + * Copyright © 2009 Red Hat, Inc. + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. + *             2005 Lars Knoll & Zack Rusin, Trolltech + *             2008 Aaron Plattner, NVIDIA Corporation + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Red Hat not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission.  Red Hat makes no representations about the + * suitability of this software for any purpose.  It is provided "as is" + * without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include <stdlib.h> +#include <string.h> +#include <math.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "pixman-private.h" + +static void +general_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *info) +{ +    pixman_image_t *image = iter->image; + +    switch (image->type) +    { +    case BITS: +        if ((iter->iter_flags & ITER_SRC) == ITER_SRC) +            _pixman_bits_image_src_iter_init (image, iter); +        else +            _pixman_bits_image_dest_iter_init (image, iter); +        break; + +    case LINEAR: +        _pixman_linear_gradient_iter_init (image, iter); +        break; + +    case RADIAL: +	_pixman_radial_gradient_iter_init (image, iter); +        break; + +    case CONICAL: +	_pixman_conical_gradient_iter_init (image, iter); +        break; + +    case SOLID: +        _pixman_log_error (FUNC, "Solid image not handled by noop"); +        break; + +    default: +	_pixman_log_error (FUNC, "Pixman bug: unknown image type\n"); +        break; +    } +} + +static const pixman_iter_info_t general_iters[] = +{ +    { PIXMAN_any, 0, 0, general_iter_init, NULL, NULL }, +    { PIXMAN_null }, +}; + +typedef struct op_info_t op_info_t; +struct op_info_t +{ +    uint8_t src, dst; +}; + +#define ITER_IGNORE_BOTH						\ +    (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB | ITER_LOCALIZED_ALPHA) + +static const op_info_t op_flags[PIXMAN_N_OPERATORS] = +{ +    /* Src                   Dst                   */ +    { ITER_IGNORE_BOTH,      ITER_IGNORE_BOTH      }, /* CLEAR */ +    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_BOTH      }, /* SRC */ +    { ITER_IGNORE_BOTH,      ITER_LOCALIZED_ALPHA  }, /* DST */ +    { 0,                     ITER_LOCALIZED_ALPHA  }, /* OVER */ +    { ITER_LOCALIZED_ALPHA,  0                     }, /* OVER_REVERSE */ +    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_RGB       }, /* IN */ +    { ITER_IGNORE_RGB,       ITER_LOCALIZED_ALPHA  }, /* IN_REVERSE */ +    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_RGB       }, /* OUT */ +    { ITER_IGNORE_RGB,       ITER_LOCALIZED_ALPHA  }, /* OUT_REVERSE */ +    { 0,                     0                     }, /* ATOP */ +    { 0,                     0                     }, /* ATOP_REVERSE */ +    { 0,                     0                     }, /* XOR */ +    { ITER_LOCALIZED_ALPHA,  ITER_LOCALIZED_ALPHA  }, /* ADD */ +    { 0,                     0                     }, /* SATURATE */ +}; + +#define SCANLINE_BUFFER_LENGTH 8192 + +static pixman_bool_t +operator_needs_division (pixman_op_t op) +{ +    static const uint8_t needs_division[] = +    { +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, /* SATURATE */ +	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, /* DISJOINT */ +	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, /* CONJOINT */ +	0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, /* blend ops */ +    }; + +    return needs_division[op]; +} + +static void +general_composite_rect  (pixman_implementation_t *imp, +                         pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint8_t stack_scanline_buffer[3 * SCANLINE_BUFFER_LENGTH]; +    uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer; +    uint8_t *src_buffer, *mask_buffer, *dest_buffer; +    pixman_iter_t src_iter, mask_iter, dest_iter; +    pixman_combine_32_func_t compose; +    pixman_bool_t component_alpha; +    iter_flags_t width_flag, src_iter_flags; +    int Bpp; +    int i; + +    if ((src_image->common.flags & FAST_PATH_NARROW_FORMAT)		     && +	(!mask_image || mask_image->common.flags & FAST_PATH_NARROW_FORMAT)  && +	(dest_image->common.flags & FAST_PATH_NARROW_FORMAT)		     && +	!(operator_needs_division (op))                                      && +	(dest_image->bits.dither == PIXMAN_DITHER_NONE)) +    { +	width_flag = ITER_NARROW; +	Bpp = 4; +    } +    else +    { +	width_flag = ITER_WIDE; +	Bpp = 16; +    } + +#define ALIGN(addr)							\ +    ((uint8_t *)((((uintptr_t)(addr)) + 15) & (~15))) + +    if (width <= 0 || _pixman_multiply_overflows_int (width, Bpp * 3)) +	return; + +    if (width * Bpp * 3 > sizeof (stack_scanline_buffer) - 15 * 3) +    { +	scanline_buffer = pixman_malloc_ab_plus_c (width, Bpp * 3, 15 * 3); + +	if (!scanline_buffer) +	    return; + +	memset (scanline_buffer, 0, width * Bpp * 3 + 15 * 3); +    } +    else +    { +	memset (stack_scanline_buffer, 0, sizeof (stack_scanline_buffer)); +    } + +    src_buffer = ALIGN (scanline_buffer); +    mask_buffer = ALIGN (src_buffer + width * Bpp); +    dest_buffer = ALIGN (mask_buffer + width * Bpp); + +    if (width_flag == ITER_WIDE) +    { +	/* To make sure there aren't any NANs in the buffers */ +	memset (src_buffer, 0, width * Bpp); +	memset (mask_buffer, 0, width * Bpp); +	memset (dest_buffer, 0, width * Bpp); +    } +     +    /* src iter */ +    src_iter_flags = width_flag | op_flags[op].src | ITER_SRC; + +    _pixman_implementation_iter_init (imp->toplevel, &src_iter, src_image, +                                      src_x, src_y, width, height, +                                      src_buffer, src_iter_flags, +                                      info->src_flags); + +    /* mask iter */ +    if ((src_iter_flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) == +	(ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) +    { +	/* If it doesn't matter what the source is, then it doesn't matter +	 * what the mask is +	 */ +	mask_image = NULL; +    } + +    component_alpha = mask_image && mask_image->common.component_alpha; + +    _pixman_implementation_iter_init ( +	imp->toplevel, &mask_iter, +	mask_image, mask_x, mask_y, width, height, mask_buffer, +	ITER_SRC | width_flag | (component_alpha? 0 : ITER_IGNORE_RGB), +	info->mask_flags); + +    /* dest iter */ +    _pixman_implementation_iter_init ( +	imp->toplevel, &dest_iter, dest_image, dest_x, dest_y, width, height, +	dest_buffer, ITER_DEST | width_flag | op_flags[op].dst, info->dest_flags); + +    compose = _pixman_implementation_lookup_combiner ( +	imp->toplevel, op, component_alpha, width_flag != ITER_WIDE); + +    for (i = 0; i < height; ++i) +    { +	uint32_t *s, *m, *d; + +	m = mask_iter.get_scanline (&mask_iter, NULL); +	s = src_iter.get_scanline (&src_iter, m); +	d = dest_iter.get_scanline (&dest_iter, NULL); + +	compose (imp->toplevel, op, d, s, m, width); + +	dest_iter.write_back (&dest_iter); +    } + +    if (src_iter.fini) +	src_iter.fini (&src_iter); +    if (mask_iter.fini) +	mask_iter.fini (&mask_iter); +    if (dest_iter.fini) +	dest_iter.fini (&dest_iter); +     +    if (scanline_buffer != (uint8_t *) stack_scanline_buffer) +	free (scanline_buffer); +} + +static const pixman_fast_path_t general_fast_path[] = +{ +    { PIXMAN_OP_any, PIXMAN_any, 0, PIXMAN_any,	0, PIXMAN_any, 0, general_composite_rect }, +    { PIXMAN_OP_NONE } +}; + +pixman_implementation_t * +_pixman_implementation_create_general (void) +{ +    pixman_implementation_t *imp = _pixman_implementation_create (NULL, general_fast_path); + +    _pixman_setup_combiner_functions_32 (imp); +    _pixman_setup_combiner_functions_float (imp); + +    imp->iter_info = general_iters; + +    return imp; +} + diff --git a/libs/pixman-0.40.0/pixman/pixman-glyph.c b/libs/pixman-0.40.0/pixman/pixman-glyph.c new file mode 100644 index 0000000..96a349a --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-glyph.c @@ -0,0 +1,676 @@ +/* + * Copyright 2010, 2012, Soren Sandmann <sandmann@cs.au.dk> + * Copyright 2010, 2011, 2012, Red Hat, Inc + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Soren Sandmann <sandmann@cs.au.dk> + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include "pixman-private.h" + +#include <stdlib.h> + +typedef struct glyph_metrics_t glyph_metrics_t; +typedef struct glyph_t glyph_t; + +#define TOMBSTONE ((glyph_t *)0x1) + +/* XXX: These numbers are arbitrary---we've never done any measurements. + */ +#define N_GLYPHS_HIGH_WATER  (16384) +#define N_GLYPHS_LOW_WATER   (8192) +#define HASH_SIZE (2 * N_GLYPHS_HIGH_WATER) +#define HASH_MASK (HASH_SIZE - 1) + +struct glyph_t +{ +    void *		font_key; +    void *		glyph_key; +    int			origin_x; +    int			origin_y; +    pixman_image_t *	image; +    pixman_link_t	mru_link; +}; + +struct pixman_glyph_cache_t +{ +    int			n_glyphs; +    int			n_tombstones; +    int			freeze_count; +    pixman_list_t	mru; +    glyph_t *		glyphs[HASH_SIZE]; +}; + +static void +free_glyph (glyph_t *glyph) +{ +    pixman_list_unlink (&glyph->mru_link); +    pixman_image_unref (glyph->image); +    free (glyph); +} + +static unsigned int +hash (const void *font_key, const void *glyph_key) +{ +    size_t key = (size_t)font_key + (size_t)glyph_key; + +    /* This hash function is based on one found on Thomas Wang's +     * web page at +     * +     *    http://www.concentric.net/~Ttwang/tech/inthash.htm +     * +     */ +    key = (key << 15) - key - 1; +    key = key ^ (key >> 12); +    key = key + (key << 2); +    key = key ^ (key >> 4); +    key = key + (key << 3) + (key << 11); +    key = key ^ (key >> 16); + +    return key; +} + +static glyph_t * +lookup_glyph (pixman_glyph_cache_t *cache, +	      void                 *font_key, +	      void                 *glyph_key) +{ +    unsigned idx; +    glyph_t *g; + +    idx = hash (font_key, glyph_key); +    while ((g = cache->glyphs[idx++ & HASH_MASK])) +    { +	if (g != TOMBSTONE			&& +	    g->font_key == font_key		&& +	    g->glyph_key == glyph_key) +	{ +	    return g; +	} +    } + +    return NULL; +} + +static void +insert_glyph (pixman_glyph_cache_t *cache, +	      glyph_t              *glyph) +{ +    unsigned idx; +    glyph_t **loc; + +    idx = hash (glyph->font_key, glyph->glyph_key); + +    /* Note: we assume that there is room in the table. If there isn't, +     * this will be an infinite loop. +     */ +    do +    { +	loc = &cache->glyphs[idx++ & HASH_MASK]; +    } while (*loc && *loc != TOMBSTONE); + +    if (*loc == TOMBSTONE) +	cache->n_tombstones--; +    cache->n_glyphs++; + +    *loc = glyph; +} + +static void +remove_glyph (pixman_glyph_cache_t *cache, +	      glyph_t              *glyph) +{ +    unsigned idx; + +    idx = hash (glyph->font_key, glyph->glyph_key); +    while (cache->glyphs[idx & HASH_MASK] != glyph) +	idx++; + +    cache->glyphs[idx & HASH_MASK] = TOMBSTONE; +    cache->n_tombstones++; +    cache->n_glyphs--; + +    /* Eliminate tombstones if possible */ +    if (cache->glyphs[(idx + 1) & HASH_MASK] == NULL) +    { +	while (cache->glyphs[idx & HASH_MASK] == TOMBSTONE) +	{ +	    cache->glyphs[idx & HASH_MASK] = NULL; +	    cache->n_tombstones--; +	    idx--; +	} +    } +} + +static void +clear_table (pixman_glyph_cache_t *cache) +{ +    int i; + +    for (i = 0; i < HASH_SIZE; ++i) +    { +	glyph_t *glyph = cache->glyphs[i]; + +	if (glyph && glyph != TOMBSTONE) +	    free_glyph (glyph); + +	cache->glyphs[i] = NULL; +    } + +    cache->n_glyphs = 0; +    cache->n_tombstones = 0; +} + +PIXMAN_EXPORT pixman_glyph_cache_t * +pixman_glyph_cache_create (void) +{ +    pixman_glyph_cache_t *cache; + +    if (!(cache = malloc (sizeof *cache))) +	return NULL; + +    memset (cache->glyphs, 0, sizeof (cache->glyphs)); +    cache->n_glyphs = 0; +    cache->n_tombstones = 0; +    cache->freeze_count = 0; + +    pixman_list_init (&cache->mru); + +    return cache; +} + +PIXMAN_EXPORT void +pixman_glyph_cache_destroy (pixman_glyph_cache_t *cache) +{ +    return_if_fail (cache->freeze_count == 0); + +    clear_table (cache); + +    free (cache); +} + +PIXMAN_EXPORT void +pixman_glyph_cache_freeze (pixman_glyph_cache_t  *cache) +{ +    cache->freeze_count++; +} + +PIXMAN_EXPORT void +pixman_glyph_cache_thaw (pixman_glyph_cache_t  *cache) +{ +    if (--cache->freeze_count == 0					&& +	cache->n_glyphs + cache->n_tombstones > N_GLYPHS_HIGH_WATER) +    { +	if (cache->n_tombstones > N_GLYPHS_HIGH_WATER) +	{ +	    /* More than half the entries are +	     * tombstones. Just dump the whole table. +	     */ +	    clear_table (cache); +	} + +	while (cache->n_glyphs > N_GLYPHS_LOW_WATER) +	{ +	    glyph_t *glyph = CONTAINER_OF (glyph_t, mru_link, cache->mru.tail); + +	    remove_glyph (cache, glyph); +	    free_glyph (glyph); +	} +    } +} + +PIXMAN_EXPORT const void * +pixman_glyph_cache_lookup (pixman_glyph_cache_t  *cache, +			   void                  *font_key, +			   void                  *glyph_key) +{ +    return lookup_glyph (cache, font_key, glyph_key); +} + +PIXMAN_EXPORT const void * +pixman_glyph_cache_insert (pixman_glyph_cache_t  *cache, +			   void                  *font_key, +			   void                  *glyph_key, +			   int			  origin_x, +			   int                    origin_y, +			   pixman_image_t        *image) +{ +    glyph_t *glyph; +    int32_t width, height; + +    return_val_if_fail (cache->freeze_count > 0, NULL); +    return_val_if_fail (image->type == BITS, NULL); + +    width = image->bits.width; +    height = image->bits.height; + +    if (cache->n_glyphs >= HASH_SIZE) +	return NULL; + +    if (!(glyph = malloc (sizeof *glyph))) +	return NULL; + +    glyph->font_key = font_key; +    glyph->glyph_key = glyph_key; +    glyph->origin_x = origin_x; +    glyph->origin_y = origin_y; + +    if (!(glyph->image = pixman_image_create_bits ( +	      image->bits.format, width, height, NULL, -1))) +    { +	free (glyph); +	return NULL; +    } + +    pixman_image_composite32 (PIXMAN_OP_SRC, +			      image, NULL, glyph->image, 0, 0, 0, 0, 0, 0, +			      width, height); + +    if (PIXMAN_FORMAT_A   (glyph->image->bits.format) != 0	&& +	PIXMAN_FORMAT_RGB (glyph->image->bits.format) != 0) +    { +	pixman_image_set_component_alpha (glyph->image, TRUE); +    } + +    pixman_list_prepend (&cache->mru, &glyph->mru_link); + +    _pixman_image_validate (glyph->image); +    insert_glyph (cache, glyph); + +    return glyph; +} + +PIXMAN_EXPORT void +pixman_glyph_cache_remove (pixman_glyph_cache_t  *cache, +			   void                  *font_key, +			   void                  *glyph_key) +{ +    glyph_t *glyph; + +    if ((glyph = lookup_glyph (cache, font_key, glyph_key))) +    { +	remove_glyph (cache, glyph); + +	free_glyph (glyph); +    } +} + +PIXMAN_EXPORT void +pixman_glyph_get_extents (pixman_glyph_cache_t *cache, +			  int                   n_glyphs, +			  pixman_glyph_t       *glyphs, +			  pixman_box32_t       *extents) +{ +    int i; + +    extents->x1 = extents->y1 = INT32_MAX; +    extents->x2 = extents->y2 = INT32_MIN; + +    for (i = 0; i < n_glyphs; ++i) +    { +	glyph_t *glyph = (glyph_t *)glyphs[i].glyph; +	int x1, y1, x2, y2; + +	x1 = glyphs[i].x - glyph->origin_x; +	y1 = glyphs[i].y - glyph->origin_y; +	x2 = glyphs[i].x - glyph->origin_x + glyph->image->bits.width; +	y2 = glyphs[i].y - glyph->origin_y + glyph->image->bits.height; + +	if (x1 < extents->x1) +	    extents->x1 = x1; +	if (y1 < extents->y1) +	    extents->y1 = y1; +	if (x2 > extents->x2) +	    extents->x2 = x2; +	if (y2 > extents->y2) +	    extents->y2 = y2; +    } +} + +/* This function returns a format that is suitable for use as a mask for the + * set of glyphs in question. + */ +PIXMAN_EXPORT pixman_format_code_t +pixman_glyph_get_mask_format (pixman_glyph_cache_t *cache, +			      int		    n_glyphs, +			      const pixman_glyph_t *glyphs) +{ +    pixman_format_code_t format = PIXMAN_a1; +    int i; + +    for (i = 0; i < n_glyphs; ++i) +    { +	const glyph_t *glyph = glyphs[i].glyph; +	pixman_format_code_t glyph_format = glyph->image->bits.format; + +	if (PIXMAN_FORMAT_TYPE (glyph_format) == PIXMAN_TYPE_A) +	{ +	    if (PIXMAN_FORMAT_A (glyph_format) > PIXMAN_FORMAT_A (format)) +		format = glyph_format; +	} +	else +	{ +	    return PIXMAN_a8r8g8b8; +	} +    } + +    return format; +} + +static pixman_bool_t +box32_intersect (pixman_box32_t *dest, +		 const pixman_box32_t *box1, +		 const pixman_box32_t *box2) +{ +    dest->x1 = MAX (box1->x1, box2->x1); +    dest->y1 = MAX (box1->y1, box2->y1); +    dest->x2 = MIN (box1->x2, box2->x2); +    dest->y2 = MIN (box1->y2, box2->y2); + +    return dest->x2 > dest->x1 && dest->y2 > dest->y1; +} + +#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) +__attribute__((__force_align_arg_pointer__)) +#endif +PIXMAN_EXPORT void +pixman_composite_glyphs_no_mask (pixman_op_t            op, +				 pixman_image_t        *src, +				 pixman_image_t        *dest, +				 int32_t                src_x, +				 int32_t                src_y, +				 int32_t                dest_x, +				 int32_t                dest_y, +				 pixman_glyph_cache_t  *cache, +				 int                    n_glyphs, +				 const pixman_glyph_t  *glyphs) +{ +    pixman_region32_t region; +    pixman_format_code_t glyph_format = PIXMAN_null; +    uint32_t glyph_flags = 0; +    pixman_format_code_t dest_format; +    uint32_t dest_flags; +    pixman_composite_func_t func = NULL; +    pixman_implementation_t *implementation = NULL; +    pixman_composite_info_t info; +    int i; + +    _pixman_image_validate (src); +    _pixman_image_validate (dest); +     +    dest_format = dest->common.extended_format_code; +    dest_flags = dest->common.flags; +     +    pixman_region32_init (®ion); +    if (!_pixman_compute_composite_region32 ( +	    ®ion, +	    src, NULL, dest, +	    src_x - dest_x, src_y - dest_y, 0, 0, 0, 0, +	    dest->bits.width, dest->bits.height)) +    { +	goto out; +    } + +    info.op = op; +    info.src_image = src; +    info.dest_image = dest; +    info.src_flags = src->common.flags; +    info.dest_flags = dest->common.flags; + +    for (i = 0; i < n_glyphs; ++i) +    { +	glyph_t *glyph = (glyph_t *)glyphs[i].glyph; +	pixman_image_t *glyph_img = glyph->image; +	pixman_box32_t glyph_box; +	pixman_box32_t *pbox; +	uint32_t extra = FAST_PATH_SAMPLES_COVER_CLIP_NEAREST; +	pixman_box32_t composite_box; +	int n; + +	glyph_box.x1 = dest_x + glyphs[i].x - glyph->origin_x; +	glyph_box.y1 = dest_y + glyphs[i].y - glyph->origin_y; +	glyph_box.x2 = glyph_box.x1 + glyph->image->bits.width; +	glyph_box.y2 = glyph_box.y1 + glyph->image->bits.height; +	 +	pbox = pixman_region32_rectangles (®ion, &n); +	 +	info.mask_image = glyph_img; + +	while (n--) +	{ +	    if (box32_intersect (&composite_box, pbox, &glyph_box)) +	    { +		if (glyph_img->common.extended_format_code != glyph_format	|| +		    glyph_img->common.flags != glyph_flags) +		{ +		    glyph_format = glyph_img->common.extended_format_code; +		    glyph_flags = glyph_img->common.flags; + +		    _pixman_implementation_lookup_composite ( +			get_implementation(), op, +			src->common.extended_format_code, src->common.flags, +			glyph_format, glyph_flags | extra, +			dest_format, dest_flags, +			&implementation, &func); +		} + +		info.src_x = src_x + composite_box.x1 - dest_x; +		info.src_y = src_y + composite_box.y1 - dest_y; +		info.mask_x = composite_box.x1 - (dest_x + glyphs[i].x - glyph->origin_x); +		info.mask_y = composite_box.y1 - (dest_y + glyphs[i].y - glyph->origin_y); +		info.dest_x = composite_box.x1; +		info.dest_y = composite_box.y1; +		info.width = composite_box.x2 - composite_box.x1; +		info.height = composite_box.y2 - composite_box.y1; + +		info.mask_flags = glyph_flags; + +		func (implementation, &info); +	    } + +	    pbox++; +	} +	pixman_list_move_to_front (&cache->mru, &glyph->mru_link); +    } + +out: +    pixman_region32_fini (®ion); +} + +static void +add_glyphs (pixman_glyph_cache_t *cache, +	    pixman_image_t *dest, +	    int off_x, int off_y, +	    int n_glyphs, const pixman_glyph_t *glyphs) +{ +    pixman_format_code_t glyph_format = PIXMAN_null; +    uint32_t glyph_flags = 0; +    pixman_composite_func_t func = NULL; +    pixman_implementation_t *implementation = NULL; +    pixman_format_code_t dest_format; +    uint32_t dest_flags; +    pixman_box32_t dest_box; +    pixman_composite_info_t info; +    pixman_image_t *white_img = NULL; +    pixman_bool_t white_src = FALSE; +    int i; + +    _pixman_image_validate (dest); + +    dest_format = dest->common.extended_format_code; +    dest_flags = dest->common.flags; + +    info.op = PIXMAN_OP_ADD; +    info.dest_image = dest; +    info.src_x = 0; +    info.src_y = 0; +    info.dest_flags = dest_flags; + +    dest_box.x1 = 0; +    dest_box.y1 = 0; +    dest_box.x2 = dest->bits.width; +    dest_box.y2 = dest->bits.height; + +    for (i = 0; i < n_glyphs; ++i) +    { +	glyph_t *glyph = (glyph_t *)glyphs[i].glyph; +	pixman_image_t *glyph_img = glyph->image; +	pixman_box32_t glyph_box; +	pixman_box32_t composite_box; + +	if (glyph_img->common.extended_format_code != glyph_format	|| +	    glyph_img->common.flags != glyph_flags) +	{ +	    pixman_format_code_t src_format, mask_format; + +	    glyph_format = glyph_img->common.extended_format_code; +	    glyph_flags = glyph_img->common.flags; + +	    if (glyph_format == dest->bits.format) +	    { +		src_format = glyph_format; +		mask_format = PIXMAN_null; +		info.src_flags = glyph_flags | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST; +		info.mask_flags = FAST_PATH_IS_OPAQUE; +		info.mask_image = NULL; +		white_src = FALSE; +	    } +	    else +	    { +		if (!white_img) +		{ +		    static const pixman_color_t white = { 0xffff, 0xffff, 0xffff, 0xffff }; + +		    if (!(white_img = pixman_image_create_solid_fill (&white))) +			goto out; + +		    _pixman_image_validate (white_img); +		} + +		src_format = PIXMAN_solid; +		mask_format = glyph_format; +		info.src_flags = white_img->common.flags; +		info.mask_flags = glyph_flags | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST; +		info.src_image = white_img; +		white_src = TRUE; +	    } + +	    _pixman_implementation_lookup_composite ( +		get_implementation(), PIXMAN_OP_ADD, +		src_format, info.src_flags, +		mask_format, info.mask_flags, +		dest_format, dest_flags, +		&implementation, &func); +	} + +	glyph_box.x1 = glyphs[i].x - glyph->origin_x + off_x; +	glyph_box.y1 = glyphs[i].y - glyph->origin_y + off_y; +	glyph_box.x2 = glyph_box.x1 + glyph->image->bits.width; +	glyph_box.y2 = glyph_box.y1 + glyph->image->bits.height; +	 +	if (box32_intersect (&composite_box, &glyph_box, &dest_box)) +	{ +	    int src_x = composite_box.x1 - glyph_box.x1; +	    int src_y = composite_box.y1 - glyph_box.y1; + +	    if (white_src) +		info.mask_image = glyph_img; +	    else +		info.src_image = glyph_img; + +	    info.mask_x = info.src_x = src_x; +	    info.mask_y = info.src_y = src_y; +	    info.dest_x = composite_box.x1; +	    info.dest_y = composite_box.y1; +	    info.width = composite_box.x2 - composite_box.x1; +	    info.height = composite_box.y2 - composite_box.y1; + +	    func (implementation, &info); + +	    pixman_list_move_to_front (&cache->mru, &glyph->mru_link); +	} +    } + +out: +    if (white_img) +	pixman_image_unref (white_img); +} + +/* Conceptually, for each glyph, (white IN glyph) is PIXMAN_OP_ADDed to an + * infinitely big mask image at the position such that the glyph origin point + * is positioned at the (glyphs[i].x, glyphs[i].y) point. + * + * Then (mask_x, mask_y) in the infinite mask and (src_x, src_y) in the source + * image are both aligned with (dest_x, dest_y) in the destination image. Then + * these three images are composited within the  + * + *       (dest_x, dest_y, dst_x + width, dst_y + height) + * + * rectangle. + * + * TODO: + *   - Trim the mask to the destination clip/image? + *   - Trim composite region based on sources, when the op ignores 0s. + */ +#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) +__attribute__((__force_align_arg_pointer__)) +#endif +PIXMAN_EXPORT void +pixman_composite_glyphs (pixman_op_t            op, +			 pixman_image_t        *src, +			 pixman_image_t        *dest, +			 pixman_format_code_t   mask_format, +			 int32_t                src_x, +			 int32_t                src_y, +			 int32_t		mask_x, +			 int32_t		mask_y, +			 int32_t                dest_x, +			 int32_t                dest_y, +			 int32_t                width, +			 int32_t                height, +			 pixman_glyph_cache_t  *cache, +			 int			n_glyphs, +			 const pixman_glyph_t  *glyphs) +{ +    pixman_image_t *mask; + +    if (!(mask = pixman_image_create_bits (mask_format, width, height, NULL, -1))) +	return; + +    if (PIXMAN_FORMAT_A   (mask_format) != 0 && +	PIXMAN_FORMAT_RGB (mask_format) != 0) +    { +	pixman_image_set_component_alpha (mask, TRUE); +    } + +    add_glyphs (cache, mask, - mask_x, - mask_y, n_glyphs, glyphs); + +    pixman_image_composite32 (op, src, mask, dest, +			      src_x, src_y, +			      0, 0, +			      dest_x, dest_y, +			      width, height); + +    pixman_image_unref (mask); +} diff --git a/libs/pixman-0.40.0/pixman/pixman-gradient-walker.c b/libs/pixman-0.40.0/pixman/pixman-gradient-walker.c new file mode 100644 index 0000000..fb7f401 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-gradient-walker.c @@ -0,0 +1,264 @@ +/* + * + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. + *             2005 Lars Knoll & Zack Rusin, Trolltech + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Keith Packard not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission.  Keith Packard makes no + * representations about the suitability of this software for any purpose.  It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include "pixman-private.h" + +void +_pixman_gradient_walker_init (pixman_gradient_walker_t *walker, +                              gradient_t *              gradient, +                              pixman_repeat_t		repeat) +{ +    walker->num_stops = gradient->n_stops; +    walker->stops     = gradient->stops; +    walker->left_x    = 0; +    walker->right_x   = 0x10000; +    walker->a_s       = 0.0f; +    walker->a_b       = 0.0f; +    walker->r_s       = 0.0f; +    walker->r_b       = 0.0f; +    walker->g_s       = 0.0f; +    walker->g_b       = 0.0f; +    walker->b_s       = 0.0f; +    walker->b_b       = 0.0f; +    walker->repeat    = repeat; + +    walker->need_reset = TRUE; +} + +static void +gradient_walker_reset (pixman_gradient_walker_t *walker, +		       pixman_fixed_48_16_t      pos) +{ +    int64_t x, left_x, right_x; +    pixman_color_t *left_c, *right_c; +    int n, count = walker->num_stops; +    pixman_gradient_stop_t *stops = walker->stops; +    float la, lr, lg, lb; +    float ra, rr, rg, rb; +    float lx, rx; + +    if (walker->repeat == PIXMAN_REPEAT_NORMAL) +    { +	x = (int32_t)pos & 0xffff; +    } +    else if (walker->repeat == PIXMAN_REPEAT_REFLECT) +    { +	x = (int32_t)pos & 0xffff; +	if ((int32_t)pos & 0x10000) +	    x = 0x10000 - x; +    } +    else +    { +	x = pos; +    } +     +    for (n = 0; n < count; n++) +    { +	if (x < stops[n].x) +	    break; +    } +     +    left_x =  stops[n - 1].x; +    left_c = &stops[n - 1].color; +     +    right_x =  stops[n].x; +    right_c = &stops[n].color; + +    if (walker->repeat == PIXMAN_REPEAT_NORMAL) +    { +	left_x  += (pos - x); +	right_x += (pos - x); +    } +    else if (walker->repeat == PIXMAN_REPEAT_REFLECT) +    { +	if ((int32_t)pos & 0x10000) +	{ +	    pixman_color_t  *tmp_c; +	    int32_t tmp_x; + +	    tmp_x   = 0x10000 - right_x; +	    right_x = 0x10000 - left_x; +	    left_x  = tmp_x; + +	    tmp_c   = right_c; +	    right_c = left_c; +	    left_c  = tmp_c; + +	    x = 0x10000 - x; +	} +	left_x  += (pos - x); +	right_x += (pos - x); +    } +    else if (walker->repeat == PIXMAN_REPEAT_NONE) +    { +	if (n == 0) +	    right_c = left_c; +	else if (n == count) +	    left_c = right_c; +    } + +    /* The alpha/red/green/blue channels are scaled to be in [0, 1]. +     * This ensures that after premultiplication all channels will +     * be in the [0, 1] interval. +     */ +    la = (left_c->alpha * (1.0f/257.0f)); +    lr = (left_c->red * (1.0f/257.0f)); +    lg = (left_c->green * (1.0f/257.0f)); +    lb = (left_c->blue * (1.0f/257.0f)); + +    ra = (right_c->alpha * (1.0f/257.0f)); +    rr = (right_c->red * (1.0f/257.0f)); +    rg = (right_c->green * (1.0f/257.0f)); +    rb = (right_c->blue * (1.0f/257.0f)); +     +    lx = left_x * (1.0f/65536.0f); +    rx = right_x * (1.0f/65536.0f); +     +    if (FLOAT_IS_ZERO (rx - lx) || left_x == INT32_MIN || right_x == INT32_MAX) +    { +	walker->a_s = walker->r_s = walker->g_s = walker->b_s = 0.0f; +	walker->a_b = (la + ra) / 510.0f; +	walker->r_b = (lr + rr) / 510.0f; +	walker->g_b = (lg + rg) / 510.0f; +	walker->b_b = (lb + rb) / 510.0f; +    } +    else +    { +	float w_rec = 1.0f / (rx - lx); + +	walker->a_b = (la * rx - ra * lx) * w_rec * (1.0f/255.0f); +	walker->r_b = (lr * rx - rr * lx) * w_rec * (1.0f/255.0f); +	walker->g_b = (lg * rx - rg * lx) * w_rec * (1.0f/255.0f); +	walker->b_b = (lb * rx - rb * lx) * w_rec * (1.0f/255.0f); + +	walker->a_s = (ra - la) * w_rec * (1.0f/255.0f); +	walker->r_s = (rr - lr) * w_rec * (1.0f/255.0f); +	walker->g_s = (rg - lg) * w_rec * (1.0f/255.0f); +	walker->b_s = (rb - lb) * w_rec * (1.0f/255.0f); +    } +    +    walker->left_x = left_x; +    walker->right_x = right_x; + +    walker->need_reset = FALSE; +} + +static argb_t +pixman_gradient_walker_pixel_float (pixman_gradient_walker_t *walker, +				    pixman_fixed_48_16_t      x) +{ +    argb_t f; +    float y; + +    if (walker->need_reset || x < walker->left_x || x >= walker->right_x) +	gradient_walker_reset (walker, x); + +    y = x * (1.0f / 65536.0f); + +    f.a = walker->a_s * y + walker->a_b; +    f.r = f.a * (walker->r_s * y + walker->r_b); +    f.g = f.a * (walker->g_s * y + walker->g_b); +    f.b = f.a * (walker->b_s * y + walker->b_b); + +    return f; +} + +static uint32_t +pixman_gradient_walker_pixel_32 (pixman_gradient_walker_t *walker, +				 pixman_fixed_48_16_t      x) +{ +    argb_t f; +    float y; + +    if (walker->need_reset || x < walker->left_x || x >= walker->right_x) +	gradient_walker_reset (walker, x); + +    y = x * (1.0f / 65536.0f); + +    /* Instead of [0...1] for ARGB, we want [0...255], +     * multiply alpha with 255 and the color channels +     * also get multiplied by the alpha multiplier. +     * +     * We don't use pixman_contract_from_float because it causes a 2x +     * slowdown to do so, and the values are already normalized, +     * so we don't have to worry about values < 0.f or > 1.f +     */ +    f.a = 255.f * (walker->a_s * y + walker->a_b); +    f.r = f.a * (walker->r_s * y + walker->r_b); +    f.g = f.a * (walker->g_s * y + walker->g_b); +    f.b = f.a * (walker->b_s * y + walker->b_b); + +    return (((uint32_t)(f.a + .5f) << 24) & 0xff000000) | +           (((uint32_t)(f.r + .5f) << 16) & 0x00ff0000) | +           (((uint32_t)(f.g + .5f) <<  8) & 0x0000ff00) | +           (((uint32_t)(f.b + .5f) >>  0) & 0x000000ff); +} + +void +_pixman_gradient_walker_write_narrow (pixman_gradient_walker_t *walker, +				      pixman_fixed_48_16_t      x, +				      uint32_t                 *buffer) +{ +    *buffer = pixman_gradient_walker_pixel_32 (walker, x); +} + +void +_pixman_gradient_walker_write_wide (pixman_gradient_walker_t *walker, +				    pixman_fixed_48_16_t      x, +				    uint32_t                 *buffer) +{ +    *(argb_t *)buffer = pixman_gradient_walker_pixel_float (walker, x); +} + +void +_pixman_gradient_walker_fill_narrow (pixman_gradient_walker_t *walker, +				     pixman_fixed_48_16_t      x, +				     uint32_t                 *buffer, +				     uint32_t                 *end) +{ +    register uint32_t color; + +    color = pixman_gradient_walker_pixel_32 (walker, x); +    while (buffer < end) +	*buffer++ = color; +} + +void +_pixman_gradient_walker_fill_wide (pixman_gradient_walker_t *walker, +				   pixman_fixed_48_16_t      x, +				   uint32_t                 *buffer, +				   uint32_t                 *end) +{ +    register argb_t color; +    argb_t *buffer_wide = (argb_t *)buffer; +    argb_t *end_wide    = (argb_t *)end; + +    color = pixman_gradient_walker_pixel_float (walker, x); +    while (buffer_wide < end_wide) +	*buffer_wide++ = color; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-image.c b/libs/pixman-0.40.0/pixman/pixman-image.c new file mode 100644 index 0000000..db29ff5 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-image.c @@ -0,0 +1,994 @@ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of SuSE not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission.  SuSE makes no representations about the + * suitability of this software for any purpose.  It is provided "as is" + * without express or implied warranty. + * + * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> + +#include "pixman-private.h" + +static const pixman_color_t transparent_black = { 0, 0, 0, 0 }; + +static void +gradient_property_changed (pixman_image_t *image) +{ +    gradient_t *gradient = &image->gradient; +    int n = gradient->n_stops; +    pixman_gradient_stop_t *stops = gradient->stops; +    pixman_gradient_stop_t *begin = &(gradient->stops[-1]); +    pixman_gradient_stop_t *end = &(gradient->stops[n]); + +    switch (gradient->common.repeat) +    { +    default: +    case PIXMAN_REPEAT_NONE: +	begin->x = INT32_MIN; +	begin->color = transparent_black; +	end->x = INT32_MAX; +	end->color = transparent_black; +	break; + +    case PIXMAN_REPEAT_NORMAL: +	begin->x = stops[n - 1].x - pixman_fixed_1; +	begin->color = stops[n - 1].color; +	end->x = stops[0].x + pixman_fixed_1; +	end->color = stops[0].color; +	break; + +    case PIXMAN_REPEAT_REFLECT: +	begin->x = - stops[0].x; +	begin->color = stops[0].color; +	end->x = pixman_int_to_fixed (2) - stops[n - 1].x; +	end->color = stops[n - 1].color; +	break; + +    case PIXMAN_REPEAT_PAD: +	begin->x = INT32_MIN; +	begin->color = stops[0].color; +	end->x = INT32_MAX; +	end->color = stops[n - 1].color; +	break; +    } +} + +pixman_bool_t +_pixman_init_gradient (gradient_t *                  gradient, +                       const pixman_gradient_stop_t *stops, +                       int                           n_stops) +{ +    return_val_if_fail (n_stops > 0, FALSE); + +    /* We allocate two extra stops, one before the beginning of the stop list, +     * and one after the end. These stops are initialized to whatever color +     * would be used for positions outside the range of the stop list. +     * +     * This saves a bit of computation in the gradient walker. +     * +     * The pointer we store in the gradient_t struct still points to the +     * first user-supplied struct, so when freeing, we will have to +     * subtract one. +     */ +    gradient->stops = +	pixman_malloc_ab (n_stops + 2, sizeof (pixman_gradient_stop_t)); +    if (!gradient->stops) +	return FALSE; + +    gradient->stops += 1; +    memcpy (gradient->stops, stops, n_stops * sizeof (pixman_gradient_stop_t)); +    gradient->n_stops = n_stops; + +    gradient->common.property_changed = gradient_property_changed; + +    return TRUE; +} + +void +_pixman_image_init (pixman_image_t *image) +{ +    image_common_t *common = &image->common; + +    pixman_region32_init (&common->clip_region); + +    common->alpha_count = 0; +    common->have_clip_region = FALSE; +    common->clip_sources = FALSE; +    common->transform = NULL; +    common->repeat = PIXMAN_REPEAT_NONE; +    common->filter = PIXMAN_FILTER_NEAREST; +    common->filter_params = NULL; +    common->n_filter_params = 0; +    common->alpha_map = NULL; +    common->component_alpha = FALSE; +    common->ref_count = 1; +    common->property_changed = NULL; +    common->client_clip = FALSE; +    common->destroy_func = NULL; +    common->destroy_data = NULL; +    common->dirty = TRUE; +} + +pixman_bool_t +_pixman_image_fini (pixman_image_t *image) +{ +    image_common_t *common = (image_common_t *)image; + +    common->ref_count--; + +    if (common->ref_count == 0) +    { +	if (image->common.destroy_func) +	    image->common.destroy_func (image, image->common.destroy_data); + +	pixman_region32_fini (&common->clip_region); + +	free (common->transform); +	free (common->filter_params); + +	if (common->alpha_map) +	    pixman_image_unref ((pixman_image_t *)common->alpha_map); + +	if (image->type == LINEAR || +	    image->type == RADIAL || +	    image->type == CONICAL) +	{ +	    if (image->gradient.stops) +	    { +		/* See _pixman_init_gradient() for an explanation of the - 1 */ +		free (image->gradient.stops - 1); +	    } + +	    /* This will trigger if someone adds a property_changed +	     * method to the linear/radial/conical gradient overwriting +	     * the general one. +	     */ +	    assert ( +		image->common.property_changed == gradient_property_changed); +	} + +	if (image->type == BITS && image->bits.free_me) +	    free (image->bits.free_me); + +	return TRUE; +    } + +    return FALSE; +} + +pixman_image_t * +_pixman_image_allocate (void) +{ +    pixman_image_t *image = malloc (sizeof (pixman_image_t)); + +    if (image) +	_pixman_image_init (image); + +    return image; +} + +static void +image_property_changed (pixman_image_t *image) +{ +    image->common.dirty = TRUE; +} + +/* Ref Counting */ +PIXMAN_EXPORT pixman_image_t * +pixman_image_ref (pixman_image_t *image) +{ +    image->common.ref_count++; + +    return image; +} + +/* returns TRUE when the image is freed */ +PIXMAN_EXPORT pixman_bool_t +pixman_image_unref (pixman_image_t *image) +{ +    if (_pixman_image_fini (image)) +    { +	free (image); +	return TRUE; +    } + +    return FALSE; +} + +PIXMAN_EXPORT void +pixman_image_set_destroy_function (pixman_image_t *            image, +                                   pixman_image_destroy_func_t func, +                                   void *                      data) +{ +    image->common.destroy_func = func; +    image->common.destroy_data = data; +} + +PIXMAN_EXPORT void * +pixman_image_get_destroy_data (pixman_image_t *image) +{ +  return image->common.destroy_data; +} + +void +_pixman_image_reset_clip_region (pixman_image_t *image) +{ +    image->common.have_clip_region = FALSE; +} + +/* Executive Summary: This function is a no-op that only exists + * for historical reasons. + * + * There used to be a bug in the X server where it would rely on + * out-of-bounds accesses when it was asked to composite with a + * window as the source. It would create a pixman image pointing + * to some bogus position in memory, but then set a clip region + * to the position where the actual bits were. + * + * Due to a bug in old versions of pixman, where it would not clip + * against the image bounds when a clip region was set, this would + * actually work. So when the pixman bug was fixed, a workaround was + * added to allow certain out-of-bound accesses. This function disabled + * those workarounds. + * + * Since 0.21.2, pixman doesn't do these workarounds anymore, so now + * this function is a no-op. + */ +PIXMAN_EXPORT void +pixman_disable_out_of_bounds_workaround (void) +{ +} + +static void +compute_image_info (pixman_image_t *image) +{ +    pixman_format_code_t code; +    uint32_t flags = 0; + +    /* Transform */ +    if (!image->common.transform) +    { +	flags |= (FAST_PATH_ID_TRANSFORM	| +		  FAST_PATH_X_UNIT_POSITIVE	| +		  FAST_PATH_Y_UNIT_ZERO		| +		  FAST_PATH_AFFINE_TRANSFORM); +    } +    else +    { +	flags |= FAST_PATH_HAS_TRANSFORM; + +	if (image->common.transform->matrix[2][0] == 0			&& +	    image->common.transform->matrix[2][1] == 0			&& +	    image->common.transform->matrix[2][2] == pixman_fixed_1) +	{ +	    flags |= FAST_PATH_AFFINE_TRANSFORM; + +	    if (image->common.transform->matrix[0][1] == 0 && +		image->common.transform->matrix[1][0] == 0) +	    { +		if (image->common.transform->matrix[0][0] == -pixman_fixed_1 && +		    image->common.transform->matrix[1][1] == -pixman_fixed_1) +		{ +		    flags |= FAST_PATH_ROTATE_180_TRANSFORM; +		} +		flags |= FAST_PATH_SCALE_TRANSFORM; +	    } +	    else if (image->common.transform->matrix[0][0] == 0 && +	             image->common.transform->matrix[1][1] == 0) +	    { +		pixman_fixed_t m01 = image->common.transform->matrix[0][1]; +		pixman_fixed_t m10 = image->common.transform->matrix[1][0]; + +		if (m01 == -pixman_fixed_1 && m10 == pixman_fixed_1) +		    flags |= FAST_PATH_ROTATE_90_TRANSFORM; +		else if (m01 == pixman_fixed_1 && m10 == -pixman_fixed_1) +		    flags |= FAST_PATH_ROTATE_270_TRANSFORM; +	    } +	} + +	if (image->common.transform->matrix[0][0] > 0) +	    flags |= FAST_PATH_X_UNIT_POSITIVE; + +	if (image->common.transform->matrix[1][0] == 0) +	    flags |= FAST_PATH_Y_UNIT_ZERO; +    } + +    /* Filter */ +    switch (image->common.filter) +    { +    case PIXMAN_FILTER_NEAREST: +    case PIXMAN_FILTER_FAST: +	flags |= (FAST_PATH_NEAREST_FILTER | FAST_PATH_NO_CONVOLUTION_FILTER); +	break; + +    case PIXMAN_FILTER_BILINEAR: +    case PIXMAN_FILTER_GOOD: +    case PIXMAN_FILTER_BEST: +	flags |= (FAST_PATH_BILINEAR_FILTER | FAST_PATH_NO_CONVOLUTION_FILTER); + +	/* Here we have a chance to optimize BILINEAR filter to NEAREST if +	 * they are equivalent for the currently used transformation matrix. +	 */ +	if (flags & FAST_PATH_ID_TRANSFORM) +	{ +	    flags |= FAST_PATH_NEAREST_FILTER; +	} +	else if (flags & FAST_PATH_AFFINE_TRANSFORM) +	{ +	    /* Suppose the transform is +	     * +	     *    [ t00, t01, t02 ] +	     *    [ t10, t11, t12 ] +	     *    [   0,   0,   1 ] +	     * +	     * and the destination coordinates are (n + 0.5, m + 0.5). Then +	     * the transformed x coordinate is: +	     * +	     *     tx = t00 * (n + 0.5) + t01 * (m + 0.5) + t02 +	     *        = t00 * n + t01 * m + t02 + (t00 + t01) * 0.5 +	     * +	     * which implies that if t00, t01 and t02 are all integers +	     * and (t00 + t01) is odd, then tx will be an integer plus 0.5, +	     * which means a BILINEAR filter will reduce to NEAREST. The same +	     * applies in the y direction +	     */ +	    pixman_fixed_t (*t)[3] = image->common.transform->matrix; + +	    if ((pixman_fixed_frac ( +		     t[0][0] | t[0][1] | t[0][2] | +		     t[1][0] | t[1][1] | t[1][2]) == 0)			&& +		(pixman_fixed_to_int ( +		    (t[0][0] + t[0][1]) & (t[1][0] + t[1][1])) % 2) == 1) +	    { +		/* FIXME: there are some affine-test failures, showing that +		 * handling of BILINEAR and NEAREST filter is not quite +		 * equivalent when getting close to 32K for the translation +		 * components of the matrix. That's likely some bug, but for +		 * now just skip BILINEAR->NEAREST optimization in this case. +		 */ +		pixman_fixed_t magic_limit = pixman_int_to_fixed (30000); +		if (image->common.transform->matrix[0][2] <= magic_limit  && +		    image->common.transform->matrix[1][2] <= magic_limit  && +		    image->common.transform->matrix[0][2] >= -magic_limit && +		    image->common.transform->matrix[1][2] >= -magic_limit) +		{ +		    flags |= FAST_PATH_NEAREST_FILTER; +		} +	    } +	} +	break; + +    case PIXMAN_FILTER_CONVOLUTION: +	break; + +    case PIXMAN_FILTER_SEPARABLE_CONVOLUTION: +	flags |= FAST_PATH_SEPARABLE_CONVOLUTION_FILTER; +	break; + +    default: +	flags |= FAST_PATH_NO_CONVOLUTION_FILTER; +	break; +    } + +    /* Repeat mode */ +    switch (image->common.repeat) +    { +    case PIXMAN_REPEAT_NONE: +	flags |= +	    FAST_PATH_NO_REFLECT_REPEAT		| +	    FAST_PATH_NO_PAD_REPEAT		| +	    FAST_PATH_NO_NORMAL_REPEAT; +	break; + +    case PIXMAN_REPEAT_REFLECT: +	flags |= +	    FAST_PATH_NO_PAD_REPEAT		| +	    FAST_PATH_NO_NONE_REPEAT		| +	    FAST_PATH_NO_NORMAL_REPEAT; +	break; + +    case PIXMAN_REPEAT_PAD: +	flags |= +	    FAST_PATH_NO_REFLECT_REPEAT		| +	    FAST_PATH_NO_NONE_REPEAT		| +	    FAST_PATH_NO_NORMAL_REPEAT; +	break; + +    default: +	flags |= +	    FAST_PATH_NO_REFLECT_REPEAT		| +	    FAST_PATH_NO_PAD_REPEAT		| +	    FAST_PATH_NO_NONE_REPEAT; +	break; +    } + +    /* Component alpha */ +    if (image->common.component_alpha) +	flags |= FAST_PATH_COMPONENT_ALPHA; +    else +	flags |= FAST_PATH_UNIFIED_ALPHA; + +    flags |= (FAST_PATH_NO_ACCESSORS | FAST_PATH_NARROW_FORMAT); + +    /* Type specific checks */ +    switch (image->type) +    { +    case SOLID: +	code = PIXMAN_solid; + +	if (image->solid.color.alpha == 0xffff) +	    flags |= FAST_PATH_IS_OPAQUE; +	break; + +    case BITS: +	if (image->bits.width == 1	&& +	    image->bits.height == 1	&& +	    image->common.repeat != PIXMAN_REPEAT_NONE) +	{ +	    code = PIXMAN_solid; +	} +	else +	{ +	    code = image->bits.format; +	    flags |= FAST_PATH_BITS_IMAGE; +	} + +	if (!PIXMAN_FORMAT_A (image->bits.format)				&& +	    PIXMAN_FORMAT_TYPE (image->bits.format) != PIXMAN_TYPE_GRAY		&& +	    PIXMAN_FORMAT_TYPE (image->bits.format) != PIXMAN_TYPE_COLOR) +	{ +	    flags |= FAST_PATH_SAMPLES_OPAQUE; + +	    if (image->common.repeat != PIXMAN_REPEAT_NONE) +		flags |= FAST_PATH_IS_OPAQUE; +	} + +	if (image->bits.read_func || image->bits.write_func) +	    flags &= ~FAST_PATH_NO_ACCESSORS; + +	if (PIXMAN_FORMAT_IS_WIDE (image->bits.format)) +	    flags &= ~FAST_PATH_NARROW_FORMAT; +	break; + +    case RADIAL: +	code = PIXMAN_unknown; + +	/* +	 * As explained in pixman-radial-gradient.c, every point of +	 * the plane has a valid associated radius (and thus will be +	 * colored) if and only if a is negative (i.e. one of the two +	 * circles contains the other one). +	 */ + +        if (image->radial.a >= 0) +	    break; + +	/* Fall through */ + +    case CONICAL: +    case LINEAR: +	code = PIXMAN_unknown; + +	if (image->common.repeat != PIXMAN_REPEAT_NONE) +	{ +	    int i; + +	    flags |= FAST_PATH_IS_OPAQUE; +	    for (i = 0; i < image->gradient.n_stops; ++i) +	    { +		if (image->gradient.stops[i].color.alpha != 0xffff) +		{ +		    flags &= ~FAST_PATH_IS_OPAQUE; +		    break; +		} +	    } +	} +	break; + +    default: +	code = PIXMAN_unknown; +	break; +    } + +    /* Alpha maps are only supported for BITS images, so it's always +     * safe to ignore their presense for non-BITS images +     */ +    if (!image->common.alpha_map || image->type != BITS) +    { +	flags |= FAST_PATH_NO_ALPHA_MAP; +    } +    else +    { +	if (PIXMAN_FORMAT_IS_WIDE (image->common.alpha_map->format)) +	    flags &= ~FAST_PATH_NARROW_FORMAT; +    } + +    /* Both alpha maps and convolution filters can introduce +     * non-opaqueness in otherwise opaque images. Also +     * an image with component alpha turned on is only opaque +     * if all channels are opaque, so we simply turn it off +     * unconditionally for those images. +     */ +    if (image->common.alpha_map						|| +	image->common.filter == PIXMAN_FILTER_CONVOLUTION		|| +        image->common.filter == PIXMAN_FILTER_SEPARABLE_CONVOLUTION     || +	image->common.component_alpha) +    { +	flags &= ~(FAST_PATH_IS_OPAQUE | FAST_PATH_SAMPLES_OPAQUE); +    } + +    image->common.flags = flags; +    image->common.extended_format_code = code; +} + +void +_pixman_image_validate (pixman_image_t *image) +{ +    if (image->common.dirty) +    { +	compute_image_info (image); + +	/* It is important that property_changed is +	 * called *after* compute_image_info() because +	 * property_changed() can make use of the flags +	 * to set up accessors etc. +	 */ +	if (image->common.property_changed) +	    image->common.property_changed (image); + +	image->common.dirty = FALSE; +    } + +    if (image->common.alpha_map) +	_pixman_image_validate ((pixman_image_t *)image->common.alpha_map); +} + +PIXMAN_EXPORT pixman_bool_t +pixman_image_set_clip_region32 (pixman_image_t *   image, +                                pixman_region32_t *region) +{ +    image_common_t *common = (image_common_t *)image; +    pixman_bool_t result; + +    if (region) +    { +	if ((result = pixman_region32_copy (&common->clip_region, region))) +	    image->common.have_clip_region = TRUE; +    } +    else +    { +	_pixman_image_reset_clip_region (image); + +	result = TRUE; +    } + +    image_property_changed (image); + +    return result; +} + +PIXMAN_EXPORT pixman_bool_t +pixman_image_set_clip_region (pixman_image_t *   image, +                              pixman_region16_t *region) +{ +    image_common_t *common = (image_common_t *)image; +    pixman_bool_t result; + +    if (region) +    { +	if ((result = pixman_region32_copy_from_region16 (&common->clip_region, region))) +	    image->common.have_clip_region = TRUE; +    } +    else +    { +	_pixman_image_reset_clip_region (image); + +	result = TRUE; +    } + +    image_property_changed (image); + +    return result; +} + +PIXMAN_EXPORT void +pixman_image_set_has_client_clip (pixman_image_t *image, +                                  pixman_bool_t   client_clip) +{ +    image->common.client_clip = client_clip; +} + +PIXMAN_EXPORT pixman_bool_t +pixman_image_set_transform (pixman_image_t *          image, +                            const pixman_transform_t *transform) +{ +    static const pixman_transform_t id = +    { +	{ { pixman_fixed_1, 0, 0 }, +	  { 0, pixman_fixed_1, 0 }, +	  { 0, 0, pixman_fixed_1 } } +    }; + +    image_common_t *common = (image_common_t *)image; +    pixman_bool_t result; + +    if (common->transform == transform) +	return TRUE; + +    if (!transform || memcmp (&id, transform, sizeof (pixman_transform_t)) == 0) +    { +	free (common->transform); +	common->transform = NULL; +	result = TRUE; + +	goto out; +    } + +    if (common->transform && +	memcmp (common->transform, transform, sizeof (pixman_transform_t)) == 0) +    { +	return TRUE; +    } + +    if (common->transform == NULL) +	common->transform = malloc (sizeof (pixman_transform_t)); + +    if (common->transform == NULL) +    { +	result = FALSE; + +	goto out; +    } + +    memcpy (common->transform, transform, sizeof(pixman_transform_t)); + +    result = TRUE; + +out: +    image_property_changed (image); + +    return result; +} + +PIXMAN_EXPORT void +pixman_image_set_repeat (pixman_image_t *image, +                         pixman_repeat_t repeat) +{ +    if (image->common.repeat == repeat) +	return; + +    image->common.repeat = repeat; + +    image_property_changed (image); +} + +PIXMAN_EXPORT void +pixman_image_set_dither (pixman_image_t *image, +			 pixman_dither_t dither) +{ +    if (image->type == BITS) +    { +	if (image->bits.dither == dither) +	    return; + +	image->bits.dither = dither; + +	image_property_changed (image); +    } +} + +PIXMAN_EXPORT void +pixman_image_set_dither_offset (pixman_image_t *image, +				int             offset_x, +				int             offset_y) +{ +    if (image->type == BITS) +    { +	if (image->bits.dither_offset_x == offset_x && +	    image->bits.dither_offset_y == offset_y) +	{ +	    return; +	} + +	image->bits.dither_offset_x = offset_x; +	image->bits.dither_offset_y = offset_y; + +	image_property_changed (image); +    } +} + +PIXMAN_EXPORT pixman_bool_t +pixman_image_set_filter (pixman_image_t *      image, +                         pixman_filter_t       filter, +                         const pixman_fixed_t *params, +                         int                   n_params) +{ +    image_common_t *common = (image_common_t *)image; +    pixman_fixed_t *new_params; + +    if (params == common->filter_params && filter == common->filter) +	return TRUE; + +    if (filter == PIXMAN_FILTER_SEPARABLE_CONVOLUTION) +    { +	int width = pixman_fixed_to_int (params[0]); +	int height = pixman_fixed_to_int (params[1]); +	int x_phase_bits = pixman_fixed_to_int (params[2]); +	int y_phase_bits = pixman_fixed_to_int (params[3]); +	int n_x_phases = (1 << x_phase_bits); +	int n_y_phases = (1 << y_phase_bits); + +	return_val_if_fail ( +	    n_params == 4 + n_x_phases * width + n_y_phases * height, FALSE); +    } +     +    new_params = NULL; +    if (params) +    { +	new_params = pixman_malloc_ab (n_params, sizeof (pixman_fixed_t)); +	if (!new_params) +	    return FALSE; + +	memcpy (new_params, +	        params, n_params * sizeof (pixman_fixed_t)); +    } + +    common->filter = filter; + +    if (common->filter_params) +	free (common->filter_params); + +    common->filter_params = new_params; +    common->n_filter_params = n_params; + +    image_property_changed (image); +    return TRUE; +} + +PIXMAN_EXPORT void +pixman_image_set_source_clipping (pixman_image_t *image, +                                  pixman_bool_t   clip_sources) +{ +    if (image->common.clip_sources == clip_sources) +	return; + +    image->common.clip_sources = clip_sources; + +    image_property_changed (image); +} + +/* Unlike all the other property setters, this function does not + * copy the content of indexed. Doing this copying is simply + * way, way too expensive. + */ +PIXMAN_EXPORT void +pixman_image_set_indexed (pixman_image_t *        image, +                          const pixman_indexed_t *indexed) +{ +    bits_image_t *bits = (bits_image_t *)image; + +    if (bits->indexed == indexed) +	return; + +    bits->indexed = indexed; + +    image_property_changed (image); +} + +PIXMAN_EXPORT void +pixman_image_set_alpha_map (pixman_image_t *image, +                            pixman_image_t *alpha_map, +                            int16_t         x, +                            int16_t         y) +{ +    image_common_t *common = (image_common_t *)image; + +    return_if_fail (!alpha_map || alpha_map->type == BITS); + +    if (alpha_map && common->alpha_count > 0) +    { +	/* If this image is being used as an alpha map itself, +	 * then you can't give it an alpha map of its own. +	 */ +	return; +    } + +    if (alpha_map && alpha_map->common.alpha_map) +    { +	/* If the image has an alpha map of its own, +	 * then it can't be used as an alpha map itself +	 */ +	return; +    } + +    if (common->alpha_map != (bits_image_t *)alpha_map) +    { +	if (common->alpha_map) +	{ +	    common->alpha_map->common.alpha_count--; + +	    pixman_image_unref ((pixman_image_t *)common->alpha_map); +	} + +	if (alpha_map) +	{ +	    common->alpha_map = (bits_image_t *)pixman_image_ref (alpha_map); + +	    common->alpha_map->common.alpha_count++; +	} +	else +	{ +	    common->alpha_map = NULL; +	} +    } + +    common->alpha_origin_x = x; +    common->alpha_origin_y = y; + +    image_property_changed (image); +} + +PIXMAN_EXPORT void +pixman_image_set_component_alpha   (pixman_image_t *image, +                                    pixman_bool_t   component_alpha) +{ +    if (image->common.component_alpha == component_alpha) +	return; + +    image->common.component_alpha = component_alpha; + +    image_property_changed (image); +} + +PIXMAN_EXPORT pixman_bool_t +pixman_image_get_component_alpha   (pixman_image_t       *image) +{ +    return image->common.component_alpha; +} + +PIXMAN_EXPORT void +pixman_image_set_accessors (pixman_image_t *           image, +                            pixman_read_memory_func_t  read_func, +                            pixman_write_memory_func_t write_func) +{ +    return_if_fail (image != NULL); + +    if (image->type == BITS) +    { +	/* Accessors only work for <= 32 bpp. */ +	if (PIXMAN_FORMAT_BPP(image->bits.format) > 32) +	    return_if_fail (!read_func && !write_func); + +	image->bits.read_func = read_func; +	image->bits.write_func = write_func; + +	image_property_changed (image); +    } +} + +PIXMAN_EXPORT uint32_t * +pixman_image_get_data (pixman_image_t *image) +{ +    if (image->type == BITS) +	return image->bits.bits; + +    return NULL; +} + +PIXMAN_EXPORT int +pixman_image_get_width (pixman_image_t *image) +{ +    if (image->type == BITS) +	return image->bits.width; + +    return 0; +} + +PIXMAN_EXPORT int +pixman_image_get_height (pixman_image_t *image) +{ +    if (image->type == BITS) +	return image->bits.height; + +    return 0; +} + +PIXMAN_EXPORT int +pixman_image_get_stride (pixman_image_t *image) +{ +    if (image->type == BITS) +	return image->bits.rowstride * (int) sizeof (uint32_t); + +    return 0; +} + +PIXMAN_EXPORT int +pixman_image_get_depth (pixman_image_t *image) +{ +    if (image->type == BITS) +	return PIXMAN_FORMAT_DEPTH (image->bits.format); + +    return 0; +} + +PIXMAN_EXPORT pixman_format_code_t +pixman_image_get_format (pixman_image_t *image) +{ +    if (image->type == BITS) +	return image->bits.format; + +    return PIXMAN_null; +} + +uint32_t +_pixman_image_get_solid (pixman_implementation_t *imp, +			 pixman_image_t *         image, +                         pixman_format_code_t     format) +{ +    uint32_t result; + +    if (image->type == SOLID) +    { +	result = image->solid.color_32; +    } +    else if (image->type == BITS) +    { +	if (image->bits.format == PIXMAN_a8r8g8b8) +	    result = image->bits.bits[0]; +	else if (image->bits.format == PIXMAN_x8r8g8b8) +	    result = image->bits.bits[0] | 0xff000000; +	else if (image->bits.format == PIXMAN_a8) +	    result = (uint32_t)(*(uint8_t *)image->bits.bits) << 24; +	else +	    goto otherwise; +    } +    else +    { +	pixman_iter_t iter; + +    otherwise: +	_pixman_implementation_iter_init ( +	    imp, &iter, image, 0, 0, 1, 1, +	    (uint8_t *)&result, +	    ITER_NARROW | ITER_SRC, image->common.flags); +	 +	result = *iter.get_scanline (&iter, NULL); + +	if (iter.fini) +	    iter.fini (&iter); +    } + +    /* If necessary, convert RGB <--> BGR. */ +    if (PIXMAN_FORMAT_TYPE (format) != PIXMAN_TYPE_ARGB +	&& PIXMAN_FORMAT_TYPE (format) != PIXMAN_TYPE_ARGB_SRGB) +    { +	result = (((result & 0xff000000) >>  0) | +	          ((result & 0x00ff0000) >> 16) | +	          ((result & 0x0000ff00) >>  0) | +	          ((result & 0x000000ff) << 16)); +    } + +    return result; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-implementation.c b/libs/pixman-0.40.0/pixman/pixman-implementation.c new file mode 100644 index 0000000..2c7de4c --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-implementation.c @@ -0,0 +1,417 @@ +/* + * Copyright © 2009 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Red Hat not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission.  Red Hat makes no representations about the + * suitability of this software for any purpose.  It is provided "as is" + * without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include <stdlib.h> +#include "pixman-private.h" + +pixman_implementation_t * +_pixman_implementation_create (pixman_implementation_t *fallback, +			       const pixman_fast_path_t *fast_paths) +{ +    pixman_implementation_t *imp; + +    assert (fast_paths); + +    if ((imp = malloc (sizeof (pixman_implementation_t)))) +    { +	pixman_implementation_t *d; + +	memset (imp, 0, sizeof *imp); + +	imp->fallback = fallback; +	imp->fast_paths = fast_paths; +	 +	/* Make sure the whole fallback chain has the right toplevel */ +	for (d = imp; d != NULL; d = d->fallback) +	    d->toplevel = imp; +    } + +    return imp; +} + +#define N_CACHED_FAST_PATHS 8 + +typedef struct +{ +    struct +    { +	pixman_implementation_t *	imp; +	pixman_fast_path_t		fast_path; +    } cache [N_CACHED_FAST_PATHS]; +} cache_t; + +PIXMAN_DEFINE_THREAD_LOCAL (cache_t, fast_path_cache); + +static void +dummy_composite_rect (pixman_implementation_t *imp, +		      pixman_composite_info_t *info) +{ +} + +void +_pixman_implementation_lookup_composite (pixman_implementation_t  *toplevel, +					 pixman_op_t               op, +					 pixman_format_code_t      src_format, +					 uint32_t                  src_flags, +					 pixman_format_code_t      mask_format, +					 uint32_t                  mask_flags, +					 pixman_format_code_t      dest_format, +					 uint32_t                  dest_flags, +					 pixman_implementation_t **out_imp, +					 pixman_composite_func_t  *out_func) +{ +    pixman_implementation_t *imp; +    cache_t *cache; +    int i; + +    /* Check cache for fast paths */ +    cache = PIXMAN_GET_THREAD_LOCAL (fast_path_cache); + +    for (i = 0; i < N_CACHED_FAST_PATHS; ++i) +    { +	const pixman_fast_path_t *info = &(cache->cache[i].fast_path); + +	/* Note that we check for equality here, not whether +	 * the cached fast path matches. This is to prevent +	 * us from selecting an overly general fast path +	 * when a more specific one would work. +	 */ +	if (info->op == op			&& +	    info->src_format == src_format	&& +	    info->mask_format == mask_format	&& +	    info->dest_format == dest_format	&& +	    info->src_flags == src_flags	&& +	    info->mask_flags == mask_flags	&& +	    info->dest_flags == dest_flags	&& +	    info->func) +	{ +	    *out_imp = cache->cache[i].imp; +	    *out_func = cache->cache[i].fast_path.func; + +	    goto update_cache; +	} +    } + +    for (imp = toplevel; imp != NULL; imp = imp->fallback) +    { +	const pixman_fast_path_t *info = imp->fast_paths; + +	while (info->op != PIXMAN_OP_NONE) +	{ +	    if ((info->op == op || info->op == PIXMAN_OP_any)		&& +		/* Formats */ +		((info->src_format == src_format) || +		 (info->src_format == PIXMAN_any))			&& +		((info->mask_format == mask_format) || +		 (info->mask_format == PIXMAN_any))			&& +		((info->dest_format == dest_format) || +		 (info->dest_format == PIXMAN_any))			&& +		/* Flags */ +		(info->src_flags & src_flags) == info->src_flags	&& +		(info->mask_flags & mask_flags) == info->mask_flags	&& +		(info->dest_flags & dest_flags) == info->dest_flags) +	    { +		*out_imp = imp; +		*out_func = info->func; + +		/* Set i to the last spot in the cache so that the +		 * move-to-front code below will work +		 */ +		i = N_CACHED_FAST_PATHS - 1; + +		goto update_cache; +	    } + +	    ++info; +	} +    } + +    /* We should never reach this point */ +    _pixman_log_error ( +        FUNC, +        "No composite function found\n" +        "\n" +        "The most likely cause of this is that this system has issues with\n" +        "thread local storage\n"); + +    *out_imp = NULL; +    *out_func = dummy_composite_rect; +    return; + +update_cache: +    if (i) +    { +	while (i--) +	    cache->cache[i + 1] = cache->cache[i]; + +	cache->cache[0].imp = *out_imp; +	cache->cache[0].fast_path.op = op; +	cache->cache[0].fast_path.src_format = src_format; +	cache->cache[0].fast_path.src_flags = src_flags; +	cache->cache[0].fast_path.mask_format = mask_format; +	cache->cache[0].fast_path.mask_flags = mask_flags; +	cache->cache[0].fast_path.dest_format = dest_format; +	cache->cache[0].fast_path.dest_flags = dest_flags; +	cache->cache[0].fast_path.func = *out_func; +    } +} + +static void +dummy_combine (pixman_implementation_t *imp, +	       pixman_op_t              op, +	       uint32_t *               pd, +	       const uint32_t *         ps, +	       const uint32_t *         pm, +	       int                      w) +{ +} + +pixman_combine_32_func_t +_pixman_implementation_lookup_combiner (pixman_implementation_t *imp, +					pixman_op_t		 op, +					pixman_bool_t		 component_alpha, +					pixman_bool_t		 narrow) +{ +    while (imp) +    { +	pixman_combine_32_func_t f = NULL; + +	switch ((narrow << 1) | component_alpha) +	{ +	case 0: /* not narrow, not component alpha */ +	    f = (pixman_combine_32_func_t)imp->combine_float[op]; +	    break; +	     +	case 1: /* not narrow, component_alpha */ +	    f = (pixman_combine_32_func_t)imp->combine_float_ca[op]; +	    break; + +	case 2: /* narrow, not component alpha */ +	    f = imp->combine_32[op]; +	    break; + +	case 3: /* narrow, component_alpha */ +	    f = imp->combine_32_ca[op]; +	    break; +	} + +	if (f) +	    return f; + +	imp = imp->fallback; +    } + +    /* We should never reach this point */ +    _pixman_log_error (FUNC, "No known combine function\n"); +    return dummy_combine; +} + +pixman_bool_t +_pixman_implementation_blt (pixman_implementation_t * imp, +                            uint32_t *                src_bits, +                            uint32_t *                dst_bits, +                            int                       src_stride, +                            int                       dst_stride, +                            int                       src_bpp, +                            int                       dst_bpp, +                            int                       src_x, +                            int                       src_y, +                            int                       dest_x, +                            int                       dest_y, +                            int                       width, +                            int                       height) +{ +    while (imp) +    { +	if (imp->blt && +	    (*imp->blt) (imp, src_bits, dst_bits, src_stride, dst_stride, +			 src_bpp, dst_bpp, src_x, src_y, dest_x, dest_y, +			 width, height)) +	{ +	    return TRUE; +	} + +	imp = imp->fallback; +    } + +    return FALSE; +} + +pixman_bool_t +_pixman_implementation_fill (pixman_implementation_t *imp, +                             uint32_t *               bits, +                             int                      stride, +                             int                      bpp, +                             int                      x, +                             int                      y, +                             int                      width, +                             int                      height, +                             uint32_t                 filler) +{ +    while (imp) +    { +	if (imp->fill && +	    ((*imp->fill) (imp, bits, stride, bpp, x, y, width, height, filler))) +	{ +	    return TRUE; +	} + +	imp = imp->fallback; +    } + +    return FALSE; +} + +static uint32_t * +get_scanline_null (pixman_iter_t *iter, const uint32_t *mask) +{ +    return NULL; +} + +void +_pixman_implementation_iter_init (pixman_implementation_t *imp, +                                  pixman_iter_t           *iter, +                                  pixman_image_t          *image, +                                  int                      x, +                                  int                      y, +                                  int                      width, +                                  int                      height, +                                  uint8_t                 *buffer, +                                  iter_flags_t             iter_flags, +                                  uint32_t                 image_flags) +{ +    pixman_format_code_t format; + +    iter->image = image; +    iter->buffer = (uint32_t *)buffer; +    iter->x = x; +    iter->y = y; +    iter->width = width; +    iter->height = height; +    iter->iter_flags = iter_flags; +    iter->image_flags = image_flags; +    iter->fini = NULL; + +    if (!iter->image) +    { +	iter->get_scanline = get_scanline_null; +	return; +    } + +    format = iter->image->common.extended_format_code; + +    while (imp) +    { +        if (imp->iter_info) +        { +            const pixman_iter_info_t *info; + +            for (info = imp->iter_info; info->format != PIXMAN_null; ++info) +            { +                if ((info->format == PIXMAN_any || info->format == format) && +                    (info->image_flags & image_flags) == info->image_flags && +                    (info->iter_flags & iter_flags) == info->iter_flags) +                { +                    iter->get_scanline = info->get_scanline; +                    iter->write_back = info->write_back; + +                    if (info->initializer) +                        info->initializer (iter, info); +                    return; +                } +            } +        } + +        imp = imp->fallback; +    } +} + +pixman_bool_t +_pixman_disabled (const char *name) +{ +    const char *env; + +    if ((env = getenv ("PIXMAN_DISABLE"))) +    { +	do +	{ +	    const char *end; +	    int len; + +	    if ((end = strchr (env, ' '))) +		len = end - env; +	    else +		len = strlen (env); + +	    if (strlen (name) == len && strncmp (name, env, len) == 0) +	    { +		printf ("pixman: Disabled %s implementation\n", name); +		return TRUE; +	    } + +	    env += len; +	} +	while (*env++); +    } + +    return FALSE; +} + +static const pixman_fast_path_t empty_fast_path[] = +{ +    { PIXMAN_OP_NONE } +}; + +pixman_implementation_t * +_pixman_choose_implementation (void) +{ +    pixman_implementation_t *imp; + +    imp = _pixman_implementation_create_general(); + +    if (!_pixman_disabled ("fast")) +	imp = _pixman_implementation_create_fast_path (imp); + +    imp = _pixman_x86_get_implementations (imp); +    imp = _pixman_arm_get_implementations (imp); +    imp = _pixman_ppc_get_implementations (imp); +    imp = _pixman_mips_get_implementations (imp); + +    imp = _pixman_implementation_create_noop (imp); + +    if (_pixman_disabled ("wholeops")) +    { +        pixman_implementation_t *cur; + +        /* Disable all whole-operation paths except the general one, +         * so that optimized iterators are used as much as possible. +         */ +        for (cur = imp; cur->fallback; cur = cur->fallback) +            cur->fast_paths = empty_fast_path; +    } + +    return imp; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-inlines.h b/libs/pixman-0.40.0/pixman/pixman-inlines.h new file mode 100644 index 0000000..f785910 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-inlines.h @@ -0,0 +1,1365 @@ +/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of SuSE not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission.  SuSE makes no representations about the + * suitability of this software for any purpose.  It is provided "as is" + * without express or implied warranty. + * + * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Author:  Keith Packard, SuSE, Inc. + */ + +#ifndef PIXMAN_FAST_PATH_H__ +#define PIXMAN_FAST_PATH_H__ + +#include "pixman-private.h" + +#define PIXMAN_REPEAT_COVER -1 + +/* Flags describing input parameters to fast path macro template. + * Turning on some flag values may indicate that + * "some property X is available so template can use this" or + * "some property X should be handled by template". + * + * FLAG_HAVE_SOLID_MASK + *  Input mask is solid so template should handle this. + * + * FLAG_HAVE_NON_SOLID_MASK + *  Input mask is bits mask so template should handle this. + * + * FLAG_HAVE_SOLID_MASK and FLAG_HAVE_NON_SOLID_MASK are mutually + * exclusive. (It's not allowed to turn both flags on) + */ +#define FLAG_NONE				(0) +#define FLAG_HAVE_SOLID_MASK			(1 <<   1) +#define FLAG_HAVE_NON_SOLID_MASK		(1 <<   2) + +/* To avoid too short repeated scanline function calls, extend source + * scanlines having width less than below constant value. + */ +#define REPEAT_NORMAL_MIN_WIDTH			64 + +static force_inline pixman_bool_t +repeat (pixman_repeat_t repeat, int *c, int size) +{ +    if (repeat == PIXMAN_REPEAT_NONE) +    { +	if (*c < 0 || *c >= size) +	    return FALSE; +    } +    else if (repeat == PIXMAN_REPEAT_NORMAL) +    { +	while (*c >= size) +	    *c -= size; +	while (*c < 0) +	    *c += size; +    } +    else if (repeat == PIXMAN_REPEAT_PAD) +    { +	*c = CLIP (*c, 0, size - 1); +    } +    else /* REFLECT */ +    { +	*c = MOD (*c, size * 2); +	if (*c >= size) +	    *c = size * 2 - *c - 1; +    } +    return TRUE; +} + +static force_inline int +pixman_fixed_to_bilinear_weight (pixman_fixed_t x) +{ +    return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) & +	   ((1 << BILINEAR_INTERPOLATION_BITS) - 1); +} + +#if BILINEAR_INTERPOLATION_BITS <= 4 +/* Inspired by Filter_32_opaque from Skia */ +static force_inline uint32_t +bilinear_interpolation (uint32_t tl, uint32_t tr, +			uint32_t bl, uint32_t br, +			int distx, int disty) +{ +    int distxy, distxiy, distixy, distixiy; +    uint32_t lo, hi; + +    distx <<= (4 - BILINEAR_INTERPOLATION_BITS); +    disty <<= (4 - BILINEAR_INTERPOLATION_BITS); + +    distxy = distx * disty; +    distxiy = (distx << 4) - distxy;	/* distx * (16 - disty) */ +    distixy = (disty << 4) - distxy;	/* disty * (16 - distx) */ +    distixiy = +	16 * 16 - (disty << 4) - +	(distx << 4) + distxy; /* (16 - distx) * (16 - disty) */ + +    lo = (tl & 0xff00ff) * distixiy; +    hi = ((tl >> 8) & 0xff00ff) * distixiy; + +    lo += (tr & 0xff00ff) * distxiy; +    hi += ((tr >> 8) & 0xff00ff) * distxiy; + +    lo += (bl & 0xff00ff) * distixy; +    hi += ((bl >> 8) & 0xff00ff) * distixy; + +    lo += (br & 0xff00ff) * distxy; +    hi += ((br >> 8) & 0xff00ff) * distxy; + +    return ((lo >> 8) & 0xff00ff) | (hi & ~0xff00ff); +} + +#else +#if SIZEOF_LONG > 4 + +static force_inline uint32_t +bilinear_interpolation (uint32_t tl, uint32_t tr, +			uint32_t bl, uint32_t br, +			int distx, int disty) +{ +    uint64_t distxy, distxiy, distixy, distixiy; +    uint64_t tl64, tr64, bl64, br64; +    uint64_t f, r; + +    distx <<= (8 - BILINEAR_INTERPOLATION_BITS); +    disty <<= (8 - BILINEAR_INTERPOLATION_BITS); + +    distxy = distx * disty; +    distxiy = distx * (256 - disty); +    distixy = (256 - distx) * disty; +    distixiy = (256 - distx) * (256 - disty); + +    /* Alpha and Blue */ +    tl64 = tl & 0xff0000ff; +    tr64 = tr & 0xff0000ff; +    bl64 = bl & 0xff0000ff; +    br64 = br & 0xff0000ff; + +    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy; +    r = f & 0x0000ff0000ff0000ull; + +    /* Red and Green */ +    tl64 = tl; +    tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull); + +    tr64 = tr; +    tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull); + +    bl64 = bl; +    bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull); + +    br64 = br; +    br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull); + +    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy; +    r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull); + +    return (uint32_t)(r >> 16); +} + +#else + +static force_inline uint32_t +bilinear_interpolation (uint32_t tl, uint32_t tr, +			uint32_t bl, uint32_t br, +			int distx, int disty) +{ +    int distxy, distxiy, distixy, distixiy; +    uint32_t f, r; + +    distx <<= (8 - BILINEAR_INTERPOLATION_BITS); +    disty <<= (8 - BILINEAR_INTERPOLATION_BITS); + +    distxy = distx * disty; +    distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */ +    distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */ +    distixiy = +	256 * 256 - (disty << 8) - +	(distx << 8) + distxy;		/* (256 - distx) * (256 - disty) */ + +    /* Blue */ +    r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy +      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy; + +    /* Green */ +    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy +      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy; +    r |= f & 0xff000000; + +    tl >>= 16; +    tr >>= 16; +    bl >>= 16; +    br >>= 16; +    r >>= 16; + +    /* Red */ +    f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy +      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy; +    r |= f & 0x00ff0000; + +    /* Alpha */ +    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy +      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy; +    r |= f & 0xff000000; + +    return r; +} + +#endif +#endif // BILINEAR_INTERPOLATION_BITS <= 4 + +static force_inline argb_t +bilinear_interpolation_float (argb_t tl, argb_t tr, +			      argb_t bl, argb_t br, +			      float distx, float disty) +{ +    float distxy, distxiy, distixy, distixiy; +    argb_t r; + +    distxy = distx * disty; +    distxiy = distx * (1.f - disty); +    distixy = (1.f - distx) * disty; +    distixiy = (1.f - distx) * (1.f - disty); + +    r.a = tl.a * distixiy + tr.a * distxiy + +          bl.a * distixy  + br.a * distxy; +    r.r = tl.r * distixiy + tr.r * distxiy + +          bl.r * distixy  + br.r * distxy; +    r.g = tl.g * distixiy + tr.g * distxiy + +          bl.g * distixy  + br.g * distxy; +    r.b = tl.b * distixiy + tr.b * distxiy + +          bl.b * distixy  + br.b * distxy; + +    return r; +} + +/* + * For each scanline fetched from source image with PAD repeat: + * - calculate how many pixels need to be padded on the left side + * - calculate how many pixels need to be padded on the right side + * - update width to only count pixels which are fetched from the image + * All this information is returned via 'width', 'left_pad', 'right_pad' + * arguments. The code is assuming that 'unit_x' is positive. + * + * Note: 64-bit math is used in order to avoid potential overflows, which + *       is probably excessive in many cases. This particular function + *       may need its own correctness test and performance tuning. + */ +static force_inline void +pad_repeat_get_scanline_bounds (int32_t         source_image_width, +				pixman_fixed_t  vx, +				pixman_fixed_t  unit_x, +				int32_t *       width, +				int32_t *       left_pad, +				int32_t *       right_pad) +{ +    int64_t max_vx = (int64_t) source_image_width << 16; +    int64_t tmp; +    if (vx < 0) +    { +	tmp = ((int64_t) unit_x - 1 - vx) / unit_x; +	if (tmp > *width) +	{ +	    *left_pad = *width; +	    *width = 0; +	} +	else +	{ +	    *left_pad = (int32_t) tmp; +	    *width -= (int32_t) tmp; +	} +    } +    else +    { +	*left_pad = 0; +    } +    tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad; +    if (tmp < 0) +    { +	*right_pad = *width; +	*width = 0; +    } +    else if (tmp >= *width) +    { +	*right_pad = 0; +    } +    else +    { +	*right_pad = *width - (int32_t) tmp; +	*width = (int32_t) tmp; +    } +} + +/* A macroified version of specialized nearest scalers for some + * common 8888 and 565 formats. It supports SRC and OVER ops. + * + * There are two repeat versions, one that handles repeat normal, + * and one without repeat handling that only works if the src region + * used is completely covered by the pre-repeated source samples. + * + * The loops are unrolled to process two pixels per iteration for better + * performance on most CPU architectures (superscalar processors + * can issue several operations simultaneously, other processors can hide + * instructions latencies by pipelining operations). Unrolling more + * does not make much sense because the compiler will start running out + * of spare registers soon. + */ + +#define GET_8888_ALPHA(s) ((s) >> 24) + /* This is not actually used since we don't have an OVER with +    565 source, but it is needed to build. */ +#define GET_0565_ALPHA(s) 0xff +#define GET_x888_ALPHA(s) 0xff + +#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT,			\ +			      src_type_t, dst_type_t, OP, repeat_mode)				\ +static force_inline void									\ +scanline_func_name (dst_type_t       *dst,							\ +		    const src_type_t *src,							\ +		    int32_t           w,							\ +		    pixman_fixed_t    vx,							\ +		    pixman_fixed_t    unit_x,							\ +		    pixman_fixed_t    src_width_fixed,						\ +		    pixman_bool_t     fully_transparent_src)					\ +{												\ +	uint32_t   d;										\ +	src_type_t s1, s2;									\ +	uint8_t    a1, a2;									\ +	int        x1, x2;									\ +												\ +	if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER && fully_transparent_src)			\ +	    return;										\ +												\ +	if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\ +	    abort();										\ +												\ +	while ((w -= 2) >= 0)									\ +	{											\ +	    x1 = pixman_fixed_to_int (vx);							\ +	    vx += unit_x;									\ +	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\ +	    {											\ +		/* This works because we know that unit_x is positive */			\ +		while (vx >= 0)									\ +		    vx -= src_width_fixed;							\ +	    }											\ +	    s1 = *(src + x1);									\ +												\ +	    x2 = pixman_fixed_to_int (vx);							\ +	    vx += unit_x;									\ +	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\ +	    {											\ +		/* This works because we know that unit_x is positive */			\ +		while (vx >= 0)									\ +		    vx -= src_width_fixed;							\ +	    }											\ +	    s2 = *(src + x2);									\ +												\ +	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\ +	    {											\ +		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\ +		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\ +												\ +		if (a1 == 0xff)									\ +		{										\ +		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\ +		}										\ +		else if (s1)									\ +		{										\ +		    d = convert_ ## DST_FORMAT ## _to_8888 (*dst);				\ +		    s1 = convert_ ## SRC_FORMAT ## _to_8888 (s1);				\ +		    a1 ^= 0xff;									\ +		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\ +		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\ +		}										\ +		dst++;										\ +												\ +		if (a2 == 0xff)									\ +		{										\ +		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s2);			\ +		}										\ +		else if (s2)									\ +		{										\ +		    d = convert_## DST_FORMAT ## _to_8888 (*dst);				\ +		    s2 = convert_## SRC_FORMAT ## _to_8888 (s2);				\ +		    a2 ^= 0xff;									\ +		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\ +		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\ +		}										\ +		dst++;										\ +	    }											\ +	    else /* PIXMAN_OP_SRC */								\ +	    {											\ +		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\ +		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s2);			\ +	    }											\ +	}											\ +												\ +	if (w & 1)										\ +	{											\ +	    x1 = pixman_fixed_to_int (vx);							\ +	    s1 = *(src + x1);									\ +												\ +	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\ +	    {											\ +		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\ +												\ +		if (a1 == 0xff)									\ +		{										\ +		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\ +		}										\ +		else if (s1)									\ +		{										\ +		    d = convert_## DST_FORMAT ## _to_8888 (*dst);				\ +		    s1 = convert_ ## SRC_FORMAT ## _to_8888 (s1);				\ +		    a1 ^= 0xff;									\ +		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\ +		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\ +		}										\ +		dst++;										\ +	    }											\ +	    else /* PIXMAN_OP_SRC */								\ +	    {											\ +		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\ +	    }											\ +	}											\ +} + +#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\ +				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\ +static void											\ +fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,		\ +						   pixman_composite_info_t *info)               \ +{												\ +    PIXMAN_COMPOSITE_ARGS (info);					                        \ +    dst_type_t *dst_line;						                        \ +    mask_type_t *mask_line;									\ +    src_type_t *src_first_line;									\ +    int       y;										\ +    pixman_fixed_t src_width_fixed = pixman_int_to_fixed (src_image->bits.width);		\ +    pixman_fixed_t max_vy;									\ +    pixman_vector_t v;										\ +    pixman_fixed_t vx, vy;									\ +    pixman_fixed_t unit_x, unit_y;								\ +    int32_t left_pad, right_pad;								\ +												\ +    src_type_t *src;										\ +    dst_type_t *dst;										\ +    mask_type_t solid_mask;									\ +    const mask_type_t *mask = &solid_mask;							\ +    int src_stride, mask_stride, dst_stride;							\ +												\ +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\ +    if (have_mask)										\ +    {												\ +	if (mask_is_solid)									\ +	    solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\ +	else											\ +	    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,			\ +				   mask_stride, mask_line, 1);					\ +    }												\ +    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\ +     * transformed from destination space to source space */					\ +    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\ +												\ +    /* reference point is the center of the pixel */						\ +    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\ +    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\ +    v.vector[2] = pixman_fixed_1;								\ +												\ +    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\ +	return;											\ +												\ +    unit_x = src_image->common.transform->matrix[0][0];						\ +    unit_y = src_image->common.transform->matrix[1][1];						\ +												\ +    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\ +    v.vector[0] -= pixman_fixed_e;								\ +    v.vector[1] -= pixman_fixed_e;								\ +												\ +    vx = v.vector[0];										\ +    vy = v.vector[1];										\ +												\ +    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\ +    {												\ +	max_vy = pixman_int_to_fixed (src_image->bits.height);					\ +												\ +	/* Clamp repeating positions inside the actual samples */				\ +	repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);					\ +	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\ +    }												\ +												\ +    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\ +	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\ +    {												\ +	pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x,			\ +					&width, &left_pad, &right_pad);				\ +	vx += left_pad * unit_x;								\ +    }												\ +												\ +    while (--height >= 0)									\ +    {												\ +	dst = dst_line;										\ +	dst_line += dst_stride;									\ +	if (have_mask && !mask_is_solid)							\ +	{											\ +	    mask = mask_line;									\ +	    mask_line += mask_stride;								\ +	}											\ +												\ +	y = pixman_fixed_to_int (vy);								\ +	vy += unit_y;										\ +	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\ +	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\ +	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\ +	{											\ +	    repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height);				\ +	    src = src_first_line + src_stride * y;						\ +	    if (left_pad > 0)									\ +	    {											\ +		scanline_func (mask, dst,							\ +			       src + src_image->bits.width - src_image->bits.width + 1,		\ +			       left_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);		\ +	    }											\ +	    if (width > 0)									\ +	    {											\ +		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\ +			       dst + left_pad, src + src_image->bits.width, width,		\ +			       vx - src_width_fixed, unit_x, src_width_fixed, FALSE);		\ +	    }											\ +	    if (right_pad > 0)									\ +	    {											\ +		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\ +			       dst + left_pad + width, src + src_image->bits.width,		\ +			       right_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);		\ +	    }											\ +	}											\ +	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\ +	{											\ +	    static const src_type_t zero[1] = { 0 };						\ +	    if (y < 0 || y >= src_image->bits.height)						\ +	    {											\ +		scanline_func (mask, dst, zero + 1, left_pad + width + right_pad,		\ +			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\ +		continue;									\ +	    }											\ +	    src = src_first_line + src_stride * y;						\ +	    if (left_pad > 0)									\ +	    {											\ +		scanline_func (mask, dst, zero + 1, left_pad,					\ +			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\ +	    }											\ +	    if (width > 0)									\ +	    {											\ +		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\ +			       dst + left_pad, src + src_image->bits.width, width,		\ +			       vx - src_width_fixed, unit_x, src_width_fixed, FALSE);		\ +	    }											\ +	    if (right_pad > 0)									\ +	    {											\ +		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\ +			       dst + left_pad + width, zero + 1, right_pad,			\ +			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\ +	    }											\ +	}											\ +	else											\ +	{											\ +	    src = src_first_line + src_stride * y;						\ +	    scanline_func (mask, dst, src + src_image->bits.width, width, vx - src_width_fixed,	\ +			   unit_x, src_width_fixed, FALSE);					\ +	}											\ +    }												\ +} + +/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */ +#define FAST_NEAREST_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\ +				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\ +	FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,	\ +				  dst_type_t, repeat_mode, have_mask, mask_is_solid) + +#define FAST_NEAREST_MAINLOOP_NOMASK(scale_func_name, scanline_func, src_type_t, dst_type_t,	\ +			      repeat_mode)							\ +    static force_inline void									\ +    scanline_func##scale_func_name##_wrapper (							\ +		    const uint8_t    *mask,							\ +		    dst_type_t       *dst,							\ +		    const src_type_t *src,							\ +		    int32_t          w,								\ +		    pixman_fixed_t   vx,							\ +		    pixman_fixed_t   unit_x,							\ +		    pixman_fixed_t   max_vx,							\ +		    pixman_bool_t    fully_transparent_src)					\ +    {												\ +	scanline_func (dst, src, w, vx, unit_x, max_vx, fully_transparent_src);			\ +    }												\ +    FAST_NEAREST_MAINLOOP_INT (scale_func_name, scanline_func##scale_func_name##_wrapper,	\ +			       src_type_t, uint8_t, dst_type_t, repeat_mode, FALSE, FALSE) + +#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t,		\ +			      repeat_mode)							\ +	FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name, scanline_func, src_type_t,		\ +			      dst_type_t, repeat_mode) + +#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,				\ +		     src_type_t, dst_type_t, OP, repeat_mode)				\ +    FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\ +			  SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t,		\ +			  OP, repeat_mode)						\ +    FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name ## _ ## OP,			\ +			  scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\ +			  src_type_t, dst_type_t, repeat_mode) + + +#define SCALED_NEAREST_FLAGS						\ +    (FAST_PATH_SCALE_TRANSFORM	|					\ +     FAST_PATH_NO_ALPHA_MAP	|					\ +     FAST_PATH_NEAREST_FILTER	|					\ +     FAST_PATH_NO_ACCESSORS	|					\ +     FAST_PATH_NARROW_FORMAT) + +#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func)			\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	(SCALED_NEAREST_FLAGS		|				\ +	 FAST_PATH_NORMAL_REPEAT	|				\ +	 FAST_PATH_X_UNIT_POSITIVE),					\ +	PIXMAN_null, 0,							\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\ +    } + +#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func)			\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	(SCALED_NEAREST_FLAGS		|				\ +	 FAST_PATH_PAD_REPEAT		|				\ +	 FAST_PATH_X_UNIT_POSITIVE),					\ +	PIXMAN_null, 0,							\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\ +    } + +#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func)			\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	(SCALED_NEAREST_FLAGS		|				\ +	 FAST_PATH_NONE_REPEAT		|				\ +	 FAST_PATH_X_UNIT_POSITIVE),					\ +	PIXMAN_null, 0,							\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\ +    } + +#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func)			\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,    \ +	PIXMAN_null, 0,							\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\ +    } + +#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	(SCALED_NEAREST_FLAGS		|				\ +	 FAST_PATH_NORMAL_REPEAT	|				\ +	 FAST_PATH_X_UNIT_POSITIVE),					\ +	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\ +    } + +#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	(SCALED_NEAREST_FLAGS		|				\ +	 FAST_PATH_PAD_REPEAT		|				\ +	 FAST_PATH_X_UNIT_POSITIVE),					\ +	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\ +    } + +#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	(SCALED_NEAREST_FLAGS		|				\ +	 FAST_PATH_NONE_REPEAT		|				\ +	 FAST_PATH_X_UNIT_POSITIVE),					\ +	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\ +    } + +#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,	\ +	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\ +    } + +#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)		\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	(SCALED_NEAREST_FLAGS		|				\ +	 FAST_PATH_NORMAL_REPEAT	|				\ +	 FAST_PATH_X_UNIT_POSITIVE),					\ +	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\ +    } + +#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	(SCALED_NEAREST_FLAGS		|				\ +	 FAST_PATH_PAD_REPEAT		|				\ +	 FAST_PATH_X_UNIT_POSITIVE),					\ +	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\ +    } + +#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	(SCALED_NEAREST_FLAGS		|				\ +	 FAST_PATH_NONE_REPEAT		|				\ +	 FAST_PATH_X_UNIT_POSITIVE),					\ +	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\ +    } + +#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,	\ +	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\ +    } + +/* Prefer the use of 'cover' variant, because it is faster */ +#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\ +    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),			\ +    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),			\ +    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),				\ +    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func) + +#define SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)			\ +    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\ +    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\ +    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func) + +#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH(op,s,d,func)		\ +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\ +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\ +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func),              \ +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (op,s,d,func) + +/*****************************************************************************/ + +/* + * Identify 5 zones in each scanline for bilinear scaling. Depending on + * whether 2 pixels to be interpolated are fetched from the image itself, + * from the padding area around it or from both image and padding area. + */ +static force_inline void +bilinear_pad_repeat_get_scanline_bounds (int32_t         source_image_width, +					 pixman_fixed_t  vx, +					 pixman_fixed_t  unit_x, +					 int32_t *       left_pad, +					 int32_t *       left_tz, +					 int32_t *       width, +					 int32_t *       right_tz, +					 int32_t *       right_pad) +{ +	int width1 = *width, left_pad1, right_pad1; +	int width2 = *width, left_pad2, right_pad2; + +	pad_repeat_get_scanline_bounds (source_image_width, vx, unit_x, +					&width1, &left_pad1, &right_pad1); +	pad_repeat_get_scanline_bounds (source_image_width, vx + pixman_fixed_1, +					unit_x, &width2, &left_pad2, &right_pad2); + +	*left_pad = left_pad2; +	*left_tz = left_pad1 - left_pad2; +	*right_tz = right_pad2 - right_pad1; +	*right_pad = right_pad1; +	*width -= *left_pad + *left_tz + *right_tz + *right_pad; +} + +/* + * Main loop template for single pass bilinear scaling. It needs to be + * provided with 'scanline_func' which should do the compositing operation. + * The needed function has the following prototype: + * + *	scanline_func (dst_type_t *       dst, + *		       const mask_type_ * mask, + *		       const src_type_t * src_top, + *		       const src_type_t * src_bottom, + *		       int32_t            width, + *		       int                weight_top, + *		       int                weight_bottom, + *		       pixman_fixed_t     vx, + *		       pixman_fixed_t     unit_x, + *		       pixman_fixed_t     max_vx, + *		       pixman_bool_t      zero_src) + * + * Where: + *  dst                 - destination scanline buffer for storing results + *  mask                - mask buffer (or single value for solid mask) + *  src_top, src_bottom - two source scanlines + *  width               - number of pixels to process + *  weight_top          - weight of the top row for interpolation + *  weight_bottom       - weight of the bottom row for interpolation + *  vx                  - initial position for fetching the first pair of + *                        pixels from the source buffer + *  unit_x              - position increment needed to move to the next pair + *                        of pixels + *  max_vx              - image size as a fixed point value, can be used for + *                        implementing NORMAL repeat (when it is supported) + *  zero_src            - boolean hint variable, which is set to TRUE when + *                        all source pixels are fetched from zero padding + *                        zone for NONE repeat + * + * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to + *       BILINEAR_INTERPOLATION_RANGE, but sometimes it may be less than that + *       for NONE repeat when handling fuzzy antialiased top or bottom image + *       edges. Also both top and bottom weight variables are guaranteed to + *       have value, which is less than BILINEAR_INTERPOLATION_RANGE. + *       For example, the weights can fit into unsigned byte or be used + *       with 8-bit SIMD multiplication instructions for 8-bit interpolation + *       precision. + */ +#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\ +				  dst_type_t, repeat_mode, flags)				\ +static void											\ +fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp,		\ +						   pixman_composite_info_t *info)		\ +{												\ +    PIXMAN_COMPOSITE_ARGS (info);								\ +    dst_type_t *dst_line;									\ +    mask_type_t *mask_line;									\ +    src_type_t *src_first_line;									\ +    int       y1, y2;										\ +    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\ +    pixman_vector_t v;										\ +    pixman_fixed_t vx, vy;									\ +    pixman_fixed_t unit_x, unit_y;								\ +    int32_t left_pad, left_tz, right_tz, right_pad;						\ +												\ +    dst_type_t *dst;										\ +    mask_type_t solid_mask;									\ +    const mask_type_t *mask = &solid_mask;							\ +    int src_stride, mask_stride, dst_stride;							\ +												\ +    int src_width;										\ +    pixman_fixed_t src_width_fixed;								\ +    int max_x;											\ +    pixman_bool_t need_src_extension;								\ +												\ +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\ +    if (flags & FLAG_HAVE_SOLID_MASK)								\ +    {												\ +	solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\ +	mask_stride = 0;									\ +    }												\ +    else if (flags & FLAG_HAVE_NON_SOLID_MASK)							\ +    {												\ +	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,				\ +			       mask_stride, mask_line, 1);					\ +    }												\ +												\ +    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\ +     * transformed from destination space to source space */					\ +    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\ +												\ +    /* reference point is the center of the pixel */						\ +    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\ +    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\ +    v.vector[2] = pixman_fixed_1;								\ +												\ +    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\ +	return;											\ +												\ +    unit_x = src_image->common.transform->matrix[0][0];						\ +    unit_y = src_image->common.transform->matrix[1][1];						\ +												\ +    v.vector[0] -= pixman_fixed_1 / 2;								\ +    v.vector[1] -= pixman_fixed_1 / 2;								\ +												\ +    vy = v.vector[1];										\ +												\ +    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\ +	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\ +    {												\ +	bilinear_pad_repeat_get_scanline_bounds (src_image->bits.width, v.vector[0], unit_x,	\ +					&left_pad, &left_tz, &width, &right_tz, &right_pad);	\ +	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\ +	{											\ +	    /* PAD repeat does not need special handling for 'transition zones' and */		\ +	    /* they can be combined with 'padding zones' safely */				\ +	    left_pad += left_tz;								\ +	    right_pad += right_tz;								\ +	    left_tz = right_tz = 0;								\ +	}											\ +	v.vector[0] += left_pad * unit_x;							\ +    }												\ +												\ +    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\ +    {												\ +	vx = v.vector[0];									\ +	repeat (PIXMAN_REPEAT_NORMAL, &vx, pixman_int_to_fixed(src_image->bits.width));		\ +	max_x = pixman_fixed_to_int (vx + (width - 1) * (int64_t)unit_x) + 1;			\ +												\ +	if (src_image->bits.width < REPEAT_NORMAL_MIN_WIDTH)					\ +	{											\ +	    src_width = 0;									\ +												\ +	    while (src_width < REPEAT_NORMAL_MIN_WIDTH && src_width <= max_x)			\ +		src_width += src_image->bits.width;						\ +												\ +	    need_src_extension = TRUE;								\ +	}											\ +	else											\ +	{											\ +	    src_width = src_image->bits.width;							\ +	    need_src_extension = FALSE;								\ +	}											\ +												\ +	src_width_fixed = pixman_int_to_fixed (src_width);					\ +    }												\ +												\ +    while (--height >= 0)									\ +    {												\ +	int weight1, weight2;									\ +	dst = dst_line;										\ +	dst_line += dst_stride;									\ +	vx = v.vector[0];									\ +	if (flags & FLAG_HAVE_NON_SOLID_MASK)							\ +	{											\ +	    mask = mask_line;									\ +	    mask_line += mask_stride;								\ +	}											\ +												\ +	y1 = pixman_fixed_to_int (vy);								\ +	weight2 = pixman_fixed_to_bilinear_weight (vy);						\ +	if (weight2)										\ +	{											\ +	    /* both weight1 and weight2 are smaller than BILINEAR_INTERPOLATION_RANGE */	\ +	    y2 = y1 + 1;									\ +	    weight1 = BILINEAR_INTERPOLATION_RANGE - weight2;					\ +	}											\ +	else											\ +	{											\ +	    /* set both top and bottom row to the same scanline and tweak weights */		\ +	    y2 = y1;										\ +	    weight1 = weight2 = BILINEAR_INTERPOLATION_RANGE / 2;				\ +	}											\ +	vy += unit_y;										\ +	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\ +	{											\ +	    src_type_t *src1, *src2;								\ +	    src_type_t buf1[2];									\ +	    src_type_t buf2[2];									\ +	    repeat (PIXMAN_REPEAT_PAD, &y1, src_image->bits.height);				\ +	    repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height);				\ +	    src1 = src_first_line + src_stride * y1;						\ +	    src2 = src_first_line + src_stride * y2;						\ +												\ +	    if (left_pad > 0)									\ +	    {											\ +		buf1[0] = buf1[1] = src1[0];							\ +		buf2[0] = buf2[1] = src2[0];							\ +		scanline_func (dst, mask,							\ +			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE);		\ +		dst += left_pad;								\ +		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\ +		    mask += left_pad;								\ +	    }											\ +	    if (width > 0)									\ +	    {											\ +		scanline_func (dst, mask,							\ +			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\ +		dst += width;									\ +		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\ +		    mask += width;								\ +	    }											\ +	    if (right_pad > 0)									\ +	    {											\ +		buf1[0] = buf1[1] = src1[src_image->bits.width - 1];				\ +		buf2[0] = buf2[1] = src2[src_image->bits.width - 1];				\ +		scanline_func (dst, mask,							\ +			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE);	\ +	    }											\ +	}											\ +	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\ +	{											\ +	    src_type_t *src1, *src2;								\ +	    src_type_t buf1[2];									\ +	    src_type_t buf2[2];									\ +	    /* handle top/bottom zero padding by just setting weights to 0 if needed */		\ +	    if (y1 < 0)										\ +	    {											\ +		weight1 = 0;									\ +		y1 = 0;										\ +	    }											\ +	    if (y1 >= src_image->bits.height)							\ +	    {											\ +		weight1 = 0;									\ +		y1 = src_image->bits.height - 1;						\ +	    }											\ +	    if (y2 < 0)										\ +	    {											\ +		weight2 = 0;									\ +		y2 = 0;										\ +	    }											\ +	    if (y2 >= src_image->bits.height)							\ +	    {											\ +		weight2 = 0;									\ +		y2 = src_image->bits.height - 1;						\ +	    }											\ +	    src1 = src_first_line + src_stride * y1;						\ +	    src2 = src_first_line + src_stride * y2;						\ +												\ +	    if (left_pad > 0)									\ +	    {											\ +		buf1[0] = buf1[1] = 0;								\ +		buf2[0] = buf2[1] = 0;								\ +		scanline_func (dst, mask,							\ +			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE);		\ +		dst += left_pad;								\ +		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\ +		    mask += left_pad;								\ +	    }											\ +	    if (left_tz > 0)									\ +	    {											\ +		buf1[0] = 0;									\ +		buf1[1] = src1[0];								\ +		buf2[0] = 0;									\ +		buf2[1] = src2[0];								\ +		scanline_func (dst, mask,							\ +			       buf1, buf2, left_tz, weight1, weight2,				\ +			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\ +		dst += left_tz;									\ +		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\ +		    mask += left_tz;								\ +		vx += left_tz * unit_x;								\ +	    }											\ +	    if (width > 0)									\ +	    {											\ +		scanline_func (dst, mask,							\ +			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\ +		dst += width;									\ +		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\ +		    mask += width;								\ +		vx += width * unit_x;								\ +	    }											\ +	    if (right_tz > 0)									\ +	    {											\ +		buf1[0] = src1[src_image->bits.width - 1];					\ +		buf1[1] = 0;									\ +		buf2[0] = src2[src_image->bits.width - 1];					\ +		buf2[1] = 0;									\ +		scanline_func (dst, mask,							\ +			       buf1, buf2, right_tz, weight1, weight2,				\ +			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\ +		dst += right_tz;								\ +		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\ +		    mask += right_tz;								\ +	    }											\ +	    if (right_pad > 0)									\ +	    {											\ +		buf1[0] = buf1[1] = 0;								\ +		buf2[0] = buf2[1] = 0;								\ +		scanline_func (dst, mask,							\ +			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE);		\ +	    }											\ +	}											\ +	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\ +	{											\ +	    int32_t	    num_pixels;								\ +	    int32_t	    width_remain;							\ +	    src_type_t *    src_line_top;							\ +	    src_type_t *    src_line_bottom;							\ +	    src_type_t	    buf1[2];								\ +	    src_type_t	    buf2[2];								\ +	    src_type_t	    extended_src_line0[REPEAT_NORMAL_MIN_WIDTH*2];			\ +	    src_type_t	    extended_src_line1[REPEAT_NORMAL_MIN_WIDTH*2];			\ +	    int		    i, j;								\ +												\ +	    repeat (PIXMAN_REPEAT_NORMAL, &y1, src_image->bits.height);				\ +	    repeat (PIXMAN_REPEAT_NORMAL, &y2, src_image->bits.height);				\ +	    src_line_top = src_first_line + src_stride * y1;					\ +	    src_line_bottom = src_first_line + src_stride * y2;					\ +												\ +	    if (need_src_extension)								\ +	    {											\ +		for (i=0; i<src_width;)								\ +		{										\ +		    for (j=0; j<src_image->bits.width; j++, i++)				\ +		    {										\ +			extended_src_line0[i] = src_line_top[j];				\ +			extended_src_line1[i] = src_line_bottom[j];				\ +		    }										\ +		}										\ +												\ +		src_line_top = &extended_src_line0[0];						\ +		src_line_bottom = &extended_src_line1[0];					\ +	    }											\ +												\ +	    /* Top & Bottom wrap around buffer */						\ +	    buf1[0] = src_line_top[src_width - 1];						\ +	    buf1[1] = src_line_top[0];								\ +	    buf2[0] = src_line_bottom[src_width - 1];						\ +	    buf2[1] = src_line_bottom[0];							\ +												\ +	    width_remain = width;								\ +												\ +	    while (width_remain > 0)								\ +	    {											\ +		/* We use src_width_fixed because it can make vx in original source range */	\ +		repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);				\ +												\ +		/* Wrap around part */								\ +		if (pixman_fixed_to_int (vx) == src_width - 1)					\ +		{										\ +		    /* for positive unit_x							\ +		     * num_pixels = max(n) + 1, where vx + n*unit_x < src_width_fixed		\ +		     *										\ +		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\ +		     * So we are safe from overflow.						\ +		     */										\ +		    num_pixels = ((src_width_fixed - vx - pixman_fixed_e) / unit_x) + 1;	\ +												\ +		    if (num_pixels > width_remain)						\ +			num_pixels = width_remain;						\ +												\ +		    scanline_func (dst, mask, buf1, buf2, num_pixels,				\ +				   weight1, weight2, pixman_fixed_frac(vx),			\ +				   unit_x, src_width_fixed, FALSE);				\ +												\ +		    width_remain -= num_pixels;							\ +		    vx += num_pixels * unit_x;							\ +		    dst += num_pixels;								\ +												\ +		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\ +			mask += num_pixels;							\ +												\ +		    repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);			\ +		}										\ +												\ +		/* Normal scanline composite */							\ +		if (pixman_fixed_to_int (vx) != src_width - 1 && width_remain > 0)		\ +		{										\ +		    /* for positive unit_x							\ +		     * num_pixels = max(n) + 1, where vx + n*unit_x < (src_width_fixed - 1)	\ +		     *										\ +		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\ +		     * So we are safe from overflow here.					\ +		     */										\ +		    num_pixels = ((src_width_fixed - pixman_fixed_1 - vx - pixman_fixed_e)	\ +				  / unit_x) + 1;						\ +												\ +		    if (num_pixels > width_remain)						\ +			num_pixels = width_remain;						\ +												\ +		    scanline_func (dst, mask, src_line_top, src_line_bottom, num_pixels,	\ +				   weight1, weight2, vx, unit_x, src_width_fixed, FALSE);	\ +												\ +		    width_remain -= num_pixels;							\ +		    vx += num_pixels * unit_x;							\ +		    dst += num_pixels;								\ +												\ +		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\ +		        mask += num_pixels;							\ +		}										\ +	    }											\ +	}											\ +	else											\ +	{											\ +	    scanline_func (dst, mask, src_first_line + src_stride * y1,				\ +			   src_first_line + src_stride * y2, width,				\ +			   weight1, weight2, vx, unit_x, max_vx, FALSE);			\ +	}											\ +    }												\ +} + +/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */ +#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\ +				  dst_type_t, repeat_mode, flags)				\ +	FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\ +				  dst_type_t, repeat_mode, flags) + +#define SCALED_BILINEAR_FLAGS						\ +    (FAST_PATH_SCALE_TRANSFORM	|					\ +     FAST_PATH_NO_ALPHA_MAP	|					\ +     FAST_PATH_BILINEAR_FILTER	|					\ +     FAST_PATH_NO_ACCESSORS	|					\ +     FAST_PATH_NARROW_FORMAT) + +#define SIMPLE_BILINEAR_FAST_PATH_PAD(op,s,d,func)			\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	(SCALED_BILINEAR_FLAGS		|				\ +	 FAST_PATH_PAD_REPEAT		|				\ +	 FAST_PATH_X_UNIT_POSITIVE),					\ +	PIXMAN_null, 0,							\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\ +    } + +#define SIMPLE_BILINEAR_FAST_PATH_NONE(op,s,d,func)			\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	(SCALED_BILINEAR_FLAGS		|				\ +	 FAST_PATH_NONE_REPEAT		|				\ +	 FAST_PATH_X_UNIT_POSITIVE),					\ +	PIXMAN_null, 0,							\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\ +    } + +#define SIMPLE_BILINEAR_FAST_PATH_COVER(op,s,d,func)			\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\ +	PIXMAN_null, 0,							\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\ +    } + +#define SIMPLE_BILINEAR_FAST_PATH_NORMAL(op,s,d,func)			\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	(SCALED_BILINEAR_FLAGS		|				\ +	 FAST_PATH_NORMAL_REPEAT	|				\ +	 FAST_PATH_X_UNIT_POSITIVE),					\ +	PIXMAN_null, 0,							\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\ +    } + +#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	(SCALED_BILINEAR_FLAGS		|				\ +	 FAST_PATH_PAD_REPEAT		|				\ +	 FAST_PATH_X_UNIT_POSITIVE),					\ +	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\ +    } + +#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	(SCALED_BILINEAR_FLAGS		|				\ +	 FAST_PATH_NONE_REPEAT		|				\ +	 FAST_PATH_X_UNIT_POSITIVE),					\ +	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\ +    } + +#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\ +	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\ +    } + +#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	(SCALED_BILINEAR_FLAGS		|				\ +	 FAST_PATH_NORMAL_REPEAT	|				\ +	 FAST_PATH_X_UNIT_POSITIVE),					\ +	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\ +    } + +#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	(SCALED_BILINEAR_FLAGS		|				\ +	 FAST_PATH_PAD_REPEAT		|				\ +	 FAST_PATH_X_UNIT_POSITIVE),					\ +	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\ +    } + +#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	(SCALED_BILINEAR_FLAGS		|				\ +	 FAST_PATH_NONE_REPEAT		|				\ +	 FAST_PATH_X_UNIT_POSITIVE),					\ +	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\ +    } + +#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\ +	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\ +    } + +#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)	\ +    {   PIXMAN_OP_ ## op,						\ +	PIXMAN_ ## s,							\ +	(SCALED_BILINEAR_FLAGS		|				\ +	 FAST_PATH_NORMAL_REPEAT	|				\ +	 FAST_PATH_X_UNIT_POSITIVE),					\ +	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\ +	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\ +	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\ +    } + +/* Prefer the use of 'cover' variant, because it is faster */ +#define SIMPLE_BILINEAR_FAST_PATH(op,s,d,func)				\ +    SIMPLE_BILINEAR_FAST_PATH_COVER (op,s,d,func),			\ +    SIMPLE_BILINEAR_FAST_PATH_NONE (op,s,d,func),			\ +    SIMPLE_BILINEAR_FAST_PATH_PAD (op,s,d,func),			\ +    SIMPLE_BILINEAR_FAST_PATH_NORMAL (op,s,d,func) + +#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH(op,s,d,func)			\ +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\ +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\ +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD (op,s,d,func),		\ +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL (op,s,d,func) + +#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH(op,s,d,func)		\ +    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\ +    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\ +    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD (op,s,d,func),		\ +    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL (op,s,d,func) + +#endif diff --git a/libs/pixman-0.40.0/pixman/pixman-linear-gradient.c b/libs/pixman-0.40.0/pixman/pixman-linear-gradient.c new file mode 100644 index 0000000..3f52850 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-linear-gradient.c @@ -0,0 +1,292 @@ +/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. + *             2005 Lars Knoll & Zack Rusin, Trolltech + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Keith Packard not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission.  Keith Packard makes no + * representations about the suitability of this software for any purpose.  It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include <stdlib.h> +#include "pixman-private.h" + +static pixman_bool_t +linear_gradient_is_horizontal (pixman_image_t *image, +			       int             x, +			       int             y, +			       int             width, +			       int             height) +{ +    linear_gradient_t *linear = (linear_gradient_t *)image; +    pixman_vector_t v; +    pixman_fixed_32_32_t l; +    pixman_fixed_48_16_t dx, dy; +    double inc; + +    if (image->common.transform) +    { +	/* projective transformation */ +	if (image->common.transform->matrix[2][0] != 0 || +	    image->common.transform->matrix[2][1] != 0 || +	    image->common.transform->matrix[2][2] == 0) +	{ +	    return FALSE; +	} + +	v.vector[0] = image->common.transform->matrix[0][1]; +	v.vector[1] = image->common.transform->matrix[1][1]; +	v.vector[2] = image->common.transform->matrix[2][2]; +    } +    else +    { +	v.vector[0] = 0; +	v.vector[1] = pixman_fixed_1; +	v.vector[2] = pixman_fixed_1; +    } + +    dx = linear->p2.x - linear->p1.x; +    dy = linear->p2.y - linear->p1.y; + +    l = dx * dx + dy * dy; + +    if (l == 0) +	return FALSE; + +    /* +     * compute how much the input of the gradient walked changes +     * when moving vertically through the whole image +     */ +    inc = height * (double) pixman_fixed_1 * pixman_fixed_1 * +	(dx * v.vector[0] + dy * v.vector[1]) / +	(v.vector[2] * (double) l); + +    /* check that casting to integer would result in 0 */ +    if (-1 < inc && inc < 1) +	return TRUE; + +    return FALSE; +} + +static uint32_t * +linear_get_scanline (pixman_iter_t                 *iter, +		     const uint32_t                *mask, +		     int                            Bpp, +		     pixman_gradient_walker_write_t write_pixel, +		     pixman_gradient_walker_fill_t  fill_pixel) +{ +    pixman_image_t *image  = iter->image; +    int             x      = iter->x; +    int             y      = iter->y; +    int             width  = iter->width; +    uint32_t *      buffer = iter->buffer; + +    pixman_vector_t v, unit; +    pixman_fixed_32_32_t l; +    pixman_fixed_48_16_t dx, dy; +    gradient_t *gradient = (gradient_t *)image; +    linear_gradient_t *linear = (linear_gradient_t *)image; +    uint32_t *end = buffer + width * (Bpp / 4); +    pixman_gradient_walker_t walker; + +    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat); + +    /* reference point is the center of the pixel */ +    v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2; +    v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2; +    v.vector[2] = pixman_fixed_1; + +    if (image->common.transform) +    { +	if (!pixman_transform_point_3d (image->common.transform, &v)) +	    return iter->buffer; + +	unit.vector[0] = image->common.transform->matrix[0][0]; +	unit.vector[1] = image->common.transform->matrix[1][0]; +	unit.vector[2] = image->common.transform->matrix[2][0]; +    } +    else +    { +	unit.vector[0] = pixman_fixed_1; +	unit.vector[1] = 0; +	unit.vector[2] = 0; +    } + +    dx = linear->p2.x - linear->p1.x; +    dy = linear->p2.y - linear->p1.y; + +    l = dx * dx + dy * dy; + +    if (l == 0 || unit.vector[2] == 0) +    { +	/* affine transformation only */ +	pixman_fixed_32_32_t t, next_inc; +	double inc; + +	if (l == 0 || v.vector[2] == 0) +	{ +	    t = 0; +	    inc = 0; +	} +	else +	{ +	    double invden, v2; + +	    invden = pixman_fixed_1 * (double) pixman_fixed_1 / +		(l * (double) v.vector[2]); +	    v2 = v.vector[2] * (1. / pixman_fixed_1); +	    t = ((dx * v.vector[0] + dy * v.vector[1]) - +		 (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden; +	    inc = (dx * unit.vector[0] + dy * unit.vector[1]) * invden; +	} +	next_inc = 0; + +	if (((pixman_fixed_32_32_t )(inc * width)) == 0) +	{ +	    fill_pixel (&walker, t, buffer, end); +	} +	else +	{ +	    int i; + +	    i = 0; +	    while (buffer < end) +	    { +		if (!mask || *mask++) +		{ +		    write_pixel (&walker, t + next_inc, buffer); +		} +		i++; +		next_inc = inc * i; +		buffer += (Bpp / 4); +	    } +	} +    } +    else +    { +	/* projective transformation */ +        double t; + +	t = 0; + +	while (buffer < end) +	{ +	    if (!mask || *mask++) +	    { +	        if (v.vector[2] != 0) +		{ +		    double invden, v2; + +		    invden = pixman_fixed_1 * (double) pixman_fixed_1 / +			(l * (double) v.vector[2]); +		    v2 = v.vector[2] * (1. / pixman_fixed_1); +		    t = ((dx * v.vector[0] + dy * v.vector[1]) - +			 (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden; +		} + +		write_pixel (&walker, t, buffer); +	    } + +	    buffer += (Bpp / 4); + +	    v.vector[0] += unit.vector[0]; +	    v.vector[1] += unit.vector[1]; +	    v.vector[2] += unit.vector[2]; +	} +    } + +    iter->y++; + +    return iter->buffer; +} + +static uint32_t * +linear_get_scanline_narrow (pixman_iter_t  *iter, +			    const uint32_t *mask) +{ +    return linear_get_scanline (iter, mask, 4, +				_pixman_gradient_walker_write_narrow, +				_pixman_gradient_walker_fill_narrow); +} + + +static uint32_t * +linear_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask) +{ +    return linear_get_scanline (iter, NULL, 16, +				_pixman_gradient_walker_write_wide, +				_pixman_gradient_walker_fill_wide); +} + +void +_pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t  *iter) +{ +    if (linear_gradient_is_horizontal ( +	    iter->image, iter->x, iter->y, iter->width, iter->height)) +    { +	if (iter->iter_flags & ITER_NARROW) +	    linear_get_scanline_narrow (iter, NULL); +	else +	    linear_get_scanline_wide (iter, NULL); + +	iter->get_scanline = _pixman_iter_get_scanline_noop; +    } +    else +    { +	if (iter->iter_flags & ITER_NARROW) +	    iter->get_scanline = linear_get_scanline_narrow; +	else +	    iter->get_scanline = linear_get_scanline_wide; +    } +} + +PIXMAN_EXPORT pixman_image_t * +pixman_image_create_linear_gradient (const pixman_point_fixed_t *  p1, +                                     const pixman_point_fixed_t *  p2, +                                     const pixman_gradient_stop_t *stops, +                                     int                           n_stops) +{ +    pixman_image_t *image; +    linear_gradient_t *linear; + +    image = _pixman_image_allocate (); + +    if (!image) +	return NULL; + +    linear = &image->linear; + +    if (!_pixman_init_gradient (&linear->common, stops, n_stops)) +    { +	free (image); +	return NULL; +    } + +    linear->p1 = *p1; +    linear->p2 = *p2; + +    image->type = LINEAR; + +    return image; +} + diff --git a/libs/pixman-0.40.0/pixman/pixman-matrix.c b/libs/pixman-0.40.0/pixman/pixman-matrix.c new file mode 100644 index 0000000..81b6e61 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-matrix.c @@ -0,0 +1,1073 @@ +/* + * Copyright © 2008 Keith Packard + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that copyright + * notice and this permission notice appear in supporting documentation, and + * that the name of the copyright holders not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission.  The copyright holders make no representations + * about the suitability of this software for any purpose.  It is provided "as + * is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THIS SOFTWARE. + */ + +/* + * Matrix interfaces + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <math.h> +#include <string.h> +#include "pixman-private.h" + +#define F(x)    pixman_int_to_fixed (x) + +static force_inline int +count_leading_zeros (uint32_t x) +{ +#ifdef HAVE_BUILTIN_CLZ +    return __builtin_clz (x); +#else +    int n = 0; +    while (x) +    { +        n++; +        x >>= 1; +    } +    return 32 - n; +#endif +} + +/* + * Large signed/unsigned integer division with rounding for the platforms with + * only 64-bit integer data type supported (no 128-bit data type). + * + * Arguments: + *     hi, lo - high and low 64-bit parts of the dividend + *     div    - 48-bit divisor + * + * Returns: lowest 64 bits of the result as a return value and highest 64 + *          bits of the result to "result_hi" pointer + */ + +/* grade-school unsigned division (128-bit by 48-bit) with rounding to nearest */ +static force_inline uint64_t +rounded_udiv_128_by_48 (uint64_t  hi, +                        uint64_t  lo, +                        uint64_t  div, +                        uint64_t *result_hi) +{ +    uint64_t tmp, remainder, result_lo; +    assert(div < ((uint64_t)1 << 48)); + +    remainder = hi % div; +    *result_hi = hi / div; + +    tmp = (remainder << 16) + (lo >> 48); +    result_lo = tmp / div; +    remainder = tmp % div; + +    tmp = (remainder << 16) + ((lo >> 32) & 0xFFFF); +    result_lo = (result_lo << 16) + (tmp / div); +    remainder = tmp % div; + +    tmp = (remainder << 16) + ((lo >> 16) & 0xFFFF); +    result_lo = (result_lo << 16) + (tmp / div); +    remainder = tmp % div; + +    tmp = (remainder << 16) + (lo & 0xFFFF); +    result_lo = (result_lo << 16) + (tmp / div); +    remainder = tmp % div; + +    /* round to nearest */ +    if (remainder * 2 >= div && ++result_lo == 0) +        *result_hi += 1; + +    return result_lo; +} + +/* signed division (128-bit by 49-bit) with rounding to nearest */ +static inline int64_t +rounded_sdiv_128_by_49 (int64_t   hi, +                        uint64_t  lo, +                        int64_t   div, +                        int64_t  *signed_result_hi) +{ +    uint64_t result_lo, result_hi; +    int sign = 0; +    if (div < 0) +    { +        div = -div; +        sign ^= 1; +    } +    if (hi < 0) +    { +        if (lo != 0) +            hi++; +        hi = -hi; +        lo = -lo; +        sign ^= 1; +    } +    result_lo = rounded_udiv_128_by_48 (hi, lo, div, &result_hi); +    if (sign) +    { +        if (result_lo != 0) +            result_hi++; +        result_hi = -result_hi; +        result_lo = -result_lo; +    } +    if (signed_result_hi) +    { +        *signed_result_hi = result_hi; +    } +    return result_lo; +} + +/* + * Multiply 64.16 fixed point value by (2^scalebits) and convert + * to 128-bit integer. + */ +static force_inline void +fixed_64_16_to_int128 (int64_t  hi, +                       int64_t  lo, +                       int64_t *rhi, +                       int64_t *rlo, +                       int      scalebits) +{ +    /* separate integer and fractional parts */ +    hi += lo >> 16; +    lo &= 0xFFFF; + +    if (scalebits <= 0) +    { +        *rlo = hi >> (-scalebits); +        *rhi = *rlo >> 63; +    } +    else +    { +        *rhi = hi >> (64 - scalebits); +        *rlo = (uint64_t)hi << scalebits; +        if (scalebits < 16) +            *rlo += lo >> (16 - scalebits); +        else +            *rlo += lo << (scalebits - 16); +    } +} + +/* + * Convert 112.16 fixed point value to 48.16 with clamping for the out + * of range values. + */ +static force_inline pixman_fixed_48_16_t +fixed_112_16_to_fixed_48_16 (int64_t hi, int64_t lo, pixman_bool_t *clampflag) +{ +    if ((lo >> 63) != hi) +    { +        *clampflag = TRUE; +        return hi >= 0 ? INT64_MAX : INT64_MIN; +    } +    else +    { +        return lo; +    } +} + +/* + * Transform a point with 31.16 fixed point coordinates from the destination + * space to a point with 48.16 fixed point coordinates in the source space. + * No overflows are possible for affine transformations and the results are + * accurate including the least significant bit. Projective transformations + * may overflow, in this case the results are just clamped to return maximum + * or minimum 48.16 values (so that the caller can at least handle the NONE + * and PAD repeats correctly) and the return value is FALSE to indicate that + * such clamping has happened. + */ +PIXMAN_EXPORT pixman_bool_t +pixman_transform_point_31_16 (const pixman_transform_t    *t, +                              const pixman_vector_48_16_t *v, +                              pixman_vector_48_16_t       *result) +{ +    pixman_bool_t clampflag = FALSE; +    int i; +    int64_t tmp[3][2], divint; +    uint16_t divfrac; + +    /* input vector values must have no more than 31 bits (including sign) +     * in the integer part */ +    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16))); +    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16))); +    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16))); +    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16))); +    assert (v->v[2] <   ((pixman_fixed_48_16_t)1 << (30 + 16))); +    assert (v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16))); + +    for (i = 0; i < 3; i++) +    { +        tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16); +        tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF); +        tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16); +        tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF); +        tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16); +        tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF); +    } + +    /* +     * separate 64-bit integer and 16-bit fractional parts for the divisor, +     * which is also scaled by 65536 after fixed point multiplication. +     */ +    divint  = tmp[2][0] + (tmp[2][1] >> 16); +    divfrac = tmp[2][1] & 0xFFFF; + +    if (divint == pixman_fixed_1 && divfrac == 0) +    { +        /* +         * this is a simple affine transformation +         */ +        result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16); +        result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16); +        result->v[2] = pixman_fixed_1; +    } +    else if (divint == 0 && divfrac == 0) +    { +        /* +         * handle zero divisor (if the values are non-zero, set the +         * results to maximum positive or minimum negative) +         */ +        clampflag = TRUE; + +        result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16); +        result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16); + +        if (result->v[0] > 0) +            result->v[0] = INT64_MAX; +        else if (result->v[0] < 0) +            result->v[0] = INT64_MIN; + +        if (result->v[1] > 0) +            result->v[1] = INT64_MAX; +        else if (result->v[1] < 0) +            result->v[1] = INT64_MIN; +    } +    else +    { +        /* +         * projective transformation, analyze the top 32 bits of the divisor +         */ +        int32_t hi32divbits = divint >> 32; +        if (hi32divbits < 0) +            hi32divbits = ~hi32divbits; + +        if (hi32divbits == 0) +        { +            /* the divisor is small, we can actually keep all the bits */ +            int64_t hi, rhi, lo, rlo; +            int64_t div = ((uint64_t)divint << 16) + divfrac; + +            fixed_64_16_to_int128 (tmp[0][0], tmp[0][1], &hi, &lo, 32); +            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi); +            result->v[0] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag); + +            fixed_64_16_to_int128 (tmp[1][0], tmp[1][1], &hi, &lo, 32); +            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi); +            result->v[1] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag); +        } +        else +        { +            /* the divisor needs to be reduced to 48 bits */ +            int64_t hi, rhi, lo, rlo, div; +            int shift = 32 - count_leading_zeros (hi32divbits); +            fixed_64_16_to_int128 (divint, divfrac, &hi, &div, 16 - shift); + +            fixed_64_16_to_int128 (tmp[0][0], tmp[0][1], &hi, &lo, 32 - shift); +            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi); +            result->v[0] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag); + +            fixed_64_16_to_int128 (tmp[1][0], tmp[1][1], &hi, &lo, 32 - shift); +            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi); +            result->v[1] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag); +        } +    } +    result->v[2] = pixman_fixed_1; +    return !clampflag; +} + +PIXMAN_EXPORT void +pixman_transform_point_31_16_affine (const pixman_transform_t    *t, +                                     const pixman_vector_48_16_t *v, +                                     pixman_vector_48_16_t       *result) +{ +    int64_t hi0, lo0, hi1, lo1; + +    /* input vector values must have no more than 31 bits (including sign) +     * in the integer part */ +    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16))); +    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16))); +    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16))); +    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16))); + +    hi0  = (int64_t)t->matrix[0][0] * (v->v[0] >> 16); +    lo0  = (int64_t)t->matrix[0][0] * (v->v[0] & 0xFFFF); +    hi0 += (int64_t)t->matrix[0][1] * (v->v[1] >> 16); +    lo0 += (int64_t)t->matrix[0][1] * (v->v[1] & 0xFFFF); +    hi0 += (int64_t)t->matrix[0][2]; + +    hi1  = (int64_t)t->matrix[1][0] * (v->v[0] >> 16); +    lo1  = (int64_t)t->matrix[1][0] * (v->v[0] & 0xFFFF); +    hi1 += (int64_t)t->matrix[1][1] * (v->v[1] >> 16); +    lo1 += (int64_t)t->matrix[1][1] * (v->v[1] & 0xFFFF); +    hi1 += (int64_t)t->matrix[1][2]; + +    result->v[0] = hi0 + ((lo0 + 0x8000) >> 16); +    result->v[1] = hi1 + ((lo1 + 0x8000) >> 16); +    result->v[2] = pixman_fixed_1; +} + +PIXMAN_EXPORT void +pixman_transform_point_31_16_3d (const pixman_transform_t    *t, +                                 const pixman_vector_48_16_t *v, +                                 pixman_vector_48_16_t       *result) +{ +    int i; +    int64_t tmp[3][2]; + +    /* input vector values must have no more than 31 bits (including sign) +     * in the integer part */ +    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16))); +    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16))); +    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16))); +    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16))); +    assert (v->v[2] <   ((pixman_fixed_48_16_t)1 << (30 + 16))); +    assert (v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16))); + +    for (i = 0; i < 3; i++) +    { +        tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16); +        tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF); +        tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16); +        tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF); +        tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16); +        tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF); +    } + +    result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16); +    result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16); +    result->v[2] = tmp[2][0] + ((tmp[2][1] + 0x8000) >> 16); +} + +PIXMAN_EXPORT void +pixman_transform_init_identity (struct pixman_transform *matrix) +{ +    int i; + +    memset (matrix, '\0', sizeof (struct pixman_transform)); +    for (i = 0; i < 3; i++) +	matrix->matrix[i][i] = F (1); +} + +typedef pixman_fixed_32_32_t pixman_fixed_34_30_t; + +PIXMAN_EXPORT pixman_bool_t +pixman_transform_point_3d (const struct pixman_transform *transform, +                           struct pixman_vector *         vector) +{ +    pixman_vector_48_16_t tmp; +    tmp.v[0] = vector->vector[0]; +    tmp.v[1] = vector->vector[1]; +    tmp.v[2] = vector->vector[2]; + +    pixman_transform_point_31_16_3d (transform, &tmp, &tmp); + +    vector->vector[0] = tmp.v[0]; +    vector->vector[1] = tmp.v[1]; +    vector->vector[2] = tmp.v[2]; + +    return vector->vector[0] == tmp.v[0] && +           vector->vector[1] == tmp.v[1] && +           vector->vector[2] == tmp.v[2]; +} + +PIXMAN_EXPORT pixman_bool_t +pixman_transform_point (const struct pixman_transform *transform, +                        struct pixman_vector *         vector) +{ +    pixman_vector_48_16_t tmp; +    tmp.v[0] = vector->vector[0]; +    tmp.v[1] = vector->vector[1]; +    tmp.v[2] = vector->vector[2]; + +    if (!pixman_transform_point_31_16 (transform, &tmp, &tmp)) +        return FALSE; + +    vector->vector[0] = tmp.v[0]; +    vector->vector[1] = tmp.v[1]; +    vector->vector[2] = tmp.v[2]; + +    return vector->vector[0] == tmp.v[0] && +           vector->vector[1] == tmp.v[1] && +           vector->vector[2] == tmp.v[2]; +} + +PIXMAN_EXPORT pixman_bool_t +pixman_transform_multiply (struct pixman_transform *      dst, +                           const struct pixman_transform *l, +                           const struct pixman_transform *r) +{ +    struct pixman_transform d; +    int dx, dy; +    int o; + +    for (dy = 0; dy < 3; dy++) +    { +	for (dx = 0; dx < 3; dx++) +	{ +	    pixman_fixed_48_16_t v; +	    pixman_fixed_32_32_t partial; +	     +	    v = 0; +	    for (o = 0; o < 3; o++) +	    { +		partial = +		    (pixman_fixed_32_32_t) l->matrix[dy][o] * +		    (pixman_fixed_32_32_t) r->matrix[o][dx]; + +		v += (partial + 0x8000) >> 16; +	    } + +	    if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16) +		return FALSE; +	     +	    d.matrix[dy][dx] = (pixman_fixed_t) v; +	} +    } + +    *dst = d; +    return TRUE; +} + +PIXMAN_EXPORT void +pixman_transform_init_scale (struct pixman_transform *t, +                             pixman_fixed_t           sx, +                             pixman_fixed_t           sy) +{ +    memset (t, '\0', sizeof (struct pixman_transform)); + +    t->matrix[0][0] = sx; +    t->matrix[1][1] = sy; +    t->matrix[2][2] = F (1); +} + +static pixman_fixed_t +fixed_inverse (pixman_fixed_t x) +{ +    return (pixman_fixed_t) ((((pixman_fixed_48_16_t) F (1)) * F (1)) / x); +} + +PIXMAN_EXPORT pixman_bool_t +pixman_transform_scale (struct pixman_transform *forward, +                        struct pixman_transform *reverse, +                        pixman_fixed_t           sx, +                        pixman_fixed_t           sy) +{ +    struct pixman_transform t; + +    if (sx == 0 || sy == 0) +	return FALSE; + +    if (forward) +    { +	pixman_transform_init_scale (&t, sx, sy); +	if (!pixman_transform_multiply (forward, &t, forward)) +	    return FALSE; +    } +     +    if (reverse) +    { +	pixman_transform_init_scale (&t, fixed_inverse (sx), +	                             fixed_inverse (sy)); +	if (!pixman_transform_multiply (reverse, reverse, &t)) +	    return FALSE; +    } +     +    return TRUE; +} + +PIXMAN_EXPORT void +pixman_transform_init_rotate (struct pixman_transform *t, +                              pixman_fixed_t           c, +                              pixman_fixed_t           s) +{ +    memset (t, '\0', sizeof (struct pixman_transform)); + +    t->matrix[0][0] = c; +    t->matrix[0][1] = -s; +    t->matrix[1][0] = s; +    t->matrix[1][1] = c; +    t->matrix[2][2] = F (1); +} + +PIXMAN_EXPORT pixman_bool_t +pixman_transform_rotate (struct pixman_transform *forward, +                         struct pixman_transform *reverse, +                         pixman_fixed_t           c, +                         pixman_fixed_t           s) +{ +    struct pixman_transform t; + +    if (forward) +    { +	pixman_transform_init_rotate (&t, c, s); +	if (!pixman_transform_multiply (forward, &t, forward)) +	    return FALSE; +    } + +    if (reverse) +    { +	pixman_transform_init_rotate (&t, c, -s); +	if (!pixman_transform_multiply (reverse, reverse, &t)) +	    return FALSE; +    } +     +    return TRUE; +} + +PIXMAN_EXPORT void +pixman_transform_init_translate (struct pixman_transform *t, +                                 pixman_fixed_t           tx, +                                 pixman_fixed_t           ty) +{ +    memset (t, '\0', sizeof (struct pixman_transform)); + +    t->matrix[0][0] = F (1); +    t->matrix[0][2] = tx; +    t->matrix[1][1] = F (1); +    t->matrix[1][2] = ty; +    t->matrix[2][2] = F (1); +} + +PIXMAN_EXPORT pixman_bool_t +pixman_transform_translate (struct pixman_transform *forward, +                            struct pixman_transform *reverse, +                            pixman_fixed_t           tx, +                            pixman_fixed_t           ty) +{ +    struct pixman_transform t; + +    if (forward) +    { +	pixman_transform_init_translate (&t, tx, ty); + +	if (!pixman_transform_multiply (forward, &t, forward)) +	    return FALSE; +    } + +    if (reverse) +    { +	pixman_transform_init_translate (&t, -tx, -ty); + +	if (!pixman_transform_multiply (reverse, reverse, &t)) +	    return FALSE; +    } +    return TRUE; +} + +PIXMAN_EXPORT pixman_bool_t +pixman_transform_bounds (const struct pixman_transform *matrix, +                         struct pixman_box16 *          b) + +{ +    struct pixman_vector v[4]; +    int i; +    int x1, y1, x2, y2; + +    v[0].vector[0] = F (b->x1); +    v[0].vector[1] = F (b->y1); +    v[0].vector[2] = F (1); + +    v[1].vector[0] = F (b->x2); +    v[1].vector[1] = F (b->y1); +    v[1].vector[2] = F (1); + +    v[2].vector[0] = F (b->x2); +    v[2].vector[1] = F (b->y2); +    v[2].vector[2] = F (1); + +    v[3].vector[0] = F (b->x1); +    v[3].vector[1] = F (b->y2); +    v[3].vector[2] = F (1); + +    for (i = 0; i < 4; i++) +    { +	if (!pixman_transform_point (matrix, &v[i])) +	    return FALSE; + +	x1 = pixman_fixed_to_int (v[i].vector[0]); +	y1 = pixman_fixed_to_int (v[i].vector[1]); +	x2 = pixman_fixed_to_int (pixman_fixed_ceil (v[i].vector[0])); +	y2 = pixman_fixed_to_int (pixman_fixed_ceil (v[i].vector[1])); + +	if (i == 0) +	{ +	    b->x1 = x1; +	    b->y1 = y1; +	    b->x2 = x2; +	    b->y2 = y2; +	} +	else +	{ +	    if (x1 < b->x1) b->x1 = x1; +	    if (y1 < b->y1) b->y1 = y1; +	    if (x2 > b->x2) b->x2 = x2; +	    if (y2 > b->y2) b->y2 = y2; +	} +    } + +    return TRUE; +} + +PIXMAN_EXPORT pixman_bool_t +pixman_transform_invert (struct pixman_transform *      dst, +                         const struct pixman_transform *src) +{ +    struct pixman_f_transform m; + +    pixman_f_transform_from_pixman_transform (&m, src); + +    if (!pixman_f_transform_invert (&m, &m)) +	return FALSE; + +    if (!pixman_transform_from_pixman_f_transform (dst, &m)) +	return FALSE; + +    return TRUE; +} + +static pixman_bool_t +within_epsilon (pixman_fixed_t a, +                pixman_fixed_t b, +                pixman_fixed_t epsilon) +{ +    pixman_fixed_t t = a - b; + +    if (t < 0) +	t = -t; + +    return t <= epsilon; +} + +#define EPSILON (pixman_fixed_t) (2) + +#define IS_SAME(a, b) (within_epsilon (a, b, EPSILON)) +#define IS_ZERO(a)    (within_epsilon (a, 0, EPSILON)) +#define IS_ONE(a)     (within_epsilon (a, F (1), EPSILON)) +#define IS_UNIT(a)			    \ +    (within_epsilon (a, F (1), EPSILON) ||  \ +     within_epsilon (a, F (-1), EPSILON) || \ +     IS_ZERO (a)) +#define IS_INT(a)    (IS_ZERO (pixman_fixed_frac (a))) + +PIXMAN_EXPORT pixman_bool_t +pixman_transform_is_identity (const struct pixman_transform *t) +{ +    return (IS_SAME (t->matrix[0][0], t->matrix[1][1]) && +	    IS_SAME (t->matrix[0][0], t->matrix[2][2]) && +	    !IS_ZERO (t->matrix[0][0]) && +	    IS_ZERO (t->matrix[0][1]) && +	    IS_ZERO (t->matrix[0][2]) && +	    IS_ZERO (t->matrix[1][0]) && +	    IS_ZERO (t->matrix[1][2]) && +	    IS_ZERO (t->matrix[2][0]) && +	    IS_ZERO (t->matrix[2][1])); +} + +PIXMAN_EXPORT pixman_bool_t +pixman_transform_is_scale (const struct pixman_transform *t) +{ +    return (!IS_ZERO (t->matrix[0][0]) && +            IS_ZERO (t->matrix[0][1]) && +            IS_ZERO (t->matrix[0][2]) && + +            IS_ZERO (t->matrix[1][0]) && +            !IS_ZERO (t->matrix[1][1]) && +            IS_ZERO (t->matrix[1][2]) && + +            IS_ZERO (t->matrix[2][0]) && +            IS_ZERO (t->matrix[2][1]) && +            !IS_ZERO (t->matrix[2][2])); +} + +PIXMAN_EXPORT pixman_bool_t +pixman_transform_is_int_translate (const struct pixman_transform *t) +{ +    return (IS_ONE (t->matrix[0][0]) && +            IS_ZERO (t->matrix[0][1]) && +            IS_INT (t->matrix[0][2]) && + +            IS_ZERO (t->matrix[1][0]) && +            IS_ONE (t->matrix[1][1]) && +            IS_INT (t->matrix[1][2]) && + +            IS_ZERO (t->matrix[2][0]) && +            IS_ZERO (t->matrix[2][1]) && +            IS_ONE (t->matrix[2][2])); +} + +PIXMAN_EXPORT pixman_bool_t +pixman_transform_is_inverse (const struct pixman_transform *a, +                             const struct pixman_transform *b) +{ +    struct pixman_transform t; + +    if (!pixman_transform_multiply (&t, a, b)) +	return FALSE; + +    return pixman_transform_is_identity (&t); +} + +PIXMAN_EXPORT void +pixman_f_transform_from_pixman_transform (struct pixman_f_transform *    ft, +                                          const struct pixman_transform *t) +{ +    int i, j; + +    for (j = 0; j < 3; j++) +    { +	for (i = 0; i < 3; i++) +	    ft->m[j][i] = pixman_fixed_to_double (t->matrix[j][i]); +    } +} + +PIXMAN_EXPORT pixman_bool_t +pixman_transform_from_pixman_f_transform (struct pixman_transform *        t, +                                          const struct pixman_f_transform *ft) +{ +    int i, j; + +    for (j = 0; j < 3; j++) +    { +	for (i = 0; i < 3; i++) +	{ +	    double d = ft->m[j][i]; +	    if (d < -32767.0 || d > 32767.0) +		return FALSE; +	    d = d * 65536.0 + 0.5; +	    t->matrix[j][i] = (pixman_fixed_t) floor (d); +	} +    } +     +    return TRUE; +} + +PIXMAN_EXPORT pixman_bool_t +pixman_f_transform_invert (struct pixman_f_transform *      dst, +                           const struct pixman_f_transform *src) +{ +    static const int a[3] = { 2, 2, 1 }; +    static const int b[3] = { 1, 0, 0 }; +    pixman_f_transform_t d; +    double det; +    int i, j; + +    det = 0; +    for (i = 0; i < 3; i++) +    { +	double p; +	int ai = a[i]; +	int bi = b[i]; +	p = src->m[i][0] * (src->m[ai][2] * src->m[bi][1] - +	                    src->m[ai][1] * src->m[bi][2]); +	if (i == 1) +	    p = -p; +	det += p; +    } +     +    if (det == 0) +	return FALSE; +     +    det = 1 / det; +    for (j = 0; j < 3; j++) +    { +	for (i = 0; i < 3; i++) +	{ +	    double p; +	    int ai = a[i]; +	    int aj = a[j]; +	    int bi = b[i]; +	    int bj = b[j]; + +	    p = (src->m[ai][aj] * src->m[bi][bj] - +	         src->m[ai][bj] * src->m[bi][aj]); +	     +	    if (((i + j) & 1) != 0) +		p = -p; +	     +	    d.m[j][i] = det * p; +	} +    } + +    *dst = d; + +    return TRUE; +} + +PIXMAN_EXPORT pixman_bool_t +pixman_f_transform_point (const struct pixman_f_transform *t, +                          struct pixman_f_vector *         v) +{ +    struct pixman_f_vector result; +    int i, j; +    double a; + +    for (j = 0; j < 3; j++) +    { +	a = 0; +	for (i = 0; i < 3; i++) +	    a += t->m[j][i] * v->v[i]; +	result.v[j] = a; +    } +     +    if (!result.v[2]) +	return FALSE; + +    for (j = 0; j < 2; j++) +	v->v[j] = result.v[j] / result.v[2]; + +    v->v[2] = 1; + +    return TRUE; +} + +PIXMAN_EXPORT void +pixman_f_transform_point_3d (const struct pixman_f_transform *t, +                             struct pixman_f_vector *         v) +{ +    struct pixman_f_vector result; +    int i, j; +    double a; + +    for (j = 0; j < 3; j++) +    { +	a = 0; +	for (i = 0; i < 3; i++) +	    a += t->m[j][i] * v->v[i]; +	result.v[j] = a; +    } +     +    *v = result; +} + +PIXMAN_EXPORT void +pixman_f_transform_multiply (struct pixman_f_transform *      dst, +                             const struct pixman_f_transform *l, +                             const struct pixman_f_transform *r) +{ +    struct pixman_f_transform d; +    int dx, dy; +    int o; + +    for (dy = 0; dy < 3; dy++) +    { +	for (dx = 0; dx < 3; dx++) +	{ +	    double v = 0; +	    for (o = 0; o < 3; o++) +		v += l->m[dy][o] * r->m[o][dx]; +	    d.m[dy][dx] = v; +	} +    } +     +    *dst = d; +} + +PIXMAN_EXPORT void +pixman_f_transform_init_scale (struct pixman_f_transform *t, +                               double                     sx, +                               double                     sy) +{ +    t->m[0][0] = sx; +    t->m[0][1] = 0; +    t->m[0][2] = 0; +    t->m[1][0] = 0; +    t->m[1][1] = sy; +    t->m[1][2] = 0; +    t->m[2][0] = 0; +    t->m[2][1] = 0; +    t->m[2][2] = 1; +} + +PIXMAN_EXPORT pixman_bool_t +pixman_f_transform_scale (struct pixman_f_transform *forward, +                          struct pixman_f_transform *reverse, +                          double                     sx, +                          double                     sy) +{ +    struct pixman_f_transform t; + +    if (sx == 0 || sy == 0) +	return FALSE; + +    if (forward) +    { +	pixman_f_transform_init_scale (&t, sx, sy); +	pixman_f_transform_multiply (forward, &t, forward); +    } +     +    if (reverse) +    { +	pixman_f_transform_init_scale (&t, 1 / sx, 1 / sy); +	pixman_f_transform_multiply (reverse, reverse, &t); +    } +     +    return TRUE; +} + +PIXMAN_EXPORT void +pixman_f_transform_init_rotate (struct pixman_f_transform *t, +                                double                     c, +                                double                     s) +{ +    t->m[0][0] = c; +    t->m[0][1] = -s; +    t->m[0][2] = 0; +    t->m[1][0] = s; +    t->m[1][1] = c; +    t->m[1][2] = 0; +    t->m[2][0] = 0; +    t->m[2][1] = 0; +    t->m[2][2] = 1; +} + +PIXMAN_EXPORT pixman_bool_t +pixman_f_transform_rotate (struct pixman_f_transform *forward, +                           struct pixman_f_transform *reverse, +                           double                     c, +                           double                     s) +{ +    struct pixman_f_transform t; + +    if (forward) +    { +	pixman_f_transform_init_rotate (&t, c, s); +	pixman_f_transform_multiply (forward, &t, forward); +    } +     +    if (reverse) +    { +	pixman_f_transform_init_rotate (&t, c, -s); +	pixman_f_transform_multiply (reverse, reverse, &t); +    } + +    return TRUE; +} + +PIXMAN_EXPORT void +pixman_f_transform_init_translate (struct pixman_f_transform *t, +                                   double                     tx, +                                   double                     ty) +{ +    t->m[0][0] = 1; +    t->m[0][1] = 0; +    t->m[0][2] = tx; +    t->m[1][0] = 0; +    t->m[1][1] = 1; +    t->m[1][2] = ty; +    t->m[2][0] = 0; +    t->m[2][1] = 0; +    t->m[2][2] = 1; +} + +PIXMAN_EXPORT pixman_bool_t +pixman_f_transform_translate (struct pixman_f_transform *forward, +                              struct pixman_f_transform *reverse, +                              double                     tx, +                              double                     ty) +{ +    struct pixman_f_transform t; + +    if (forward) +    { +	pixman_f_transform_init_translate (&t, tx, ty); +	pixman_f_transform_multiply (forward, &t, forward); +    } + +    if (reverse) +    { +	pixman_f_transform_init_translate (&t, -tx, -ty); +	pixman_f_transform_multiply (reverse, reverse, &t); +    } + +    return TRUE; +} + +PIXMAN_EXPORT pixman_bool_t +pixman_f_transform_bounds (const struct pixman_f_transform *t, +                           struct pixman_box16 *            b) +{ +    struct pixman_f_vector v[4]; +    int i; +    int x1, y1, x2, y2; + +    v[0].v[0] = b->x1; +    v[0].v[1] = b->y1; +    v[0].v[2] = 1; +    v[1].v[0] = b->x2; +    v[1].v[1] = b->y1; +    v[1].v[2] = 1; +    v[2].v[0] = b->x2; +    v[2].v[1] = b->y2; +    v[2].v[2] = 1; +    v[3].v[0] = b->x1; +    v[3].v[1] = b->y2; +    v[3].v[2] = 1; + +    for (i = 0; i < 4; i++) +    { +	if (!pixman_f_transform_point (t, &v[i])) +	    return FALSE; + +	x1 = floor (v[i].v[0]); +	y1 = floor (v[i].v[1]); +	x2 = ceil (v[i].v[0]); +	y2 = ceil (v[i].v[1]); + +	if (i == 0) +	{ +	    b->x1 = x1; +	    b->y1 = y1; +	    b->x2 = x2; +	    b->y2 = y2; +	} +	else +	{ +	    if (x1 < b->x1) b->x1 = x1; +	    if (y1 < b->y1) b->y1 = y1; +	    if (x2 > b->x2) b->x2 = x2; +	    if (y2 > b->y2) b->y2 = y2; +	} +    } + +    return TRUE; +} + +PIXMAN_EXPORT void +pixman_f_transform_init_identity (struct pixman_f_transform *t) +{ +    int i, j; + +    for (j = 0; j < 3; j++) +    { +	for (i = 0; i < 3; i++) +	    t->m[j][i] = i == j ? 1 : 0; +    } +} diff --git a/libs/pixman-0.40.0/pixman/pixman-mips-dspr2-asm.S b/libs/pixman-0.40.0/pixman/pixman-mips-dspr2-asm.S new file mode 100644 index 0000000..9dad163 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-mips-dspr2-asm.S @@ -0,0 +1,4283 @@ +/* + * Copyright (c) 2012 + *      MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *    notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + *    notice, this list of conditions and the following disclaimer in the + *    documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + *    contributors may be used to endorse or promote products derived from + *    this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author:  Nemanja Lukic (nemanja.lukic@rt-rk.com) + */ + +#include "pixman-private.h" +#include "pixman-mips-dspr2-asm.h" + +LEAF_MIPS_DSPR2(pixman_fill_buff16_mips) +/* + * a0 - *dest + * a1 - count (bytes) + * a2 - value to fill buffer with + */ + +    beqz     a1, 3f +     andi    t1, a0, 0x0002 +    beqz     t1, 0f          /* check if address is 4-byte aligned */ +     nop +    sh       a2, 0(a0) +    addiu    a0, a0, 2 +    addiu    a1, a1, -2 +0: +    srl      t1, a1, 5       /* t1 how many multiples of 32 bytes */ +    replv.ph a2, a2          /* replicate fill value (16bit) in a2 */ +    beqz     t1, 2f +     nop +1: +    addiu    t1, t1, -1 +    beqz     t1, 11f +     addiu   a1, a1, -32 +    pref     30, 32(a0) +    sw       a2, 0(a0) +    sw       a2, 4(a0) +    sw       a2, 8(a0) +    sw       a2, 12(a0) +    sw       a2, 16(a0) +    sw       a2, 20(a0) +    sw       a2, 24(a0) +    sw       a2, 28(a0) +    b        1b +     addiu   a0, a0, 32 +11: +    sw       a2, 0(a0) +    sw       a2, 4(a0) +    sw       a2, 8(a0) +    sw       a2, 12(a0) +    sw       a2, 16(a0) +    sw       a2, 20(a0) +    sw       a2, 24(a0) +    sw       a2, 28(a0) +    addiu    a0, a0, 32 +2: +    blez     a1, 3f +     addiu   a1, a1, -2 +    sh       a2, 0(a0) +    b        2b +     addiu   a0, a0, 2 +3: +    jr       ra +     nop + +END(pixman_fill_buff16_mips) + +LEAF_MIPS32R2(pixman_fill_buff32_mips) +/* + * a0 - *dest + * a1 - count (bytes) + * a2 - value to fill buffer with + */ + +    beqz     a1, 3f +     nop +    srl      t1, a1, 5 /* t1 how many multiples of 32 bytes */ +    beqz     t1, 2f +     nop +1: +    addiu    t1, t1, -1 +    beqz     t1, 11f +     addiu   a1, a1, -32 +    pref     30, 32(a0) +    sw       a2, 0(a0) +    sw       a2, 4(a0) +    sw       a2, 8(a0) +    sw       a2, 12(a0) +    sw       a2, 16(a0) +    sw       a2, 20(a0) +    sw       a2, 24(a0) +    sw       a2, 28(a0) +    b        1b +     addiu   a0, a0, 32 +11: +    sw       a2, 0(a0) +    sw       a2, 4(a0) +    sw       a2, 8(a0) +    sw       a2, 12(a0) +    sw       a2, 16(a0) +    sw       a2, 20(a0) +    sw       a2, 24(a0) +    sw       a2, 28(a0) +    addiu    a0, a0, 32 +2: +    blez     a1, 3f +     addiu   a1, a1, -4 +    sw       a2, 0(a0) +    b        2b +     addiu   a0, a0, 4 +3: +    jr       ra +     nop + +END(pixman_fill_buff32_mips) + +LEAF_MIPS_DSPR2(pixman_composite_src_8888_0565_asm_mips) +/* + * a0 - dst (r5g6b5) + * a1 - src (a8r8g8b8) + * a2 - w + */ + +    beqz     a2, 3f +     nop +    addiu    t1, a2, -1 +    beqz     t1, 2f +     nop +    li       t4, 0xf800f800 +    li       t5, 0x07e007e0 +    li       t6, 0x001f001f +1: +    lw       t0, 0(a1) +    lw       t1, 4(a1) +    addiu    a1, a1, 8 +    addiu    a2, a2, -2 + +    CONVERT_2x8888_TO_2x0565 t0, t1, t2, t3, t4, t5, t6, t7, t8 + +    sh       t2, 0(a0) +    sh       t3, 2(a0) + +    addiu    t2, a2, -1 +    bgtz     t2, 1b +     addiu   a0, a0, 4 +2: +    beqz     a2, 3f +     nop +    lw       t0, 0(a1) + +    CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3 + +    sh       t1, 0(a0) +3: +    j        ra +     nop + +END(pixman_composite_src_8888_0565_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_src_0565_8888_asm_mips) +/* + * a0 - dst (a8r8g8b8) + * a1 - src (r5g6b5) + * a2 - w + */ + +    beqz     a2, 3f +     nop +    addiu    t1, a2, -1 +    beqz     t1, 2f +     nop +    li       t4, 0x07e007e0 +    li       t5, 0x001F001F +1: +    lhu      t0, 0(a1) +    lhu      t1, 2(a1) +    addiu    a1, a1, 4 +    addiu    a2, a2, -2 + +    CONVERT_2x0565_TO_2x8888 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9 + +    sw       t2, 0(a0) +    sw       t3, 4(a0) + +    addiu    t2, a2, -1 +    bgtz     t2, 1b +     addiu   a0, a0, 8 +2: +    beqz     a2, 3f +     nop +    lhu      t0, 0(a1) + +    CONVERT_1x0565_TO_1x8888 t0, t1, t2, t3 + +    sw       t1, 0(a0) +3: +    j        ra +     nop + +END(pixman_composite_src_0565_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_src_x888_8888_asm_mips) +/* + * a0 - dst (a8r8g8b8) + * a1 - src (x8r8g8b8) + * a2 - w + */ + +    beqz     a2, 4f +     nop +    li       t9, 0xff000000 +    srl      t8, a2, 3    /* t1 = how many multiples of 8 src pixels */ +    beqz     t8, 3f       /* branch if less than 8 src pixels */ +     nop +1: +    addiu    t8, t8, -1 +    beqz     t8, 2f +     addiu   a2, a2, -8 +    pref     0, 32(a1) +    lw       t0, 0(a1) +    lw       t1, 4(a1) +    lw       t2, 8(a1) +    lw       t3, 12(a1) +    lw       t4, 16(a1) +    lw       t5, 20(a1) +    lw       t6, 24(a1) +    lw       t7, 28(a1) +    addiu    a1, a1, 32 +    or       t0, t0, t9 +    or       t1, t1, t9 +    or       t2, t2, t9 +    or       t3, t3, t9 +    or       t4, t4, t9 +    or       t5, t5, t9 +    or       t6, t6, t9 +    or       t7, t7, t9 +    pref     30, 32(a0) +    sw       t0, 0(a0) +    sw       t1, 4(a0) +    sw       t2, 8(a0) +    sw       t3, 12(a0) +    sw       t4, 16(a0) +    sw       t5, 20(a0) +    sw       t6, 24(a0) +    sw       t7, 28(a0) +    b        1b +     addiu   a0, a0, 32 +2: +    lw       t0, 0(a1) +    lw       t1, 4(a1) +    lw       t2, 8(a1) +    lw       t3, 12(a1) +    lw       t4, 16(a1) +    lw       t5, 20(a1) +    lw       t6, 24(a1) +    lw       t7, 28(a1) +    addiu    a1, a1, 32 +    or       t0, t0, t9 +    or       t1, t1, t9 +    or       t2, t2, t9 +    or       t3, t3, t9 +    or       t4, t4, t9 +    or       t5, t5, t9 +    or       t6, t6, t9 +    or       t7, t7, t9 +    sw       t0, 0(a0) +    sw       t1, 4(a0) +    sw       t2, 8(a0) +    sw       t3, 12(a0) +    sw       t4, 16(a0) +    sw       t5, 20(a0) +    sw       t6, 24(a0) +    sw       t7, 28(a0) +    beqz     a2, 4f +     addiu   a0, a0, 32 +3: +    lw       t0, 0(a1) +    addiu    a1, a1, 4 +    addiu    a2, a2, -1 +    or       t1, t0, t9 +    sw       t1, 0(a0) +    bnez     a2, 3b +     addiu   a0, a0, 4 +4: +    jr       ra +     nop + +END(pixman_composite_src_x888_8888_asm_mips) + +#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) +LEAF_MIPS_DSPR2(pixman_composite_src_0888_8888_rev_asm_mips) +/* + * a0 - dst (a8r8g8b8) + * a1 - src (b8g8r8) + * a2 - w + */ + +    beqz              a2, 6f +     nop + +    lui               t8, 0xff00; +    srl               t9, a2, 2   /* t9 = how many multiples of 4 src pixels */ +    beqz              t9, 4f      /* branch if less than 4 src pixels */ +     nop + +    li                t0, 0x1 +    li                t1, 0x2 +    li                t2, 0x3 +    andi              t3, a1, 0x3 +    beq               t3, t0, 1f +     nop +    beq               t3, t1, 2f +     nop +    beq               t3, t2, 3f +     nop + +0: +    beqz              t9, 4f +     addiu            t9, t9, -1 +    lw                t0, 0(a1)            /* t0 = R2 | B1 | G1 | R1 */ +    lw                t1, 4(a1)            /* t1 = G3 | R3 | B2 | G2 */ +    lw                t2, 8(a1)            /* t2 = B4 | G4 | R4 | B3 */ + +    addiu             a1, a1, 12 +    addiu             a2, a2, -4 + +    wsbh              t0, t0               /* t0 = B1 | R2 | R1 | G1 */ +    wsbh              t1, t1               /* t1 = R3 | G3 | G2 | B2 */ +    wsbh              t2, t2               /* t2 = G4 | B4 | B3 | R4 */ + +    packrl.ph         t3, t1, t0           /* t3 = G2 | B2 | B1 | R2 */ +    packrl.ph         t4, t0, t0           /* t4 = R1 | G1 | B1 | R2 */ +    rotr              t3, t3, 16           /* t3 = B1 | R2 | G2 | B2 */ +    or                t3, t3, t8           /* t3 = FF | R2 | G2 | B2 */ +    srl               t4, t4, 8            /* t4 =  0 | R1 | G1 | B1 */ +    or                t4, t4, t8           /* t4 = FF | R1 | G1 | B1 */ +    packrl.ph         t5, t2, t1           /* t5 = B3 | R4 | R3 | G3 */ +    rotr              t5, t5, 24           /* t5 = R4 | R3 | G3 | B3 */ +    or                t5, t5, t8           /* t5 = FF | R3 | G3 | B3 */ +    rotr              t2, t2, 16           /* t2 = B3 | R4 | G4 | B4 */ +    or                t2, t2, t8           /* t5 = FF | R3 | G3 | B3 */ + +    sw                t4, 0(a0) +    sw                t3, 4(a0) +    sw                t5, 8(a0) +    sw                t2, 12(a0) +    b                 0b +     addiu            a0, a0, 16 + +1: +    lbu               t6, 0(a1)            /* t6 =  0 |  0 |  0 | R1 */ +    lhu               t7, 1(a1)            /* t7 =  0 |  0 | B1 | G1 */ +    sll               t6, t6, 16           /* t6 =  0 | R1 |  0 | 0  */ +    wsbh              t7, t7               /* t7 =  0 |  0 | G1 | B1 */ +    or                t7, t6, t7           /* t7 =  0 | R1 | G1 | B1 */ +11: +    beqz              t9, 4f +     addiu            t9, t9, -1 +    lw                t0, 3(a1)            /* t0 = R3 | B2 | G2 | R2 */ +    lw                t1, 7(a1)            /* t1 = G4 | R4 | B3 | G3 */ +    lw                t2, 11(a1)           /* t2 = B5 | G5 | R5 | B4 */ + +    addiu             a1, a1, 12 +    addiu             a2, a2, -4 + +    wsbh              t0, t0               /* t0 = B2 | R3 | R2 | G2 */ +    wsbh              t1, t1               /* t1 = R4 | G4 | G3 | B3 */ +    wsbh              t2, t2               /* t2 = G5 | B5 | B4 | R5 */ + +    packrl.ph         t3, t1, t0           /* t3 = G3 | B3 | B2 | R3 */ +    packrl.ph         t4, t2, t1           /* t4 = B4 | R5 | R4 | G4 */ +    rotr              t0, t0, 24           /* t0 = R3 | R2 | G2 | B2 */ +    rotr              t3, t3, 16           /* t3 = B2 | R3 | G3 | B3 */ +    rotr              t4, t4, 24           /* t4 = R5 | R4 | G4 | B4 */ +    or                t7, t7, t8           /* t7 = FF | R1 | G1 | B1 */ +    or                t0, t0, t8           /* t0 = FF | R2 | G2 | B2 */ +    or                t3, t3, t8           /* t1 = FF | R3 | G3 | B3 */ +    or                t4, t4, t8           /* t3 = FF | R4 | G4 | B4 */ + +    sw                t7, 0(a0) +    sw                t0, 4(a0) +    sw                t3, 8(a0) +    sw                t4, 12(a0) +    rotr              t7, t2, 16           /* t7 = xx | R5 | G5 | B5 */ +    b                 11b +     addiu            a0, a0, 16 + +2: +    lhu               t7, 0(a1)            /* t7 =  0 |  0 | G1 | R1 */ +    wsbh              t7, t7               /* t7 =  0 |  0 | R1 | G1 */ +21: +    beqz              t9, 4f +     addiu            t9, t9, -1 +    lw                t0, 2(a1)            /* t0 = B2 | G2 | R2 | B1 */ +    lw                t1, 6(a1)            /* t1 = R4 | B3 | G3 | R3 */ +    lw                t2, 10(a1)           /* t2 = G5 | R5 | B4 | G4 */ + +    addiu             a1, a1, 12 +    addiu             a2, a2, -4 + +    wsbh              t0, t0               /* t0 = G2 | B2 | B1 | R2 */ +    wsbh              t1, t1               /* t1 = B3 | R4 | R3 | G3 */ +    wsbh              t2, t2               /* t2 = R5 | G5 | G4 | B4 */ + +    precr_sra.ph.w    t7, t0, 0            /* t7 = R1 | G1 | B1 | R2 */ +    rotr              t0, t0, 16           /* t0 = B1 | R2 | G2 | B2 */ +    packrl.ph         t3, t2, t1           /* t3 = G4 | B4 | B3 | R4 */ +    rotr              t1, t1, 24           /* t1 = R4 | R3 | G3 | B3 */ +    srl               t7, t7, 8            /* t7 =  0 | R1 | G1 | B1 */ +    rotr              t3, t3, 16           /* t3 = B3 | R4 | G4 | B4 */ +    or                t7, t7, t8           /* t7 = FF | R1 | G1 | B1 */ +    or                t0, t0, t8           /* t0 = FF | R2 | G2 | B2 */ +    or                t1, t1, t8           /* t1 = FF | R3 | G3 | B3 */ +    or                t3, t3, t8           /* t3 = FF | R4 | G4 | B4 */ + +    sw                t7, 0(a0) +    sw                t0, 4(a0) +    sw                t1, 8(a0) +    sw                t3, 12(a0) +    srl               t7, t2, 16           /* t7 =  0 |  0 | R5 | G5 */ +    b                 21b +     addiu            a0, a0, 16 + +3: +    lbu               t7, 0(a1)            /* t7 =  0 |  0 |  0 | R1 */ +31: +    beqz              t9, 4f +     addiu            t9, t9, -1 +    lw                t0, 1(a1)            /* t0 = G2 | R2 | B1 | G1 */ +    lw                t1, 5(a1)            /* t1 = B3 | G3 | R3 | B2 */ +    lw                t2, 9(a1)            /* t2 = R5 | B4 | G4 | R4 */ + +    addiu             a1, a1, 12 +    addiu             a2, a2, -4 + +    wsbh              t0, t0               /* t0 = R2 | G2 | G1 | B1 */ +    wsbh              t1, t1               /* t1 = G3 | B3 | B2 | R3 */ +    wsbh              t2, t2               /* t2 = B4 | R5 | R4 | G4 */ + +    precr_sra.ph.w    t7, t0, 0            /* t7 = xx | R1 | G1 | B1 */ +    packrl.ph         t3, t1, t0           /* t3 = B2 | R3 | R2 | G2 */ +    rotr              t1, t1, 16           /* t1 = B2 | R3 | G3 | B3 */ +    rotr              t4, t2, 24           /* t4 = R5 | R4 | G4 | B4 */ +    rotr              t3, t3, 24           /* t3 = R3 | R2 | G2 | B2 */ +    or                t7, t7, t8           /* t7 = FF | R1 | G1 | B1 */ +    or                t3, t3, t8           /* t3 = FF | R2 | G2 | B2 */ +    or                t1, t1, t8           /* t1 = FF | R3 | G3 | B3 */ +    or                t4, t4, t8           /* t4 = FF | R4 | G4 | B4 */ + +    sw                t7, 0(a0) +    sw                t3, 4(a0) +    sw                t1, 8(a0) +    sw                t4, 12(a0) +    srl               t7, t2, 16           /* t7 =  0 |  0 | xx | R5 */ +    b                 31b +     addiu            a0, a0, 16 + +4: +    beqz              a2, 6f +     nop +5: +    lbu               t0, 0(a1)            /* t0 =  0 | 0 | 0 | R */ +    lbu               t1, 1(a1)            /* t1 =  0 | 0 | 0 | G */ +    lbu               t2, 2(a1)            /* t2 =  0 | 0 | 0 | B */ +    addiu             a1, a1, 3 + +    sll               t0, t0, 16           /* t2 =  0 | R | 0 | 0 */ +    sll               t1, t1, 8            /* t1 =  0 | 0 | G | 0 */ + +    or                t2, t2, t1           /* t2 =  0 | 0 | G | B */ +    or                t2, t2, t0           /* t2 =  0 | R | G | B */ +    or                t2, t2, t8           /* t2 = FF | R | G | B */ + +    sw                t2, 0(a0) +    addiu             a2, a2, -1 +    bnez              a2, 5b +     addiu            a0, a0, 4 +6: +    j                 ra +     nop + +END(pixman_composite_src_0888_8888_rev_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_src_0888_0565_rev_asm_mips) +/* + * a0 - dst (r5g6b5) + * a1 - src (b8g8r8) + * a2 - w + */ + +    SAVE_REGS_ON_STACK 0, v0, v1 +    beqz              a2, 6f +     nop + +    li                t6, 0xf800f800 +    li                t7, 0x07e007e0 +    li                t8, 0x001F001F +    srl               t9, a2, 2   /* t9 = how many multiples of 4 src pixels */ +    beqz              t9, 4f      /* branch if less than 4 src pixels */ +     nop + +    li                t0, 0x1 +    li                t1, 0x2 +    li                t2, 0x3 +    andi              t3, a1, 0x3 +    beq               t3, t0, 1f +     nop +    beq               t3, t1, 2f +     nop +    beq               t3, t2, 3f +     nop + +0: +    beqz              t9, 4f +     addiu            t9, t9, -1 +    lw                t0, 0(a1)            /* t0 = R2 | B1 | G1 | R1 */ +    lw                t1, 4(a1)            /* t1 = G3 | R3 | B2 | G2 */ +    lw                t2, 8(a1)            /* t2 = B4 | G4 | R4 | B3 */ + +    addiu             a1, a1, 12 +    addiu             a2, a2, -4 + +    wsbh              t0, t0               /* t0 = B1 | R2 | R1 | G1 */ +    wsbh              t1, t1               /* t1 = R3 | G3 | G2 | B2 */ +    wsbh              t2, t2               /* t2 = G4 | B4 | B3 | R4 */ + +    packrl.ph         t3, t1, t0           /* t3 = G2 | B2 | B1 | R2 */ +    packrl.ph         t4, t0, t0           /* t4 = R1 | G1 | B1 | R2 */ +    rotr              t3, t3, 16           /* t3 = B1 | R2 | G2 | B2 */ +    srl               t4, t4, 8            /* t4 =  0 | R1 | G1 | B1 */ +    packrl.ph         t5, t2, t1           /* t5 = B3 | R4 | R3 | G3 */ +    rotr              t5, t5, 24           /* t5 = R4 | R3 | G3 | B3 */ +    rotr              t2, t2, 16           /* t2 = B3 | R4 | G4 | B4 */ + +    CONVERT_2x8888_TO_2x0565 t4, t3, t4, t3, t6, t7, t8, v0, v1 +    CONVERT_2x8888_TO_2x0565 t5, t2, t5, t2, t6, t7, t8, v0, v1 + +    sh                t4, 0(a0) +    sh                t3, 2(a0) +    sh                t5, 4(a0) +    sh                t2, 6(a0) +    b                 0b +     addiu            a0, a0, 8 + +1: +    lbu               t4, 0(a1)            /* t4 =  0 |  0 |  0 | R1 */ +    lhu               t5, 1(a1)            /* t5 =  0 |  0 | B1 | G1 */ +    sll               t4, t4, 16           /* t4 =  0 | R1 |  0 | 0  */ +    wsbh              t5, t5               /* t5 =  0 |  0 | G1 | B1 */ +    or                t5, t4, t5           /* t5 =  0 | R1 | G1 | B1 */ +11: +    beqz              t9, 4f +     addiu            t9, t9, -1 +    lw                t0, 3(a1)            /* t0 = R3 | B2 | G2 | R2 */ +    lw                t1, 7(a1)            /* t1 = G4 | R4 | B3 | G3 */ +    lw                t2, 11(a1)           /* t2 = B5 | G5 | R5 | B4 */ + +    addiu             a1, a1, 12 +    addiu             a2, a2, -4 + +    wsbh              t0, t0               /* t0 = B2 | R3 | R2 | G2 */ +    wsbh              t1, t1               /* t1 = R4 | G4 | G3 | B3 */ +    wsbh              t2, t2               /* t2 = G5 | B5 | B4 | R5 */ + +    packrl.ph         t3, t1, t0           /* t3 = G3 | B3 | B2 | R3 */ +    packrl.ph         t4, t2, t1           /* t4 = B4 | R5 | R4 | G4 */ +    rotr              t0, t0, 24           /* t0 = R3 | R2 | G2 | B2 */ +    rotr              t3, t3, 16           /* t3 = B2 | R3 | G3 | B3 */ +    rotr              t4, t4, 24           /* t4 = R5 | R4 | G4 | B4 */ + +    CONVERT_2x8888_TO_2x0565 t5, t0, t5, t0, t6, t7, t8, v0, v1 +    CONVERT_2x8888_TO_2x0565 t3, t4, t3, t4, t6, t7, t8, v0, v1 + +    sh                t5, 0(a0) +    sh                t0, 2(a0) +    sh                t3, 4(a0) +    sh                t4, 6(a0) +    rotr              t5, t2, 16           /* t5 = xx | R5 | G5 | B5 */ +    b                 11b +     addiu            a0, a0, 8 + +2: +    lhu               t5, 0(a1)            /* t5 =  0 |  0 | G1 | R1 */ +    wsbh              t5, t5               /* t5 =  0 |  0 | R1 | G1 */ +21: +    beqz              t9, 4f +     addiu            t9, t9, -1 +    lw                t0, 2(a1)            /* t0 = B2 | G2 | R2 | B1 */ +    lw                t1, 6(a1)            /* t1 = R4 | B3 | G3 | R3 */ +    lw                t2, 10(a1)           /* t2 = G5 | R5 | B4 | G4 */ + +    addiu             a1, a1, 12 +    addiu             a2, a2, -4 + +    wsbh              t0, t0               /* t0 = G2 | B2 | B1 | R2 */ +    wsbh              t1, t1               /* t1 = B3 | R4 | R3 | G3 */ +    wsbh              t2, t2               /* t2 = R5 | G5 | G4 | B4 */ + +    precr_sra.ph.w    t5, t0, 0            /* t5 = R1 | G1 | B1 | R2 */ +    rotr              t0, t0, 16           /* t0 = B1 | R2 | G2 | B2 */ +    packrl.ph         t3, t2, t1           /* t3 = G4 | B4 | B3 | R4 */ +    rotr              t1, t1, 24           /* t1 = R4 | R3 | G3 | B3 */ +    srl               t5, t5, 8            /* t5 =  0 | R1 | G1 | B1 */ +    rotr              t3, t3, 16           /* t3 = B3 | R4 | G4 | B4 */ + +    CONVERT_2x8888_TO_2x0565 t5, t0, t5, t0, t6, t7, t8, v0, v1 +    CONVERT_2x8888_TO_2x0565 t1, t3, t1, t3, t6, t7, t8, v0, v1 + +    sh                t5, 0(a0) +    sh                t0, 2(a0) +    sh                t1, 4(a0) +    sh                t3, 6(a0) +    srl               t5, t2, 16           /* t5 =  0 |  0 | R5 | G5 */ +    b                 21b +     addiu            a0, a0, 8 + +3: +    lbu               t5, 0(a1)            /* t5 =  0 |  0 |  0 | R1 */ +31: +    beqz              t9, 4f +     addiu            t9, t9, -1 +    lw                t0, 1(a1)            /* t0 = G2 | R2 | B1 | G1 */ +    lw                t1, 5(a1)            /* t1 = B3 | G3 | R3 | B2 */ +    lw                t2, 9(a1)            /* t2 = R5 | B4 | G4 | R4 */ + +    addiu             a1, a1, 12 +    addiu             a2, a2, -4 + +    wsbh              t0, t0               /* t0 = R2 | G2 | G1 | B1 */ +    wsbh              t1, t1               /* t1 = G3 | B3 | B2 | R3 */ +    wsbh              t2, t2               /* t2 = B4 | R5 | R4 | G4 */ + +    precr_sra.ph.w    t5, t0, 0            /* t5 = xx | R1 | G1 | B1 */ +    packrl.ph         t3, t1, t0           /* t3 = B2 | R3 | R2 | G2 */ +    rotr              t1, t1, 16           /* t1 = B2 | R3 | G3 | B3 */ +    rotr              t4, t2, 24           /* t4 = R5 | R4 | G4 | B4 */ +    rotr              t3, t3, 24           /* t3 = R3 | R2 | G2 | B2 */ + +    CONVERT_2x8888_TO_2x0565 t5, t3, t5, t3, t6, t7, t8, v0, v1 +    CONVERT_2x8888_TO_2x0565 t1, t4, t1, t4, t6, t7, t8, v0, v1 + +    sh                t5, 0(a0) +    sh                t3, 2(a0) +    sh                t1, 4(a0) +    sh                t4, 6(a0) +    srl               t5, t2, 16           /* t5 =  0 |  0 | xx | R5 */ +    b                 31b +     addiu            a0, a0, 8 + +4: +    beqz              a2, 6f +     nop +5: +    lbu               t0, 0(a1)            /* t0 =  0 | 0 | 0 | R */ +    lbu               t1, 1(a1)            /* t1 =  0 | 0 | 0 | G */ +    lbu               t2, 2(a1)            /* t2 =  0 | 0 | 0 | B */ +    addiu             a1, a1, 3 + +    sll               t0, t0, 16           /* t2 =  0 | R | 0 | 0 */ +    sll               t1, t1, 8            /* t1 =  0 | 0 | G | 0 */ + +    or                t2, t2, t1           /* t2 =  0 | 0 | G | B */ +    or                t2, t2, t0           /* t2 =  0 | R | G | B */ + +    CONVERT_1x8888_TO_1x0565 t2, t3, t4, t5 + +    sh                t3, 0(a0) +    addiu             a2, a2, -1 +    bnez              a2, 5b +     addiu            a0, a0, 2 +6: +    RESTORE_REGS_FROM_STACK 0, v0, v1 +    j                 ra +     nop + +END(pixman_composite_src_0888_0565_rev_asm_mips) +#endif + +LEAF_MIPS_DSPR2(pixman_composite_src_pixbuf_8888_asm_mips) +/* + * a0 - dst  (a8b8g8r8) + * a1 - src  (a8r8g8b8) + * a2 - w + */ + +    SAVE_REGS_ON_STACK 0, v0 +    li       v0, 0x00ff00ff + +    beqz     a2, 3f +     nop +    addiu    t1, a2, -1 +    beqz     t1, 2f +     nop +1: +    lw       t0, 0(a1) +    lw       t1, 4(a1) +    addiu    a1, a1, 8 +    addiu    a2, a2, -2 +    srl      t2, t0, 24 +    srl      t3, t1, 24 + +    MIPS_2xUN8x4_MUL_2xUN8 t0, t1, t2, t3, t0, t1, v0, t4, t5, t6, t7, t8, t9 + +    sll      t0, t0, 8 +    sll      t1, t1, 8 +    andi     t2, t2, 0xff +    andi     t3, t3, 0xff +    or       t0, t0, t2 +    or       t1, t1, t3 +    wsbh     t0, t0 +    wsbh     t1, t1 +    rotr     t0, t0, 16 +    rotr     t1, t1, 16 +    sw       t0, 0(a0) +    sw       t1, 4(a0) + +    addiu    t2, a2, -1 +    bgtz     t2, 1b +     addiu   a0, a0, 8 +2: +    beqz     a2, 3f +     nop +    lw       t0, 0(a1) +    srl      t1, t0, 24 + +    MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t3, t4, t5 + +    sll      t0, t0, 8 +    andi     t1, t1, 0xff +    or       t0, t0, t1 +    wsbh     t0, t0 +    rotr     t0, t0, 16 +    sw       t0, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 0, v0 +    j        ra +     nop + +END(pixman_composite_src_pixbuf_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_src_rpixbuf_8888_asm_mips) +/* + * a0 - dst  (a8r8g8b8) + * a1 - src  (a8r8g8b8) + * a2 - w + */ + +    SAVE_REGS_ON_STACK 0, v0 +    li       v0, 0x00ff00ff + +    beqz     a2, 3f +     nop +    addiu    t1, a2, -1 +    beqz     t1, 2f +     nop +1: +    lw       t0, 0(a1) +    lw       t1, 4(a1) +    addiu    a1, a1, 8 +    addiu    a2, a2, -2 +    srl      t2, t0, 24 +    srl      t3, t1, 24 + +    MIPS_2xUN8x4_MUL_2xUN8 t0, t1, t2, t3, t0, t1, v0, t4, t5, t6, t7, t8, t9 + +    sll      t0, t0, 8 +    sll      t1, t1, 8 +    andi     t2, t2, 0xff +    andi     t3, t3, 0xff +    or       t0, t0, t2 +    or       t1, t1, t3 +    rotr     t0, t0, 8 +    rotr     t1, t1, 8 +    sw       t0, 0(a0) +    sw       t1, 4(a0) + +    addiu    t2, a2, -1 +    bgtz     t2, 1b +     addiu   a0, a0, 8 +2: +    beqz     a2, 3f +     nop +    lw       t0, 0(a1) +    srl      t1, t0, 24 + +    MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t3, t4, t5 + +    sll      t0, t0, 8 +    andi     t1, t1, 0xff +    or       t0, t0, t1 +    rotr     t0, t0, 8 +    sw       t0, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 0, v0 +    j        ra +     nop + +END(pixman_composite_src_rpixbuf_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_src_n_8_8888_asm_mips) +/* + * a0 - dst  (a8r8g8b8) + * a1 - src  (32bit constant) + * a2 - mask (a8) + * a3 - w + */ + + +    SAVE_REGS_ON_STACK 0, v0 +    li       v0, 0x00ff00ff + +    beqz     a3, 3f +     nop +    addiu    t1, a3, -1 +    beqz     t1, 2f +     nop + +1: +                       /* a1 = source      (32bit constant) */ +    lbu      t0, 0(a2) /* t2 = mask        (a8) */ +    lbu      t1, 1(a2) /* t3 = mask        (a8) */ +    addiu    a2, a2, 2 + +    MIPS_2xUN8x4_MUL_2xUN8 a1, a1, t0, t1, t2, t3, v0, t4, t5, t6, t7, t8, t9 + +    sw       t2, 0(a0) +    sw       t3, 4(a0) +    addiu    a3, a3, -2 +    addiu    t2, a3, -1 +    bgtz     t2, 1b +     addiu   a0, a0, 8 + +    beqz     a3, 3f +     nop + +2: +    lbu      t0, 0(a2) +    addiu    a2, a2, 1 + +    MIPS_UN8x4_MUL_UN8 a1, t0, t1, v0, t3, t4, t5 + +    sw       t1, 0(a0) +    addiu    a3, a3, -1 +    addiu    a0, a0, 4 + +3: +    RESTORE_REGS_FROM_STACK 0, v0 +    j        ra +     nop + +END(pixman_composite_src_n_8_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_src_n_8_8_asm_mips) +/* + * a0 - dst  (a8) + * a1 - src  (32bit constant) + * a2 - mask (a8) + * a3 - w + */ + +    li                t9, 0x00ff00ff +    beqz              a3, 3f +     nop +    srl               t7, a3, 2   /* t7 = how many multiples of 4 dst pixels */ +    beqz              t7, 1f      /* branch if less than 4 src pixels */ +     nop + +    srl               t8, a1, 24 +    replv.ph          t8, t8 + +0: +    beqz              t7, 1f +     addiu            t7, t7, -1 +    lbu               t0, 0(a2) +    lbu               t1, 1(a2) +    lbu               t2, 2(a2) +    lbu               t3, 3(a2) + +    addiu             a2, a2, 4 + +    precr_sra.ph.w    t1, t0, 0 +    precr_sra.ph.w    t3, t2, 0 +    precr.qb.ph       t0, t3, t1 + +    muleu_s.ph.qbl    t2, t0, t8 +    muleu_s.ph.qbr    t3, t0, t8 +    shra_r.ph         t4, t2, 8 +    shra_r.ph         t5, t3, 8 +    and               t4, t4, t9 +    and               t5, t5, t9 +    addq.ph           t2, t2, t4 +    addq.ph           t3, t3, t5 +    shra_r.ph         t2, t2, 8 +    shra_r.ph         t3, t3, 8 +    precr.qb.ph       t2, t2, t3 + +    sb                t2, 0(a0) +    srl               t2, t2, 8 +    sb                t2, 1(a0) +    srl               t2, t2, 8 +    sb                t2, 2(a0) +    srl               t2, t2, 8 +    sb                t2, 3(a0) +    addiu             a3, a3, -4 +    b                 0b +     addiu            a0, a0, 4 + +1: +    beqz              a3, 3f +     nop +    srl               t8, a1, 24 +2: +    lbu               t0, 0(a2) +    addiu             a2, a2, 1 + +    mul               t2, t0, t8 +    shra_r.ph         t3, t2, 8 +    andi              t3, t3, 0x00ff +    addq.ph           t2, t2, t3 +    shra_r.ph         t2, t2, 8 + +    sb                t2, 0(a0) +    addiu             a3, a3, -1 +    bnez              a3, 2b +     addiu            a0, a0, 1 + +3: +    j                 ra +     nop + +END(pixman_composite_src_n_8_8_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_8888_ca_asm_mips) +/* + * a0 - dst  (a8r8g8b8) + * a1 - src  (32bit constant) + * a2 - mask (a8r8g8b8) + * a3 - w + */ + +    beqz         a3, 8f +     nop +    SAVE_REGS_ON_STACK 8, s0, s1, s2, s3, s4, s5 + +    li           t6, 0xff +    addiu        t7, zero, -1 /* t7 = 0xffffffff */ +    srl          t8, a1, 24   /* t8 = srca */ +    li           t9, 0x00ff00ff + +    addiu        t1, a3, -1 +    beqz         t1, 4f       /* last pixel */ +     nop + +0: +    lw           t0, 0(a2)    /* t0 = mask */ +    lw           t1, 4(a2)    /* t1 = mask */ +    addiu        a3, a3, -2   /* w = w - 2 */ +    or           t2, t0, t1 +    beqz         t2, 3f      /* if (t0 == 0) && (t1 == 0) */ +     addiu       a2, a2, 8 +    and          t2, t0, t1 +    beq          t2, t7, 1f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */ +     nop + +//if(ma) +    lw           t2, 0(a0)    /* t2 = dst */ +    lw           t3, 4(a0)    /* t3 = dst */ +    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5 +    MIPS_2xUN8x4_MUL_2xUN8   t0, t1, t8, t8, t0, t1, t9, s0, s1, s2, s3, s4, s5 +    not          t0, t0 +    not          t1, t1 +    MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5 +    addu_s.qb    t2, t4, t2 +    addu_s.qb    t3, t5, t3 +    sw           t2, 0(a0) +    sw           t3, 4(a0) +    addiu        t1, a3, -1 +    bgtz         t1, 0b +     addiu       a0, a0, 8 +    b            4f +     nop +1: +//if (t0 == 0xffffffff) && (t1 == 0xffffffff): +    beq          t8, t6, 2f   /* if (srca == 0xff) */ +     nop +    lw           t2, 0(a0)    /* t2 = dst */ +    lw           t3, 4(a0)    /* t3 = dst */ +    not          t0, a1 +    not          t1, a1 +    srl          t0, t0, 24 +    srl          t1, t1, 24 +    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5 +    addu_s.qb    t2, a1, t2 +    addu_s.qb    t3, a1, t3 +    sw           t2, 0(a0) +    sw           t3, 4(a0) +    addiu        t1, a3, -1 +    bgtz         t1, 0b +     addiu       a0, a0, 8 +    b            4f +     nop +2: +    sw           a1, 0(a0) +    sw           a1, 4(a0) +3: +    addiu        t1, a3, -1 +    bgtz         t1, 0b +     addiu       a0, a0, 8 + +4: +    beqz         a3, 7f +     nop +                              /* a1 = src */ +    lw           t0, 0(a2)    /* t0 = mask */ +    beqz         t0, 7f       /* if (t0 == 0) */ +     nop +    beq          t0, t7, 5f  /* if (t0 == 0xffffffff) */ +     nop +//if(ma) +    lw           t1, 0(a0)    /* t1 = dst */ +    MIPS_UN8x4_MUL_UN8x4  a1, t0, t2, t9, t3, t4, t5, s0 +    MIPS_UN8x4_MUL_UN8    t0, t8, t0, t9, t3, t4, t5 +    not          t0, t0 +    MIPS_UN8x4_MUL_UN8x4  t1, t0, t1, t9, t3, t4, t5, s0 +    addu_s.qb    t1, t2, t1 +    sw           t1, 0(a0) +    RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5 +    j            ra +     nop +5: +//if (t0 == 0xffffffff) +    beq          t8, t6, 6f   /* if (srca == 0xff) */ +     nop +    lw           t1, 0(a0)    /* t1 = dst */ +    not          t0, a1 +    srl          t0, t0, 24 +    MIPS_UN8x4_MUL_UN8 t1, t0, t1, t9, t2, t3, t4 +    addu_s.qb    t1, a1, t1 +    sw           t1, 0(a0) +    RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5 +    j            ra +     nop +6: +    sw           a1, 0(a0) +7: +    RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5 +8: +    j            ra +     nop + +END(pixman_composite_over_n_8888_8888_ca_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_0565_ca_asm_mips) +/* + * a0 - dst  (r5g6b5) + * a1 - src  (32bit constant) + * a2 - mask (a8r8g8b8) + * a3 - w + */ + +    beqz         a3, 8f +     nop +    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8 + +    li           t6, 0xff +    addiu        t7, zero, -1 /* t7 = 0xffffffff */ +    srl          t8, a1, 24   /* t8 = srca */ +    li           t9, 0x00ff00ff +    li           s6, 0xf800f800 +    li           s7, 0x07e007e0 +    li           s8, 0x001F001F + +    addiu        t1, a3, -1 +    beqz         t1, 4f       /* last pixel */ +     nop + +0: +    lw           t0, 0(a2)    /* t0 = mask */ +    lw           t1, 4(a2)    /* t1 = mask */ +    addiu        a3, a3, -2   /* w = w - 2 */ +    or           t2, t0, t1 +    beqz         t2, 3f      /* if (t0 == 0) && (t1 == 0) */ +     addiu       a2, a2, 8 +    and          t2, t0, t1 +    beq          t2, t7, 1f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */ +     nop + +//if(ma) +    lhu          t2, 0(a0)    /* t2 = dst */ +    lhu          t3, 2(a0)    /* t3 = dst */ +    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5 +    MIPS_2xUN8x4_MUL_2xUN8   t0, t1, t8, t8, t0, t1, t9, s0, s1, s2, s3, s4, s5 +    not          t0, t0 +    not          t1, t1 +    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, s7, s8, s0, s1, s2, s3 +    MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5 +    addu_s.qb    t2, t4, t2 +    addu_s.qb    t3, t5, t3 +    CONVERT_2x8888_TO_2x0565 t2, t3, t2, t3, s6, s7, s8, s0, s1 +    sh           t2, 0(a0) +    sh           t3, 2(a0) +    addiu        t1, a3, -1 +    bgtz         t1, 0b +     addiu       a0, a0, 4 +    b            4f +     nop +1: +//if (t0 == 0xffffffff) && (t1 == 0xffffffff): +    beq          t8, t6, 2f   /* if (srca == 0xff) */ +     nop +    lhu          t2, 0(a0)    /* t2 = dst */ +    lhu          t3, 2(a0)    /* t3 = dst */ +    not          t0, a1 +    not          t1, a1 +    srl          t0, t0, 24 +    srl          t1, t1, 24 +    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, s7, s8, s0, s1, s2, s3 +    MIPS_2xUN8x4_MUL_2xUN8   t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5 +    addu_s.qb    t2, a1, t2 +    addu_s.qb    t3, a1, t3 +    CONVERT_2x8888_TO_2x0565 t2, t3, t2, t3, s6, s7, s8, s0, s1 +    sh           t2, 0(a0) +    sh           t3, 2(a0) +    addiu        t1, a3, -1 +    bgtz         t1, 0b +     addiu       a0, a0, 4 +    b            4f +     nop +2: +    CONVERT_1x8888_TO_1x0565 a1, t2, s0, s1 +    sh           t2, 0(a0) +    sh           t2, 2(a0) +3: +    addiu        t1, a3, -1 +    bgtz         t1, 0b +     addiu       a0, a0, 4 + +4: +    beqz         a3, 7f +     nop +                              /* a1 = src */ +    lw           t0, 0(a2)    /* t0 = mask */ +    beqz         t0, 7f       /* if (t0 == 0) */ +     nop +    beq          t0, t7, 5f  /* if (t0 == 0xffffffff) */ +     nop +//if(ma) +    lhu          t1, 0(a0)    /* t1 = dst */ +    MIPS_UN8x4_MUL_UN8x4     a1, t0, t2, t9, t3, t4, t5, s0 +    MIPS_UN8x4_MUL_UN8       t0, t8, t0, t9, t3, t4, t5 +    not          t0, t0 +    CONVERT_1x0565_TO_1x8888 t1, s1, s2, s3 +    MIPS_UN8x4_MUL_UN8x4     s1, t0, s1, t9, t3, t4, t5, s0 +    addu_s.qb    s1, t2, s1 +    CONVERT_1x8888_TO_1x0565 s1, t1, s0, s2 +    sh           t1, 0(a0) +    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8 +    j            ra +     nop +5: +//if (t0 == 0xffffffff) +    beq          t8, t6, 6f   /* if (srca == 0xff) */ +     nop +    lhu          t1, 0(a0)    /* t1 = dst */ +    not          t0, a1 +    srl          t0, t0, 24 +    CONVERT_1x0565_TO_1x8888 t1, s1, s2, s3 +    MIPS_UN8x4_MUL_UN8       s1, t0, s1, t9, t2, t3, t4 +    addu_s.qb    s1, a1, s1 +    CONVERT_1x8888_TO_1x0565 s1, t1, s0, s2 +    sh           t1, 0(a0) +    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8 +    j            ra +     nop +6: +    CONVERT_1x8888_TO_1x0565 a1, t1, s0, s2 +    sh           t1, 0(a0) +7: +    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8 +8: +    j            ra +     nop + +END(pixman_composite_over_n_8888_0565_ca_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_n_8_8_asm_mips) +/* + * a0 - dst  (a8) + * a1 - src  (32bit constant) + * a2 - mask (a8) + * a3 - w + */ + +    SAVE_REGS_ON_STACK 0, v0 +    li                t9, 0x00ff00ff +    beqz              a3, 3f +     nop +    srl               v0, a3, 2   /* v0 = how many multiples of 4 dst pixels */ +    beqz              v0, 1f      /* branch if less than 4 src pixels */ +     nop + +    srl               t8, a1, 24 +    replv.ph          t8, t8 + +0: +    beqz              v0, 1f +     addiu            v0, v0, -1 +    lbu               t0, 0(a2) +    lbu               t1, 1(a2) +    lbu               t2, 2(a2) +    lbu               t3, 3(a2) +    lbu               t4, 0(a0) +    lbu               t5, 1(a0) +    lbu               t6, 2(a0) +    lbu               t7, 3(a0) + +    addiu             a2, a2, 4 + +    precr_sra.ph.w    t1, t0, 0 +    precr_sra.ph.w    t3, t2, 0 +    precr_sra.ph.w    t5, t4, 0 +    precr_sra.ph.w    t7, t6, 0 + +    precr.qb.ph       t0, t3, t1 +    precr.qb.ph       t1, t7, t5 + +    muleu_s.ph.qbl    t2, t0, t8 +    muleu_s.ph.qbr    t3, t0, t8 +    shra_r.ph         t4, t2, 8 +    shra_r.ph         t5, t3, 8 +    and               t4, t4, t9 +    and               t5, t5, t9 +    addq.ph           t2, t2, t4 +    addq.ph           t3, t3, t5 +    shra_r.ph         t2, t2, 8 +    shra_r.ph         t3, t3, 8 +    precr.qb.ph       t0, t2, t3 +    not               t6, t0 + +    preceu.ph.qbl     t7, t6 +    preceu.ph.qbr     t6, t6 + +    muleu_s.ph.qbl    t2, t1, t7 +    muleu_s.ph.qbr    t3, t1, t6 +    shra_r.ph         t4, t2, 8 +    shra_r.ph         t5, t3, 8 +    and               t4, t4, t9 +    and               t5, t5, t9 +    addq.ph           t2, t2, t4 +    addq.ph           t3, t3, t5 +    shra_r.ph         t2, t2, 8 +    shra_r.ph         t3, t3, 8 +    precr.qb.ph       t1, t2, t3 + +    addu_s.qb         t2, t0, t1 + +    sb                t2, 0(a0) +    srl               t2, t2, 8 +    sb                t2, 1(a0) +    srl               t2, t2, 8 +    sb                t2, 2(a0) +    srl               t2, t2, 8 +    sb                t2, 3(a0) +    addiu             a3, a3, -4 +    b                 0b +     addiu            a0, a0, 4 + +1: +    beqz              a3, 3f +     nop +    srl               t8, a1, 24 +2: +    lbu               t0, 0(a2) +    lbu               t1, 0(a0) +    addiu             a2, a2, 1 + +    mul               t2, t0, t8 +    shra_r.ph         t3, t2, 8 +    andi              t3, t3, 0x00ff +    addq.ph           t2, t2, t3 +    shra_r.ph         t2, t2, 8 +    not               t3, t2 +    andi              t3, t3, 0x00ff + + +    mul               t4, t1, t3 +    shra_r.ph         t5, t4, 8 +    andi              t5, t5, 0x00ff +    addq.ph           t4, t4, t5 +    shra_r.ph         t4, t4, 8 +    andi              t4, t4, 0x00ff + +    addu_s.qb         t2, t2, t4 +    sb                t2, 0(a0) +    addiu             a3, a3, -1 +    bnez              a3, 2b +     addiu            a0, a0, 1 + +3: +    RESTORE_REGS_FROM_STACK 0, v0 +    j                 ra +     nop + +END(pixman_composite_over_n_8_8_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_n_8_8888_asm_mips) +/* + * a0 - dst  (a8r8g8b8) + * a1 - src  (32bit constant) + * a2 - mask (a8) + * a3 - w + */ + +    SAVE_REGS_ON_STACK 4, s0, s1, s2, s3, s4 +    beqz      a3, 4f +     nop +    li        t4, 0x00ff00ff +    li        t5, 0xff +    addiu     t0, a3, -1 +    beqz      t0, 3f         /* last pixel */ +     srl      t6, a1, 24     /* t6 = srca */ +    not       s4, a1 +    beq       t5, t6, 2f     /* if (srca == 0xff) */ +     srl      s4, s4, 24 +1: +                             /* a1 = src */ +    lbu       t0, 0(a2)      /* t0 = mask */ +    lbu       t1, 1(a2)      /* t1 = mask */ +    or        t2, t0, t1 +    beqz      t2, 111f       /* if (t0 == 0) && (t1 == 0) */ +     addiu    a2, a2, 2 +    and       t3, t0, t1 + +    lw        t2, 0(a0)      /* t2 = dst */ +    beq       t3, t5, 11f    /* if (t0 == 0xff) && (t1 == 0xff) */ +     lw       t3, 4(a0)      /* t3 = dst */ + +    MIPS_2xUN8x4_MUL_2xUN8 a1, a1, t0, t1, s0, s1, t4, t6, t7, t8, t9, s2, s3 +    not       s2, s0 +    not       s3, s1 +    srl       s2, s2, 24 +    srl       s3, s3, 24 +    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, s2, s3, t2, t3, t4, t0, t1, t6, t7, t8, t9 +    addu_s.qb s2, t2, s0 +    addu_s.qb s3, t3, s1 +    sw        s2, 0(a0) +    b         111f +     sw       s3, 4(a0) +11: +    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, s4, s4, t2, t3, t4, t0, t1, t6, t7, t8, t9 +    addu_s.qb s2, t2, a1 +    addu_s.qb s3, t3, a1 +    sw        s2, 0(a0) +    sw        s3, 4(a0) + +111: +    addiu     a3, a3, -2 +    addiu     t0, a3, -1 +    bgtz      t0, 1b +     addiu    a0, a0, 8 +    b         3f +     nop +2: +                             /* a1 = src */ +    lbu       t0, 0(a2)      /* t0 = mask */ +    lbu       t1, 1(a2)      /* t1 = mask */ +    or        t2, t0, t1 +    beqz      t2, 222f       /* if (t0 == 0) && (t1 == 0) */ +     addiu    a2, a2, 2 +    and       t3, t0, t1 +    beq       t3, t5, 22f    /* if (t0 == 0xff) && (t1 == 0xff) */ +     nop +    lw        t2, 0(a0)      /* t2 = dst */ +    lw        t3, 4(a0)      /* t3 = dst */ + +    OVER_2x8888_2x8_2x8888 a1, a1, t0, t1, t2, t3, \ +                           t6, t7, t4, t8, t9, s0, s1, s2, s3 +    sw        t6, 0(a0) +    b         222f +     sw        t7, 4(a0) +22: +    sw        a1, 0(a0) +    sw        a1, 4(a0) +222: +    addiu     a3, a3, -2 +    addiu     t0, a3, -1 +    bgtz      t0, 2b +     addiu    a0, a0, 8 +3: +    blez      a3, 4f +     nop +                             /* a1 = src */ +    lbu       t0, 0(a2)      /* t0 = mask */ +    beqz      t0, 4f         /* if (t0 == 0) */ +     addiu    a2, a2, 1 +    move      t3, a1 +    beq       t0, t5, 31f    /* if (t0 == 0xff) */ +     lw       t1, 0(a0)      /* t1 = dst */ + +    MIPS_UN8x4_MUL_UN8 a1, t0, t3, t4, t6, t7, t8 +31: +    not       t2, t3 +    srl       t2, t2, 24 +    MIPS_UN8x4_MUL_UN8 t1, t2, t1, t4, t6, t7, t8 +    addu_s.qb t2, t1, t3 +    sw        t2, 0(a0) +4: +    RESTORE_REGS_FROM_STACK 4, s0, s1, s2, s3, s4 +    j         ra +     nop + +END(pixman_composite_over_n_8_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_n_8_0565_asm_mips) +/* + * a0 - dst  (r5g6b5) + * a1 - src  (32bit constant) + * a2 - mask (a8) + * a3 - w + */ +    SAVE_REGS_ON_STACK 24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8 +    beqz     a3, 4f +     nop +    li       t4, 0x00ff00ff +    li       t5, 0xff +    li       t6, 0xf800f800 +    li       t7, 0x07e007e0 +    li       t8, 0x001F001F +    addiu    t1, a3, -1 +    beqz     t1, 3f         /* last pixel */ +     srl     t0, a1, 24     /* t0 = srca */ +    not      v0, a1 +    beq      t0, t5, 2f     /* if (srca == 0xff) */ +     srl     v0, v0, 24 +1: +                            /* a1 = src */ +    lbu      t0, 0(a2)      /* t0 = mask */ +    lbu      t1, 1(a2)      /* t1 = mask */ +    or       t2, t0, t1 +    beqz     t2, 111f       /* if (t0 == 0) && (t1 == 0) */ +     addiu   a2, a2, 2 +    lhu      t2, 0(a0)      /* t2 = dst */ +    lhu      t3, 2(a0)      /* t3 = dst */ +    CONVERT_2x0565_TO_2x8888 t2, t3, s0, s1, t7, t8, t9, s2, s3, s4 +    and      t9, t0, t1 +    beq      t9, t5, 11f    /* if (t0 == 0xff) && (t1 == 0xff) */ +     nop + +    MIPS_2xUN8x4_MUL_2xUN8   a1, a1, t0, t1, s2, s3, t4, t9, s4, s5, s6, s7, s8 +    not      s4, s2 +    not      s5, s3 +    srl      s4, s4, 24 +    srl      s5, s5, 24 +    MIPS_2xUN8x4_MUL_2xUN8   s0, s1, s4, s5, s0, s1, t4, t9, t0, t1, s6, s7, s8 +    addu_s.qb                s4, s2, s0 +    addu_s.qb                s5, s3, s1 +    CONVERT_2x8888_TO_2x0565 s4, s5, t2, t3, t6, t7, t8, s0, s1 +    sh       t2, 0(a0) +    b        111f +     sh      t3, 2(a0) +11: +    MIPS_2xUN8x4_MUL_2xUN8   s0, s1, v0, v0, s0, s1, t4, t9, t0, t1, s6, s7, s8 +    addu_s.qb                s4, a1, s0 +    addu_s.qb                s5, a1, s1 +    CONVERT_2x8888_TO_2x0565 s4, s5, t2, t3, t6, t7, t8, s0, s1 +    sh       t2, 0(a0) +    sh       t3, 2(a0) +111: +    addiu    a3, a3, -2 +    addiu    t0, a3, -1 +    bgtz     t0, 1b +     addiu   a0, a0, 4 +    b        3f +     nop +2: +    CONVERT_1x8888_TO_1x0565 a1, s0, s1, s2 +21: +                            /* a1 = src */ +    lbu      t0, 0(a2)      /* t0 = mask */ +    lbu      t1, 1(a2)      /* t1 = mask */ +    or       t2, t0, t1 +    beqz     t2, 222f       /* if (t0 == 0) && (t1 == 0) */ +     addiu   a2, a2, 2 +    and      t9, t0, t1 +    move     s2, s0 +    beq      t9, t5, 22f    /* if (t0 == 0xff) && (t2 == 0xff) */ +     move    s3, s0 +    lhu      t2, 0(a0)      /* t2 = dst */ +    lhu      t3, 2(a0)      /* t3 = dst */ + +    CONVERT_2x0565_TO_2x8888 t2, t3, s2, s3, t7, t8, s4, s5, s6, s7 +    OVER_2x8888_2x8_2x8888   a1, a1, t0, t1, s2, s3, \ +                             t2, t3, t4, t9, s4, s5, s6, s7, s8 +    CONVERT_2x8888_TO_2x0565 t2, t3, s2, s3, t6, t7, t8, s4, s5 +22: +    sh       s2, 0(a0) +    sh       s3, 2(a0) +222: +    addiu    a3, a3, -2 +    addiu    t0, a3, -1 +    bgtz     t0, 21b +     addiu   a0, a0, 4 +3: +    blez      a3, 4f +     nop +                            /* a1 = src */ +    lbu      t0, 0(a2)      /* t0 = mask */ +    beqz     t0, 4f         /* if (t0 == 0) */ +     nop +    lhu      t1, 0(a0)      /* t1 = dst */ +    CONVERT_1x0565_TO_1x8888 t1, t2, t3, t7 +    beq      t0, t5, 31f    /* if (t0 == 0xff) */ +     move    t3, a1 + +    MIPS_UN8x4_MUL_UN8       a1, t0, t3, t4, t7, t8, t9 +31: +    not      t6, t3 +    srl      t6, t6, 24 +    MIPS_UN8x4_MUL_UN8       t2, t6, t2, t4, t7, t8, t9 +    addu_s.qb                t1, t2, t3 +    CONVERT_1x8888_TO_1x0565 t1, t2, t3, t7 +    sh       t2, 0(a0) +4: +    RESTORE_REGS_FROM_STACK  24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8 +    j        ra +     nop + +END(pixman_composite_over_n_8_0565_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_8888_n_8888_asm_mips) +/* + * a0 - dst  (a8r8g8b8) + * a1 - src  (a8r8g8b8) + * a2 - mask (32bit constant) + * a3 - w + */ + +    SAVE_REGS_ON_STACK 0, s0 +    li       t4, 0x00ff00ff +    beqz     a3, 3f +     nop +    addiu    t1, a3, -1 +    srl      a2, a2, 24 +    beqz     t1, 2f +     nop + +1: +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */ +                       /* a2 = mask        (32bit constant) */ +    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */ +    lw       t3, 4(a0) /* t3 = destination (a8r8g8b8) */ +    addiu    a1, a1, 8 + +    OVER_2x8888_2x8_2x8888 t0, t1, a2, a2, t2, t3, \ +                           t5, t6, t4, t7, t8, t9, t0, t1, s0 + +    sw       t5, 0(a0) +    sw       t6, 4(a0) +    addiu    a3, a3, -2 +    addiu    t1, a3, -1 +    bgtz     t1, 1b +     addiu   a0, a0, 8 +2: +    beqz     a3, 3f +     nop +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +                       /* a2 = mask        (32bit constant) */ +    lw       t1, 0(a0) /* t1 = destination (a8r8g8b8) */ + +    OVER_8888_8_8888 t0, a2, t1, t3, t4, t5, t6, t7, t8 + +    sw       t3, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 0, s0 +    j        ra +     nop + +END(pixman_composite_over_8888_n_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_8888_n_0565_asm_mips) +/* + * a0 - dst  (r5g6b5) + * a1 - src  (a8r8g8b8) + * a2 - mask (32bit constant) + * a3 - w + */ + +    SAVE_REGS_ON_STACK 0, s0, s1, s2, s3 +    li       t6, 0x00ff00ff +    li       t7, 0xf800f800 +    li       t8, 0x07e007e0 +    li       t9, 0x001F001F +    beqz     a3, 3f +     nop +    srl      a2, a2, 24 +    addiu    t1, a3, -1 +    beqz     t1, 2f +     nop +1: +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */ +                       /* a2 = mask        (32bit constant) */ +    lhu      t2, 0(a0) /* t2 = destination (r5g6b5) */ +    lhu      t3, 2(a0) /* t2 = destination (r5g6b5) */ +    addiu    a1, a1, 8 + +    CONVERT_2x0565_TO_2x8888 t2, t3, t4, t5, t8, t9, s0, s1, t2, t3 +    OVER_2x8888_2x8_2x8888   t0, t1, a2, a2, t4, t5, \ +                             t2, t3, t6, t0, t1, s0, s1, s2, s3 +    CONVERT_2x8888_TO_2x0565 t2, t3, t4, t5, t7, t8, t9, s0, s1 + +    sh       t4, 0(a0) +    sh       t5, 2(a0) +    addiu    a3, a3, -2 +    addiu    t1, a3, -1 +    bgtz     t1, 1b +     addiu   a0, a0, 4 +2: +    beqz     a3, 3f +     nop +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +                       /* a2 = mask        (32bit constant) */ +    lhu      t1, 0(a0) /* t1 = destination (r5g6b5) */ + +    CONVERT_1x0565_TO_1x8888 t1, t2, t4, t5 +    OVER_8888_8_8888         t0, a2, t2, t1, t6, t3, t4, t5, t7 +    CONVERT_1x8888_TO_1x0565 t1, t3, t4, t5 + +    sh       t3, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3 +    j                 ra +     nop + +END(pixman_composite_over_8888_n_0565_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_0565_n_0565_asm_mips) +/* + * a0 - dst  (r5g6b5) + * a1 - src  (r5g6b5) + * a2 - mask (32bit constant) + * a3 - w + */ + +    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5 +    li       t6, 0x00ff00ff +    li       t7, 0xf800f800 +    li       t8, 0x07e007e0 +    li       t9, 0x001F001F +    beqz     a3, 3f +     nop +    srl      a2, a2, 24 +    addiu    t1, a3, -1 +    beqz     t1, 2f +     nop +1: +    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */ +    lhu      t1, 2(a1) /* t1 = source      (r5g6b5) */ +                       /* a2 = mask        (32bit constant) */ +    lhu      t2, 0(a0) /* t2 = destination (r5g6b5) */ +    lhu      t3, 2(a0) /* t3 = destination (r5g6b5) */ +    addiu    a1, a1, 4 + +    CONVERT_2x0565_TO_2x8888 t0, t1, t4, t5, t8, t9, s0, s1, s2, s3 +    CONVERT_2x0565_TO_2x8888 t2, t3, s0, s1, t8, t9, s2, s3, s4, s5 +    OVER_2x8888_2x8_2x8888   t4, t5, a2, a2, s0, s1, \ +                             t0, t1, t6, s2, s3, s4, s5, t4, t5 +    CONVERT_2x8888_TO_2x0565 t0, t1, s0, s1, t7, t8, t9, s2, s3 + +    sh       s0, 0(a0) +    sh       s1, 2(a0) +    addiu    a3, a3, -2 +    addiu    t1, a3, -1 +    bgtz     t1, 1b +     addiu   a0, a0, 4 +2: +    beqz     a3, 3f +     nop +    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */ +                       /* a2 = mask        (32bit constant) */ +    lhu      t1, 0(a0) /* t1 = destination (r5g6b5) */ + +    CONVERT_1x0565_TO_1x8888 t0, t2, t4, t5 +    CONVERT_1x0565_TO_1x8888 t1, t3, t4, t5 +    OVER_8888_8_8888         t2, a2, t3, t0, t6, t1, t4, t5, t7 +    CONVERT_1x8888_TO_1x0565 t0, t3, t4, t5 + +    sh       t3, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5 +    j        ra +     nop + +END(pixman_composite_over_0565_n_0565_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_8888_8_8888_asm_mips) +/* + * a0 - dst  (a8r8g8b8) + * a1 - src  (a8r8g8b8) + * a2 - mask (a8) + * a3 - w + */ + +    SAVE_REGS_ON_STACK 0, s0, s1 +    li       t4, 0x00ff00ff +    beqz     a3, 3f +     nop +    addiu    t1, a3, -1 +    beqz     t1, 2f +     nop +1: +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */ +    lbu      t2, 0(a2) /* t2 = mask        (a8) */ +    lbu      t3, 1(a2) /* t3 = mask        (a8) */ +    lw       t5, 0(a0) /* t5 = destination (a8r8g8b8) */ +    lw       t6, 4(a0) /* t6 = destination (a8r8g8b8) */ +    addiu    a1, a1, 8 +    addiu    a2, a2, 2 + +    OVER_2x8888_2x8_2x8888 t0, t1, t2, t3, t5, t6, \ +                           t7, t8, t4, t9, s0, s1, t0, t1, t2 + +    sw       t7, 0(a0) +    sw       t8, 4(a0) +    addiu    a3, a3, -2 +    addiu    t1, a3, -1 +    bgtz     t1, 1b +     addiu   a0, a0, 8 +2: +    beqz     a3, 3f +     nop +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +    lbu      t1, 0(a2) /* t1 = mask        (a8) */ +    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */ + +    OVER_8888_8_8888 t0, t1, t2, t3, t4, t5, t6, t7, t8 + +    sw       t3, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 0, s0, s1 +    j        ra +     nop + +END(pixman_composite_over_8888_8_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_8888_8_0565_asm_mips) +/* + * a0 - dst  (r5g6b5) + * a1 - src  (a8r8g8b8) + * a2 - mask (a8) + * a3 - w + */ + +    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5 +    li       t6, 0x00ff00ff +    li       t7, 0xf800f800 +    li       t8, 0x07e007e0 +    li       t9, 0x001F001F +    beqz     a3, 3f +     nop +    addiu    t1, a3, -1 +    beqz     t1, 2f +     nop +1: +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */ +    lbu      t2, 0(a2) /* t2 = mask        (a8) */ +    lbu      t3, 1(a2) /* t3 = mask        (a8) */ +    lhu      t4, 0(a0) /* t4 = destination (r5g6b5) */ +    lhu      t5, 2(a0) /* t5 = destination (r5g6b5) */ +    addiu    a1, a1, 8 +    addiu    a2, a2, 2 + +    CONVERT_2x0565_TO_2x8888 t4, t5, s0, s1, t8, t9, s2, s3, s4, s5 +    OVER_2x8888_2x8_2x8888   t0, t1, t2, t3, s0, s1, \ +                             t4, t5, t6, s2, s3, s4, s5, t0, t1 +    CONVERT_2x8888_TO_2x0565 t4, t5, s0, s1, t7, t8, t9, s2, s3 + +    sh       s0, 0(a0) +    sh       s1, 2(a0) +    addiu    a3, a3, -2 +    addiu    t1, a3, -1 +    bgtz     t1, 1b +     addiu   a0, a0, 4 +2: +    beqz     a3, 3f +     nop +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +    lbu      t1, 0(a2) /* t1 = mask        (a8) */ +    lhu      t2, 0(a0) /* t2 = destination (r5g6b5) */ + +    CONVERT_1x0565_TO_1x8888 t2, t3, t4, t5 +    OVER_8888_8_8888         t0, t1, t3, t2, t6, t4, t5, t7, t8 +    CONVERT_1x8888_TO_1x0565 t2, t3, t4, t5 + +    sh       t3, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5 +    j        ra +     nop + +END(pixman_composite_over_8888_8_0565_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_0565_8_0565_asm_mips) +/* + * a0 - dst  (r5g6b5) + * a1 - src  (r5g6b5) + * a2 - mask (a8) + * a3 - w + */ + +    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5 +    li       t4, 0xf800f800 +    li       t5, 0x07e007e0 +    li       t6, 0x001F001F +    li       t7, 0x00ff00ff +    beqz     a3, 3f +     nop +    addiu    t1, a3, -1 +    beqz     t1, 2f +     nop +1: +    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */ +    lhu      t1, 2(a1) /* t1 = source      (r5g6b5) */ +    lbu      t2, 0(a2) /* t2 = mask        (a8) */ +    lbu      t3, 1(a2) /* t3 = mask        (a8) */ +    lhu      t8, 0(a0) /* t8 = destination (r5g6b5) */ +    lhu      t9, 2(a0) /* t9 = destination (r5g6b5) */ +    addiu    a1, a1, 4 +    addiu    a2, a2, 2 + +    CONVERT_2x0565_TO_2x8888 t0, t1, s0, s1, t5, t6, s2, s3, s4, s5 +    CONVERT_2x0565_TO_2x8888 t8, t9, s2, s3, t5, t6, s4, s5, t0, t1 +    OVER_2x8888_2x8_2x8888   s0, s1, t2, t3, s2, s3, \ +                             t0, t1, t7, s4, s5, t8, t9, s0, s1 +    CONVERT_2x8888_TO_2x0565 t0, t1, s0, s1, t4, t5, t6, s2, s3 + +    sh       s0, 0(a0) +    sh       s1, 2(a0) +    addiu    a3, a3, -2 +    addiu    t1, a3, -1 +    bgtz     t1, 1b +     addiu   a0, a0, 4 +2: +    beqz     a3, 3f +     nop +    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */ +    lbu      t1, 0(a2) /* t1 = mask        (a8) */ +    lhu      t2, 0(a0) /* t2 = destination (r5g6b5) */ + +    CONVERT_1x0565_TO_1x8888 t0, t3, t4, t5 +    CONVERT_1x0565_TO_1x8888 t2, t4, t5, t6 +    OVER_8888_8_8888         t3, t1, t4, t0, t7, t2, t5, t6, t8 +    CONVERT_1x8888_TO_1x0565 t0, t3, t4, t5 + +    sh       t3, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5 +    j        ra +     nop + +END(pixman_composite_over_0565_8_0565_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_8888_8888_8888_asm_mips) +/* + * a0 - dst  (a8r8g8b8) + * a1 - src  (a8r8g8b8) + * a2 - mask (a8r8g8b8) + * a3 - w + */ + +    SAVE_REGS_ON_STACK 0, s0, s1, s2 +    li       t4, 0x00ff00ff +    beqz     a3, 3f +     nop +    addiu    t1, a3, -1 +    beqz     t1, 2f +     nop +1: +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */ +    lw       t2, 0(a2) /* t2 = mask        (a8r8g8b8) */ +    lw       t3, 4(a2) /* t3 = mask        (a8r8g8b8) */ +    lw       t5, 0(a0) /* t5 = destination (a8r8g8b8) */ +    lw       t6, 4(a0) /* t6 = destination (a8r8g8b8) */ +    addiu    a1, a1, 8 +    addiu    a2, a2, 8 +    srl      t2, t2, 24 +    srl      t3, t3, 24 + +    OVER_2x8888_2x8_2x8888 t0, t1, t2, t3, t5, t6, t7, t8, t4, t9, s0, s1, s2, t0, t1 + +    sw       t7, 0(a0) +    sw       t8, 4(a0) +    addiu    a3, a3, -2 +    addiu    t1, a3, -1 +    bgtz     t1, 1b +     addiu   a0, a0, 8 +2: +    beqz     a3, 3f +     nop +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +    lw       t1, 0(a2) /* t1 = mask        (a8r8g8b8) */ +    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */ +    srl      t1, t1, 24 + +    OVER_8888_8_8888 t0, t1, t2, t3, t4, t5, t6, t7, t8 + +    sw       t3, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2 +    j        ra +     nop + +END(pixman_composite_over_8888_8888_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_8888_8888_asm_mips) +/* + * a0 - dst  (a8r8g8b8) + * a1 - src  (a8r8g8b8) + * a2 - w + */ + +    SAVE_REGS_ON_STACK 0, s0, s1, s2 +    li           t4, 0x00ff00ff +    beqz         a2, 3f +     nop +    addiu        t1, a2, -1 +    beqz         t1, 2f +     nop +1: +    lw           t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +    lw           t1, 4(a1) /* t1 = source      (a8r8g8b8) */ +    lw           t2, 0(a0) /* t2 = destination (a8r8g8b8) */ +    lw           t3, 4(a0) /* t3 = destination (a8r8g8b8) */ +    addiu        a1, a1, 8 + +    not          t5, t0 +    srl          t5, t5, 24 +    not          t6, t1 +    srl          t6, t6, 24 + +    or           t7, t5, t6 +    beqz         t7, 11f +     or          t8, t0, t1 +    beqz         t8, 12f + +    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, t5, t6, t7, t8, t4, t9, s0, s1, s2, t2, t3 + +    addu_s.qb    t0, t7, t0 +    addu_s.qb    t1, t8, t1 +11: +    sw           t0, 0(a0) +    sw           t1, 4(a0) +12: +    addiu        a2, a2, -2 +    addiu        t1, a2, -1 +    bgtz         t1, 1b +     addiu       a0, a0, 8 +2: +    beqz         a2, 3f +     nop + +    lw           t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +    lw           t1, 0(a0) /* t1 = destination (a8r8g8b8) */ +    addiu        a1, a1, 4 + +    not          t2, t0 +    srl          t2, t2, 24 + +    beqz         t2, 21f +     nop +    beqz         t0, 3f + +    MIPS_UN8x4_MUL_UN8 t1, t2, t3, t4, t5, t6, t7 + +    addu_s.qb    t0, t3, t0 +21: +    sw           t0, 0(a0) + +3: +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2 +    j            ra +     nop + +END(pixman_composite_over_8888_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_8888_0565_asm_mips) +/* + * a0 - dst  (r5g6b5) + * a1 - src  (a8r8g8b8) + * a2 - w + */ + +    SAVE_REGS_ON_STACK 8, s0, s1, s2, s3, s4, s5 +    li           t4, 0x00ff00ff +    li           s3, 0xf800f800 +    li           s4, 0x07e007e0 +    li           s5, 0x001F001F +    beqz         a2, 3f +     nop +    addiu        t1, a2, -1 +    beqz         t1, 2f +     nop +1: +    lw           t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +    lw           t1, 4(a1) /* t1 = source      (a8r8g8b8) */ +    lhu          t2, 0(a0) /* t2 = destination (r5g6b5) */ +    lhu          t3, 2(a0) /* t3 = destination (r5g6b5) */ +    addiu        a1, a1, 8 + +    not          t5, t0 +    srl          t5, t5, 24 +    not          t6, t1 +    srl          t6, t6, 24 + +    or           t7, t5, t6 +    beqz         t7, 11f +     or          t8, t0, t1 +    beqz         t8, 12f + +    CONVERT_2x0565_TO_2x8888 t2, t3, s0, s1, s4, s5, t7, t8, t9, s2 +    MIPS_2xUN8x4_MUL_2xUN8   s0, s1, t5, t6, t7, t8, t4, t9, t2, t3, s2, s0, s1 + +    addu_s.qb    t0, t7, t0 +    addu_s.qb    t1, t8, t1 +11: +    CONVERT_2x8888_TO_2x0565 t0, t1, t7, t8, s3, s4, s5, t2, t3 +    sh           t7, 0(a0) +    sh           t8, 2(a0) +12: +    addiu        a2, a2, -2 +    addiu        t1, a2, -1 +    bgtz         t1, 1b +     addiu       a0, a0, 4 +2: +    beqz         a2, 3f +     nop + +    lw           t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +    lhu          t1, 0(a0) /* t1 = destination (r5g6b5) */ +    addiu        a1, a1, 4 + +    not          t2, t0 +    srl          t2, t2, 24 + +    beqz         t2, 21f +     nop +    beqz         t0, 3f + +    CONVERT_1x0565_TO_1x8888 t1, s0, t8, t9 +    MIPS_UN8x4_MUL_UN8       s0, t2, t3, t4, t5, t6, t7 + +    addu_s.qb    t0, t3, t0 +21: +    CONVERT_1x8888_TO_1x0565 t0, s0, t8, t9 +    sh           s0, 0(a0) + +3: +    RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5 +    j            ra +     nop + +END(pixman_composite_over_8888_0565_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_n_0565_asm_mips) +/* + * a0 - dst  (r5g6b5) + * a1 - src  (32bit constant) + * a2 - w + */ + +    beqz         a2, 5f +     nop + +    not          t0, a1 +    srl          t0, t0, 24 +    bgtz         t0, 1f +     nop +    CONVERT_1x8888_TO_1x0565 a1, t1, t2, t3 +0: +    sh           t1, 0(a0) +    addiu        a2, a2, -1 +    bgtz         a2, 0b +     addiu       a0, a0, 2 +    j            ra +     nop + +1: +    SAVE_REGS_ON_STACK 0, s0, s1, s2 +    li           t4, 0x00ff00ff +    li           t5, 0xf800f800 +    li           t6, 0x07e007e0 +    li           t7, 0x001F001F +    addiu        t1, a2, -1 +    beqz         t1, 3f +     nop +2: +    lhu          t1, 0(a0) /* t1 = destination (r5g6b5) */ +    lhu          t2, 2(a0) /* t2 = destination (r5g6b5) */ + +    CONVERT_2x0565_TO_2x8888 t1, t2, t3, t8, t6, t7, t9, s0, s1, s2 +    MIPS_2xUN8x4_MUL_2xUN8   t3, t8, t0, t0, t1, t2, t4, t9, s0, s1, s2, t3, t8 +    addu_s.qb                t1, t1, a1 +    addu_s.qb                t2, t2, a1 +    CONVERT_2x8888_TO_2x0565 t1, t2, t3, t8, t5, t6, t7, s0, s1 + +    sh           t3, 0(a0) +    sh           t8, 2(a0) + +    addiu        a2, a2, -2 +    addiu        t1, a2, -1 +    bgtz         t1, 2b +     addiu       a0, a0, 4 +3: +    beqz         a2, 4f +     nop + +    lhu          t1, 0(a0) /* t1 = destination (r5g6b5) */ + +    CONVERT_1x0565_TO_1x8888 t1, t2, s0, s1 +    MIPS_UN8x4_MUL_UN8       t2, t0, t1, t4, s0, s1, s2 +    addu_s.qb                t1, t1, a1 +    CONVERT_1x8888_TO_1x0565 t1, t2, s0, s1 + +    sh           t2, 0(a0) + +4: +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2 +5: +    j            ra +     nop + +END(pixman_composite_over_n_0565_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_asm_mips) +/* + * a0 - dst  (a8r8g8b8) + * a1 - src  (32bit constant) + * a2 - w + */ + +    beqz         a2, 5f +     nop + +    not          t0, a1 +    srl          t0, t0, 24 +    bgtz         t0, 1f +     nop +0: +    sw           a1, 0(a0) +    addiu        a2, a2, -1 +    bgtz         a2, 0b +     addiu       a0, a0, 4 +    j            ra +     nop + +1: +    SAVE_REGS_ON_STACK 0, s0, s1, s2 +    li           t4, 0x00ff00ff +    addiu        t1, a2, -1 +    beqz         t1, 3f +     nop +2: +    lw           t2, 0(a0) /* t2 = destination (a8r8g8b8) */ +    lw           t3, 4(a0) /* t3 = destination (a8r8g8b8) */ + +    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, t0, t0, t7, t8, t4, t9, s0, s1, s2, t2, t3 + +    addu_s.qb    t7, t7, a1 +    addu_s.qb    t8, t8, a1 + +    sw           t7, 0(a0) +    sw           t8, 4(a0) + +    addiu        a2, a2, -2 +    addiu        t1, a2, -1 +    bgtz         t1, 2b +     addiu       a0, a0, 8 +3: +    beqz         a2, 4f +     nop + +    lw           t1, 0(a0) /* t1 = destination (a8r8g8b8) */ + +    MIPS_UN8x4_MUL_UN8 t1, t0, t3, t4, t5, t6, t7 + +    addu_s.qb    t3, t3, a1 + +    sw           t3, 0(a0) + +4: +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2 +5: +    j            ra +     nop + +END(pixman_composite_over_n_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_add_8_8_8_asm_mips) +/* + * a0 - dst  (a8) + * a1 - src  (a8) + * a2 - mask (a8) + * a3 - w + */ + +    SAVE_REGS_ON_STACK 0, v0, v1 +    li                t9, 0x00ff00ff +    beqz              a3, 3f +     nop + +    srl               v0, a3, 2   /* v0 = how many multiples of 4 dst pixels */ +    beqz              v0, 1f      /* branch if less than 4 src pixels */ +     nop + +0: +    beqz              v0, 1f +     addiu            v0, v0, -1 +    lbu               t0, 0(a2) +    lbu               t1, 1(a2) +    lbu               t2, 2(a2) +    lbu               t3, 3(a2) +    lbu               t4, 0(a0) +    lbu               t5, 1(a0) +    lbu               t6, 2(a0) +    lbu               t7, 3(a0) + +    addiu             a2, a2, 4 + +    precr_sra.ph.w    t1, t0, 0 +    precr_sra.ph.w    t3, t2, 0 +    precr_sra.ph.w    t5, t4, 0 +    precr_sra.ph.w    t7, t6, 0 + +    precr.qb.ph       t0, t3, t1 +    precr.qb.ph       t1, t7, t5 + +    lbu               t4, 0(a1) +    lbu               v1, 1(a1) +    lbu               t7, 2(a1) +    lbu               t8, 3(a1) + +    addiu             a1, a1, 4 + +    precr_sra.ph.w    v1, t4, 0 +    precr_sra.ph.w    t8, t7, 0 + +    muleu_s.ph.qbl    t2, t0, t8 +    muleu_s.ph.qbr    t3, t0, v1 +    shra_r.ph         t4, t2, 8 +    shra_r.ph         t5, t3, 8 +    and               t4, t4, t9 +    and               t5, t5, t9 +    addq.ph           t2, t2, t4 +    addq.ph           t3, t3, t5 +    shra_r.ph         t2, t2, 8 +    shra_r.ph         t3, t3, 8 +    precr.qb.ph       t0, t2, t3 + +    addu_s.qb         t2, t0, t1 + +    sb                t2, 0(a0) +    srl               t2, t2, 8 +    sb                t2, 1(a0) +    srl               t2, t2, 8 +    sb                t2, 2(a0) +    srl               t2, t2, 8 +    sb                t2, 3(a0) +    addiu             a3, a3, -4 +    b                 0b +     addiu            a0, a0, 4 + +1: +    beqz              a3, 3f +     nop +2: +    lbu               t8, 0(a1) +    lbu               t0, 0(a2) +    lbu               t1, 0(a0) +    addiu             a1, a1, 1 +    addiu             a2, a2, 1 + +    mul               t2, t0, t8 +    shra_r.ph         t3, t2, 8 +    andi              t3, t3, 0xff +    addq.ph           t2, t2, t3 +    shra_r.ph         t2, t2, 8 +    andi              t2, t2, 0xff + +    addu_s.qb         t2, t2, t1 +    sb                t2, 0(a0) +    addiu             a3, a3, -1 +    bnez              a3, 2b +     addiu            a0, a0, 1 + +3: +    RESTORE_REGS_FROM_STACK 0, v0, v1 +    j                 ra +     nop + +END(pixman_composite_add_8_8_8_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_add_n_8_8_asm_mips) +/* + * a0 - dst  (a8) + * a1 - src  (32bit constant) + * a2 - mask (a8) + * a3 - w + */ + +    SAVE_REGS_ON_STACK 0, v0 +    li                t9, 0x00ff00ff +    beqz              a3, 3f +     nop + +    srl               v0, a3, 2   /* v0 = how many multiples of 4 dst pixels */ +    beqz              v0, 1f      /* branch if less than 4 src pixels */ +     nop + +    srl               t8, a1, 24 +    replv.ph          t8, t8 + +0: +    beqz              v0, 1f +     addiu            v0, v0, -1 +    lbu               t0, 0(a2) +    lbu               t1, 1(a2) +    lbu               t2, 2(a2) +    lbu               t3, 3(a2) +    lbu               t4, 0(a0) +    lbu               t5, 1(a0) +    lbu               t6, 2(a0) +    lbu               t7, 3(a0) + +    addiu             a2, a2, 4 + +    precr_sra.ph.w    t1, t0, 0 +    precr_sra.ph.w    t3, t2, 0 +    precr_sra.ph.w    t5, t4, 0 +    precr_sra.ph.w    t7, t6, 0 + +    precr.qb.ph       t0, t3, t1 +    precr.qb.ph       t1, t7, t5 + +    muleu_s.ph.qbl    t2, t0, t8 +    muleu_s.ph.qbr    t3, t0, t8 +    shra_r.ph         t4, t2, 8 +    shra_r.ph         t5, t3, 8 +    and               t4, t4, t9 +    and               t5, t5, t9 +    addq.ph           t2, t2, t4 +    addq.ph           t3, t3, t5 +    shra_r.ph         t2, t2, 8 +    shra_r.ph         t3, t3, 8 +    precr.qb.ph       t0, t2, t3 + +    addu_s.qb         t2, t0, t1 + +    sb                t2, 0(a0) +    srl               t2, t2, 8 +    sb                t2, 1(a0) +    srl               t2, t2, 8 +    sb                t2, 2(a0) +    srl               t2, t2, 8 +    sb                t2, 3(a0) +    addiu             a3, a3, -4 +    b                 0b +     addiu            a0, a0, 4 + +1: +    beqz              a3, 3f +     nop +    srl               t8, a1, 24 +2: +    lbu               t0, 0(a2) +    lbu               t1, 0(a0) +    addiu             a2, a2, 1 + +    mul               t2, t0, t8 +    shra_r.ph         t3, t2, 8 +    andi              t3, t3, 0xff +    addq.ph           t2, t2, t3 +    shra_r.ph         t2, t2, 8 +    andi              t2, t2, 0xff + +    addu_s.qb         t2, t2, t1 +    sb                t2, 0(a0) +    addiu             a3, a3, -1 +    bnez              a3, 2b +     addiu            a0, a0, 1 + +3: +    RESTORE_REGS_FROM_STACK 0, v0 +    j                 ra +     nop + +END(pixman_composite_add_n_8_8_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_add_n_8_8888_asm_mips) +/* + * a0 - dst  (a8r8g8b8) + * a1 - src  (32bit constant) + * a2 - mask (a8) + * a3 - w + */ + +    SAVE_REGS_ON_STACK 0, s0, s1, s2 +    li       t4, 0x00ff00ff +    beqz     a3, 3f +     nop +    addiu    t1, a3, -1 +    beqz     t1, 2f +     nop +1: +                       /* a1 = source      (32bit constant) */ +    lbu      t0, 0(a2) /* t0 = mask        (a8) */ +    lbu      t1, 1(a2) /* t1 = mask        (a8) */ +    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */ +    lw       t3, 4(a0) /* t3 = destination (a8r8g8b8) */ +    addiu    a2, a2, 2 + +    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 a1, a1, \ +                                       t0, t1, \ +                                       t2, t3, \ +                                       t5, t6, \ +                                       t4, t7, t8, t9, s0, s1, s2 + +    sw       t5, 0(a0) +    sw       t6, 4(a0) +    addiu    a3, a3, -2 +    addiu    t1, a3, -1 +    bgtz     t1, 1b +     addiu   a0, a0, 8 +2: +    beqz     a3, 3f +     nop +                       /* a1 = source      (32bit constant) */ +    lbu      t0, 0(a2) /* t0 = mask        (a8) */ +    lw       t1, 0(a0) /* t1 = destination (a8r8g8b8) */ + +    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 a1, t0, t1, t2, t4, t3, t5, t6 + +    sw       t2, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2 +    j        ra +     nop + +END(pixman_composite_add_n_8_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_add_0565_8_0565_asm_mips) +/* + * a0 - dst  (r5g6b5) + * a1 - src  (r5g6b5) + * a2 - mask (a8) + * a3 - w + */ + +    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7 +    li       t4, 0xf800f800 +    li       t5, 0x07e007e0 +    li       t6, 0x001F001F +    li       t7, 0x00ff00ff +    beqz     a3, 3f +     nop +    addiu    t1, a3, -1 +    beqz     t1, 2f +     nop +1: +    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */ +    lhu      t1, 2(a1) /* t1 = source      (r5g6b5) */ +    lbu      t2, 0(a2) /* t2 = mask        (a8) */ +    lbu      t3, 1(a2) /* t3 = mask        (a8) */ +    lhu      t8, 0(a0) /* t8 = destination (r5g6b5) */ +    lhu      t9, 2(a0) /* t9 = destination (r5g6b5) */ +    addiu    a1, a1, 4 +    addiu    a2, a2, 2 + +    CONVERT_2x0565_TO_2x8888  t0, t1, s0, s1, t5, t6, s2, s3, s4, s5 +    CONVERT_2x0565_TO_2x8888  t8, t9, s2, s3, t5, t6, s4, s5, s6, s7 +    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4  s0, s1, \ +                                        t2, t3, \ +                                        s2, s3, \ +                                        t0, t1, \ +                                        t7, s4, s5, s6, s7, t8, t9 +    CONVERT_2x8888_TO_2x0565  t0, t1, s0, s1, t4, t5, t6, s2, s3 + +    sh       s0, 0(a0) +    sh       s1, 2(a0) +    addiu    a3, a3, -2 +    addiu    t1, a3, -1 +    bgtz     t1, 1b +     addiu   a0, a0, 4 +2: +    beqz     a3, 3f +     nop +    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */ +    lbu      t1, 0(a2) /* t1 = mask        (a8) */ +    lhu      t2, 0(a0) /* t2 = destination (r5g6b5) */ + +    CONVERT_1x0565_TO_1x8888  t0, t3, t4, t5 +    CONVERT_1x0565_TO_1x8888  t2, t4, t5, t6 +    MIPS_UN8x4_MUL_UN8_ADD_UN8x4  t3, t1, t4, t0, t7, t2, t5, t6 +    CONVERT_1x8888_TO_1x0565  t0, t3, t4, t5 + +    sh       t3, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7 +    j        ra +     nop + +END(pixman_composite_add_0565_8_0565_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_add_8888_8_8888_asm_mips) +/* + * a0 - dst  (a8r8g8b8) + * a1 - src  (a8r8g8b8) + * a2 - mask (a8) + * a3 - w + */ + +    SAVE_REGS_ON_STACK 0, s0, s1, s2 +    li       t4, 0x00ff00ff +    beqz     a3, 3f +     nop +    addiu    t1, a3, -1 +    beqz     t1, 2f +     nop +1: +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */ +    lbu      t2, 0(a2) /* t2 = mask        (a8) */ +    lbu      t3, 1(a2) /* t3 = mask        (a8) */ +    lw       t5, 0(a0) /* t5 = destination (a8r8g8b8) */ +    lw       t6, 4(a0) /* t6 = destination (a8r8g8b8) */ +    addiu    a1, a1, 8 +    addiu    a2, a2, 2 + +    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 t0, t1, \ +                                       t2, t3, \ +                                       t5, t6, \ +                                       t7, t8, \ +                                       t4, t9, s0, s1, s2, t0, t1 + +    sw       t7, 0(a0) +    sw       t8, 4(a0) +    addiu    a3, a3, -2 +    addiu    t1, a3, -1 +    bgtz     t1, 1b +     addiu   a0, a0, 8 +2: +    beqz     a3, 3f +     nop +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +    lbu      t1, 0(a2) /* t1 = mask        (a8) */ +    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */ + +    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, t1, t2, t3, t4, t5, t6, t7 + +    sw       t3, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2 +    j        ra +     nop + +END(pixman_composite_add_8888_8_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_add_8888_n_8888_asm_mips) +/* + * a0 - dst  (a8r8g8b8) + * a1 - src  (a8r8g8b8) + * a2 - mask (32bit constant) + * a3 - w + */ + +    SAVE_REGS_ON_STACK 0, s0, s1, s2 +    li       t4, 0x00ff00ff +    beqz     a3, 3f +     nop +    srl      a2, a2, 24 +    addiu    t1, a3, -1 +    beqz     t1, 2f +     nop +1: +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */ +                       /* a2 = mask        (32bit constant) */ +    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */ +    lw       t3, 4(a0) /* t3 = destination (a8r8g8b8) */ +    addiu    a1, a1, 8 + +    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 t0, t1, \ +                                       a2, a2, \ +                                       t2, t3, \ +                                       t5, t6, \ +                                       t4, t7, t8, t9, s0, s1, s2 + +    sw       t5, 0(a0) +    sw       t6, 4(a0) +    addiu    a3, a3, -2 +    addiu    t1, a3, -1 +    bgtz     t1, 1b +     addiu   a0, a0, 8 +2: +    beqz     a3, 3f +     nop +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +                       /* a2 = mask        (32bit constant) */ +    lw       t1, 0(a0) /* t1 = destination (a8r8g8b8) */ + +    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, a2, t1, t3, t4, t5, t6, t7 + +    sw       t3, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2 +    j        ra +     nop + +END(pixman_composite_add_8888_n_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_add_8888_8888_8888_asm_mips) +/* + * a0 - dst  (a8r8g8b8) + * a1 - src  (a8r8g8b8) + * a2 - mask (a8r8g8b8) + * a3 - w + */ + +    SAVE_REGS_ON_STACK 0, s0, s1, s2 +    li       t4, 0x00ff00ff +    beqz     a3, 3f +     nop +    addiu    t1, a3, -1 +    beqz     t1, 2f +     nop +1: +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */ +    lw       t2, 0(a2) /* t2 = mask        (a8r8g8b8) */ +    lw       t3, 4(a2) /* t3 = mask        (a8r8g8b8) */ +    lw       t5, 0(a0) /* t5 = destination (a8r8g8b8) */ +    lw       t6, 4(a0) /* t6 = destination (a8r8g8b8) */ +    addiu    a1, a1, 8 +    addiu    a2, a2, 8 +    srl      t2, t2, 24 +    srl      t3, t3, 24 + +    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 t0, t1, \ +                                       t2, t3, \ +                                       t5, t6, \ +                                       t7, t8, \ +                                       t4, t9, s0, s1, s2, t0, t1 + +    sw       t7, 0(a0) +    sw       t8, 4(a0) +    addiu    a3, a3, -2 +    addiu    t1, a3, -1 +    bgtz     t1, 1b +     addiu   a0, a0, 8 +2: +    beqz     a3, 3f +     nop +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */ +    lw       t1, 0(a2) /* t1 = mask        (a8r8g8b8) */ +    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */ +    srl      t1, t1, 24 + +    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, t1, t2, t3, t4, t5, t6, t7 + +    sw       t3, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2 +    j        ra +     nop + +END(pixman_composite_add_8888_8888_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_add_8_8_asm_mips) +/* + * a0 - dst  (a8) + * a1 - src  (a8) + * a2 - w + */ + +    beqz              a2, 3f +     nop +    srl               t9, a2, 2   /* t9 = how many multiples of 4 dst pixels */ +    beqz              t9, 1f      /* branch if less than 4 src pixels */ +     nop + +0: +    beqz              t9, 1f +     addiu            t9, t9, -1 +    lbu               t0, 0(a1) +    lbu               t1, 1(a1) +    lbu               t2, 2(a1) +    lbu               t3, 3(a1) +    lbu               t4, 0(a0) +    lbu               t5, 1(a0) +    lbu               t6, 2(a0) +    lbu               t7, 3(a0) + +    addiu             a1, a1, 4 + +    precr_sra.ph.w    t1, t0, 0 +    precr_sra.ph.w    t3, t2, 0 +    precr_sra.ph.w    t5, t4, 0 +    precr_sra.ph.w    t7, t6, 0 + +    precr.qb.ph       t0, t3, t1 +    precr.qb.ph       t1, t7, t5 + +    addu_s.qb         t2, t0, t1 + +    sb                t2, 0(a0) +    srl               t2, t2, 8 +    sb                t2, 1(a0) +    srl               t2, t2, 8 +    sb                t2, 2(a0) +    srl               t2, t2, 8 +    sb                t2, 3(a0) +    addiu             a2, a2, -4 +    b                 0b +     addiu            a0, a0, 4 + +1: +    beqz              a2, 3f +     nop +2: +    lbu               t0, 0(a1) +    lbu               t1, 0(a0) +    addiu             a1, a1, 1 + +    addu_s.qb         t2, t0, t1 +    sb                t2, 0(a0) +    addiu             a2, a2, -1 +    bnez              a2, 2b +     addiu            a0, a0, 1 + +3: +    j                 ra +     nop + +END(pixman_composite_add_8_8_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_add_8888_8888_asm_mips) +/* + * a0 - dst (a8r8g8b8) + * a1 - src (a8r8g8b8) + * a2 - w + */ + +    beqz         a2, 4f +     nop + +    srl          t9, a2, 2      /* t1 = how many multiples of 4 src pixels */ +    beqz         t9, 3f         /* branch if less than 4 src pixels */ +     nop +1: +    addiu        t9, t9, -1 +    beqz         t9, 2f +     addiu       a2, a2, -4 + +    lw           t0, 0(a1) +    lw           t1, 4(a1) +    lw           t2, 8(a1) +    lw           t3, 12(a1) +    lw           t4, 0(a0) +    lw           t5, 4(a0) +    lw           t6, 8(a0) +    lw           t7, 12(a0) +    addiu        a1, a1, 16 + +    addu_s.qb    t4, t4, t0 +    addu_s.qb    t5, t5, t1 +    addu_s.qb    t6, t6, t2 +    addu_s.qb    t7, t7, t3 + +    sw           t4, 0(a0) +    sw           t5, 4(a0) +    sw           t6, 8(a0) +    sw           t7, 12(a0) +    b            1b +     addiu       a0, a0, 16 +2: +    lw           t0, 0(a1) +    lw           t1, 4(a1) +    lw           t2, 8(a1) +    lw           t3, 12(a1) +    lw           t4, 0(a0) +    lw           t5, 4(a0) +    lw           t6, 8(a0) +    lw           t7, 12(a0) +    addiu        a1, a1, 16 + +    addu_s.qb    t4, t4, t0 +    addu_s.qb    t5, t5, t1 +    addu_s.qb    t6, t6, t2 +    addu_s.qb    t7, t7, t3 + +    sw           t4, 0(a0) +    sw           t5, 4(a0) +    sw           t6, 8(a0) +    sw           t7, 12(a0) + +    beqz         a2, 4f +     addiu       a0, a0, 16 +3: +    lw           t0, 0(a1) +    lw           t1, 0(a0) +    addiu        a1, a1, 4 +    addiu        a2, a2, -1 +    addu_s.qb    t1, t1, t0 +    sw           t1, 0(a0) +    bnez         a2, 3b +     addiu       a0, a0, 4 +4: +    jr           ra +     nop + +END(pixman_composite_add_8888_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_out_reverse_8_0565_asm_mips) +/* + * a0 - dst  (r5g6b5) + * a1 - src  (a8) + * a2 - w + */ + +    beqz     a2, 4f +     nop + +    SAVE_REGS_ON_STACK 0, s0, s1, s2, s3 +    li       t2, 0xf800f800 +    li       t3, 0x07e007e0 +    li       t4, 0x001F001F +    li       t5, 0x00ff00ff + +    addiu    t1, a2, -1 +    beqz     t1, 2f +     nop +1: +    lbu      t0, 0(a1) /* t0 = source      (a8) */ +    lbu      t1, 1(a1) /* t1 = source      (a8) */ +    lhu      t6, 0(a0) /* t6 = destination (r5g6b5) */ +    lhu      t7, 2(a0) /* t7 = destination (r5g6b5) */ +    addiu    a1, a1, 2 + +    not      t0, t0 +    not      t1, t1 +    andi     t0, 0xff  /* t0 = neg source1 */ +    andi     t1, 0xff  /* t1 = neg source2 */ +    CONVERT_2x0565_TO_2x8888 t6, t7, t8, t9, t3, t4, s0, s1, s2, s3 +    MIPS_2xUN8x4_MUL_2xUN8   t8, t9, t0, t1, t6, t7, t5, s0, s1, s2, s3, t8, t9 +    CONVERT_2x8888_TO_2x0565 t6, t7, t8, t9, t2, t3, t4, s0, s1 + +    sh       t8, 0(a0) +    sh       t9, 2(a0) +    addiu    a2, a2, -2 +    addiu    t1, a2, -1 +    bgtz     t1, 1b +     addiu   a0, a0, 4 +2: +    beqz     a2, 3f +     nop +    lbu      t0, 0(a1) /* t0 = source      (a8) */ +    lhu      t1, 0(a0) /* t1 = destination (r5g6b5) */ + +    not      t0, t0 +    andi     t0, 0xff  /* t0 = neg source */ +    CONVERT_1x0565_TO_1x8888 t1, t2, t3, t4 +    MIPS_UN8x4_MUL_UN8        t2, t0, t1, t5, t3, t4, t6 +    CONVERT_1x8888_TO_1x0565 t1, t2, t3, t4 + +    sh       t2, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3 +4: +    j        ra +     nop + +END(pixman_composite_out_reverse_8_0565_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_out_reverse_8_8888_asm_mips) +/* + * a0 - dst  (a8r8g8b8) + * a1 - src  (a8) + * a2 - w + */ + +    beqz     a2, 3f +     nop +    li       t4, 0x00ff00ff +    addiu    t1, a2, -1 +    beqz     t1, 2f +     nop +1: +    lbu      t0, 0(a1) /* t0 = source      (a8) */ +    lbu      t1, 1(a1) /* t1 = source      (a8) */ +    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */ +    lw       t3, 4(a0) /* t3 = destination (a8r8g8b8) */ +    addiu    a1, a1, 2 +    not      t0, t0 +    not      t1, t1 +    andi     t0, 0xff  /* t0 = neg source */ +    andi     t1, 0xff  /* t1 = neg source */ + +    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, t0, t1, t5, t6, t4, t7, t8, t9, t2, t3, t0 + +    sw       t5, 0(a0) +    sw       t6, 4(a0) +    addiu    a2, a2, -2 +    addiu    t1, a2, -1 +    bgtz     t1, 1b +     addiu   a0, a0, 8 +2: +    beqz     a2, 3f +     nop +    lbu      t0, 0(a1) /* t0 = source      (a8) */ +    lw       t1, 0(a0) /* t1 = destination (a8r8g8b8) */ +    not      t0, t0 +    andi     t0, 0xff  /* t0 = neg source */ + +    MIPS_UN8x4_MUL_UN8 t1, t0, t2, t4, t3, t5, t6 + +    sw       t2, 0(a0) +3: +    j        ra +     nop + +END(pixman_composite_out_reverse_8_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_over_reverse_n_8888_asm_mips) +/* + * a0 - dst  (a8r8g8b8) + * a1 - src  (32bit constant) + * a2 - w + */ + +    beqz              a2, 5f +     nop + +    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7 +    li                t0, 0x00ff00ff +    srl               t9, a2, 2   /* t9 = how many multiples of 4 src pixels */ +    beqz              t9, 2f      /* branch if less than 4 src pixels */ +     nop +1: +    beqz              t9, 2f +     addiu            t9, t9, -1 + +    lw                t1, 0(a0) +    lw                t2, 4(a0) +    lw                t3, 8(a0) +    lw                t4, 12(a0) + +    addiu             a2, a2, -4 + +    not               t5, t1 +    not               t6, t2 +    not               t7, t3 +    not               t8, t4 +    srl               t5, t5, 24 +    srl               t6, t6, 24 +    srl               t7, t7, 24 +    srl               t8, t8, 24 +    replv.ph          t5, t5 +    replv.ph          t6, t6 +    replv.ph          t7, t7 +    replv.ph          t8, t8 +    muleu_s.ph.qbl    s0, a1, t5 +    muleu_s.ph.qbr    s1, a1, t5 +    muleu_s.ph.qbl    s2, a1, t6 +    muleu_s.ph.qbr    s3, a1, t6 +    muleu_s.ph.qbl    s4, a1, t7 +    muleu_s.ph.qbr    s5, a1, t7 +    muleu_s.ph.qbl    s6, a1, t8 +    muleu_s.ph.qbr    s7, a1, t8 + +    shra_r.ph         t5, s0, 8 +    shra_r.ph         t6, s1, 8 +    shra_r.ph         t7, s2, 8 +    shra_r.ph         t8, s3, 8 +    and               t5, t5, t0 +    and               t6, t6, t0 +    and               t7, t7, t0 +    and               t8, t8, t0 +    addq.ph           s0, s0, t5 +    addq.ph           s1, s1, t6 +    addq.ph           s2, s2, t7 +    addq.ph           s3, s3, t8 +    shra_r.ph         s0, s0, 8 +    shra_r.ph         s1, s1, 8 +    shra_r.ph         s2, s2, 8 +    shra_r.ph         s3, s3, 8 +    shra_r.ph         t5, s4, 8 +    shra_r.ph         t6, s5, 8 +    shra_r.ph         t7, s6, 8 +    shra_r.ph         t8, s7, 8 +    and               t5, t5, t0 +    and               t6, t6, t0 +    and               t7, t7, t0 +    and               t8, t8, t0 +    addq.ph           s4, s4, t5 +    addq.ph           s5, s5, t6 +    addq.ph           s6, s6, t7 +    addq.ph           s7, s7, t8 +    shra_r.ph         s4, s4, 8 +    shra_r.ph         s5, s5, 8 +    shra_r.ph         s6, s6, 8 +    shra_r.ph         s7, s7, 8 + +    precr.qb.ph       t5, s0, s1 +    precr.qb.ph       t6, s2, s3 +    precr.qb.ph       t7, s4, s5 +    precr.qb.ph       t8, s6, s7 +    addu_s.qb         t5, t1, t5 +    addu_s.qb         t6, t2, t6 +    addu_s.qb         t7, t3, t7 +    addu_s.qb         t8, t4, t8 + +    sw                t5, 0(a0) +    sw                t6, 4(a0) +    sw                t7, 8(a0) +    sw                t8, 12(a0) +    b                 1b +     addiu            a0, a0, 16 + +2: +    beqz              a2, 4f +     nop +3: +    lw                t1, 0(a0) + +    not               t2, t1 +    srl               t2, t2, 24 +    replv.ph          t2, t2 + +    muleu_s.ph.qbl    t4, a1, t2 +    muleu_s.ph.qbr    t5, a1, t2 +    shra_r.ph         t6, t4, 8 +    shra_r.ph         t7, t5, 8 + +    and               t6,t6,t0 +    and               t7,t7,t0 + +    addq.ph           t8, t4, t6 +    addq.ph           t9, t5, t7 + +    shra_r.ph         t8, t8, 8 +    shra_r.ph         t9, t9, 8 + +    precr.qb.ph       t9, t8, t9 + +    addu_s.qb         t9, t1, t9 +    sw                t9, 0(a0) + +    addiu             a2, a2, -1 +    bnez              a2, 3b +     addiu            a0, a0, 4 +4: +    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7 +5: +    j                 ra +     nop + +END(pixman_composite_over_reverse_n_8888_asm_mips) + +LEAF_MIPS_DSPR2(pixman_composite_in_n_8_asm_mips) +/* + * a0 - dst  (a8) + * a1 - src  (32bit constant) + * a2 - w + */ + +    li                t9, 0x00ff00ff +    beqz              a2, 3f +     nop +    srl               t7, a2, 2   /* t7 = how many multiples of 4 dst pixels */ +    beqz              t7, 1f      /* branch if less than 4 src pixels */ +     nop + +    srl               t8, a1, 24 +    replv.ph          t8, t8 + +0: +    beqz              t7, 1f +     addiu            t7, t7, -1 +    lbu               t0, 0(a0) +    lbu               t1, 1(a0) +    lbu               t2, 2(a0) +    lbu               t3, 3(a0) + +    precr_sra.ph.w    t1, t0, 0 +    precr_sra.ph.w    t3, t2, 0 +    precr.qb.ph       t0, t3, t1 + +    muleu_s.ph.qbl    t2, t0, t8 +    muleu_s.ph.qbr    t3, t0, t8 +    shra_r.ph         t4, t2, 8 +    shra_r.ph         t5, t3, 8 +    and               t4, t4, t9 +    and               t5, t5, t9 +    addq.ph           t2, t2, t4 +    addq.ph           t3, t3, t5 +    shra_r.ph         t2, t2, 8 +    shra_r.ph         t3, t3, 8 +    precr.qb.ph       t2, t2, t3 + +    sb                t2, 0(a0) +    srl               t2, t2, 8 +    sb                t2, 1(a0) +    srl               t2, t2, 8 +    sb                t2, 2(a0) +    srl               t2, t2, 8 +    sb                t2, 3(a0) +    addiu             a2, a2, -4 +    b                 0b +     addiu            a0, a0, 4 + +1: +    beqz              a2, 3f +     nop +    srl               t8, a1, 24 +2: +    lbu               t0, 0(a0) + +    mul               t2, t0, t8 +    shra_r.ph         t3, t2, 8 +    andi              t3, t3, 0x00ff +    addq.ph           t2, t2, t3 +    shra_r.ph         t2, t2, 8 + +    sb                t2, 0(a0) +    addiu             a2, a2, -1 +    bnez              a2, 2b +     addiu            a0, a0, 1 + +3: +    j                 ra +     nop + +END(pixman_composite_in_n_8_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8888_OVER_asm_mips) +/* + * a0     - dst  (a8r8g8b8) + * a1     - src  (a8r8g8b8) + * a2     - w + * a3     - vx + * 16(sp) - unit_x + */ + +    SAVE_REGS_ON_STACK 0, s0, s1, s2, s3 +    lw       t8, 16(sp) /* t8 = unit_x */ +    li       t6, 0x00ff00ff +    beqz     a2, 3f +     nop +    addiu    t1, a2, -1 +    beqz     t1, 2f +     nop +1: +    sra      t0, a3, 16 /* t0 = vx >> 16 */ +    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */ +    addu     t0, a1, t0 +    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */ +    addu     a3, a3, t8 /* a3 = vx + unit_x */ + +    sra      t1, a3, 16 /* t0 = vx >> 16 */ +    sll      t1, t1, 2  /* t0 = t0 * 4 (a8r8g8b8) */ +    addu     t1, a1, t1 +    lw       t1, 0(t1)  /* t1 = source      (a8r8g8b8) */ +    addu     a3, a3, t8 /* a3 = vx + unit_x */ + +    lw       t2, 0(a0)  /* t2 = destination (a8r8g8b8) */ +    lw       t3, 4(a0)  /* t3 = destination (a8r8g8b8) */ + +    OVER_2x8888_2x8888 t0, t1, t2, t3, t4, t5, t6, t7, t9, s0, s1, s2, s3 + +    sw       t4, 0(a0) +    sw       t5, 4(a0) +    addiu    a2, a2, -2 +    addiu    t1, a2, -1 +    bgtz     t1, 1b +     addiu   a0, a0, 8 +2: +    beqz     a2, 3f +     nop +    sra      t0, a3, 16 /* t0 = vx >> 16 */ +    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */ +    addu     t0, a1, t0 +    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */ +    lw       t1, 0(a0)  /* t1 = destination (a8r8g8b8) */ +    addu     a3, a3, t8 /* a3 = vx + unit_x */ + +    OVER_8888_8888 t0, t1, t2, t6, t4, t5, t3, t7 + +    sw       t2, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3 +    j        ra +     nop + +END(pixman_scaled_nearest_scanline_8888_8888_OVER_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_0565_OVER_asm_mips) +/* + * a0     - dst  (r5g6b5) + * a1     - src  (a8r8g8b8) + * a2     - w + * a3     - vx + * 16(sp) - unit_x + */ + +    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, v0, v1 +    lw       t8, 40(sp) /* t8 = unit_x */ +    li       t4, 0x00ff00ff +    li       t5, 0xf800f800 +    li       t6, 0x07e007e0 +    li       t7, 0x001F001F +    beqz     a2, 3f +     nop +    addiu    t1, a2, -1 +    beqz     t1, 2f +     nop +1: +    sra      t0, a3, 16 /* t0 = vx >> 16 */ +    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */ +    addu     t0, a1, t0 +    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */ +    addu     a3, a3, t8 /* a3 = vx + unit_x */ +    sra      t1, a3, 16 /* t0 = vx >> 16 */ +    sll      t1, t1, 2  /* t0 = t0 * 4 (a8r8g8b8) */ +    addu     t1, a1, t1 +    lw       t1, 0(t1)  /* t1 = source      (a8r8g8b8) */ +    addu     a3, a3, t8 /* a3 = vx + unit_x */ +    lhu      t2, 0(a0)  /* t2 = destination (r5g6b5) */ +    lhu      t3, 2(a0)  /* t3 = destination (r5g6b5) */ + +    CONVERT_2x0565_TO_2x8888 t2, t3, v0, v1, t6, t7, s0, s1, s2, s3 +    OVER_2x8888_2x8888       t0, t1, v0, v1, t2, t3, t4, t9, s0, s1, s2, s3, s4 +    CONVERT_2x8888_TO_2x0565 t2, t3, v0, v1, t5, t6, t7, t9, s2 + +    sh       v0, 0(a0) +    sh       v1, 2(a0) +    addiu    a2, a2, -2 +    addiu    t1, a2, -1 +    bgtz     t1, 1b +     addiu   a0, a0, 4 +2: +    beqz     a2, 3f +     nop +    sra      t0, a3, 16 /* t0 = vx >> 16 */ +    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */ +    addu     t0, a1, t0 +    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */ +    lhu      t1, 0(a0)  /* t1 = destination (r5g6b5) */ +    addu     a3, a3, t8 /* a3 = vx + unit_x */ + +    CONVERT_1x0565_TO_1x8888 t1, t2, t5, t6 +    OVER_8888_8888           t0, t2, t1, t4, t3, t5, t6, t7 +    CONVERT_1x8888_TO_1x0565 t1, t2, t5, t6 + +    sh       t2, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, v0, v1 +    j        ra +     nop + +END(pixman_scaled_nearest_scanline_8888_0565_OVER_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_0565_8888_SRC_asm_mips) +/* + * a0     - dst (a8r8g8b8) + * a1     - src (r5g6b5) + * a2     - w + * a3     - vx + * 16(sp) - unit_x + */ + +    SAVE_REGS_ON_STACK 0, v0 +    beqz     a2, 3f +     nop + +    lw       v0, 16(sp) /* v0 = unit_x */ +    addiu    t1, a2, -1 +    beqz     t1, 2f +     nop + +    li       t4, 0x07e007e0 +    li       t5, 0x001F001F +1: +    sra      t0, a3, 16 /* t0 = vx >> 16 */ +    sll      t0, t0, 1  /* t0 = t0 * 2 ((r5g6b5)) */ +    addu     t0, a1, t0 +    lhu      t0, 0(t0)  /* t0 = source ((r5g6b5)) */ +    addu     a3, a3, v0 /* a3 = vx + unit_x */ +    sra      t1, a3, 16 /* t1 = vx >> 16 */ +    sll      t1, t1, 1  /* t1 = t1 * 2 ((r5g6b5)) */ +    addu     t1, a1, t1 +    lhu      t1, 0(t1)  /* t1 = source ((r5g6b5)) */ +    addu     a3, a3, v0 /* a3 = vx + unit_x */ +    addiu    a2, a2, -2 + +    CONVERT_2x0565_TO_2x8888 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9 + +    sw       t2, 0(a0) +    sw       t3, 4(a0) + +    addiu    t2, a2, -1 +    bgtz     t2, 1b +     addiu   a0, a0, 8 +2: +    beqz     a2, 3f +     nop +    sra      t0, a3, 16 /* t0 = vx >> 16 */ +    sll      t0, t0, 1  /* t0 = t0 * 2 ((r5g6b5)) */ +    addu     t0, a1, t0 +    lhu      t0, 0(t0)  /* t0 = source ((r5g6b5)) */ + +    CONVERT_1x0565_TO_1x8888 t0, t1, t2, t3 + +    sw       t1, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 0, v0 +    j        ra +     nop + +END(pixman_scaled_nearest_scanline_0565_8888_SRC_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_mips) +/* + * a0     - dst  (r5g6b5) + * a1     - src  (a8r8g8b8) + * a2     - mask (a8) + * a3     - w + * 16(sp) - vx + * 20(sp) - unit_x + */ +    beqz     a3, 4f +     nop + +    SAVE_REGS_ON_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5 +    lw       v0, 36(sp) /* v0 = vx */ +    lw       v1, 40(sp) /* v1 = unit_x */ +    li       t6, 0x00ff00ff +    li       t7, 0xf800f800 +    li       t8, 0x07e007e0 +    li       t9, 0x001F001F + +    addiu    t1, a3, -1 +    beqz     t1, 2f +     nop +1: +    sra      t0, v0, 16 /* t0 = vx >> 16 */ +    sll      t0, t0, 2  /* t0 = t0 * 4      (a8r8g8b8) */ +    addu     t0, a1, t0 +    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */ +    addu     v0, v0, v1 /* v0 = vx + unit_x */ +    sra      t1, v0, 16 /* t1 = vx >> 16 */ +    sll      t1, t1, 2  /* t1 = t1 * 4      (a8r8g8b8) */ +    addu     t1, a1, t1 +    lw       t1, 0(t1)  /* t1 = source      (a8r8g8b8) */ +    addu     v0, v0, v1 /* v0 = vx + unit_x */ +    lbu      t2, 0(a2)  /* t2 = mask        (a8) */ +    lbu      t3, 1(a2)  /* t3 = mask        (a8) */ +    lhu      t4, 0(a0)  /* t4 = destination (r5g6b5) */ +    lhu      t5, 2(a0)  /* t5 = destination (r5g6b5) */ +    addiu    a2, a2, 2 + +    CONVERT_2x0565_TO_2x8888 t4, t5, s0, s1, t8, t9, s2, s3, s4, s5 +    OVER_2x8888_2x8_2x8888   t0, t1, \ +                             t2, t3, \ +                             s0, s1, \ +                             t4, t5, \ +                             t6, s2, s3, s4, s5, t2, t3 +    CONVERT_2x8888_TO_2x0565 t4, t5, s0, s1, t7, t8, t9, s2, s3 + +    sh       s0, 0(a0) +    sh       s1, 2(a0) +    addiu    a3, a3, -2 +    addiu    t1, a3, -1 +    bgtz     t1, 1b +     addiu   a0, a0, 4 +2: +    beqz     a3, 3f +     nop +    sra      t0, v0, 16 /* t0 = vx >> 16 */ +    sll      t0, t0, 2  /* t0 = t0 * 4      (a8r8g8b8) */ +    addu     t0, a1, t0 +    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */ +    lbu      t1, 0(a2)  /* t1 = mask        (a8) */ +    lhu      t2, 0(a0)  /* t2 = destination (r5g6b5) */ + +    CONVERT_1x0565_TO_1x8888 t2, t3, t4, t5 +    OVER_8888_8_8888         t0, t1, t3, t2, t6, t4, t5, t7, t8 +    CONVERT_1x8888_TO_1x0565 t2, t3, t4, t5 + +    sh       t3, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5 +4: +    j        ra +     nop + +END(pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_mips) +/* + * a0     - dst  (r5g6b5) + * a1     - src  (r5g6b5) + * a2     - mask (a8) + * a3     - w + * 16(sp) - vx + * 20(sp) - unit_x + */ + +    beqz     a3, 4f +     nop +    SAVE_REGS_ON_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5 +    lw       v0, 36(sp) /* v0 = vx */ +    lw       v1, 40(sp) /* v1 = unit_x */ +    li       t4, 0xf800f800 +    li       t5, 0x07e007e0 +    li       t6, 0x001F001F +    li       t7, 0x00ff00ff + +    addiu    t1, a3, -1 +    beqz     t1, 2f +     nop +1: +    sra      t0, v0, 16 /* t0 = vx >> 16 */ +    sll      t0, t0, 1  /* t0 = t0 * 2      (r5g6b5) */ +    addu     t0, a1, t0 +    lhu      t0, 0(t0)  /* t0 = source      (r5g6b5) */ +    addu     v0, v0, v1 /* v0 = vx + unit_x */ +    sra      t1, v0, 16 /* t1 = vx >> 16 */ +    sll      t1, t1, 1  /* t1 = t1 * 2      (r5g6b5) */ +    addu     t1, a1, t1 +    lhu      t1, 0(t1)  /* t1 = source      (r5g6b5) */ +    addu     v0, v0, v1 /* v0 = vx + unit_x */ +    lbu      t2, 0(a2)  /* t2 = mask        (a8) */ +    lbu      t3, 1(a2)  /* t3 = mask        (a8) */ +    lhu      t8, 0(a0)  /* t8 = destination (r5g6b5) */ +    lhu      t9, 2(a0)  /* t9 = destination (r5g6b5) */ +    addiu    a2, a2, 2 + +    CONVERT_2x0565_TO_2x8888 t0, t1, s0, s1, t5, t6, s2, s3, s4, s5 +    CONVERT_2x0565_TO_2x8888 t8, t9, s2, s3, t5, t6, s4, s5, t0, t1 +    OVER_2x8888_2x8_2x8888   s0, s1, \ +                             t2, t3, \ +                             s2, s3, \ +                             t0, t1, \ +                             t7, t8, t9, s4, s5, s0, s1 +    CONVERT_2x8888_TO_2x0565 t0, t1, s0, s1, t4, t5, t6, s2, s3 + +    sh       s0, 0(a0) +    sh       s1, 2(a0) +    addiu    a3, a3, -2 +    addiu    t1, a3, -1 +    bgtz     t1, 1b +     addiu   a0, a0, 4 +2: +    beqz     a3, 3f +     nop +    sra      t0, v0, 16 /* t0 = vx >> 16 */ +    sll      t0, t0, 1  /* t0 = t0 * 2      (r5g6b5) */ +    addu     t0, a1, t0 + +    lhu      t0, 0(t0)  /* t0 = source      (r5g6b5) */ +    lbu      t1, 0(a2)  /* t1 = mask        (a8) */ +    lhu      t2, 0(a0)  /* t2 = destination (r5g6b5) */ + +    CONVERT_1x0565_TO_1x8888 t0, t3, t4, t5 +    CONVERT_1x0565_TO_1x8888 t2, t4, t5, t6 +    OVER_8888_8_8888         t3, t1, t4, t0, t7, t2, t5, t6, t8 +    CONVERT_1x8888_TO_1x0565 t0, t3, t4, t5 + +    sh       t3, 0(a0) +3: +    RESTORE_REGS_FROM_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5 +4: +    j        ra +     nop + +END(pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_mips) +/* + * a0     - *dst + * a1     - *src_top + * a2     - *src_bottom + * a3     - w + * 16(sp) - wt + * 20(sp) - wb + * 24(sp) - vx + * 28(sp) - unit_x + */ + +    beqz     a3, 1f +     nop + +    SAVE_REGS_ON_STACK 20, v0, s0, s1, s2, s3, s4, s5, s6, s7 + +    lw       s0, 36(sp)     /* s0 = wt */ +    lw       s1, 40(sp)     /* s1 = wb */ +    lw       s2, 44(sp)     /* s2 = vx */ +    lw       s3, 48(sp)     /* s3 = unit_x */ +    li       v0, BILINEAR_INTERPOLATION_RANGE + +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +0: +    andi     t4, s2, 0xffff /* t4 = (short)vx */ +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */ +    subu     t5, v0, t4     /* t5 = ( 256 - (vx>>8)) */ + +    mul      s4, s0, t5     /* s4 = wt*(256-(vx>>8)) */ +    mul      s5, s0, t4     /* s5 = wt*(vx>>8) */ +    mul      s6, s1, t5     /* s6 = wb*(256-(vx>>8)) */ +    mul      s7, s1, t4     /* s7 = wb*(vx>>8) */ + +    sra      t9, s2, 16 +    sll      t9, t9, 2 +    addiu    t8, t9, 4 +    lwx      t0, t9(a1)     /* t0 = tl */ +    lwx      t1, t8(a1)     /* t1 = tr */ +    addiu    a3, a3, -1 +    lwx      t2, t9(a2)     /* t2 = bl */ +    lwx      t3, t8(a2)     /* t3 = br */ + +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7 + +    addu     s2, s2, s3     /* vx += unit_x; */ +    sw       t0, 0(a0) +    bnez     a3, 0b +     addiu   a0, a0, 4 + +    RESTORE_REGS_FROM_STACK 20, v0, s0, s1, s2, s3, s4, s5, s6, s7 +1: +    j        ra +     nop + +END(pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_mips) +/* + * a0     - *dst + * a1     - *src_top + * a2     - *src_bottom + * a3     - w + * 16(sp) - wt + * 20(sp) - wb + * 24(sp) - vx + * 28(sp) - unit_x + */ + +    beqz     a3, 1f +     nop + +    SAVE_REGS_ON_STACK 20, v0, s0, s1, s2, s3, s4, s5, s6, s7 + +    lw       s0, 36(sp)     /* s0 = wt */ +    lw       s1, 40(sp)     /* s1 = wb */ +    lw       s2, 44(sp)     /* s2 = vx */ +    lw       s3, 48(sp)     /* s3 = unit_x */ +    li       v0, BILINEAR_INTERPOLATION_RANGE + +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +0: +    andi     t4, s2, 0xffff /* t4 = (short)vx */ +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */ +    subu     t5, v0, t4     /* t5 = ( 256 - (vx>>8)) */ + +    mul      s4, s0, t5     /* s4 = wt*(256-(vx>>8)) */ +    mul      s5, s0, t4     /* s5 = wt*(vx>>8) */ +    mul      s6, s1, t5     /* s6 = wb*(256-(vx>>8)) */ +    mul      s7, s1, t4     /* s7 = wb*(vx>>8) */ + +    sra      t9, s2, 16 +    sll      t9, t9, 2 +    addiu    t8, t9, 4 +    lwx      t0, t9(a1)     /* t0 = tl */ +    lwx      t1, t8(a1)     /* t1 = tr */ +    addiu    a3, a3, -1 +    lwx      t2, t9(a2)     /* t2 = bl */ +    lwx      t3, t8(a2)     /* t3 = br */ + +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7 +    CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3 + +    addu     s2, s2, s3     /* vx += unit_x; */ +    sh       t1, 0(a0) +    bnez     a3, 0b +     addiu   a0, a0, 2 + +    RESTORE_REGS_FROM_STACK 20, v0, s0, s1, s2, s3, s4, s5, s6, s7 +1: +    j        ra +     nop + +END(pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_0565_8888_SRC_asm_mips) +/* + * a0     - *dst + * a1     - *src_top + * a2     - *src_bottom + * a3     - w + * 16(sp) - wt + * 20(sp) - wb + * 24(sp) - vx + * 28(sp) - unit_x + */ + +    beqz     a3, 1f +     nop + +    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8 + +    lw       s0, 44(sp)     /* s0 = wt */ +    lw       s1, 48(sp)     /* s1 = wb */ +    lw       s2, 52(sp)     /* s2 = vx */ +    lw       s3, 56(sp)     /* s3 = unit_x */ +    li       v0, BILINEAR_INTERPOLATION_RANGE +    li       v1, 0x07e007e0 +    li       s8, 0x001f001f + +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +0: +    andi     t4, s2, 0xffff /* t4 = (short)vx */ +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */ +    subu     t5, v0, t4     /* t5 = ( 256 - (vx>>8)) */ + +    mul      s4, s0, t5     /* s4 = wt*(256-(vx>>8)) */ +    mul      s5, s0, t4     /* s5 = wt*(vx>>8) */ +    mul      s6, s1, t5     /* s6 = wb*(256-(vx>>8)) */ +    mul      s7, s1, t4     /* s7 = wb*(vx>>8) */ + +    sra      t9, s2, 16 +    sll      t9, t9, 1 +    addiu    t8, t9, 2 +    lhx      t0, t9(a1)     /* t0 = tl */ +    lhx      t1, t8(a1)     /* t1 = tr */ +    andi     t1, t1, 0xffff +    addiu    a3, a3, -1 +    lhx      t2, t9(a2)     /* t2 = bl */ +    lhx      t3, t8(a2)     /* t3 = br */ +    andi     t3, t3, 0xffff + +    CONVERT_2x0565_TO_2x8888 t0, t1, t0, t1, v1, s8, t4, t5, t6, t7 +    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, v1, s8, t4, t5, t6, t7 +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7 + +    addu     s2, s2, s3     /* vx += unit_x; */ +    sw       t0, 0(a0) +    bnez     a3, 0b +     addiu   a0, a0, 4 + +    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8 +1: +    j        ra +     nop + +END(pixman_scaled_bilinear_scanline_0565_8888_SRC_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_mips) +/* + * a0     - *dst + * a1     - *src_top + * a2     - *src_bottom + * a3     - w + * 16(sp) - wt + * 20(sp) - wb + * 24(sp) - vx + * 28(sp) - unit_x + */ + +    beqz     a3, 1f +     nop + +    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8 + +    lw       s0, 44(sp)     /* s0 = wt */ +    lw       s1, 48(sp)     /* s1 = wb */ +    lw       s2, 52(sp)     /* s2 = vx */ +    lw       s3, 56(sp)     /* s3 = unit_x */ +    li       v0, BILINEAR_INTERPOLATION_RANGE +    li       v1, 0x07e007e0 +    li       s8, 0x001f001f + +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +0: +    andi     t4, s2, 0xffff /* t4 = (short)vx */ +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */ +    subu     t5, v0, t4     /* t5 = ( 256 - (vx>>8)) */ + +    mul      s4, s0, t5     /* s4 = wt*(256-(vx>>8)) */ +    mul      s5, s0, t4     /* s5 = wt*(vx>>8) */ +    mul      s6, s1, t5     /* s6 = wb*(256-(vx>>8)) */ +    mul      s7, s1, t4     /* s7 = wb*(vx>>8) */ + +    sra      t9, s2, 16 +    sll      t9, t9, 1 +    addiu    t8, t9, 2 +    lhx      t0, t9(a1)     /* t0 = tl */ +    lhx      t1, t8(a1)     /* t1 = tr */ +    andi     t1, t1, 0xffff +    addiu    a3, a3, -1 +    lhx      t2, t9(a2)     /* t2 = bl */ +    lhx      t3, t8(a2)     /* t3 = br */ +    andi     t3, t3, 0xffff + +    CONVERT_2x0565_TO_2x8888 t0, t1, t0, t1, v1, s8, t4, t5, t6, t7 +    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, v1, s8, t4, t5, t6, t7 +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7 +    CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3 + +    addu     s2, s2, s3     /* vx += unit_x; */ +    sh       t1, 0(a0) +    bnez     a3, 0b +     addiu   a0, a0, 2 + +    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8 +1: +    j        ra +     nop + +END(pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_mips) +/* + * a0     - *dst + * a1     - *src_top + * a2     - *src_bottom + * a3     - w + * 16(sp) - wt + * 20(sp) - wb + * 24(sp) - vx + * 28(sp) - unit_x + */ + +    beqz     a3, 1f +     nop + +    SAVE_REGS_ON_STACK 24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8 + +    lw       s0, 40(sp)     /* s0 = wt */ +    lw       s1, 44(sp)     /* s1 = wb */ +    lw       s2, 48(sp)     /* s2 = vx */ +    lw       s3, 52(sp)     /* s3 = unit_x */ +    li       v0, BILINEAR_INTERPOLATION_RANGE +    li       s8, 0x00ff00ff + +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +0: +    andi     t4, s2, 0xffff /* t4 = (short)vx */ +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */ +    subu     t5, v0, t4     /* t5 = ( 256 - (vx>>8)) */ + +    mul      s4, s0, t5     /* s4 = wt*(256-(vx>>8)) */ +    mul      s5, s0, t4     /* s5 = wt*(vx>>8) */ +    mul      s6, s1, t5     /* s6 = wb*(256-(vx>>8)) */ +    mul      s7, s1, t4     /* s7 = wb*(vx>>8) */ + +    sra      t9, s2, 16 +    sll      t9, t9, 2 +    addiu    t8, t9, 4 +    lwx      t0, t9(a1)     /* t0 = tl */ +    lwx      t1, t8(a1)     /* t1 = tr */ +    addiu    a3, a3, -1 +    lwx      t2, t9(a2)     /* t2 = bl */ +    lwx      t3, t8(a2)     /* t3 = br */ + +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7 +    lw       t1, 0(a0)      /* t1 = dest */ +    OVER_8888_8888 t0, t1, t2, s8, t3, t4, t5, t6 + +    addu     s2, s2, s3     /* vx += unit_x; */ +    sw       t2, 0(a0) +    bnez     a3, 0b +     addiu   a0, a0, 4 + +    RESTORE_REGS_FROM_STACK 24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8 +1: +    j        ra +     nop + +END(pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_mips) +/* + * a0     - *dst + * a1     - *src_top + * a2     - *src_bottom + * a3     - w + * 16(sp) - wt + * 20(sp) - wb + * 24(sp) - vx + * 28(sp) - unit_x + */ + +    beqz         a3, 1f +     nop + +    SAVE_REGS_ON_STACK 20, v0, s0, s1, s2, s3, s4, s5, s6, s7 + +    lw           s0, 36(sp)     /* s0 = wt */ +    lw           s1, 40(sp)     /* s1 = wb */ +    lw           s2, 44(sp)     /* s2 = vx */ +    lw           s3, 48(sp)     /* s3 = unit_x */ +    li           v0, BILINEAR_INTERPOLATION_RANGE + +    sll          s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +    sll          s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +0: +    andi         t4, s2, 0xffff /* t4 = (short)vx */ +    srl          t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */ +    subu         t5, v0, t4     /* t5 = ( 256 - (vx>>8)) */ + +    mul          s4, s0, t5     /* s4 = wt*(256-(vx>>8)) */ +    mul          s5, s0, t4     /* s5 = wt*(vx>>8) */ +    mul          s6, s1, t5     /* s6 = wb*(256-(vx>>8)) */ +    mul          s7, s1, t4     /* s7 = wb*(vx>>8) */ + +    sra          t9, s2, 16 +    sll          t9, t9, 2 +    addiu        t8, t9, 4 +    lwx          t0, t9(a1)     /* t0 = tl */ +    lwx          t1, t8(a1)     /* t1 = tr */ +    addiu        a3, a3, -1 +    lwx          t2, t9(a2)     /* t2 = bl */ +    lwx          t3, t8(a2)     /* t3 = br */ + +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7 +    lw           t1, 0(a0) +    addu_s.qb    t2, t0, t1 + +    addu         s2, s2, s3     /* vx += unit_x; */ +    sw           t2, 0(a0) +    bnez         a3, 0b +     addiu       a0, a0, 4 + +    RESTORE_REGS_FROM_STACK 20, v0, s0, s1, s2, s3, s4, s5, s6, s7 +1: +    j            ra +     nop + +END(pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_mips) +/* + * a0     - *dst + * a1     - *mask + * a2     - *src_top + * a3     - *src_bottom + * 16(sp) - wt + * 20(sp) - wb + * 24(sp) - vx + * 28(sp) - unit_x + * 32(sp) - w + */ + +    lw       v1, 32(sp) +    beqz     v1, 1f +     nop + +    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8 + +    lw       s0, 44(sp)        /* s0 = wt */ +    lw       s1, 48(sp)        /* s1 = wb */ +    lw       s2, 52(sp)        /* s2 = vx */ +    lw       s3, 56(sp)        /* s3 = unit_x */ +    li       v0, BILINEAR_INTERPOLATION_RANGE +    li       s8, 0x00ff00ff + +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +0: +    andi     t4, s2, 0xffff    /* t4 = (short)vx */ +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */ +    subu     t5, v0, t4        /* t5 = ( 256 - (vx>>8)) */ + +    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */ +    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */ +    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */ +    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */ + +    sra      t9, s2, 16 +    sll      t9, t9, 2 +    addiu    t8, t9, 4 +    lwx      t0, t9(a2)        /* t0 = tl */ +    lwx      t1, t8(a2)        /* t1 = tr */ +    addiu    v1, v1, -1 +    lwx      t2, t9(a3)        /* t2 = bl */ +    lwx      t3, t8(a3)        /* t3 = br */ + +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7 +    lbu      t1, 0(a1)         /* t1 = mask */ +    addiu    a1, a1, 1 +    MIPS_UN8x4_MUL_UN8 t0, t1, t0, s8, t2, t3, t4 + +    addu     s2, s2, s3        /* vx += unit_x; */ +    sw       t0, 0(a0) +    bnez     v1, 0b +     addiu   a0, a0, 4 + +    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8 +1: +    j        ra +     nop + +END(pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_mips) +/* + * a0     - *dst + * a1     - *mask + * a2     - *src_top + * a3     - *src_bottom + * 16(sp) - wt + * 20(sp) - wb + * 24(sp) - vx + * 28(sp) - unit_x + * 32(sp) - w + */ + +    lw       v1, 32(sp) +    beqz     v1, 1f +     nop + +    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8 + +    lw       s0, 44(sp)        /* s0 = wt */ +    lw       s1, 48(sp)        /* s1 = wb */ +    lw       s2, 52(sp)        /* s2 = vx */ +    lw       s3, 56(sp)        /* s3 = unit_x */ +    li       v0, BILINEAR_INTERPOLATION_RANGE +    li       s8, 0x00ff00ff + +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +0: +    andi     t4, s2, 0xffff    /* t4 = (short)vx */ +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */ +    subu     t5, v0, t4        /* t5 = ( 256 - (vx>>8)) */ + +    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */ +    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */ +    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */ +    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */ + +    sra      t9, s2, 16 +    sll      t9, t9, 2 +    addiu    t8, t9, 4 +    lwx      t0, t9(a2)        /* t0 = tl */ +    lwx      t1, t8(a2)        /* t1 = tr */ +    addiu    v1, v1, -1 +    lwx      t2, t9(a3)        /* t2 = bl */ +    lwx      t3, t8(a3)        /* t3 = br */ + +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7 +    lbu      t1, 0(a1)         /* t1 = mask */ +    addiu    a1, a1, 1 +    MIPS_UN8x4_MUL_UN8 t0, t1, t0, s8, t2, t3, t4 +    CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3 + +    addu     s2, s2, s3        /* vx += unit_x; */ +    sh       t1, 0(a0) +    bnez     v1, 0b +     addiu   a0, a0, 2 + +    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8 +1: +    j        ra +     nop + +END(pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_mips) +/* + * a0     - *dst + * a1     - *mask + * a2     - *src_top + * a3     - *src_bottom + * 16(sp) - wt + * 20(sp) - wb + * 24(sp) - vx + * 28(sp) - unit_x + * 32(sp) - w + */ + +    lw       t0, 32(sp) +    beqz     t0, 1f +     nop + +    SAVE_REGS_ON_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra + +    lw       s0, 48(sp)        /* s0 = wt */ +    lw       s1, 52(sp)        /* s1 = wb */ +    lw       s2, 56(sp)        /* s2 = vx */ +    lw       s3, 60(sp)        /* s3 = unit_x */ +    lw       ra, 64(sp)        /* ra = w */ +    li       v0, 0x00ff00ff +    li       v1, 0x07e007e0 +    li       s8, 0x001f001f + +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +0: +    andi     t4, s2, 0xffff    /* t4 = (short)vx */ +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */ +    li       t5, BILINEAR_INTERPOLATION_RANGE +    subu     t5, t5, t4        /* t5 = ( 256 - (vx>>8)) */ + +    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */ +    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */ +    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */ +    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */ + +    sra      t9, s2, 16 +    sll      t9, t9, 1 +    addiu    t8, t9, 2 +    lhx      t0, t9(a2)        /* t0 = tl */ +    lhx      t1, t8(a2)        /* t1 = tr */ +    andi     t1, t1, 0xffff +    addiu    ra, ra, -1 +    lhx      t2, t9(a3)        /* t2 = bl */ +    lhx      t3, t8(a3)        /* t3 = br */ +    andi     t3, t3, 0xffff + +    CONVERT_2x0565_TO_2x8888 t0, t1, t0, t1, v1, s8, t4, t5, t6, t7 +    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, v1, s8, t4, t5, t6, t7 +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7 +    lbu      t1, 0(a1)         /* t1 = mask */ +    addiu    a1, a1, 1 +    MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t2, t3, t4 + +    addu     s2, s2, s3        /* vx += unit_x; */ +    sw       t0, 0(a0) +    bnez     ra, 0b +     addiu   a0, a0, 4 + +    RESTORE_REGS_FROM_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra +1: +    j        ra +     nop + +END(pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_mips) +/* + * a0     - *dst + * a1     - *mask + * a2     - *src_top + * a3     - *src_bottom + * 16(sp) - wt + * 20(sp) - wb + * 24(sp) - vx + * 28(sp) - unit_x + * 32(sp) - w + */ + +    lw       t0, 32(sp) +    beqz     t0, 1f +     nop + +    SAVE_REGS_ON_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra + +    lw       s0, 48(sp)        /* s0 = wt */ +    lw       s1, 52(sp)        /* s1 = wb */ +    lw       s2, 56(sp)        /* s2 = vx */ +    lw       s3, 60(sp)        /* s3 = unit_x */ +    lw       ra, 64(sp)        /* ra = w */ +    li       v0, 0x00ff00ff +    li       v1, 0x07e007e0 +    li       s8, 0x001f001f + +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +0: +    andi     t4, s2, 0xffff    /* t4 = (short)vx */ +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */ +    li       t5, BILINEAR_INTERPOLATION_RANGE +    subu     t5, t5, t4        /* t5 = ( 256 - (vx>>8)) */ + +    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */ +    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */ +    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */ +    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */ + +    sra      t9, s2, 16 +    sll      t9, t9, 1 +    addiu    t8, t9, 2 +    lhx      t0, t9(a2)        /* t0 = tl */ +    lhx      t1, t8(a2)        /* t1 = tr */ +    andi     t1, t1, 0xffff +    addiu    ra, ra, -1 +    lhx      t2, t9(a3)        /* t2 = bl */ +    lhx      t3, t8(a3)        /* t3 = br */ +    andi     t3, t3, 0xffff + +    CONVERT_2x0565_TO_2x8888 t0, t1, t0, t1, v1, s8, t4, t5, t6, t7 +    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, v1, s8, t4, t5, t6, t7 +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7 +    lbu      t1, 0(a1)         /* t1 = mask */ +    addiu    a1, a1, 1 +    MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t2, t3, t4 +    CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3 + +    addu     s2, s2, s3        /* vx += unit_x; */ +    sh       t1, 0(a0) +    bnez     ra, 0b +     addiu   a0, a0, 2 + +    RESTORE_REGS_FROM_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra +1: +    j        ra +     nop + +END(pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_mips) +/* + * a0     - dst        (a8r8g8b8) + * a1     - mask       (a8) + * a2     - src_top    (a8r8g8b8) + * a3     - src_bottom (a8r8g8b8) + * 16(sp) - wt + * 20(sp) - wb + * 24(sp) - vx + * 28(sp) - unit_x + * 32(sp) - w + */ + +    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8 + +    lw       v1, 60(sp)        /* v1 = w(sp + 32 + 28 save regs stack offset)*/ +    beqz     v1, 1f +     nop + +    lw       s0, 44(sp)        /* s0 = wt */ +    lw       s1, 48(sp)        /* s1 = wb */ +    lw       s2, 52(sp)        /* s2 = vx */ +    lw       s3, 56(sp)        /* s3 = unit_x */ +    li       v0, BILINEAR_INTERPOLATION_RANGE +    li       s8, 0x00ff00ff + +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) + +0: +    andi     t4, s2, 0xffff    /* t4 = (short)vx */ +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */ +    subu     t5, v0, t4        /* t5 = ( 256 - (vx>>8)) */ + +    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */ +    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */ +    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */ +    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */ + +    sra      t9, s2, 16 +    sll      t9, t9, 2 +    addiu    t8, t9, 4 +    lwx      t0, t9(a2)        /* t0 = tl */ +    lwx      t1, t8(a2)        /* t1 = tr */ +    addiu    v1, v1, -1 +    lwx      t2, t9(a3)        /* t2 = bl */ +    lwx      t3, t8(a3)        /* t3 = br */ + +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, \ +                                      t4, t5, t6, t7, t8, t9, s4, s5, s6, s7 +    lbu      t1, 0(a1)         /* t1 = mask */ +    lw       t2, 0(a0)         /* t2 = dst */ +    addiu    a1, a1, 1 +    OVER_8888_8_8888 t0, t1, t2, t0, s8, t3, t4, t5, t6 + +    addu     s2, s2, s3        /* vx += unit_x; */ +    sw       t0, 0(a0) +    bnez     v1, 0b +     addiu   a0, a0, 4 + +1: +    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8 +    j        ra +     nop + +END(pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_mips) + +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_mips) +/* + * a0     - *dst + * a1     - *mask + * a2     - *src_top + * a3     - *src_bottom + * 16(sp) - wt + * 20(sp) - wb + * 24(sp) - vx + * 28(sp) - unit_x + * 32(sp) - w + */ + +    lw       v1, 32(sp) +    beqz     v1, 1f +     nop + +    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8 + +    lw       s0, 44(sp)        /* s0 = wt */ +    lw       s1, 48(sp)        /* s1 = wb */ +    lw       s2, 52(sp)        /* s2 = vx */ +    lw       s3, 56(sp)        /* s3 = unit_x */ +    li       v0, BILINEAR_INTERPOLATION_RANGE +    li       s8, 0x00ff00ff + +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS)) +0: +    andi     t4, s2, 0xffff    /* t4 = (short)vx */ +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */ +    subu     t5, v0, t4        /* t5 = ( 256 - (vx>>8)) */ + +    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */ +    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */ +    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */ +    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */ + +    sra      t9, s2, 16 +    sll      t9, t9, 2 +    addiu    t8, t9, 4 +    lwx      t0, t9(a2)        /* t0 = tl */ +    lwx      t1, t8(a2)        /* t1 = tr */ +    addiu    v1, v1, -1 +    lwx      t2, t9(a3)        /* t2 = bl */ +    lwx      t3, t8(a3)        /* t3 = br */ + +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7 +    lbu      t1, 0(a1)         /* t1 = mask */ +    lw       t2, 0(a0)         /* t2 = dst */ +    addiu    a1, a1, 1 +    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, t1, t2, t0, s8, t3, t4, t5 + +    addu     s2, s2, s3        /* vx += unit_x; */ +    sw       t0, 0(a0) +    bnez     v1, 0b +     addiu   a0, a0, 4 + +    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8 +1: +    j        ra +     nop + +END(pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_mips) diff --git a/libs/pixman-0.40.0/pixman/pixman-mips-dspr2-asm.h b/libs/pixman-0.40.0/pixman/pixman-mips-dspr2-asm.h new file mode 100644 index 0000000..e238566 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-mips-dspr2-asm.h @@ -0,0 +1,711 @@ +/* + * Copyright (c) 2012 + *      MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *    notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + *    notice, this list of conditions and the following disclaimer in the + *    documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + *    contributors may be used to endorse or promote products derived from + *    this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author:  Nemanja Lukic (nemanja.lukic@rt-rk.com) + */ + +#ifndef PIXMAN_MIPS_DSPR2_ASM_H +#define PIXMAN_MIPS_DSPR2_ASM_H + +#define zero $0 +#define AT   $1 +#define v0   $2 +#define v1   $3 +#define a0   $4 +#define a1   $5 +#define a2   $6 +#define a3   $7 +#define t0   $8 +#define t1   $9 +#define t2   $10 +#define t3   $11 +#define t4   $12 +#define t5   $13 +#define t6   $14 +#define t7   $15 +#define s0   $16 +#define s1   $17 +#define s2   $18 +#define s3   $19 +#define s4   $20 +#define s5   $21 +#define s6   $22 +#define s7   $23 +#define t8   $24 +#define t9   $25 +#define k0   $26 +#define k1   $27 +#define gp   $28 +#define sp   $29 +#define fp   $30 +#define s8   $30 +#define ra   $31 + +/* + * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2 + */ +#define LEAF_MIPS32R2(symbol)                           \ +                .globl  symbol;                         \ +                .align  2;                              \ +                .hidden symbol;                         \ +                .type   symbol, @function;              \ +                .ent    symbol, 0;                      \ +symbol:         .frame  sp, 0, ra;                      \ +                .set    push;                           \ +                .set    arch=mips32r2;                  \ +                .set    noreorder;                      \ +                .set    noat; + +/* + * LEAF_MIPS32R2 - declare leaf routine for MIPS DSPr2 + */ +#define LEAF_MIPS_DSPR2(symbol)                         \ +LEAF_MIPS32R2(symbol)                                   \ +                .set    dspr2; + +/* + * END - mark end of function + */ +#define END(function)                                   \ +                .set    pop;                            \ +                .end    function;                       \ +                .size   function,.-function + +/* + * Checks if stack offset is big enough for storing/restoring regs_num + * number of register to/from stack. Stack offset must be greater than + * or equal to the number of bytes needed for storing registers (regs_num*4). + * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is + * preserved for input arguments of the functions, already stored in a0-a3), + * stack size can be further optimized by utilizing this space. + */ +.macro CHECK_STACK_OFFSET regs_num, stack_offset +.if \stack_offset < \regs_num * 4 - 16 +.error "Stack offset too small." +.endif +.endm + +/* + * Saves set of registers on stack. Maximum number of registers that + * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). + * Stack offset is number of bytes that are added to stack pointer (sp) + * before registers are pushed in order to provide enough space on stack + * (offset must be multiple of 4, and must be big enough, as described by + * CHECK_STACK_OFFSET macro). This macro is intended to be used in + * combination with RESTORE_REGS_FROM_STACK macro. Example: + *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1 + *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 + */ +.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \ +                          r2  = 0, r3  = 0, r4  = 0, \ +                          r5  = 0, r6  = 0, r7  = 0, \ +                          r8  = 0, r9  = 0, r10 = 0, \ +                          r11 = 0, r12 = 0, r13 = 0, \ +                          r14 = 0 +    .if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4) +    .error "Stack offset must be pozitive and multiple of 4." +    .endif +    .if \stack_offset != 0 +    addiu           sp, sp, -\stack_offset +    .endif +    sw              \r1, 0(sp) +    .if \r2 != 0 +    sw              \r2, 4(sp) +    .endif +    .if \r3 != 0 +    sw              \r3, 8(sp) +    .endif +    .if \r4 != 0 +    sw              \r4, 12(sp) +    .endif +    .if \r5 != 0 +    CHECK_STACK_OFFSET 5, \stack_offset +    sw              \r5, 16(sp) +    .endif +    .if \r6 != 0 +    CHECK_STACK_OFFSET 6, \stack_offset +    sw              \r6, 20(sp) +    .endif +    .if \r7 != 0 +    CHECK_STACK_OFFSET 7, \stack_offset +    sw              \r7, 24(sp) +    .endif +    .if \r8 != 0 +    CHECK_STACK_OFFSET 8, \stack_offset +    sw              \r8, 28(sp) +    .endif +    .if \r9 != 0 +    CHECK_STACK_OFFSET 9, \stack_offset +    sw              \r9, 32(sp) +    .endif +    .if \r10 != 0 +    CHECK_STACK_OFFSET 10, \stack_offset +    sw              \r10, 36(sp) +    .endif +    .if \r11 != 0 +    CHECK_STACK_OFFSET 11, \stack_offset +    sw              \r11, 40(sp) +    .endif +    .if \r12 != 0 +    CHECK_STACK_OFFSET 12, \stack_offset +    sw              \r12, 44(sp) +    .endif +    .if \r13 != 0 +    CHECK_STACK_OFFSET 13, \stack_offset +    sw              \r13, 48(sp) +    .endif +    .if \r14 != 0 +    CHECK_STACK_OFFSET 14, \stack_offset +    sw              \r14, 52(sp) +    .endif +.endm + +/* + * Restores set of registers from stack. Maximum number of registers that + * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). + * Stack offset is number of bytes that are added to stack pointer (sp) + * after registers are restored (offset must be multiple of 4, and must + * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is + * intended to be used in combination with RESTORE_REGS_FROM_STACK macro. + * Example: + *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1 + *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 + */ +.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \ +                               r2  = 0, r3  = 0, r4  = 0, \ +                               r5  = 0, r6  = 0, r7  = 0, \ +                               r8  = 0, r9  = 0, r10 = 0, \ +                               r11 = 0, r12 = 0, r13 = 0, \ +                               r14 = 0 +    .if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4) +    .error "Stack offset must be pozitive and multiple of 4." +    .endif +    lw              \r1, 0(sp) +    .if \r2 != 0 +    lw              \r2, 4(sp) +    .endif +    .if \r3 != 0 +    lw              \r3, 8(sp) +    .endif +    .if \r4 != 0 +    lw              \r4, 12(sp) +    .endif +    .if \r5 != 0 +    CHECK_STACK_OFFSET 5, \stack_offset +    lw              \r5, 16(sp) +    .endif +    .if \r6 != 0 +    CHECK_STACK_OFFSET 6, \stack_offset +    lw              \r6, 20(sp) +    .endif +    .if \r7 != 0 +    CHECK_STACK_OFFSET 7, \stack_offset +    lw              \r7, 24(sp) +    .endif +    .if \r8 != 0 +    CHECK_STACK_OFFSET 8, \stack_offset +    lw              \r8, 28(sp) +    .endif +    .if \r9 != 0 +    CHECK_STACK_OFFSET 9, \stack_offset +    lw              \r9, 32(sp) +    .endif +    .if \r10 != 0 +    CHECK_STACK_OFFSET 10, \stack_offset +    lw              \r10, 36(sp) +    .endif +    .if \r11 != 0 +    CHECK_STACK_OFFSET 11, \stack_offset +    lw              \r11, 40(sp) +    .endif +    .if \r12 != 0 +    CHECK_STACK_OFFSET 12, \stack_offset +    lw              \r12, 44(sp) +    .endif +    .if \r13 != 0 +    CHECK_STACK_OFFSET 13, \stack_offset +    lw              \r13, 48(sp) +    .endif +    .if \r14 != 0 +    CHECK_STACK_OFFSET 14, \stack_offset +    lw              \r14, 52(sp) +    .endif +    .if \stack_offset != 0 +    addiu           sp, sp, \stack_offset +    .endif +.endm + +/* + * Conversion of single r5g6b5 pixel (in_565) to single a8r8g8b8 pixel + * returned in (out_8888) register. Requires two temporary registers + * (scratch1 and scratch2). + */ +.macro CONVERT_1x0565_TO_1x8888 in_565,   \ +                                out_8888, \ +                                scratch1, scratch2 +    lui     \out_8888, 0xff00 +    sll     \scratch1, \in_565,   0x3 +    andi    \scratch2, \scratch1, 0xff +    ext     \scratch1, \in_565,   0x2, 0x3 +    or      \scratch1, \scratch2, \scratch1 +    or      \out_8888, \out_8888, \scratch1 + +    sll     \scratch1, \in_565,   0x5 +    andi    \scratch1, \scratch1, 0xfc00 +    srl     \scratch2, \in_565,   0x1 +    andi    \scratch2, \scratch2, 0x300 +    or      \scratch2, \scratch1, \scratch2 +    or      \out_8888, \out_8888, \scratch2 + +    andi    \scratch1, \in_565,   0xf800 +    srl     \scratch2, \scratch1, 0x5 +    andi    \scratch2, \scratch2, 0xff00 +    or      \scratch1, \scratch1, \scratch2 +    sll     \scratch1, \scratch1, 0x8 +    or      \out_8888, \out_8888, \scratch1 +.endm + +/* + * Conversion of two r5g6b5 pixels (in1_565 and in2_565) to two a8r8g8b8 pixels + * returned in (out1_8888 and out2_8888) registers. Requires four scratch + * registers (scratch1 ... scratch4). It also requires maskG and maskB for + * color component extractions. These masks must have following values: + *   li       maskG, 0x07e007e0 + *   li       maskB, 0x001F001F + */ +.macro CONVERT_2x0565_TO_2x8888 in1_565, in2_565,     \ +                                out1_8888, out2_8888, \ +                                maskG, maskB,         \ +                                scratch1, scratch2, scratch3, scratch4 +    sll               \scratch1,  \in1_565,   16 +    or                \scratch1,  \scratch1,  \in2_565 +    lui               \out2_8888, 0xff00 +    ori               \out2_8888, \out2_8888, 0xff00 +    shrl.ph           \scratch2,  \scratch1,  11 +    and               \scratch3,  \scratch1,  \maskG +    shra.ph           \scratch4,  \scratch2,  2 +    shll.ph           \scratch2,  \scratch2,  3 +    shll.ph           \scratch3,  \scratch3,  5 +    or                \scratch2,  \scratch2,  \scratch4 +    shrl.qb           \scratch4,  \scratch3,  6 +    or                \out2_8888, \out2_8888, \scratch2 +    or                \scratch3,  \scratch3,  \scratch4 +    and               \scratch1,  \scratch1,  \maskB +    shll.ph           \scratch2,  \scratch1,  3 +    shra.ph           \scratch4,  \scratch1,  2 +    or                \scratch2,  \scratch2,  \scratch4 +    or                \scratch3,  \scratch2,  \scratch3 +    precrq.ph.w       \out1_8888, \out2_8888, \scratch3 +    precr_sra.ph.w    \out2_8888, \scratch3,  0 +.endm + +/* + * Conversion of single a8r8g8b8 pixel (in_8888) to single r5g6b5 pixel + * returned in (out_565) register. Requires two temporary registers + * (scratch1 and scratch2). + */ +.macro CONVERT_1x8888_TO_1x0565 in_8888, \ +                                out_565, \ +                                scratch1, scratch2 +    ext     \out_565,  \in_8888,  0x3, 0x5 +    srl     \scratch1, \in_8888,  0x5 +    andi    \scratch1, \scratch1, 0x07e0 +    srl     \scratch2, \in_8888,  0x8 +    andi    \scratch2, \scratch2, 0xf800 +    or      \out_565,  \out_565,  \scratch1 +    or      \out_565,  \out_565,  \scratch2 +.endm + +/* + * Conversion of two a8r8g8b8 pixels (in1_8888 and in2_8888) to two r5g6b5 + * pixels returned in (out1_565 and out2_565) registers. Requires two temporary + * registers (scratch1 and scratch2). It also requires maskR, maskG and maskB + * for color component extractions. These masks must have following values: + *   li       maskR, 0xf800f800 + *   li       maskG, 0x07e007e0 + *   li       maskB, 0x001F001F + * Value of input register in2_8888 is lost. + */ +.macro CONVERT_2x8888_TO_2x0565 in1_8888, in2_8888,  \ +                                out1_565, out2_565,  \ +                                maskR, maskG, maskB, \ +                                scratch1, scratch2 +    precr.qb.ph    \scratch1, \in2_8888, \in1_8888 +    precrq.qb.ph   \in2_8888, \in2_8888, \in1_8888 +    and            \out1_565, \scratch1, \maskR +    shrl.ph        \scratch1, \scratch1, 3 +    shll.ph        \in2_8888, \in2_8888, 3 +    and            \scratch1, \scratch1, \maskB +    or             \out1_565, \out1_565, \scratch1 +    and            \in2_8888, \in2_8888, \maskG +    or             \out1_565, \out1_565, \in2_8888 +    srl            \out2_565, \out1_565, 16 +.endm + +/* + * Multiply pixel (a8) with single pixel (a8r8g8b8). It requires maskLSR needed + * for rounding process. maskLSR must have following value: + *   li       maskLSR, 0x00ff00ff + */ +.macro MIPS_UN8x4_MUL_UN8 s_8888,  \ +                          m_8,     \ +                          d_8888,  \ +                          maskLSR, \ +                          scratch1, scratch2, scratch3 +    replv.ph          \m_8,      \m_8                 /*   0 | M | 0 | M */ +    muleu_s.ph.qbl    \scratch1, \s_8888,   \m_8      /*    A*M  |  R*M */ +    muleu_s.ph.qbr    \scratch2, \s_8888,   \m_8      /*    G*M  |  B*M */ +    shra_r.ph         \scratch3, \scratch1, 8 +    shra_r.ph         \d_8888,   \scratch2, 8 +    and               \scratch3, \scratch3, \maskLSR  /*   0 |A*M| 0 |R*M */ +    and               \d_8888,   \d_8888,   \maskLSR  /*   0 |G*M| 0 |B*M */ +    addq.ph           \scratch1, \scratch1, \scratch3 /* A*M+A*M | R*M+R*M */ +    addq.ph           \scratch2, \scratch2, \d_8888   /* G*M+G*M | B*M+B*M */ +    shra_r.ph         \scratch1, \scratch1, 8 +    shra_r.ph         \scratch2, \scratch2, 8 +    precr.qb.ph       \d_8888,   \scratch1, \scratch2 +.endm + +/* + * Multiply two pixels (a8) with two pixels (a8r8g8b8). It requires maskLSR + * needed for rounding process. maskLSR must have following value: + *   li       maskLSR, 0x00ff00ff + */ +.macro MIPS_2xUN8x4_MUL_2xUN8 s1_8888, \ +                              s2_8888, \ +                              m1_8,    \ +                              m2_8,    \ +                              d1_8888, \ +                              d2_8888, \ +                              maskLSR, \ +                              scratch1, scratch2, scratch3, \ +                              scratch4, scratch5, scratch6 +    replv.ph          \m1_8,     \m1_8                /*  0 | M1 | 0 | M1 */ +    replv.ph          \m2_8,     \m2_8                /*  0 | M2 | 0 | M2 */ +    muleu_s.ph.qbl    \scratch1, \s1_8888,  \m1_8     /*  A1*M1  |  R1*M1 */ +    muleu_s.ph.qbr    \scratch2, \s1_8888,  \m1_8     /*  G1*M1  |  B1*M1 */ +    muleu_s.ph.qbl    \scratch3, \s2_8888,  \m2_8     /*  A2*M2  |  R2*M2 */ +    muleu_s.ph.qbr    \scratch4, \s2_8888,  \m2_8     /*  G2*M2  |  B2*M2 */ +    shra_r.ph         \scratch5, \scratch1, 8 +    shra_r.ph         \d1_8888,  \scratch2, 8 +    shra_r.ph         \scratch6, \scratch3, 8 +    shra_r.ph         \d2_8888,  \scratch4, 8 +    and               \scratch5, \scratch5, \maskLSR  /* 0 |A1*M1| 0 |R1*M1 */ +    and               \d1_8888,  \d1_8888,  \maskLSR  /* 0 |G1*M1| 0 |B1*M1 */ +    and               \scratch6, \scratch6, \maskLSR  /* 0 |A2*M2| 0 |R2*M2 */ +    and               \d2_8888,  \d2_8888,  \maskLSR  /* 0 |G2*M2| 0 |B2*M2 */ +    addq.ph           \scratch1, \scratch1, \scratch5 +    addq.ph           \scratch2, \scratch2, \d1_8888 +    addq.ph           \scratch3, \scratch3, \scratch6 +    addq.ph           \scratch4, \scratch4, \d2_8888 +    shra_r.ph         \scratch1, \scratch1, 8 +    shra_r.ph         \scratch2, \scratch2, 8 +    shra_r.ph         \scratch3, \scratch3, 8 +    shra_r.ph         \scratch4, \scratch4, 8 +    precr.qb.ph       \d1_8888,  \scratch1, \scratch2 +    precr.qb.ph       \d2_8888,  \scratch3, \scratch4 +.endm + +/* + * Multiply pixel (a8r8g8b8) with single pixel (a8r8g8b8). It requires maskLSR + * needed for rounding process. maskLSR must have following value: + *   li       maskLSR, 0x00ff00ff + */ +.macro MIPS_UN8x4_MUL_UN8x4 s_8888,  \ +                            m_8888,  \ +                            d_8888,  \ +                            maskLSR, \ +                            scratch1, scratch2, scratch3, scratch4 +    preceu.ph.qbl     \scratch1, \m_8888              /*   0 | A | 0 | R */ +    preceu.ph.qbr     \scratch2, \m_8888              /*   0 | G | 0 | B */ +    muleu_s.ph.qbl    \scratch3, \s_8888,   \scratch1 /*    A*A  |  R*R */ +    muleu_s.ph.qbr    \scratch4, \s_8888,   \scratch2 /*    G*G  |  B*B */ +    shra_r.ph         \scratch1, \scratch3, 8 +    shra_r.ph         \scratch2, \scratch4, 8 +    and               \scratch1, \scratch1, \maskLSR  /*   0 |A*A| 0 |R*R */ +    and               \scratch2, \scratch2, \maskLSR  /*   0 |G*G| 0 |B*B */ +    addq.ph           \scratch1, \scratch1, \scratch3 +    addq.ph           \scratch2, \scratch2, \scratch4 +    shra_r.ph         \scratch1, \scratch1, 8 +    shra_r.ph         \scratch2, \scratch2, 8 +    precr.qb.ph       \d_8888,   \scratch1, \scratch2 +.endm + +/* + * Multiply two pixels (a8r8g8b8) with two pixels (a8r8g8b8). It requires + * maskLSR needed for rounding process. maskLSR must have following value: + *   li       maskLSR, 0x00ff00ff + */ + +.macro MIPS_2xUN8x4_MUL_2xUN8x4 s1_8888,  \ +                                s2_8888,  \ +                                m1_8888,  \ +                                m2_8888,  \ +                                d1_8888,  \ +                                d2_8888,  \ +                                maskLSR,  \ +                                scratch1, scratch2, scratch3, \ +                                scratch4, scratch5, scratch6 +    preceu.ph.qbl     \scratch1, \m1_8888             /*   0 | A | 0 | R */ +    preceu.ph.qbr     \scratch2, \m1_8888             /*   0 | G | 0 | B */ +    preceu.ph.qbl     \scratch3, \m2_8888             /*   0 | A | 0 | R */ +    preceu.ph.qbr     \scratch4, \m2_8888             /*   0 | G | 0 | B */ +    muleu_s.ph.qbl    \scratch5, \s1_8888,  \scratch1 /*    A*A  |  R*R */ +    muleu_s.ph.qbr    \scratch6, \s1_8888,  \scratch2 /*    G*G  |  B*B */ +    muleu_s.ph.qbl    \scratch1, \s2_8888,  \scratch3 /*    A*A  |  R*R */ +    muleu_s.ph.qbr    \scratch2, \s2_8888,  \scratch4 /*    G*G  |  B*B */ +    shra_r.ph         \scratch3, \scratch5, 8 +    shra_r.ph         \scratch4, \scratch6, 8 +    shra_r.ph         \d1_8888,  \scratch1, 8 +    shra_r.ph         \d2_8888,  \scratch2, 8 +    and               \scratch3, \scratch3, \maskLSR  /*   0 |A*A| 0 |R*R */ +    and               \scratch4, \scratch4, \maskLSR  /*   0 |G*G| 0 |B*B */ +    and               \d1_8888,  \d1_8888,  \maskLSR  /*   0 |A*A| 0 |R*R */ +    and               \d2_8888,  \d2_8888,  \maskLSR  /*   0 |G*G| 0 |B*B */ +    addq.ph           \scratch3, \scratch3, \scratch5 +    addq.ph           \scratch4, \scratch4, \scratch6 +    addq.ph           \d1_8888,  \d1_8888,  \scratch1 +    addq.ph           \d2_8888,  \d2_8888,  \scratch2 +    shra_r.ph         \scratch3, \scratch3, 8 +    shra_r.ph         \scratch4, \scratch4, 8 +    shra_r.ph         \scratch5, \d1_8888,  8 +    shra_r.ph         \scratch6, \d2_8888,  8 +    precr.qb.ph       \d1_8888,  \scratch3, \scratch4 +    precr.qb.ph       \d2_8888,  \scratch5, \scratch6 +.endm + +/* + * OVER operation on single a8r8g8b8 source pixel (s_8888) and single a8r8g8b8 + * destination pixel (d_8888) using a8 mask (m_8). It also requires maskLSR + * needed for rounding process. maskLSR must have following value: + *   li       maskLSR, 0x00ff00ff + */ +.macro OVER_8888_8_8888 s_8888,   \ +                        m_8,      \ +                        d_8888,   \ +                        out_8888, \ +                        maskLSR,  \ +                        scratch1, scratch2, scratch3, scratch4 +    MIPS_UN8x4_MUL_UN8 \s_8888,   \m_8, \ +                       \scratch1, \maskLSR, \ +                       \scratch2, \scratch3, \scratch4 + +    not                \scratch2, \scratch1 +    srl                \scratch2, \scratch2, 24 + +    MIPS_UN8x4_MUL_UN8 \d_8888,   \scratch2, \ +                       \d_8888,   \maskLSR,  \ +                       \scratch3, \scratch4, \out_8888 + +    addu_s.qb          \out_8888, \d_8888,   \scratch1 +.endm + +/* + * OVER operation on two a8r8g8b8 source pixels (s1_8888 and s2_8888) and two + * a8r8g8b8 destination pixels (d1_8888 and d2_8888) using a8 masks (m1_8 and + * m2_8). It also requires maskLSR needed for rounding process. maskLSR must + * have following value: + *   li       maskLSR, 0x00ff00ff + */ +.macro OVER_2x8888_2x8_2x8888 s1_8888,   \ +                              s2_8888,   \ +                              m1_8,      \ +                              m2_8,      \ +                              d1_8888,   \ +                              d2_8888,   \ +                              out1_8888, \ +                              out2_8888, \ +                              maskLSR,   \ +                              scratch1, scratch2, scratch3, \ +                              scratch4, scratch5, scratch6 +    MIPS_2xUN8x4_MUL_2xUN8 \s1_8888,   \s2_8888, \ +                           \m1_8,      \m2_8, \ +                           \scratch1,  \scratch2, \ +                           \maskLSR, \ +                           \scratch3,  \scratch4, \out1_8888, \ +                           \out2_8888, \scratch5, \scratch6 + +    not                    \scratch3,  \scratch1 +    srl                    \scratch3,  \scratch3, 24 +    not                    \scratch4,  \scratch2 +    srl                    \scratch4,  \scratch4, 24 + +    MIPS_2xUN8x4_MUL_2xUN8 \d1_8888,   \d2_8888, \ +                           \scratch3,  \scratch4, \ +                           \d1_8888,   \d2_8888, \ +                           \maskLSR, \ +                           \scratch5,  \scratch6, \out1_8888, \ +                           \out2_8888, \scratch3, \scratch4 + +    addu_s.qb              \out1_8888, \d1_8888,  \scratch1 +    addu_s.qb              \out2_8888, \d2_8888,  \scratch2 +.endm + +/* + * OVER operation on single a8r8g8b8 source pixel (s_8888) and single a8r8g8b8 + * destination pixel (d_8888). It also requires maskLSR needed for rounding + * process. maskLSR must have following value: + *   li       maskLSR, 0x00ff00ff + */ +.macro OVER_8888_8888 s_8888,   \ +                      d_8888,   \ +                      out_8888, \ +                      maskLSR,  \ +                      scratch1, scratch2, scratch3, scratch4 +    not                \scratch1, \s_8888 +    srl                \scratch1, \scratch1, 24 + +    MIPS_UN8x4_MUL_UN8 \d_8888,   \scratch1, \ +                       \out_8888, \maskLSR, \ +                       \scratch2, \scratch3, \scratch4 + +    addu_s.qb          \out_8888, \out_8888, \s_8888 +.endm + +/* + * OVER operation on two a8r8g8b8 source pixels (s1_8888 and s2_8888) and two + * a8r8g8b8 destination pixels (d1_8888 and d2_8888). It also requires maskLSR + * needed for rounding process. maskLSR must have following value: + *   li       maskLSR, 0x00ff00ff + */ +.macro OVER_2x8888_2x8888 s1_8888,   \ +                          s2_8888,   \ +                          d1_8888,   \ +                          d2_8888,   \ +                          out1_8888, \ +                          out2_8888, \ +                          maskLSR,   \ +                          scratch1, scratch2, scratch3, \ +                          scratch4, scratch5, scratch6 +    not                    \scratch1,  \s1_8888 +    srl                    \scratch1,  \scratch1,  24 +    not                    \scratch2,  \s2_8888 +    srl                    \scratch2,  \scratch2,  24 +    MIPS_2xUN8x4_MUL_2xUN8 \d1_8888,   \d2_8888, \ +                           \scratch1,  \scratch2,  \ +                           \out1_8888, \out2_8888, \ +                           \maskLSR, \ +                           \scratch3,  \scratch4, \scratch5, \ +                           \scratch6,  \d1_8888,  \d2_8888 + +    addu_s.qb              \out1_8888, \out1_8888, \s1_8888 +    addu_s.qb              \out2_8888, \out2_8888, \s2_8888 +.endm + +.macro MIPS_UN8x4_MUL_UN8_ADD_UN8x4 s_8888,   \ +                                    m_8,      \ +                                    d_8888,   \ +                                    out_8888, \ +                                    maskLSR,  \ +                                    scratch1, scratch2, scratch3 +    MIPS_UN8x4_MUL_UN8 \s_8888, \m_8, \ +                       \out_8888, \maskLSR, \ +                       \scratch1, \scratch2, \scratch3 + +    addu_s.qb          \out_8888, \out_8888, \d_8888 +.endm + +.macro MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 s1_8888,   \ +                             s2_8888,   \ +                             m1_8,      \ +                             m2_8,      \ +                             d1_8888,   \ +                             d2_8888,   \ +                             out1_8888, \ +                             out2_8888, \ +                             maskLSR,   \ +                             scratch1,  scratch2, scratch3, \ +                             scratch4, scratch5, scratch6 +    MIPS_2xUN8x4_MUL_2xUN8 \s1_8888,   \s2_8888, \ +                           \m1_8,      \m2_8, \ +                           \out1_8888, \out2_8888, \ +                           \maskLSR, \ +                           \scratch1,  \scratch2, \scratch3, \ +                           \scratch4,  \scratch5, \scratch6 + +    addu_s.qb             \out1_8888, \out1_8888, \d1_8888 +    addu_s.qb             \out2_8888, \out2_8888, \d2_8888 +.endm + +.macro BILINEAR_INTERPOLATE_SINGLE_PIXEL tl, tr, bl, br,         \ +                                         scratch1, scratch2,     \ +                                         alpha, red, green, blue \ +                                         wt1, wt2, wb1, wb2 +    andi            \scratch1, \tl,  0xff +    andi            \scratch2, \tr,  0xff +    andi            \alpha,    \bl,  0xff +    andi            \red,      \br,  0xff + +    multu           $ac0,      \wt1, \scratch1 +    maddu           $ac0,      \wt2, \scratch2 +    maddu           $ac0,      \wb1, \alpha +    maddu           $ac0,      \wb2, \red + +    ext             \scratch1, \tl,  8, 8 +    ext             \scratch2, \tr,  8, 8 +    ext             \alpha,    \bl,  8, 8 +    ext             \red,      \br,  8, 8 + +    multu           $ac1,      \wt1, \scratch1 +    maddu           $ac1,      \wt2, \scratch2 +    maddu           $ac1,      \wb1, \alpha +    maddu           $ac1,      \wb2, \red + +    ext             \scratch1, \tl,  16, 8 +    ext             \scratch2, \tr,  16, 8 +    ext             \alpha,    \bl,  16, 8 +    ext             \red,      \br,  16, 8 + +    mflo            \blue,     $ac0 + +    multu           $ac2,      \wt1, \scratch1 +    maddu           $ac2,      \wt2, \scratch2 +    maddu           $ac2,      \wb1, \alpha +    maddu           $ac2,      \wb2, \red + +    ext             \scratch1, \tl,  24, 8 +    ext             \scratch2, \tr,  24, 8 +    ext             \alpha,    \bl,  24, 8 +    ext             \red,      \br,  24, 8 + +    mflo            \green,    $ac1 + +    multu           $ac3,      \wt1, \scratch1 +    maddu           $ac3,      \wt2, \scratch2 +    maddu           $ac3,      \wb1, \alpha +    maddu           $ac3,      \wb2, \red + +    mflo            \red,      $ac2 +    mflo            \alpha,    $ac3 + +    precr.qb.ph     \alpha,    \alpha, \red +    precr.qb.ph     \scratch1, \green, \blue +    precrq.qb.ph    \tl,       \alpha, \scratch1 +.endm + +#endif //PIXMAN_MIPS_DSPR2_ASM_H diff --git a/libs/pixman-0.40.0/pixman/pixman-mips-dspr2.c b/libs/pixman-0.40.0/pixman/pixman-mips-dspr2.c new file mode 100644 index 0000000..87969ae --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-mips-dspr2.c @@ -0,0 +1,459 @@ +/* + * Copyright (c) 2012 + *      MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *    notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + *    notice, this list of conditions and the following disclaimer in the + *    documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + *    contributors may be used to endorse or promote products derived from + *    this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author:  Nemanja Lukic (nemanja.lukic@rt-rk.com) + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include "pixman-private.h" +#include "pixman-mips-dspr2.h" + +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_x888_8888, +                                    uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_8888_0565, +                                    uint32_t, 1, uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0565_8888, +                                    uint16_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0565_0565, +                                    uint16_t, 1, uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_8888_8888, +                                    uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888, +                                    uint8_t, 3, uint8_t, 3) +#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0888_8888_rev, +                                    uint8_t, 3, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0888_0565_rev, +                                    uint8_t, 3, uint16_t, 1) +#endif +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_pixbuf_8888, +                                    uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_rpixbuf_8888, +                                    uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_8888, +                                    uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_0565, +                                    uint32_t, 1, uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8_8, +                                    uint8_t, 1, uint8_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8888_8888, +                                    uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, out_reverse_8_0565, +                                    uint8_t, 1, uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, out_reverse_8_8888, +                                    uint8_t,  1, uint32_t, 1) + +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (0, src_n_8_8888, +                                       uint8_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (0, src_n_8_8, +                                       uint8_t, 1, uint8_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_8888_ca, +                                       uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_0565_ca, +                                       uint32_t, 1, uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8, +                                       uint8_t, 1, uint8_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8888, +                                       uint8_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_0565, +                                       uint8_t, 1, uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, add_n_8_8, +                                       uint8_t, 1, uint8_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, add_n_8_8888, +                                       uint8_t, 1, uint32_t, 1) + +PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_8888_n_8888, +                                      uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_8888_n_0565, +                                      uint32_t, 1, uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_0565_n_0565, +                                      uint16_t, 1, uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, add_8888_n_8888, +                                      uint32_t, 1, uint32_t, 1) + +PIXMAN_MIPS_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, over_n_0565, +                                  uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, over_n_8888, +                                  uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, over_reverse_n_8888, +                                  uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_N_DST (0, in_n_8, +                                  uint8_t, 1) + +PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8_8_8, uint8_t,  1, +                                         uint8_t,  1, uint8_t,  1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8888_8_8888, uint32_t, 1, +                                         uint8_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8888_8888_8888, uint32_t, 1, +                                         uint32_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_0565_8_0565, uint16_t, 1, +                                         uint8_t,  1, uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8_8888, uint32_t, 1, +                                         uint8_t, 1, uint32_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8_0565, uint32_t, 1, +                                         uint8_t, 1, uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_0565_8_0565, uint16_t, 1, +                                         uint8_t, 1, uint16_t, 1) +PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8888_8888, uint32_t, 1, +                                         uint32_t, 1, uint32_t, 1) + +PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (8888_8888, OVER, +                                         uint32_t, uint32_t) +PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (8888_0565, OVER, +                                         uint32_t, uint16_t) +PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (0565_8888, SRC, +                                         uint16_t, uint32_t) + +PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 8888_8888, SRC, +                                          uint32_t, uint32_t) +PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 8888_0565, SRC, +                                          uint32_t, uint16_t) +PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 0565_8888, SRC, +                                          uint16_t, uint32_t) +PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 0565_0565, SRC, +                                          uint16_t, uint16_t) +PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, 8888_8888, OVER, +                                          uint32_t, uint32_t) +PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, 8888_8888, ADD, +                                          uint32_t, uint32_t) + +PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, 8888_8_0565, +                                            OVER, uint32_t, uint16_t) +PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, 0565_8_0565, +                                            OVER, uint16_t, uint16_t) + +PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 8888_8_8888, SRC, +                                             uint32_t, uint32_t) +PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 8888_8_0565, SRC, +                                             uint32_t, uint16_t) +PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 0565_8_x888, SRC, +                                             uint16_t, uint32_t) +PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 0565_8_0565, SRC, +                                             uint16_t, uint16_t) +PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, 8888_8_8888, OVER, +                                             uint32_t, uint32_t) +PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, 8888_8_8888, ADD, +                                             uint32_t, uint32_t) + +static pixman_bool_t +mips_dspr2_fill (pixman_implementation_t *imp, +                 uint32_t *               bits, +                 int                      stride, +                 int                      bpp, +                 int                      x, +                 int                      y, +                 int                      width, +                 int                      height, +                 uint32_t                 _xor) +{ +    uint8_t *byte_line; +    uint32_t byte_width; +    switch (bpp) +    { +    case 16: +        stride = stride * (int) sizeof (uint32_t) / 2; +        byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); +        byte_width = width * 2; +        stride *= 2; + +        while (height--) +        { +            uint8_t *dst = byte_line; +            byte_line += stride; +            pixman_fill_buff16_mips (dst, byte_width, _xor & 0xffff); +        } +        return TRUE; +    case 32: +        stride = stride * (int) sizeof (uint32_t) / 4; +        byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); +        byte_width = width * 4; +        stride *= 4; + +        while (height--) +        { +            uint8_t *dst = byte_line; +            byte_line += stride; +            pixman_fill_buff32_mips (dst, byte_width, _xor); +        } +        return TRUE; +    default: +        return FALSE; +    } +} + +static pixman_bool_t +mips_dspr2_blt (pixman_implementation_t *imp, +                uint32_t *               src_bits, +                uint32_t *               dst_bits, +                int                      src_stride, +                int                      dst_stride, +                int                      src_bpp, +                int                      dst_bpp, +                int                      src_x, +                int                      src_y, +                int                      dest_x, +                int                      dest_y, +                int                      width, +                int                      height) +{ +    if (src_bpp != dst_bpp) +        return FALSE; + +    uint8_t *src_bytes; +    uint8_t *dst_bytes; +    uint32_t byte_width; + +    switch (src_bpp) +    { +    case 16: +        src_stride = src_stride * (int) sizeof (uint32_t) / 2; +        dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; +        src_bytes =(uint8_t *)(((uint16_t *)src_bits) +                                          + src_stride * (src_y) + (src_x)); +        dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) +                                           + dst_stride * (dest_y) + (dest_x)); +        byte_width = width * 2; +        src_stride *= 2; +        dst_stride *= 2; + +        while (height--) +        { +            uint8_t *src = src_bytes; +            uint8_t *dst = dst_bytes; +            src_bytes += src_stride; +            dst_bytes += dst_stride; +            pixman_mips_fast_memcpy (dst, src, byte_width); +        } +        return TRUE; +    case 32: +        src_stride = src_stride * (int) sizeof (uint32_t) / 4; +        dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; +        src_bytes = (uint8_t *)(((uint32_t *)src_bits) +                                           + src_stride * (src_y) + (src_x)); +        dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) +                                           + dst_stride * (dest_y) + (dest_x)); +        byte_width = width * 4; +        src_stride *= 4; +        dst_stride *= 4; + +        while (height--) +        { +            uint8_t *src = src_bytes; +            uint8_t *dst = dst_bytes; +            src_bytes += src_stride; +            dst_bytes += dst_stride; +            pixman_mips_fast_memcpy (dst, src, byte_width); +        } +        return TRUE; +    default: +        return FALSE; +    } +} + +static const pixman_fast_path_t mips_dspr2_fast_paths[] = +{ +    PIXMAN_STD_FAST_PATH (SRC, r5g6b5,   null, r5g6b5,   mips_composite_src_0565_0565), +    PIXMAN_STD_FAST_PATH (SRC, b5g6r5,   null, b5g6r5,   mips_composite_src_0565_0565), +    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5,   mips_composite_src_8888_0565), +    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5,   mips_composite_src_8888_0565), +    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5,   mips_composite_src_8888_0565), +    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5,   mips_composite_src_8888_0565), +    PIXMAN_STD_FAST_PATH (SRC, r5g6b5,   null, a8r8g8b8, mips_composite_src_0565_8888), +    PIXMAN_STD_FAST_PATH (SRC, r5g6b5,   null, x8r8g8b8, mips_composite_src_0565_8888), +    PIXMAN_STD_FAST_PATH (SRC, b5g6r5,   null, a8b8g8r8, mips_composite_src_0565_8888), +    PIXMAN_STD_FAST_PATH (SRC, b5g6r5,   null, x8b8g8r8, mips_composite_src_0565_8888), +    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mips_composite_src_8888_8888), +    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mips_composite_src_8888_8888), +    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mips_composite_src_8888_8888), +    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mips_composite_src_8888_8888), +    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mips_composite_src_8888_8888), +    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mips_composite_src_8888_8888), +    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, mips_composite_src_x888_8888), +    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, mips_composite_src_x888_8888), +    PIXMAN_STD_FAST_PATH (SRC, r8g8b8,   null, r8g8b8,   mips_composite_src_0888_0888), +#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) +    PIXMAN_STD_FAST_PATH (SRC, b8g8r8,   null, x8r8g8b8, mips_composite_src_0888_8888_rev), +    PIXMAN_STD_FAST_PATH (SRC, b8g8r8,   null, r5g6b5,   mips_composite_src_0888_0565_rev), +#endif +    PIXMAN_STD_FAST_PATH (SRC, pixbuf,   pixbuf,  a8r8g8b8, mips_composite_src_pixbuf_8888), +    PIXMAN_STD_FAST_PATH (SRC, pixbuf,   pixbuf,  a8b8g8r8, mips_composite_src_rpixbuf_8888), +    PIXMAN_STD_FAST_PATH (SRC, rpixbuf,  rpixbuf, a8r8g8b8, mips_composite_src_rpixbuf_8888), +    PIXMAN_STD_FAST_PATH (SRC, rpixbuf,  rpixbuf, a8b8g8r8, mips_composite_src_pixbuf_8888), +    PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   a8r8g8b8, mips_composite_src_n_8_8888), +    PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   x8r8g8b8, mips_composite_src_n_8_8888), +    PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   a8b8g8r8, mips_composite_src_n_8_8888), +    PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   x8b8g8r8, mips_composite_src_n_8_8888), +    PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   a8,       mips_composite_src_n_8_8), + +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mips_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mips_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mips_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mips_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5,   mips_composite_over_n_8888_0565_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5,   mips_composite_over_n_8888_0565_ca), +    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8,       mips_composite_over_n_8_8), +    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8r8g8b8, mips_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8r8g8b8, mips_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8b8g8r8, mips_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8b8g8r8, mips_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       r5g6b5,   mips_composite_over_n_8_0565), +    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       b5g6r5,   mips_composite_over_n_8_0565), +    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     r5g6b5,   mips_composite_over_n_0565), +    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     a8r8g8b8, mips_composite_over_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     x8r8g8b8, mips_composite_over_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    a8r8g8b8, mips_composite_over_8888_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    x8r8g8b8, mips_composite_over_8888_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    r5g6b5,   mips_composite_over_8888_n_0565), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid,    b5g6r5,   mips_composite_over_8888_n_0565), +    PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   solid,    r5g6b5,   mips_composite_over_0565_n_0565), +    PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   solid,    b5g6r5,   mips_composite_over_0565_n_0565), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       a8r8g8b8, mips_composite_over_8888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       x8r8g8b8, mips_composite_over_8888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       a8b8g8r8, mips_composite_over_8888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       x8b8g8r8, mips_composite_over_8888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       r5g6b5,   mips_composite_over_8888_8_0565), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       b5g6r5,   mips_composite_over_8888_8_0565), +    PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   a8,       r5g6b5,   mips_composite_over_0565_8_0565), +    PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   a8,       b5g6r5,   mips_composite_over_0565_8_0565), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, mips_composite_over_8888_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     a8r8g8b8, mips_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     x8r8g8b8, mips_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     a8b8g8r8, mips_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     x8b8g8r8, mips_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     r5g6b5,   mips_composite_over_8888_0565), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     b5g6r5,   mips_composite_over_8888_0565), +    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8,       mips_composite_add_n_8_8), +    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8r8g8b8, mips_composite_add_n_8_8888), +    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8b8g8r8, mips_composite_add_n_8_8888), +    PIXMAN_STD_FAST_PATH (ADD,  a8,       a8,       a8,       mips_composite_add_8_8_8), +    PIXMAN_STD_FAST_PATH (ADD,  r5g6b5,   a8,       r5g6b5,   mips_composite_add_0565_8_0565), +    PIXMAN_STD_FAST_PATH (ADD,  b5g6r5,   a8,       b5g6r5,   mips_composite_add_0565_8_0565), +    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8,       a8r8g8b8, mips_composite_add_8888_8_8888), +    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, a8,       a8b8g8r8, mips_composite_add_8888_8_8888), +    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, a8r8g8b8, mips_composite_add_8888_8888_8888), +    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, solid,    a8r8g8b8, mips_composite_add_8888_n_8888), +    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, solid,    a8b8g8r8, mips_composite_add_8888_n_8888), +    PIXMAN_STD_FAST_PATH (ADD,  a8,       null,     a8,       mips_composite_add_8_8), +    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null,     a8r8g8b8, mips_composite_add_8888_8888), +    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null,     a8b8g8r8, mips_composite_add_8888_8888), +    PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8,    null, r5g6b5,   mips_composite_out_reverse_8_0565), +    PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8,    null, b5g6r5,   mips_composite_out_reverse_8_0565), +    PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8,    null, a8r8g8b8, mips_composite_out_reverse_8_8888), +    PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8,    null, a8b8g8r8, mips_composite_out_reverse_8_8888), +    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mips_composite_over_reverse_n_8888), +    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mips_composite_over_reverse_n_8888), +    PIXMAN_STD_FAST_PATH (IN,           solid, null, a8,       mips_composite_in_n_8), + +    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mips_8888_8888), +    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mips_8888_8888), +    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mips_8888_8888), +    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mips_8888_8888), + +    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_0565), +    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_0565), + +    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, mips_0565_8888), +    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, mips_0565_8888), +    /* Note: NONE repeat is not supported yet */ +    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, r5g6b5, a8r8g8b8, mips_0565_8888), +    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, b5g6r5, a8b8g8r8, mips_0565_8888), +    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, mips_0565_8888), +    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, mips_0565_8888), + +    SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_8_0565), +    SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_8_0565), + +    SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, mips_0565_8_0565), +    SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, mips_0565_8_0565), + +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8888), +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8888), +    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mips_8888_8888), + +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, r5g6b5, mips_8888_0565), +    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, mips_8888_0565), + +    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, mips_0565_8888), +    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, mips_0565_0565), + +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mips_8888_8888), +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mips_8888_8888), + +    SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, mips_8888_8888), +    SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, mips_8888_8888), + +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8_8888), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8_8888), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mips_8888_8_8888), + +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, r5g6b5, mips_8888_8_0565), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, r5g6b5, mips_8888_8_0565), + +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, x8r8g8b8, mips_0565_8_x888), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, mips_0565_8_0565), + +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mips_8888_8_8888), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mips_8888_8_8888), + +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, mips_8888_8_8888), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, mips_8888_8_8888), +    { PIXMAN_OP_NONE }, +}; + +static void +mips_dspr2_combine_over_u (pixman_implementation_t *imp, +                           pixman_op_t              op, +                           uint32_t *               dest, +                           const uint32_t *         src, +                           const uint32_t *         mask, +                           int                      width) +{ +    if (mask) +        pixman_composite_over_8888_8888_8888_asm_mips ( +            dest, (uint32_t *)src, (uint32_t *)mask, width); +    else +        pixman_composite_over_8888_8888_asm_mips ( +		    dest, (uint32_t *)src, width); +} + +pixman_implementation_t * +_pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback) +{ +    pixman_implementation_t *imp = +        _pixman_implementation_create (fallback, mips_dspr2_fast_paths); + +    imp->combine_32[PIXMAN_OP_OVER] = mips_dspr2_combine_over_u; + +    imp->blt = mips_dspr2_blt; +    imp->fill = mips_dspr2_fill; + +    return imp; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-mips-dspr2.h b/libs/pixman-0.40.0/pixman/pixman-mips-dspr2.h new file mode 100644 index 0000000..57b3835 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-mips-dspr2.h @@ -0,0 +1,432 @@ +/* + * Copyright (c) 2012 + *      MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *    notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + *    notice, this list of conditions and the following disclaimer in the + *    documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + *    contributors may be used to endorse or promote products derived from + *    this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author:  Nemanja Lukic (nemanja.lukic@rt-rk.com) + */ + +#ifndef PIXMAN_MIPS_DSPR2_H +#define PIXMAN_MIPS_DSPR2_H + +#include "pixman-private.h" +#include "pixman-inlines.h" + +#define SKIP_ZERO_SRC  1 +#define SKIP_ZERO_MASK 2 +#define DO_FAST_MEMCPY 3 + +void +pixman_mips_fast_memcpy (void *dst, void *src, uint32_t n_bytes); +void +pixman_fill_buff16_mips (void *dst, uint32_t n_bytes, uint16_t value); +void +pixman_fill_buff32_mips (void *dst, uint32_t n_bytes, uint32_t value); + +/****************************************************************/ + +#define PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST(flags, name,          \ +                                           src_type, src_cnt,    \ +                                           dst_type, dst_cnt)    \ +void                                                             \ +pixman_composite_##name##_asm_mips (dst_type *dst,               \ +                                    src_type *src,               \ +                                    int32_t   w);                \ +                                                                 \ +static void                                                      \ +mips_composite_##name (pixman_implementation_t *imp,             \ +                       pixman_composite_info_t *info)            \ +{                                                                \ +    PIXMAN_COMPOSITE_ARGS (info);                                \ +    dst_type *dst_line, *dst;                                    \ +    src_type *src_line, *src;                                    \ +    int32_t dst_stride, src_stride;                              \ +    int bpp = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8;   \ +                                                                 \ +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,    \ +                           src_stride, src_line, src_cnt);       \ +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type, \ +                           dst_stride, dst_line, dst_cnt);       \ +                                                                 \ +    while (height--)                                             \ +    {                                                            \ +      dst = dst_line;                                            \ +      dst_line += dst_stride;                                    \ +      src = src_line;                                            \ +      src_line += src_stride;                                    \ +                                                                 \ +      if (flags == DO_FAST_MEMCPY)                               \ +        pixman_mips_fast_memcpy (dst, src, width * bpp);         \ +      else                                                       \ +        pixman_composite_##name##_asm_mips (dst, src, width);    \ +    }                                                            \ +} + +/****************************************************************/ + +#define PIXMAN_MIPS_BIND_FAST_PATH_N_DST(flags, name,            \ +                                         dst_type, dst_cnt)      \ +void                                                             \ +pixman_composite_##name##_asm_mips (dst_type *dst,               \ +                                    uint32_t  src,               \ +                                    int32_t   w);                \ +                                                                 \ +static void                                                      \ +mips_composite_##name (pixman_implementation_t *imp,             \ +                       pixman_composite_info_t *info)            \ +{                                                                \ +    PIXMAN_COMPOSITE_ARGS (info);                                \ +    dst_type  *dst_line, *dst;                                   \ +    int32_t    dst_stride;                                       \ +    uint32_t   src;                                              \ +                                                                 \ +    src = _pixman_image_get_solid (                              \ +    imp, src_image, dest_image->bits.format);                    \ +                                                                 \ +    if ((flags & SKIP_ZERO_SRC) && src == 0)                     \ +        return;                                                  \ +                                                                 \ +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type, \ +                           dst_stride, dst_line, dst_cnt);       \ +                                                                 \ +    while (height--)                                             \ +    {                                                            \ +        dst = dst_line;                                          \ +        dst_line += dst_stride;                                  \ +                                                                 \ +        pixman_composite_##name##_asm_mips (dst, src, width);    \ +    }                                                            \ +} + +/*******************************************************************/ + +#define PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST(flags, name,          \ +                                              mask_type, mask_cnt,  \ +                                              dst_type, dst_cnt)    \ +void                                                                \ +pixman_composite_##name##_asm_mips (dst_type  *dst,                 \ +                                    uint32_t  src,                  \ +                                    mask_type *mask,                \ +                                    int32_t   w);                   \ +                                                                    \ +static void                                                         \ +mips_composite_##name (pixman_implementation_t *imp,                \ +                       pixman_composite_info_t *info)               \ +{                                                                   \ +    PIXMAN_COMPOSITE_ARGS (info);                                   \ +    dst_type  *dst_line, *dst;                                      \ +    mask_type *mask_line, *mask;                                    \ +    int32_t    dst_stride, mask_stride;                             \ +    uint32_t   src;                                                 \ +                                                                    \ +    src = _pixman_image_get_solid (                                 \ +        imp, src_image, dest_image->bits.format);                   \ +                                                                    \ +    if ((flags & SKIP_ZERO_SRC) && src == 0)                        \ +        return;                                                     \ +                                                                    \ +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,    \ +                           dst_stride, dst_line, dst_cnt);          \ +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type,   \ +                           mask_stride, mask_line, mask_cnt);       \ +                                                                    \ +    while (height--)                                                \ +    {                                                               \ +        dst = dst_line;                                             \ +        dst_line += dst_stride;                                     \ +        mask = mask_line;                                           \ +        mask_line += mask_stride;                                   \ +        pixman_composite_##name##_asm_mips (dst, src, mask, width); \ +    }                                                               \ +} + +/*******************************************************************/ + +#define PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST(flags, name,           \ +                                            src_type, src_cnt,      \ +                                            dst_type, dst_cnt)      \ +void                                                                \ +pixman_composite_##name##_asm_mips (dst_type  *dst,                 \ +                                    src_type  *src,                 \ +                                    uint32_t   mask,                \ +                                    int32_t    w);                  \ +                                                                    \ +static void                                                         \ +mips_composite_##name (pixman_implementation_t *imp,                \ +                       pixman_composite_info_t *info)               \ +{                                                                   \ +    PIXMAN_COMPOSITE_ARGS (info);                                   \ +    dst_type  *dst_line, *dst;                                      \ +    src_type  *src_line, *src;                                      \ +    int32_t    dst_stride, src_stride;                              \ +    uint32_t   mask;                                                \ +                                                                    \ +    mask = _pixman_image_get_solid (                                \ +        imp, mask_image, dest_image->bits.format);                  \ +                                                                    \ +    if ((flags & SKIP_ZERO_MASK) && mask == 0)                      \ +        return;                                                     \ +                                                                    \ +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,    \ +                           dst_stride, dst_line, dst_cnt);          \ +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,       \ +                           src_stride, src_line, src_cnt);          \ +                                                                    \ +    while (height--)                                                \ +    {                                                               \ +        dst = dst_line;                                             \ +        dst_line += dst_stride;                                     \ +        src = src_line;                                             \ +        src_line += src_stride;                                     \ +                                                                    \ +        pixman_composite_##name##_asm_mips (dst, src, mask, width); \ +    }                                                               \ +} + +/************************************************************************/ + +#define PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST(name, src_type, src_cnt, \ +                                                mask_type, mask_cnt,     \ +                                                dst_type, dst_cnt)       \ +void                                                                     \ +pixman_composite_##name##_asm_mips (dst_type  *dst,                      \ +                                    src_type  *src,                      \ +                                    mask_type *mask,                     \ +                                    int32_t   w);                        \ +                                                                         \ +static void                                                              \ +mips_composite_##name (pixman_implementation_t *imp,                     \ +                       pixman_composite_info_t *info)                    \ +{                                                                        \ +    PIXMAN_COMPOSITE_ARGS (info);                                        \ +    dst_type  *dst_line, *dst;                                           \ +    src_type  *src_line, *src;                                           \ +    mask_type *mask_line, *mask;                                         \ +    int32_t    dst_stride, src_stride, mask_stride;                      \ +                                                                         \ +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,         \ +                           dst_stride, dst_line, dst_cnt);               \ +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,            \ +                           src_stride, src_line, src_cnt);               \ +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type,        \ +                           mask_stride, mask_line, mask_cnt);            \ +                                                                         \ +    while (height--)                                                     \ +    {                                                                    \ +        dst = dst_line;                                                  \ +        dst_line += dst_stride;                                          \ +        mask = mask_line;                                                \ +        mask_line += mask_stride;                                        \ +        src = src_line;                                                  \ +        src_line += src_stride;                                          \ +        pixman_composite_##name##_asm_mips (dst, src, mask, width);      \ +    }                                                                    \ +} + +/****************************************************************************/ + +#define PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST(name, op,                    \ +                                                src_type, dst_type)          \ +void                                                                         \ +pixman_scaled_nearest_scanline_##name##_##op##_asm_mips (                    \ +                                                   dst_type *       dst,     \ +                                                   const src_type * src,     \ +                                                   int32_t          w,       \ +                                                   pixman_fixed_t   vx,      \ +                                                   pixman_fixed_t   unit_x); \ +                                                                             \ +static force_inline void                                                     \ +scaled_nearest_scanline_mips_##name##_##op (dst_type *       pd,             \ +                                            const src_type * ps,             \ +                                            int32_t          w,              \ +                                            pixman_fixed_t   vx,             \ +                                            pixman_fixed_t   unit_x,         \ +                                            pixman_fixed_t   max_vx,         \ +                                            pixman_bool_t    zero_src)       \ +{                                                                            \ +    pixman_scaled_nearest_scanline_##name##_##op##_asm_mips (pd, ps, w,      \ +                                                             vx, unit_x);    \ +}                                                                            \ +                                                                             \ +FAST_NEAREST_MAINLOOP (mips_##name##_cover_##op,                             \ +                       scaled_nearest_scanline_mips_##name##_##op,           \ +                       src_type, dst_type, COVER)                            \ +FAST_NEAREST_MAINLOOP (mips_##name##_none_##op,                              \ +                       scaled_nearest_scanline_mips_##name##_##op,           \ +                       src_type, dst_type, NONE)                             \ +FAST_NEAREST_MAINLOOP (mips_##name##_pad_##op,                               \ +                       scaled_nearest_scanline_mips_##name##_##op,           \ +                       src_type, dst_type, PAD) + +/* Provide entries for the fast path table */ +#define PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH(op,s,d,func)                    \ +    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),                            \ +    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),                             \ +    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func) + + +/*****************************************************************************/ + +#define PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_A8_DST(flags, name, op,           \ +                                                  src_type, dst_type)         \ +void                                                                          \ +pixman_scaled_nearest_scanline_##name##_##op##_asm_mips (                     \ +                                                   dst_type *       dst,      \ +                                                   const src_type * src,      \ +                                                   const uint8_t *  mask,     \ +                                                   int32_t          w,        \ +                                                   pixman_fixed_t   vx,       \ +                                                   pixman_fixed_t   unit_x);  \ +                                                                              \ +static force_inline void                                                      \ +scaled_nearest_scanline_mips_##name##_##op (const uint8_t *  mask,            \ +                                            dst_type *       pd,              \ +                                            const src_type * ps,              \ +                                            int32_t          w,               \ +                                            pixman_fixed_t   vx,              \ +                                            pixman_fixed_t   unit_x,          \ +                                            pixman_fixed_t   max_vx,          \ +                                            pixman_bool_t    zero_src)        \ +{                                                                             \ +    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \ +        return;                                                               \ +    pixman_scaled_nearest_scanline_##name##_##op##_asm_mips (pd, ps,          \ +                                                             mask, w,         \ +                                                             vx, unit_x);     \ +}                                                                             \ +                                                                              \ +FAST_NEAREST_MAINLOOP_COMMON (mips_##name##_cover_##op,                       \ +                              scaled_nearest_scanline_mips_##name##_##op,     \ +                              src_type, uint8_t, dst_type, COVER, TRUE, FALSE)\ +FAST_NEAREST_MAINLOOP_COMMON (mips_##name##_none_##op,                        \ +                              scaled_nearest_scanline_mips_##name##_##op,     \ +                              src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \ +FAST_NEAREST_MAINLOOP_COMMON (mips_##name##_pad_##op,                         \ +                              scaled_nearest_scanline_mips_##name##_##op,     \ +                              src_type, uint8_t, dst_type, PAD, TRUE, FALSE) + +/****************************************************************************/ + +#define PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST(flags, name, op,            \ +                                                 src_type, dst_type)         \ +void                                                                         \ +pixman_scaled_bilinear_scanline_##name##_##op##_asm_mips(                    \ +                                             dst_type *       dst,           \ +                                             const src_type * src_top,       \ +                                             const src_type * src_bottom,    \ +                                             int32_t          w,             \ +                                             int              wt,            \ +                                             int              wb,            \ +                                             pixman_fixed_t   vx,            \ +                                             pixman_fixed_t   unit_x);       \ +static force_inline void                                                     \ +scaled_bilinear_scanline_mips_##name##_##op (dst_type *       dst,           \ +                                             const uint32_t * mask,          \ +                                             const src_type * src_top,       \ +                                             const src_type * src_bottom,    \ +                                             int32_t          w,             \ +                                             int              wt,            \ +                                             int              wb,            \ +                                             pixman_fixed_t   vx,            \ +                                             pixman_fixed_t   unit_x,        \ +                                             pixman_fixed_t   max_vx,        \ +                                             pixman_bool_t    zero_src)      \ +{                                                                            \ +    if ((flags & SKIP_ZERO_SRC) && zero_src)                                 \ +        return;                                                              \ +    pixman_scaled_bilinear_scanline_##name##_##op##_asm_mips (dst, src_top,  \ +                                                              src_bottom, w, \ +                                                              wt, wb,        \ +                                                              vx, unit_x);   \ +}                                                                            \ +                                                                             \ +FAST_BILINEAR_MAINLOOP_COMMON (mips_##name##_cover_##op,                     \ +                       scaled_bilinear_scanline_mips_##name##_##op,          \ +                       src_type, uint32_t, dst_type, COVER, FLAG_NONE)       \ +FAST_BILINEAR_MAINLOOP_COMMON (mips_##name##_none_##op,                      \ +                       scaled_bilinear_scanline_mips_##name##_##op,          \ +                       src_type, uint32_t, dst_type, NONE, FLAG_NONE)        \ +FAST_BILINEAR_MAINLOOP_COMMON (mips_##name##_pad_##op,                       \ +                       scaled_bilinear_scanline_mips_##name##_##op,          \ +                       src_type, uint32_t, dst_type, PAD, FLAG_NONE)         \ +FAST_BILINEAR_MAINLOOP_COMMON (mips_##name##_normal_##op,                    \ +                       scaled_bilinear_scanline_mips_##name##_##op,          \ +                       src_type, uint32_t, dst_type, NORMAL,                 \ +                       FLAG_NONE) + +/*****************************************************************************/ + +#define PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST(flags, name, op,          \ +                                                src_type, dst_type)           \ +void                                                                          \ +pixman_scaled_bilinear_scanline_##name##_##op##_asm_mips (                    \ +                                             dst_type *       dst,            \ +                                             const uint8_t *  mask,           \ +                                             const src_type * top,            \ +                                             const src_type * bottom,         \ +                                             int              wt,             \ +                                             int              wb,             \ +                                             pixman_fixed_t   x,              \ +                                             pixman_fixed_t   ux,             \ +                                             int              width);         \ +                                                                              \ +static force_inline void                                                      \ +scaled_bilinear_scanline_mips_##name##_##op (dst_type *       dst,            \ +                                             const uint8_t *  mask,           \ +                                             const src_type * src_top,        \ +                                             const src_type * src_bottom,     \ +                                             int32_t          w,              \ +                                             int              wt,             \ +                                             int              wb,             \ +                                             pixman_fixed_t   vx,             \ +                                             pixman_fixed_t   unit_x,         \ +                                             pixman_fixed_t   max_vx,         \ +                                             pixman_bool_t    zero_src)       \ +{                                                                             \ +    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \ +        return;                                                               \ +    pixman_scaled_bilinear_scanline_##name##_##op##_asm_mips (                \ +                      dst, mask, src_top, src_bottom, wt, wb, vx, unit_x, w); \ +}                                                                             \ +                                                                              \ +FAST_BILINEAR_MAINLOOP_COMMON (mips_##name##_cover_##op,                      \ +                       scaled_bilinear_scanline_mips_##name##_##op,           \ +                       src_type, uint8_t, dst_type, COVER,                    \ +                       FLAG_HAVE_NON_SOLID_MASK)                              \ +FAST_BILINEAR_MAINLOOP_COMMON (mips_##name##_none_##op,                       \ +                       scaled_bilinear_scanline_mips_##name##_##op,           \ +                       src_type, uint8_t, dst_type, NONE,                     \ +                       FLAG_HAVE_NON_SOLID_MASK)                              \ +FAST_BILINEAR_MAINLOOP_COMMON (mips_##name##_pad_##op,                        \ +                       scaled_bilinear_scanline_mips_##name##_##op,           \ +                       src_type, uint8_t, dst_type, PAD,                      \ +                       FLAG_HAVE_NON_SOLID_MASK)                              \ +FAST_BILINEAR_MAINLOOP_COMMON (mips_##name##_normal_##op,                     \ +                       scaled_bilinear_scanline_mips_##name##_##op,           \ +                       src_type, uint8_t, dst_type, NORMAL,                   \ +                       FLAG_HAVE_NON_SOLID_MASK) + +#endif //PIXMAN_MIPS_DSPR2_H diff --git a/libs/pixman-0.40.0/pixman/pixman-mips-memcpy-asm.S b/libs/pixman-0.40.0/pixman/pixman-mips-memcpy-asm.S new file mode 100644 index 0000000..9ad6da5 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-mips-memcpy-asm.S @@ -0,0 +1,382 @@ +/* + * Copyright (c) 2012 + *      MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *    notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + *    notice, this list of conditions and the following disclaimer in the + *    documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + *    contributors may be used to endorse or promote products derived from + *    this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "pixman-mips-dspr2-asm.h" + +/* + * This routine could be optimized for MIPS64. The current code only + * uses MIPS32 instructions. + */ + +#ifdef EB +#  define LWHI	lwl		/* high part is left in big-endian */ +#  define SWHI	swl		/* high part is left in big-endian */ +#  define LWLO	lwr		/* low part is right in big-endian */ +#  define SWLO	swr		/* low part is right in big-endian */ +#else +#  define LWHI	lwr		/* high part is right in little-endian */ +#  define SWHI	swr		/* high part is right in little-endian */ +#  define LWLO	lwl		/* low part is left in big-endian */ +#  define SWLO	swl		/* low part is left in big-endian */ +#endif + +LEAF_MIPS32R2(pixman_mips_fast_memcpy) + +	slti	AT, a2, 8 +	bne	AT, zero, $last8 +	move	v0, a0	/* memcpy returns the dst pointer */ + +/* Test if the src and dst are word-aligned, or can be made word-aligned */ +	xor	t8, a1, a0 +	andi	t8, t8, 0x3		/* t8 is a0/a1 word-displacement */ + +	bne	t8, zero, $unaligned +	negu	a3, a0 + +	andi	a3, a3, 0x3	/* we need to copy a3 bytes to make a0/a1 aligned */ +	beq	a3, zero, $chk16w	/* when a3=0 then the dst (a0) is word-aligned */ +	subu	a2, a2, a3	/* now a2 is the remining bytes count */ + +	LWHI	t8, 0(a1) +	addu	a1, a1, a3 +	SWHI	t8, 0(a0) +	addu	a0, a0, a3 + +/* Now the dst/src are mutually word-aligned with word-aligned addresses */ +$chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */ +				/* t8 is the byte count after 64-byte chunks */ + +	beq	a2, t8, $chk8w	/* if a2==t8, no 64-byte chunks */ +				/* There will be at most 1 32-byte chunk after it */ +	subu	a3, a2, t8	/* subtract from a2 the reminder */ +                                /* Here a3 counts bytes in 16w chunks */ +	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */ + +	addu	t0, a0, a2	/* t0 is the "past the end" address */ + +/* + * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past + * the "t0-32" address + * This means: for x=128 the last "safe" a0 address is "t0-160" + * Alternatively, for x=64 the last "safe" a0 address is "t0-96" + * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit + */ +	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */ + +	pref    0, 0(a1)		/* bring the first line of src, addr 0 */ +	pref    0, 32(a1)	/* bring the second line of src, addr 32 */ +	pref    0, 64(a1)	/* bring the third line of src, addr 64 */ +	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */ +/* In case the a0 > t9 don't use "pref 30" at all */ +	sgtu	v1, a0, t9 +	bgtz	v1, $loop16w	/* skip "pref 30, 64(a0)" for too short arrays */ +	nop +/* otherwise, start with using pref30 */ +	pref	30, 64(a0) +$loop16w: +	pref	0, 96(a1) +	lw	t0, 0(a1) +	bgtz	v1, $skip_pref30_96	/* skip "pref 30, 96(a0)" */ +	lw	t1, 4(a1) +	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */ +$skip_pref30_96: +	lw	t2, 8(a1) +	lw	t3, 12(a1) +	lw	t4, 16(a1) +	lw	t5, 20(a1) +	lw	t6, 24(a1) +	lw	t7, 28(a1) +        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */ + +	sw	t0, 0(a0) +	sw	t1, 4(a0) +	sw	t2, 8(a0) +	sw	t3, 12(a0) +	sw	t4, 16(a0) +	sw	t5, 20(a0) +	sw	t6, 24(a0) +	sw	t7, 28(a0) + +	lw	t0, 32(a1) +	bgtz	v1, $skip_pref30_128	/* skip "pref 30, 128(a0)" */ +	lw	t1, 36(a1) +	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */ +$skip_pref30_128: +	lw	t2, 40(a1) +	lw	t3, 44(a1) +	lw	t4, 48(a1) +	lw	t5, 52(a1) +	lw	t6, 56(a1) +	lw	t7, 60(a1) +        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */ + +	sw	t0, 32(a0) +	sw	t1, 36(a0) +	sw	t2, 40(a0) +	sw	t3, 44(a0) +	sw	t4, 48(a0) +	sw	t5, 52(a0) +	sw	t6, 56(a0) +	sw	t7, 60(a0) + +	addiu	a0, a0, 64	/* adding 64 to dest */ +	sgtu	v1, a0, t9 +	bne	a0, a3, $loop16w +	addiu	a1, a1, 64	/* adding 64 to src */ +	move	a2, t8 + +/* Here we have src and dest word-aligned but less than 64-bytes to go */ + +$chk8w: +	pref 0, 0x0(a1) +	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */ +				/* the t8 is the reminder count past 32-bytes */ +	beq	a2, t8, $chk1w	/* when a2=t8, no 32-byte chunk */ +	 nop + +	lw	t0, 0(a1) +	lw	t1, 4(a1) +	lw	t2, 8(a1) +	lw	t3, 12(a1) +	lw	t4, 16(a1) +	lw	t5, 20(a1) +	lw	t6, 24(a1) +	lw	t7, 28(a1) +	addiu	a1, a1, 32 + +	sw	t0, 0(a0) +	sw	t1, 4(a0) +	sw	t2, 8(a0) +	sw	t3, 12(a0) +	sw	t4, 16(a0) +	sw	t5, 20(a0) +	sw	t6, 24(a0) +	sw	t7, 28(a0) +	addiu	a0, a0, 32 + +$chk1w: +	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */ +	beq	a2, t8, $last8 +	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */ +	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */ + +/* copying in words (4-byte chunks) */ +$wordCopy_loop: +	lw	t3, 0(a1)	/* the first t3 may be equal t0 ... optimize? */ +	addiu	a1, a1, 4 +	addiu	a0, a0, 4 +	bne	a0, a3, $wordCopy_loop +	sw	t3, -4(a0) + +/* For the last (<8) bytes */ +$last8: +	blez	a2, leave +	addu	a3, a0, a2	/* a3 is the last dst address */ +$last8loop: +	lb	v1, 0(a1) +	addiu	a1, a1, 1 +	addiu	a0, a0, 1 +	bne	a0, a3, $last8loop +	sb	v1, -1(a0) + +leave:	j	ra +	nop + +/* + * UNALIGNED case + */ + +$unaligned: +	/* got here with a3="negu a0" */ +	andi	a3, a3, 0x3	/* test if the a0 is word aligned */ +	beqz	a3, $ua_chk16w +	subu	a2, a2, a3	/* bytes left after initial a3 bytes */ + +	LWHI	v1, 0(a1) +	LWLO	v1, 3(a1) +	addu	a1, a1, a3	/* a3 may be here 1, 2 or 3 */ +	SWHI	v1, 0(a0) +	addu	a0, a0, a3	/* below the dst will be word aligned (NOTE1) */ + +$ua_chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */ +				/* t8 is the byte count after 64-byte chunks */ +	beq	a2, t8, $ua_chk8w	/* if a2==t8, no 64-byte chunks */ +				/* There will be at most 1 32-byte chunk after it */ +	subu	a3, a2, t8	/* subtract from a2 the reminder */ +                                /* Here a3 counts bytes in 16w chunks */ +	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */ + +	addu	t0, a0, a2	/* t0 is the "past the end" address */ + +	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */ + +	pref    0, 0(a1)		/* bring the first line of src, addr 0 */ +	pref    0, 32(a1)	/* bring the second line of src, addr 32 */ +	pref    0, 64(a1)	/* bring the third line of src, addr 64 */ +	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */ +/* In case the a0 > t9 don't use "pref 30" at all */ +	sgtu	v1, a0, t9 +	bgtz	v1, $ua_loop16w	/* skip "pref 30, 64(a0)" for too short arrays */ +	nop +/* otherwise,  start with using pref30 */ +	pref	30, 64(a0) +$ua_loop16w: +	pref	0, 96(a1) +	LWHI	t0, 0(a1) +	LWLO	t0, 3(a1) +	LWHI	t1, 4(a1) +	bgtz	v1, $ua_skip_pref30_96 +	LWLO	t1, 7(a1) +	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */ +$ua_skip_pref30_96: +	LWHI	t2, 8(a1) +	LWLO	t2, 11(a1) +	LWHI	t3, 12(a1) +	LWLO	t3, 15(a1) +	LWHI	t4, 16(a1) +	LWLO	t4, 19(a1) +	LWHI	t5, 20(a1) +	LWLO	t5, 23(a1) +	LWHI	t6, 24(a1) +	LWLO	t6, 27(a1) +	LWHI	t7, 28(a1) +	LWLO	t7, 31(a1) +        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */ + +	sw	t0, 0(a0) +	sw	t1, 4(a0) +	sw	t2, 8(a0) +	sw	t3, 12(a0) +	sw	t4, 16(a0) +	sw	t5, 20(a0) +	sw	t6, 24(a0) +	sw	t7, 28(a0) + +	LWHI	t0, 32(a1) +	LWLO	t0, 35(a1) +	LWHI	t1, 36(a1) +	bgtz	v1, $ua_skip_pref30_128 +	LWLO	t1, 39(a1) +	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */ +$ua_skip_pref30_128: +	LWHI	t2, 40(a1) +	LWLO	t2, 43(a1) +	LWHI	t3, 44(a1) +	LWLO	t3, 47(a1) +	LWHI	t4, 48(a1) +	LWLO	t4, 51(a1) +	LWHI	t5, 52(a1) +	LWLO	t5, 55(a1) +	LWHI	t6, 56(a1) +	LWLO	t6, 59(a1) +	LWHI	t7, 60(a1) +	LWLO	t7, 63(a1) +        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */ + +	sw	t0, 32(a0) +	sw	t1, 36(a0) +	sw	t2, 40(a0) +	sw	t3, 44(a0) +	sw	t4, 48(a0) +	sw	t5, 52(a0) +	sw	t6, 56(a0) +	sw	t7, 60(a0) + +	addiu	a0, a0, 64	/* adding 64 to dest */ +	sgtu	v1, a0, t9 +	bne	a0, a3, $ua_loop16w +	addiu	a1, a1, 64	/* adding 64 to src */ +	move	a2, t8 + +/* Here we have src and dest word-aligned but less than 64-bytes to go */ + +$ua_chk8w: +	pref 0, 0x0(a1) +	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */ +				/* the t8 is the reminder count */ +	beq	a2, t8, $ua_chk1w	/* when a2=t8, no 32-byte chunk */ + +	LWHI	t0, 0(a1) +	LWLO	t0, 3(a1) +	LWHI	t1, 4(a1) +	LWLO	t1, 7(a1) +	LWHI	t2, 8(a1) +	LWLO	t2, 11(a1) +	LWHI	t3, 12(a1) +	LWLO	t3, 15(a1) +	LWHI	t4, 16(a1) +	LWLO	t4, 19(a1) +	LWHI	t5, 20(a1) +	LWLO	t5, 23(a1) +	LWHI	t6, 24(a1) +	LWLO	t6, 27(a1) +	LWHI	t7, 28(a1) +	LWLO	t7, 31(a1) +	addiu	a1, a1, 32 + +	sw	t0, 0(a0) +	sw	t1, 4(a0) +	sw	t2, 8(a0) +	sw	t3, 12(a0) +	sw	t4, 16(a0) +	sw	t5, 20(a0) +	sw	t6, 24(a0) +	sw	t7, 28(a0) +	addiu	a0, a0, 32 + +$ua_chk1w: +	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */ +	beq	a2, t8, $ua_smallCopy +	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */ +	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */ + +/* copying in words (4-byte chunks) */ +$ua_wordCopy_loop: +	LWHI	v1, 0(a1) +	LWLO	v1, 3(a1) +	addiu	a1, a1, 4 +	addiu	a0, a0, 4		/* note: dst=a0 is word aligned here, see NOTE1 */ +	bne	a0, a3, $ua_wordCopy_loop +	sw	v1, -4(a0) + +/* Now less than 4 bytes (value in a2) left to copy */ +$ua_smallCopy: +	beqz	a2, leave +	addu	a3, a0, a2	/* a3 is the last dst address */ +$ua_smallCopy_loop: +	lb	v1, 0(a1) +	addiu	a1, a1, 1 +	addiu	a0, a0, 1 +	bne	a0, a3, $ua_smallCopy_loop +	sb	v1, -1(a0) + +	j	ra +	nop + +END(pixman_mips_fast_memcpy) diff --git a/libs/pixman-0.40.0/pixman/pixman-mips.c b/libs/pixman-0.40.0/pixman/pixman-mips.c new file mode 100644 index 0000000..3048813 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-mips.c @@ -0,0 +1,94 @@ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of SuSE not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission.  SuSE makes no representations about the + * suitability of this software for any purpose.  It is provided "as is" + * without express or implied warranty. + * + * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include "pixman-private.h" + +#if defined(USE_MIPS_DSPR2) || defined(USE_LOONGSON_MMI) + +#include <string.h> +#include <stdlib.h> + +static pixman_bool_t +have_feature (const char *search_string) +{ +#if defined (__linux__) /* linux ELF */ +    /* Simple detection of MIPS features at runtime for Linux. +     * It is based on /proc/cpuinfo, which reveals hardware configuration +     * to user-space applications.  According to MIPS (early 2010), no similar +     * facility is universally available on the MIPS architectures, so it's up +     * to individual OSes to provide such. +     */ +    const char *file_name = "/proc/cpuinfo"; +    char cpuinfo_line[256]; +    FILE *f = NULL; + +    if ((f = fopen (file_name, "r")) == NULL) +        return FALSE; + +    while (fgets (cpuinfo_line, sizeof (cpuinfo_line), f) != NULL) +    { +        if (strstr (cpuinfo_line, search_string) != NULL) +        { +            fclose (f); +            return TRUE; +        } +    } + +    fclose (f); +#endif + +    /* Did not find string in the proc file, or not Linux ELF. */ +    return FALSE; +} + +#endif + +pixman_implementation_t * +_pixman_mips_get_implementations (pixman_implementation_t *imp) +{ +#ifdef USE_LOONGSON_MMI +    /* I really don't know if some Loongson CPUs don't have MMI. */ +    if (!_pixman_disabled ("loongson-mmi") && have_feature ("Loongson")) +	imp = _pixman_implementation_create_mmx (imp); +#endif + +#ifdef USE_MIPS_DSPR2 +    if (!_pixman_disabled ("mips-dspr2")) +    { +	int already_compiling_everything_for_dspr2 = 0; +#if defined(__mips_dsp) && (__mips_dsp_rev >= 2) +	already_compiling_everything_for_dspr2 = 1; +#endif +	if (already_compiling_everything_for_dspr2 || +	    /* Only currently available MIPS core that supports DSPr2 is 74K. */ +	    have_feature ("MIPS 74K")) +	{ +	    imp = _pixman_implementation_create_mips_dspr2 (imp); +	} +    } +#endif + +    return imp; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-mmx.c b/libs/pixman-0.40.0/pixman/pixman-mmx.c new file mode 100644 index 0000000..d7cf265 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-mmx.c @@ -0,0 +1,4153 @@ +/* + * Copyright © 2004, 2005 Red Hat, Inc. + * Copyright © 2004 Nicholas Miell + * Copyright © 2005 Trolltech AS + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Red Hat not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission.  Red Hat makes no representations about the + * suitability of this software for any purpose.  It is provided "as is" + * without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author:  Søren Sandmann (sandmann@redhat.com) + * Minor Improvements: Nicholas Miell (nmiell@gmail.com) + * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com) + * + * Based on work by Owen Taylor + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI + +#ifdef USE_LOONGSON_MMI +#include <loongson-mmintrin.h> +#else +#include <mmintrin.h> +#endif +#include "pixman-private.h" +#include "pixman-combine32.h" +#include "pixman-inlines.h" + +#ifdef VERBOSE +#define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__) +#else +#define CHECKPOINT() +#endif + +#if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8 +/* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this.  */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_empty (void) +{ + +} +#endif + +#ifdef USE_X86_MMX +# if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64)) +#  include <xmmintrin.h> +# else +/* We have to compile with -msse to use xmmintrin.h, but that causes SSE + * instructions to be generated that we don't want. Just duplicate the + * functions we want to use.  */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movemask_pi8 (__m64 __A) +{ +    int ret; + +    asm ("pmovmskb %1, %0\n\t" +	: "=r" (ret) +	: "y" (__A) +    ); + +    return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhi_pu16 (__m64 __A, __m64 __B) +{ +    asm ("pmulhuw %1, %0\n\t" +	: "+y" (__A) +	: "y" (__B) +    ); +    return __A; +} + +# define _mm_shuffle_pi16(A, N)						\ +    ({									\ +	__m64 ret;							\ +									\ +	asm ("pshufw %2, %1, %0\n\t"					\ +	     : "=y" (ret)						\ +	     : "y" (A), "K" ((const int8_t)N)				\ +	);								\ +									\ +	ret;								\ +    }) +# endif +#endif + +#ifndef _MSC_VER +#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) +#endif + +/* Notes about writing mmx code + * + * give memory operands as the second operand. If you give it as the + * first, gcc will first load it into a register, then use that + * register + * + *   ie. use + * + *         _mm_mullo_pi16 (x, mmx_constant); + * + *   not + * + *         _mm_mullo_pi16 (mmx_constant, x); + * + * Also try to minimize dependencies. i.e. when you need a value, try + * to calculate it from a value that was calculated as early as + * possible. + */ + +/* --------------- MMX primitives ------------------------------------- */ + +/* If __m64 is defined as a struct or union, then define M64_MEMBER to be + * the name of the member used to access the data. + * If __m64 requires using mm_cvt* intrinsics functions to convert between + * uint64_t and __m64 values, then define USE_CVT_INTRINSICS. + * If __m64 and uint64_t values can just be cast to each other directly, + * then define USE_M64_CASTS. + * If __m64 is a double datatype, then define USE_M64_DOUBLE. + */ +#ifdef _MSC_VER +# define M64_MEMBER m64_u64 +#elif defined(__ICC) +# define USE_CVT_INTRINSICS +#elif defined(USE_LOONGSON_MMI) +# define USE_M64_DOUBLE +#elif defined(__GNUC__) +# define USE_M64_CASTS +#elif defined(__SUNPRO_C) +# if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__) +/* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__) + * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__ + * is defined.   If it is used, then the mm_cvt* intrinsics must be used. + */ +#  define USE_CVT_INTRINSICS +# else +/* For Studio 12.2 or older, or when __attribute__(__vector_size__) is + * disabled, __m64 is defined as a struct containing "unsigned long long l_". + */ +#  define M64_MEMBER l_ +# endif +#endif + +#if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE) +typedef uint64_t mmxdatafield; +#else +typedef __m64 mmxdatafield; +#endif + +typedef struct +{ +    mmxdatafield mmx_4x00ff; +    mmxdatafield mmx_4x0080; +    mmxdatafield mmx_565_rgb; +    mmxdatafield mmx_565_unpack_multiplier; +    mmxdatafield mmx_565_pack_multiplier; +    mmxdatafield mmx_565_r; +    mmxdatafield mmx_565_g; +    mmxdatafield mmx_565_b; +    mmxdatafield mmx_packed_565_rb; +    mmxdatafield mmx_packed_565_g; +    mmxdatafield mmx_expand_565_g; +    mmxdatafield mmx_expand_565_b; +    mmxdatafield mmx_expand_565_r; +#ifndef USE_LOONGSON_MMI +    mmxdatafield mmx_mask_0; +    mmxdatafield mmx_mask_1; +    mmxdatafield mmx_mask_2; +    mmxdatafield mmx_mask_3; +#endif +    mmxdatafield mmx_full_alpha; +    mmxdatafield mmx_4x0101; +    mmxdatafield mmx_ff000000; +} mmx_data_t; + +#if defined(_MSC_VER) +# define MMXDATA_INIT(field, val) { val ## UI64 } +#elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */ +# define MMXDATA_INIT(field, val) field =   { val ## ULL } +#else                           /* mmxdatafield is an integral type */ +# define MMXDATA_INIT(field, val) field =   val ## ULL +#endif + +static const mmx_data_t c = +{ +    MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff), +    MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080), +    MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f), +    MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840), +    MMXDATA_INIT (.mmx_565_pack_multiplier,      0x2000000420000004), +    MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000), +    MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000), +    MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8), +    MMXDATA_INIT (.mmx_packed_565_rb,            0x00f800f800f800f8), +    MMXDATA_INIT (.mmx_packed_565_g,             0x0000fc000000fc00), +    MMXDATA_INIT (.mmx_expand_565_g,             0x07e007e007e007e0), +    MMXDATA_INIT (.mmx_expand_565_b,             0x001f001f001f001f), +    MMXDATA_INIT (.mmx_expand_565_r,             0xf800f800f800f800), +#ifndef USE_LOONGSON_MMI +    MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000), +    MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff), +    MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff), +    MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff), +#endif +    MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000), +    MMXDATA_INIT (.mmx_4x0101,                   0x0101010101010101), +    MMXDATA_INIT (.mmx_ff000000,                 0xff000000ff000000), +}; + +#ifdef USE_CVT_INTRINSICS +#    define MC(x) to_m64 (c.mmx_ ## x) +#elif defined(USE_M64_CASTS) +#    define MC(x) ((__m64)c.mmx_ ## x) +#elif defined(USE_M64_DOUBLE) +#    define MC(x) (*(__m64 *)&c.mmx_ ## x) +#else +#    define MC(x) c.mmx_ ## x +#endif + +static force_inline __m64 +to_m64 (uint64_t x) +{ +#ifdef USE_CVT_INTRINSICS +    return _mm_cvtsi64_m64 (x); +#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */ +    __m64 res; + +    res.M64_MEMBER = x; +    return res; +#elif defined USE_M64_DOUBLE +    return *(__m64 *)&x; +#else /* USE_M64_CASTS */ +    return (__m64)x; +#endif +} + +static force_inline uint64_t +to_uint64 (__m64 x) +{ +#ifdef USE_CVT_INTRINSICS +    return _mm_cvtm64_si64 (x); +#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */ +    uint64_t res = x.M64_MEMBER; +    return res; +#elif defined USE_M64_DOUBLE +    return *(uint64_t *)&x; +#else /* USE_M64_CASTS */ +    return (uint64_t)x; +#endif +} + +static force_inline __m64 +shift (__m64 v, +       int   s) +{ +    if (s > 0) +	return _mm_slli_si64 (v, s); +    else if (s < 0) +	return _mm_srli_si64 (v, -s); +    else +	return v; +} + +static force_inline __m64 +negate (__m64 mask) +{ +    return _mm_xor_si64 (mask, MC (4x00ff)); +} + +/* Computes the product of two unsigned fixed-point 8-bit values from 0 to 1 + * and maps its result to the same range. + * + * Jim Blinn gives multiple ways to compute this in "Jim Blinn's Corner: + * Notation, Notation, Notation", the first of which is + * + *   prod(a, b) = (a * b + 128) / 255. + * + * By approximating the division by 255 as 257/65536 it can be replaced by a + * multiply and a right shift. This is the implementation that we use in + * pix_multiply(), but we _mm_mulhi_pu16() by 257 (part of SSE1 or Extended + * 3DNow!, and unavailable at the time of the book's publication) to perform + * the multiplication and right shift in a single operation. + * + *   prod(a, b) = ((a * b + 128) * 257) >> 16. + * + * A third way (how pix_multiply() was implemented prior to 14208344) exists + * also that performs the multiplication by 257 with adds and shifts. + * + * Where temp = a * b + 128 + * + *   prod(a, b) = (temp + (temp >> 8)) >> 8. + */ +static force_inline __m64 +pix_multiply (__m64 a, __m64 b) +{ +    __m64 res; + +    res = _mm_mullo_pi16 (a, b); +    res = _mm_adds_pu16 (res, MC (4x0080)); +    res = _mm_mulhi_pu16 (res, MC (4x0101)); + +    return res; +} + +static force_inline __m64 +pix_add (__m64 a, __m64 b) +{ +    return _mm_adds_pu8 (a, b); +} + +static force_inline __m64 +expand_alpha (__m64 pixel) +{ +    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3)); +} + +static force_inline __m64 +expand_alpha_rev (__m64 pixel) +{ +    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0)); +} + +static force_inline __m64 +invert_colors (__m64 pixel) +{ +    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2)); +} + +static force_inline __m64 +over (__m64 src, +      __m64 srca, +      __m64 dest) +{ +    return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca))); +} + +static force_inline __m64 +over_rev_non_pre (__m64 src, __m64 dest) +{ +    __m64 srca = expand_alpha (src); +    __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha)); + +    return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest); +} + +static force_inline __m64 +in (__m64 src, __m64 mask) +{ +    return pix_multiply (src, mask); +} + +#ifndef _MSC_VER +static force_inline __m64 +in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest) +{ +    return over (in (src, mask), pix_multiply (srca, mask), dest); +} + +#else + +#define in_over(src, srca, mask, dest)					\ +    over (in (src, mask), pix_multiply (srca, mask), dest) + +#endif + +/* Elemental unaligned loads */ + +static force_inline __m64 ldq_u(__m64 *p) +{ +#ifdef USE_X86_MMX +    /* x86's alignment restrictions are very relaxed, but that's no excuse */ +    __m64 r; +    memcpy(&r, p, sizeof(__m64)); +    return r; +#elif defined USE_ARM_IWMMXT +    int align = (uintptr_t)p & 7; +    __m64 *aligned_p; +    if (align == 0) +	return *p; +    aligned_p = (__m64 *)((uintptr_t)p & ~7); +    return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align); +#else +    struct __una_u64 { __m64 x __attribute__((packed)); }; +    const struct __una_u64 *ptr = (const struct __una_u64 *) p; +    return (__m64) ptr->x; +#endif +} + +static force_inline uint32_t ldl_u(const uint32_t *p) +{ +#ifdef USE_X86_MMX +    /* x86's alignment restrictions are very relaxed. */ +    uint32_t r; +    memcpy(&r, p, sizeof(uint32_t)); +    return r; +#else +    struct __una_u32 { uint32_t x __attribute__((packed)); }; +    const struct __una_u32 *ptr = (const struct __una_u32 *) p; +    return ptr->x; +#endif +} + +static force_inline __m64 +load (const uint32_t *v) +{ +#ifdef USE_LOONGSON_MMI +    __m64 ret; +    asm ("lwc1 %0, %1\n\t" +	: "=f" (ret) +	: "m" (*v) +    ); +    return ret; +#else +    return _mm_cvtsi32_si64 (*v); +#endif +} + +static force_inline __m64 +load8888 (const uint32_t *v) +{ +#ifdef USE_LOONGSON_MMI +    return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ()); +#else +    return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ()); +#endif +} + +static force_inline __m64 +load8888u (const uint32_t *v) +{ +    uint32_t l = ldl_u (v); +    return load8888 (&l); +} + +static force_inline __m64 +pack8888 (__m64 lo, __m64 hi) +{ +    return _mm_packs_pu16 (lo, hi); +} + +static force_inline void +store (uint32_t *dest, __m64 v) +{ +#ifdef USE_LOONGSON_MMI +    asm ("swc1 %1, %0\n\t" +	: "=m" (*dest) +	: "f" (v) +	: "memory" +    ); +#else +    *dest = _mm_cvtsi64_si32 (v); +#endif +} + +static force_inline void +store8888 (uint32_t *dest, __m64 v) +{ +    v = pack8888 (v, _mm_setzero_si64 ()); +    store (dest, v); +} + +static force_inline pixman_bool_t +is_equal (__m64 a, __m64 b) +{ +#ifdef USE_LOONGSON_MMI +    /* __m64 is double, we can compare directly. */ +    return a == b; +#else +    return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff; +#endif +} + +static force_inline pixman_bool_t +is_opaque (__m64 v) +{ +#ifdef USE_LOONGSON_MMI +    return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha)); +#else +    __m64 ffs = _mm_cmpeq_pi8 (v, v); +    return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40); +#endif +} + +static force_inline pixman_bool_t +is_zero (__m64 v) +{ +    return is_equal (v, _mm_setzero_si64 ()); +} + +/* Expand 16 bits positioned at @pos (0-3) of a mmx register into + * + *    00RR00GG00BB + * + * --- Expanding 565 in the low word --- + * + * m = (m << (32 - 3)) | (m << (16 - 5)) | m; + * m = m & (01f0003f001f); + * m = m * (008404100840); + * m = m >> 8; + * + * Note the trick here - the top word is shifted by another nibble to + * avoid it bumping into the middle word + */ +static force_inline __m64 +expand565 (__m64 pixel, int pos) +{ +    __m64 p = pixel; +    __m64 t1, t2; + +    /* move pixel to low 16 bit and zero the rest */ +#ifdef USE_LOONGSON_MMI +    p = loongson_extract_pi16 (p, pos); +#else +    p = shift (shift (p, (3 - pos) * 16), -48); +#endif + +    t1 = shift (p, 36 - 11); +    t2 = shift (p, 16 - 5); + +    p = _mm_or_si64 (t1, p); +    p = _mm_or_si64 (t2, p); +    p = _mm_and_si64 (p, MC (565_rgb)); + +    pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier)); +    return _mm_srli_pi16 (pixel, 8); +} + +/* Expand 4 16 bit pixels in an mmx register into two mmx registers of + * + *    AARRGGBBRRGGBB + */ +static force_inline void +expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha) +{ +    __m64 t0, t1, alpha = _mm_setzero_si64 (); +    __m64 r = _mm_and_si64 (vin, MC (expand_565_r)); +    __m64 g = _mm_and_si64 (vin, MC (expand_565_g)); +    __m64 b = _mm_and_si64 (vin, MC (expand_565_b)); +    if (full_alpha) +	alpha = _mm_cmpeq_pi32 (alpha, alpha); + +    /* Replicate high bits into empty low bits. */ +    r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13)); +    g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9)); +    b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2)); + +    r = _mm_packs_pu16 (r, _mm_setzero_si64 ());	/* 00 00 00 00 R3 R2 R1 R0 */ +    g = _mm_packs_pu16 (g, _mm_setzero_si64 ());	/* 00 00 00 00 G3 G2 G1 G0 */ +    b = _mm_packs_pu16 (b, _mm_setzero_si64 ());	/* 00 00 00 00 B3 B2 B1 B0 */ + +    t1 = _mm_unpacklo_pi8 (r, alpha);			/* A3 R3 A2 R2 A1 R1 A0 R0 */ +    t0 = _mm_unpacklo_pi8 (b, g);			/* G3 B3 G2 B2 G1 B1 G0 B0 */ + +    *vout0 = _mm_unpacklo_pi16 (t0, t1);		/* A1 R1 G1 B1 A0 R0 G0 B0 */ +    *vout1 = _mm_unpackhi_pi16 (t0, t1);		/* A3 R3 G3 B3 A2 R2 G2 B2 */ +} + +static force_inline __m64 +expand8888 (__m64 in, int pos) +{ +    if (pos == 0) +	return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ()); +    else +	return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ()); +} + +static force_inline __m64 +expandx888 (__m64 in, int pos) +{ +    return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha)); +} + +static force_inline void +expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha) +{ +    __m64 v0, v1; +    expand_4xpacked565 (vin, &v0, &v1, full_alpha); +    *vout0 = expand8888 (v0, 0); +    *vout1 = expand8888 (v0, 1); +    *vout2 = expand8888 (v1, 0); +    *vout3 = expand8888 (v1, 1); +} + +static force_inline __m64 +pack_565 (__m64 pixel, __m64 target, int pos) +{ +    __m64 p = pixel; +    __m64 t = target; +    __m64 r, g, b; + +    r = _mm_and_si64 (p, MC (565_r)); +    g = _mm_and_si64 (p, MC (565_g)); +    b = _mm_and_si64 (p, MC (565_b)); + +#ifdef USE_LOONGSON_MMI +    r = shift (r, -(32 - 8)); +    g = shift (g, -(16 - 3)); +    b = shift (b, -(0  + 3)); + +    p = _mm_or_si64 (r, g); +    p = _mm_or_si64 (p, b); +    return loongson_insert_pi16 (t, p, pos); +#else +    r = shift (r, -(32 - 8) + pos * 16); +    g = shift (g, -(16 - 3) + pos * 16); +    b = shift (b, -(0  + 3) + pos * 16); + +    if (pos == 0) +	t = _mm_and_si64 (t, MC (mask_0)); +    else if (pos == 1) +	t = _mm_and_si64 (t, MC (mask_1)); +    else if (pos == 2) +	t = _mm_and_si64 (t, MC (mask_2)); +    else if (pos == 3) +	t = _mm_and_si64 (t, MC (mask_3)); + +    p = _mm_or_si64 (r, t); +    p = _mm_or_si64 (g, p); + +    return _mm_or_si64 (b, p); +#endif +} + +static force_inline __m64 +pack_4xpacked565 (__m64 a, __m64 b) +{ +    __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb)); +    __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb)); + +    __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier)); +    __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier)); + +    __m64 g0 = _mm_and_si64 (a, MC (packed_565_g)); +    __m64 g1 = _mm_and_si64 (b, MC (packed_565_g)); + +    t0 = _mm_or_si64 (t0, g0); +    t1 = _mm_or_si64 (t1, g1); + +    t0 = shift(t0, -5); +#ifdef USE_ARM_IWMMXT +    t1 = shift(t1, -5); +    return _mm_packs_pu32 (t0, t1); +#else +    t1 = shift(t1, -5 + 16); +    return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0)); +#endif +} + +#ifndef _MSC_VER + +static force_inline __m64 +pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3) +{ +    return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3)); +} + +static force_inline __m64 +pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b) +{ +    x = pix_multiply (x, a); +    y = pix_multiply (y, b); + +    return pix_add (x, y); +} + +#else + +/* MSVC only handles a "pass by register" of up to three SSE intrinsics */ + +#define pack_4x565(v0, v1, v2, v3) \ +    pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3)) + +#define pix_add_mul(x, a, y, b)	 \ +    ( x = pix_multiply (x, a),	 \ +      y = pix_multiply (y, b),	 \ +      pix_add (x, y) ) + +#endif + +/* --------------- MMX code patch for fbcompose.c --------------------- */ + +static force_inline __m64 +combine (const uint32_t *src, const uint32_t *mask) +{ +    __m64 vsrc = load8888 (src); + +    if (mask) +    { +	__m64 m = load8888 (mask); + +	m = expand_alpha (m); +	vsrc = pix_multiply (vsrc, m); +    } + +    return vsrc; +} + +static force_inline __m64 +core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst) +{ +    vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ()); + +    if (is_opaque (vsrc)) +    { +	return vsrc; +    } +    else if (!is_zero (vsrc)) +    { +	return over (vsrc, expand_alpha (vsrc), +		     _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ())); +    } + +    return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()); +} + +static void +mmx_combine_over_u (pixman_implementation_t *imp, +                    pixman_op_t              op, +                    uint32_t *               dest, +                    const uint32_t *         src, +                    const uint32_t *         mask, +                    int                      width) +{ +    const uint32_t *end = dest + width; + +    while (dest < end) +    { +	__m64 vsrc = combine (src, mask); + +	if (is_opaque (vsrc)) +	{ +	    store8888 (dest, vsrc); +	} +	else if (!is_zero (vsrc)) +	{ +	    __m64 sa = expand_alpha (vsrc); +	    store8888 (dest, over (vsrc, sa, load8888 (dest))); +	} + +	++dest; +	++src; +	if (mask) +	    ++mask; +    } +    _mm_empty (); +} + +static void +mmx_combine_over_reverse_u (pixman_implementation_t *imp, +                            pixman_op_t              op, +                            uint32_t *               dest, +                            const uint32_t *         src, +                            const uint32_t *         mask, +                            int                      width) +{ +    const uint32_t *end = dest + width; + +    while (dest < end) +    { +	__m64 d, da; +	__m64 s = combine (src, mask); + +	d = load8888 (dest); +	da = expand_alpha (d); +	store8888 (dest, over (d, da, s)); + +	++dest; +	++src; +	if (mask) +	    mask++; +    } +    _mm_empty (); +} + +static void +mmx_combine_in_u (pixman_implementation_t *imp, +                  pixman_op_t              op, +                  uint32_t *               dest, +                  const uint32_t *         src, +                  const uint32_t *         mask, +                  int                      width) +{ +    const uint32_t *end = dest + width; + +    while (dest < end) +    { +	__m64 a; +	__m64 x = combine (src, mask); + +	a = load8888 (dest); +	a = expand_alpha (a); +	x = pix_multiply (x, a); + +	store8888 (dest, x); + +	++dest; +	++src; +	if (mask) +	    mask++; +    } +    _mm_empty (); +} + +static void +mmx_combine_in_reverse_u (pixman_implementation_t *imp, +                          pixman_op_t              op, +                          uint32_t *               dest, +                          const uint32_t *         src, +                          const uint32_t *         mask, +                          int                      width) +{ +    const uint32_t *end = dest + width; + +    while (dest < end) +    { +	__m64 a = combine (src, mask); +	__m64 x; + +	x = load8888 (dest); +	a = expand_alpha (a); +	x = pix_multiply (x, a); +	store8888 (dest, x); + +	++dest; +	++src; +	if (mask) +	    mask++; +    } +    _mm_empty (); +} + +static void +mmx_combine_out_u (pixman_implementation_t *imp, +                   pixman_op_t              op, +                   uint32_t *               dest, +                   const uint32_t *         src, +                   const uint32_t *         mask, +                   int                      width) +{ +    const uint32_t *end = dest + width; + +    while (dest < end) +    { +	__m64 a; +	__m64 x = combine (src, mask); + +	a = load8888 (dest); +	a = expand_alpha (a); +	a = negate (a); +	x = pix_multiply (x, a); +	store8888 (dest, x); + +	++dest; +	++src; +	if (mask) +	    mask++; +    } +    _mm_empty (); +} + +static void +mmx_combine_out_reverse_u (pixman_implementation_t *imp, +                           pixman_op_t              op, +                           uint32_t *               dest, +                           const uint32_t *         src, +                           const uint32_t *         mask, +                           int                      width) +{ +    const uint32_t *end = dest + width; + +    while (dest < end) +    { +	__m64 a = combine (src, mask); +	__m64 x; + +	x = load8888 (dest); +	a = expand_alpha (a); +	a = negate (a); +	x = pix_multiply (x, a); + +	store8888 (dest, x); + +	++dest; +	++src; +	if (mask) +	    mask++; +    } +    _mm_empty (); +} + +static void +mmx_combine_atop_u (pixman_implementation_t *imp, +                    pixman_op_t              op, +                    uint32_t *               dest, +                    const uint32_t *         src, +                    const uint32_t *         mask, +                    int                      width) +{ +    const uint32_t *end = dest + width; + +    while (dest < end) +    { +	__m64 da, d, sia; +	__m64 s = combine (src, mask); + +	d = load8888 (dest); +	sia = expand_alpha (s); +	sia = negate (sia); +	da = expand_alpha (d); +	s = pix_add_mul (s, da, d, sia); +	store8888 (dest, s); + +	++dest; +	++src; +	if (mask) +	    mask++; +    } +    _mm_empty (); +} + +static void +mmx_combine_atop_reverse_u (pixman_implementation_t *imp, +                            pixman_op_t              op, +                            uint32_t *               dest, +                            const uint32_t *         src, +                            const uint32_t *         mask, +                            int                      width) +{ +    const uint32_t *end; + +    end = dest + width; + +    while (dest < end) +    { +	__m64 dia, d, sa; +	__m64 s = combine (src, mask); + +	d = load8888 (dest); +	sa = expand_alpha (s); +	dia = expand_alpha (d); +	dia = negate (dia); +	s = pix_add_mul (s, dia, d, sa); +	store8888 (dest, s); + +	++dest; +	++src; +	if (mask) +	    mask++; +    } +    _mm_empty (); +} + +static void +mmx_combine_xor_u (pixman_implementation_t *imp, +                   pixman_op_t              op, +                   uint32_t *               dest, +                   const uint32_t *         src, +                   const uint32_t *         mask, +                   int                      width) +{ +    const uint32_t *end = dest + width; + +    while (dest < end) +    { +	__m64 dia, d, sia; +	__m64 s = combine (src, mask); + +	d = load8888 (dest); +	sia = expand_alpha (s); +	dia = expand_alpha (d); +	sia = negate (sia); +	dia = negate (dia); +	s = pix_add_mul (s, dia, d, sia); +	store8888 (dest, s); + +	++dest; +	++src; +	if (mask) +	    mask++; +    } +    _mm_empty (); +} + +static void +mmx_combine_add_u (pixman_implementation_t *imp, +                   pixman_op_t              op, +                   uint32_t *               dest, +                   const uint32_t *         src, +                   const uint32_t *         mask, +                   int                      width) +{ +    const uint32_t *end = dest + width; + +    while (dest < end) +    { +	__m64 d; +	__m64 s = combine (src, mask); + +	d = load8888 (dest); +	s = pix_add (s, d); +	store8888 (dest, s); + +	++dest; +	++src; +	if (mask) +	    mask++; +    } +    _mm_empty (); +} + +static void +mmx_combine_saturate_u (pixman_implementation_t *imp, +                        pixman_op_t              op, +                        uint32_t *               dest, +                        const uint32_t *         src, +                        const uint32_t *         mask, +                        int                      width) +{ +    const uint32_t *end = dest + width; + +    while (dest < end) +    { +	uint32_t s, sa, da; +	uint32_t d = *dest; +	__m64 ms = combine (src, mask); +	__m64 md = load8888 (dest); + +	store8888(&s, ms); +	da = ~d >> 24; +	sa = s >> 24; + +	if (sa > da) +	{ +	    uint32_t quot = DIV_UN8 (da, sa) << 24; +	    __m64 msa = load8888 ("); +	    msa = expand_alpha (msa); +	    ms = pix_multiply (ms, msa); +	} + +	md = pix_add (md, ms); +	store8888 (dest, md); + +	++src; +	++dest; +	if (mask) +	    mask++; +    } +    _mm_empty (); +} + +static void +mmx_combine_src_ca (pixman_implementation_t *imp, +                    pixman_op_t              op, +                    uint32_t *               dest, +                    const uint32_t *         src, +                    const uint32_t *         mask, +                    int                      width) +{ +    const uint32_t *end = src + width; + +    while (src < end) +    { +	__m64 a = load8888 (mask); +	__m64 s = load8888 (src); + +	s = pix_multiply (s, a); +	store8888 (dest, s); + +	++src; +	++mask; +	++dest; +    } +    _mm_empty (); +} + +static void +mmx_combine_over_ca (pixman_implementation_t *imp, +                     pixman_op_t              op, +                     uint32_t *               dest, +                     const uint32_t *         src, +                     const uint32_t *         mask, +                     int                      width) +{ +    const uint32_t *end = src + width; + +    while (src < end) +    { +	__m64 a = load8888 (mask); +	__m64 s = load8888 (src); +	__m64 d = load8888 (dest); +	__m64 sa = expand_alpha (s); + +	store8888 (dest, in_over (s, sa, a, d)); + +	++src; +	++dest; +	++mask; +    } +    _mm_empty (); +} + +static void +mmx_combine_over_reverse_ca (pixman_implementation_t *imp, +                             pixman_op_t              op, +                             uint32_t *               dest, +                             const uint32_t *         src, +                             const uint32_t *         mask, +                             int                      width) +{ +    const uint32_t *end = src + width; + +    while (src < end) +    { +	__m64 a = load8888 (mask); +	__m64 s = load8888 (src); +	__m64 d = load8888 (dest); +	__m64 da = expand_alpha (d); + +	store8888 (dest, over (d, da, in (s, a))); + +	++src; +	++dest; +	++mask; +    } +    _mm_empty (); +} + +static void +mmx_combine_in_ca (pixman_implementation_t *imp, +                   pixman_op_t              op, +                   uint32_t *               dest, +                   const uint32_t *         src, +                   const uint32_t *         mask, +                   int                      width) +{ +    const uint32_t *end = src + width; + +    while (src < end) +    { +	__m64 a = load8888 (mask); +	__m64 s = load8888 (src); +	__m64 d = load8888 (dest); +	__m64 da = expand_alpha (d); + +	s = pix_multiply (s, a); +	s = pix_multiply (s, da); +	store8888 (dest, s); + +	++src; +	++dest; +	++mask; +    } +    _mm_empty (); +} + +static void +mmx_combine_in_reverse_ca (pixman_implementation_t *imp, +                           pixman_op_t              op, +                           uint32_t *               dest, +                           const uint32_t *         src, +                           const uint32_t *         mask, +                           int                      width) +{ +    const uint32_t *end = src + width; + +    while (src < end) +    { +	__m64 a = load8888 (mask); +	__m64 s = load8888 (src); +	__m64 d = load8888 (dest); +	__m64 sa = expand_alpha (s); + +	a = pix_multiply (a, sa); +	d = pix_multiply (d, a); +	store8888 (dest, d); + +	++src; +	++dest; +	++mask; +    } +    _mm_empty (); +} + +static void +mmx_combine_out_ca (pixman_implementation_t *imp, +                    pixman_op_t              op, +                    uint32_t *               dest, +                    const uint32_t *         src, +                    const uint32_t *         mask, +                    int                      width) +{ +    const uint32_t *end = src + width; + +    while (src < end) +    { +	__m64 a = load8888 (mask); +	__m64 s = load8888 (src); +	__m64 d = load8888 (dest); +	__m64 da = expand_alpha (d); + +	da = negate (da); +	s = pix_multiply (s, a); +	s = pix_multiply (s, da); +	store8888 (dest, s); + +	++src; +	++dest; +	++mask; +    } +    _mm_empty (); +} + +static void +mmx_combine_out_reverse_ca (pixman_implementation_t *imp, +                            pixman_op_t              op, +                            uint32_t *               dest, +                            const uint32_t *         src, +                            const uint32_t *         mask, +                            int                      width) +{ +    const uint32_t *end = src + width; + +    while (src < end) +    { +	__m64 a = load8888 (mask); +	__m64 s = load8888 (src); +	__m64 d = load8888 (dest); +	__m64 sa = expand_alpha (s); + +	a = pix_multiply (a, sa); +	a = negate (a); +	d = pix_multiply (d, a); +	store8888 (dest, d); + +	++src; +	++dest; +	++mask; +    } +    _mm_empty (); +} + +static void +mmx_combine_atop_ca (pixman_implementation_t *imp, +                     pixman_op_t              op, +                     uint32_t *               dest, +                     const uint32_t *         src, +                     const uint32_t *         mask, +                     int                      width) +{ +    const uint32_t *end = src + width; + +    while (src < end) +    { +	__m64 a = load8888 (mask); +	__m64 s = load8888 (src); +	__m64 d = load8888 (dest); +	__m64 da = expand_alpha (d); +	__m64 sa = expand_alpha (s); + +	s = pix_multiply (s, a); +	a = pix_multiply (a, sa); +	a = negate (a); +	d = pix_add_mul (d, a, s, da); +	store8888 (dest, d); + +	++src; +	++dest; +	++mask; +    } +    _mm_empty (); +} + +static void +mmx_combine_atop_reverse_ca (pixman_implementation_t *imp, +                             pixman_op_t              op, +                             uint32_t *               dest, +                             const uint32_t *         src, +                             const uint32_t *         mask, +                             int                      width) +{ +    const uint32_t *end = src + width; + +    while (src < end) +    { +	__m64 a = load8888 (mask); +	__m64 s = load8888 (src); +	__m64 d = load8888 (dest); +	__m64 da = expand_alpha (d); +	__m64 sa = expand_alpha (s); + +	s = pix_multiply (s, a); +	a = pix_multiply (a, sa); +	da = negate (da); +	d = pix_add_mul (d, a, s, da); +	store8888 (dest, d); + +	++src; +	++dest; +	++mask; +    } +    _mm_empty (); +} + +static void +mmx_combine_xor_ca (pixman_implementation_t *imp, +                    pixman_op_t              op, +                    uint32_t *               dest, +                    const uint32_t *         src, +                    const uint32_t *         mask, +                    int                      width) +{ +    const uint32_t *end = src + width; + +    while (src < end) +    { +	__m64 a = load8888 (mask); +	__m64 s = load8888 (src); +	__m64 d = load8888 (dest); +	__m64 da = expand_alpha (d); +	__m64 sa = expand_alpha (s); + +	s = pix_multiply (s, a); +	a = pix_multiply (a, sa); +	da = negate (da); +	a = negate (a); +	d = pix_add_mul (d, a, s, da); +	store8888 (dest, d); + +	++src; +	++dest; +	++mask; +    } +    _mm_empty (); +} + +static void +mmx_combine_add_ca (pixman_implementation_t *imp, +                    pixman_op_t              op, +                    uint32_t *               dest, +                    const uint32_t *         src, +                    const uint32_t *         mask, +                    int                      width) +{ +    const uint32_t *end = src + width; + +    while (src < end) +    { +	__m64 a = load8888 (mask); +	__m64 s = load8888 (src); +	__m64 d = load8888 (dest); + +	s = pix_multiply (s, a); +	d = pix_add (s, d); +	store8888 (dest, d); + +	++src; +	++dest; +	++mask; +    } +    _mm_empty (); +} + +/* ------------- MMX code paths called from fbpict.c -------------------- */ + +static void +mmx_composite_over_n_8888 (pixman_implementation_t *imp, +                           pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src; +    uint32_t    *dst_line, *dst; +    int32_t w; +    int dst_stride; +    __m64 vsrc, vsrca; + +    CHECKPOINT (); + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + +    vsrc = load8888 (&src); +    vsrca = expand_alpha (vsrc); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	w = width; + +	CHECKPOINT (); + +	while (w && (uintptr_t)dst & 7) +	{ +	    store8888 (dst, over (vsrc, vsrca, load8888 (dst))); + +	    w--; +	    dst++; +	} + +	while (w >= 2) +	{ +	    __m64 vdest; +	    __m64 dest0, dest1; + +	    vdest = *(__m64 *)dst; + +	    dest0 = over (vsrc, vsrca, expand8888 (vdest, 0)); +	    dest1 = over (vsrc, vsrca, expand8888 (vdest, 1)); + +	    *(__m64 *)dst = pack8888 (dest0, dest1); + +	    dst += 2; +	    w -= 2; +	} + +	CHECKPOINT (); + +	if (w) +	{ +	    store8888 (dst, over (vsrc, vsrca, load8888 (dst))); +	} +    } + +    _mm_empty (); +} + +static void +mmx_composite_over_n_0565 (pixman_implementation_t *imp, +                           pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src; +    uint16_t    *dst_line, *dst; +    int32_t w; +    int dst_stride; +    __m64 vsrc, vsrca; + +    CHECKPOINT (); + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + +    vsrc = load8888 (&src); +    vsrca = expand_alpha (vsrc); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	w = width; + +	CHECKPOINT (); + +	while (w && (uintptr_t)dst & 7) +	{ +	    uint64_t d = *dst; +	    __m64 vdest = expand565 (to_m64 (d), 0); + +	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); +	    *dst = to_uint64 (vdest); + +	    w--; +	    dst++; +	} + +	while (w >= 4) +	{ +	    __m64 vdest = *(__m64 *)dst; +	    __m64 v0, v1, v2, v3; + +	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); + +	    v0 = over (vsrc, vsrca, v0); +	    v1 = over (vsrc, vsrca, v1); +	    v2 = over (vsrc, vsrca, v2); +	    v3 = over (vsrc, vsrca, v3); + +	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); + +	    dst += 4; +	    w -= 4; +	} + +	CHECKPOINT (); + +	while (w) +	{ +	    uint64_t d = *dst; +	    __m64 vdest = expand565 (to_m64 (d), 0); + +	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); +	    *dst = to_uint64 (vdest); + +	    w--; +	    dst++; +	} +    } + +    _mm_empty (); +} + +static void +mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, +                                   pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src; +    uint32_t    *dst_line; +    uint32_t    *mask_line; +    int dst_stride, mask_stride; +    __m64 vsrc, vsrca; + +    CHECKPOINT (); + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + +    vsrc = load8888 (&src); +    vsrca = expand_alpha (vsrc); + +    while (height--) +    { +	int twidth = width; +	uint32_t *p = (uint32_t *)mask_line; +	uint32_t *q = (uint32_t *)dst_line; + +	while (twidth && (uintptr_t)q & 7) +	{ +	    uint32_t m = *(uint32_t *)p; + +	    if (m) +	    { +		__m64 vdest = load8888 (q); +		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); +		store8888 (q, vdest); +	    } + +	    twidth--; +	    p++; +	    q++; +	} + +	while (twidth >= 2) +	{ +	    uint32_t m0, m1; +	    m0 = *p; +	    m1 = *(p + 1); + +	    if (m0 | m1) +	    { +		__m64 dest0, dest1; +		__m64 vdest = *(__m64 *)q; + +		dest0 = in_over (vsrc, vsrca, load8888 (&m0), +		                 expand8888 (vdest, 0)); +		dest1 = in_over (vsrc, vsrca, load8888 (&m1), +		                 expand8888 (vdest, 1)); + +		*(__m64 *)q = pack8888 (dest0, dest1); +	    } + +	    p += 2; +	    q += 2; +	    twidth -= 2; +	} + +	if (twidth) +	{ +	    uint32_t m = *(uint32_t *)p; + +	    if (m) +	    { +		__m64 vdest = load8888 (q); +		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); +		store8888 (q, vdest); +	    } + +	    twidth--; +	    p++; +	    q++; +	} + +	dst_line += dst_stride; +	mask_line += mask_stride; +    } + +    _mm_empty (); +} + +static void +mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp, +                                pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t    *dst_line, *dst; +    uint32_t    *src_line, *src; +    uint32_t mask; +    __m64 vmask; +    int dst_stride, src_stride; +    int32_t w; + +    CHECKPOINT (); + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); +    vmask = expand_alpha (load8888 (&mask)); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w && (uintptr_t)dst & 7) +	{ +	    __m64 s = load8888 (src); +	    __m64 d = load8888 (dst); + +	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); + +	    w--; +	    dst++; +	    src++; +	} + +	while (w >= 2) +	{ +	    __m64 vs = ldq_u ((__m64 *)src); +	    __m64 vd = *(__m64 *)dst; +	    __m64 vsrc0 = expand8888 (vs, 0); +	    __m64 vsrc1 = expand8888 (vs, 1); + +	    *(__m64 *)dst = pack8888 ( +	        in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)), +	        in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1))); + +	    w -= 2; +	    dst += 2; +	    src += 2; +	} + +	if (w) +	{ +	    __m64 s = load8888 (src); +	    __m64 d = load8888 (dst); + +	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); +	} +    } + +    _mm_empty (); +} + +static void +mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp, +                                pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t *dst_line, *dst; +    uint32_t *src_line, *src; +    uint32_t mask; +    __m64 vmask; +    int dst_stride, src_stride; +    int32_t w; +    __m64 srca; + +    CHECKPOINT (); + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); +    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); + +    vmask = expand_alpha (load8888 (&mask)); +    srca = MC (4x00ff); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w && (uintptr_t)dst & 7) +	{ +	    uint32_t ssrc = *src | 0xff000000; +	    __m64 s = load8888 (&ssrc); +	    __m64 d = load8888 (dst); + +	    store8888 (dst, in_over (s, srca, vmask, d)); + +	    w--; +	    dst++; +	    src++; +	} + +	while (w >= 16) +	{ +	    __m64 vd0 = *(__m64 *)(dst + 0); +	    __m64 vd1 = *(__m64 *)(dst + 2); +	    __m64 vd2 = *(__m64 *)(dst + 4); +	    __m64 vd3 = *(__m64 *)(dst + 6); +	    __m64 vd4 = *(__m64 *)(dst + 8); +	    __m64 vd5 = *(__m64 *)(dst + 10); +	    __m64 vd6 = *(__m64 *)(dst + 12); +	    __m64 vd7 = *(__m64 *)(dst + 14); + +	    __m64 vs0 = ldq_u ((__m64 *)(src + 0)); +	    __m64 vs1 = ldq_u ((__m64 *)(src + 2)); +	    __m64 vs2 = ldq_u ((__m64 *)(src + 4)); +	    __m64 vs3 = ldq_u ((__m64 *)(src + 6)); +	    __m64 vs4 = ldq_u ((__m64 *)(src + 8)); +	    __m64 vs5 = ldq_u ((__m64 *)(src + 10)); +	    __m64 vs6 = ldq_u ((__m64 *)(src + 12)); +	    __m64 vs7 = ldq_u ((__m64 *)(src + 14)); + +	    vd0 = pack8888 ( +	        in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)), +	        in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1))); + +	    vd1 = pack8888 ( +	        in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)), +	        in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1))); + +	    vd2 = pack8888 ( +	        in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)), +	        in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1))); + +	    vd3 = pack8888 ( +	        in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)), +	        in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1))); + +	    vd4 = pack8888 ( +	        in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)), +	        in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1))); + +	    vd5 = pack8888 ( +	        in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)), +	        in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1))); + +	    vd6 = pack8888 ( +	        in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)), +	        in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1))); + +	    vd7 = pack8888 ( +	        in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)), +	        in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1))); + +	    *(__m64 *)(dst + 0) = vd0; +	    *(__m64 *)(dst + 2) = vd1; +	    *(__m64 *)(dst + 4) = vd2; +	    *(__m64 *)(dst + 6) = vd3; +	    *(__m64 *)(dst + 8) = vd4; +	    *(__m64 *)(dst + 10) = vd5; +	    *(__m64 *)(dst + 12) = vd6; +	    *(__m64 *)(dst + 14) = vd7; + +	    w -= 16; +	    dst += 16; +	    src += 16; +	} + +	while (w) +	{ +	    uint32_t ssrc = *src | 0xff000000; +	    __m64 s = load8888 (&ssrc); +	    __m64 d = load8888 (dst); + +	    store8888 (dst, in_over (s, srca, vmask, d)); + +	    w--; +	    dst++; +	    src++; +	} +    } + +    _mm_empty (); +} + +static void +mmx_composite_over_8888_8888 (pixman_implementation_t *imp, +                              pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t *dst_line, *dst; +    uint32_t *src_line, *src; +    uint32_t s; +    int dst_stride, src_stride; +    uint8_t a; +    int32_t w; + +    CHECKPOINT (); + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w--) +	{ +	    s = *src++; +	    a = s >> 24; + +	    if (a == 0xff) +	    { +		*dst = s; +	    } +	    else if (s) +	    { +		__m64 ms, sa; +		ms = load8888 (&s); +		sa = expand_alpha (ms); +		store8888 (dst, over (ms, sa, load8888 (dst))); +	    } + +	    dst++; +	} +    } +    _mm_empty (); +} + +static void +mmx_composite_over_8888_0565 (pixman_implementation_t *imp, +                              pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint16_t    *dst_line, *dst; +    uint32_t    *src_line, *src; +    int dst_stride, src_stride; +    int32_t w; + +    CHECKPOINT (); + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +#if 0 +    /* FIXME */ +    assert (src_image->drawable == mask_image->drawable); +#endif + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	CHECKPOINT (); + +	while (w && (uintptr_t)dst & 7) +	{ +	    __m64 vsrc = load8888 (src); +	    uint64_t d = *dst; +	    __m64 vdest = expand565 (to_m64 (d), 0); + +	    vdest = pack_565 ( +		over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); + +	    *dst = to_uint64 (vdest); + +	    w--; +	    dst++; +	    src++; +	} + +	CHECKPOINT (); + +	while (w >= 4) +	{ +	    __m64 vdest = *(__m64 *)dst; +	    __m64 v0, v1, v2, v3; +	    __m64 vsrc0, vsrc1, vsrc2, vsrc3; + +	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); + +	    vsrc0 = load8888 ((src + 0)); +	    vsrc1 = load8888 ((src + 1)); +	    vsrc2 = load8888 ((src + 2)); +	    vsrc3 = load8888 ((src + 3)); + +	    v0 = over (vsrc0, expand_alpha (vsrc0), v0); +	    v1 = over (vsrc1, expand_alpha (vsrc1), v1); +	    v2 = over (vsrc2, expand_alpha (vsrc2), v2); +	    v3 = over (vsrc3, expand_alpha (vsrc3), v3); + +	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); + +	    w -= 4; +	    dst += 4; +	    src += 4; +	} + +	CHECKPOINT (); + +	while (w) +	{ +	    __m64 vsrc = load8888 (src); +	    uint64_t d = *dst; +	    __m64 vdest = expand565 (to_m64 (d), 0); + +	    vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); + +	    *dst = to_uint64 (vdest); + +	    w--; +	    dst++; +	    src++; +	} +    } + +    _mm_empty (); +} + +static void +mmx_composite_over_n_8_8888 (pixman_implementation_t *imp, +                             pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src, srca; +    uint32_t *dst_line, *dst; +    uint8_t *mask_line, *mask; +    int dst_stride, mask_stride; +    int32_t w; +    __m64 vsrc, vsrca; +    uint64_t srcsrc; + +    CHECKPOINT (); + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    srca = src >> 24; +    if (src == 0) +	return; + +    srcsrc = (uint64_t)src << 32 | src; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + +    vsrc = load8888 (&src); +    vsrca = expand_alpha (vsrc); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; +	w = width; + +	CHECKPOINT (); + +	while (w && (uintptr_t)dst & 7) +	{ +	    uint64_t m = *mask; + +	    if (m) +	    { +		__m64 vdest = in_over (vsrc, vsrca, +				       expand_alpha_rev (to_m64 (m)), +				       load8888 (dst)); + +		store8888 (dst, vdest); +	    } + +	    w--; +	    mask++; +	    dst++; +	} + +	CHECKPOINT (); + +	while (w >= 2) +	{ +	    uint64_t m0, m1; + +	    m0 = *mask; +	    m1 = *(mask + 1); + +	    if (srca == 0xff && (m0 & m1) == 0xff) +	    { +		*(uint64_t *)dst = srcsrc; +	    } +	    else if (m0 | m1) +	    { +		__m64 vdest; +		__m64 dest0, dest1; + +		vdest = *(__m64 *)dst; + +		dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)), +				 expand8888 (vdest, 0)); +		dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)), +				 expand8888 (vdest, 1)); + +		*(__m64 *)dst = pack8888 (dest0, dest1); +	    } + +	    mask += 2; +	    dst += 2; +	    w -= 2; +	} + +	CHECKPOINT (); + +	if (w) +	{ +	    uint64_t m = *mask; + +	    if (m) +	    { +		__m64 vdest = load8888 (dst); + +		vdest = in_over ( +		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest); +		store8888 (dst, vdest); +	    } +	} +    } + +    _mm_empty (); +} + +static pixman_bool_t +mmx_fill (pixman_implementation_t *imp, +          uint32_t *               bits, +          int                      stride, +          int                      bpp, +          int                      x, +          int                      y, +          int                      width, +          int                      height, +          uint32_t		   filler) +{ +    uint64_t fill; +    __m64 vfill; +    uint32_t byte_width; +    uint8_t     *byte_line; + +#if defined __GNUC__ && defined USE_X86_MMX +    __m64 v1, v2, v3, v4, v5, v6, v7; +#endif + +    if (bpp != 16 && bpp != 32 && bpp != 8) +	return FALSE; + +    if (bpp == 8) +    { +	stride = stride * (int) sizeof (uint32_t) / 1; +	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); +	byte_width = width; +	stride *= 1; +        filler = (filler & 0xff) * 0x01010101; +    } +    else if (bpp == 16) +    { +	stride = stride * (int) sizeof (uint32_t) / 2; +	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); +	byte_width = 2 * width; +	stride *= 2; +        filler = (filler & 0xffff) * 0x00010001; +    } +    else +    { +	stride = stride * (int) sizeof (uint32_t) / 4; +	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); +	byte_width = 4 * width; +	stride *= 4; +    } + +    fill = ((uint64_t)filler << 32) | filler; +    vfill = to_m64 (fill); + +#if defined __GNUC__ && defined USE_X86_MMX +    __asm__ ( +        "movq		%7,	%0\n" +        "movq		%7,	%1\n" +        "movq		%7,	%2\n" +        "movq		%7,	%3\n" +        "movq		%7,	%4\n" +        "movq		%7,	%5\n" +        "movq		%7,	%6\n" +	: "=&y" (v1), "=&y" (v2), "=&y" (v3), +	  "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7) +	: "y" (vfill)); +#endif + +    while (height--) +    { +	int w; +	uint8_t *d = byte_line; + +	byte_line += stride; +	w = byte_width; + +	if (w >= 1 && ((uintptr_t)d & 1)) +	{ +	    *(uint8_t *)d = (filler & 0xff); +	    w--; +	    d++; +	} + +	if (w >= 2 && ((uintptr_t)d & 3)) +	{ +	    *(uint16_t *)d = filler; +	    w -= 2; +	    d += 2; +	} + +	while (w >= 4 && ((uintptr_t)d & 7)) +	{ +	    *(uint32_t *)d = filler; + +	    w -= 4; +	    d += 4; +	} + +	while (w >= 64) +	{ +#if defined __GNUC__ && defined USE_X86_MMX +	    __asm__ ( +	        "movq	%1,	  (%0)\n" +	        "movq	%2,	 8(%0)\n" +	        "movq	%3,	16(%0)\n" +	        "movq	%4,	24(%0)\n" +	        "movq	%5,	32(%0)\n" +	        "movq	%6,	40(%0)\n" +	        "movq	%7,	48(%0)\n" +	        "movq	%8,	56(%0)\n" +		: +		: "r" (d), +		  "y" (vfill), "y" (v1), "y" (v2), "y" (v3), +		  "y" (v4), "y" (v5), "y" (v6), "y" (v7) +		: "memory"); +#else +	    *(__m64*) (d +  0) = vfill; +	    *(__m64*) (d +  8) = vfill; +	    *(__m64*) (d + 16) = vfill; +	    *(__m64*) (d + 24) = vfill; +	    *(__m64*) (d + 32) = vfill; +	    *(__m64*) (d + 40) = vfill; +	    *(__m64*) (d + 48) = vfill; +	    *(__m64*) (d + 56) = vfill; +#endif +	    w -= 64; +	    d += 64; +	} + +	while (w >= 4) +	{ +	    *(uint32_t *)d = filler; + +	    w -= 4; +	    d += 4; +	} +	if (w >= 2) +	{ +	    *(uint16_t *)d = filler; +	    w -= 2; +	    d += 2; +	} +	if (w >= 1) +	{ +	    *(uint8_t *)d = (filler & 0xff); +	    w--; +	    d++; +	} + +    } + +    _mm_empty (); +    return TRUE; +} + +static void +mmx_composite_src_x888_0565 (pixman_implementation_t *imp, +                             pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint16_t    *dst_line, *dst; +    uint32_t    *src_line, *src, s; +    int dst_stride, src_stride; +    int32_t w; + +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w && (uintptr_t)dst & 7) +	{ +	    s = *src++; +	    *dst = convert_8888_to_0565 (s); +	    dst++; +	    w--; +	} + +	while (w >= 4) +	{ +	    __m64 vdest; +	    __m64 vsrc0 = ldq_u ((__m64 *)(src + 0)); +	    __m64 vsrc1 = ldq_u ((__m64 *)(src + 2)); + +	    vdest = pack_4xpacked565 (vsrc0, vsrc1); + +	    *(__m64 *)dst = vdest; + +	    w -= 4; +	    src += 4; +	    dst += 4; +	} + +	while (w) +	{ +	    s = *src++; +	    *dst = convert_8888_to_0565 (s); +	    dst++; +	    w--; +	} +    } + +    _mm_empty (); +} + +static void +mmx_composite_src_n_8_8888 (pixman_implementation_t *imp, +                            pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src, srca; +    uint32_t    *dst_line, *dst; +    uint8_t     *mask_line, *mask; +    int dst_stride, mask_stride; +    int32_t w; +    __m64 vsrc; +    uint64_t srcsrc; + +    CHECKPOINT (); + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    srca = src >> 24; +    if (src == 0) +    { +	mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride, +		  PIXMAN_FORMAT_BPP (dest_image->bits.format), +		  dest_x, dest_y, width, height, 0); +	return; +    } + +    srcsrc = (uint64_t)src << 32 | src; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + +    vsrc = load8888 (&src); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; +	w = width; + +	CHECKPOINT (); + +	while (w && (uintptr_t)dst & 7) +	{ +	    uint64_t m = *mask; + +	    if (m) +	    { +		__m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); + +		store8888 (dst, vdest); +	    } +	    else +	    { +		*dst = 0; +	    } + +	    w--; +	    mask++; +	    dst++; +	} + +	CHECKPOINT (); + +	while (w >= 2) +	{ +	    uint64_t m0, m1; +	    m0 = *mask; +	    m1 = *(mask + 1); + +	    if (srca == 0xff && (m0 & m1) == 0xff) +	    { +		*(uint64_t *)dst = srcsrc; +	    } +	    else if (m0 | m1) +	    { +		__m64 dest0, dest1; + +		dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0))); +		dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1))); + +		*(__m64 *)dst = pack8888 (dest0, dest1); +	    } +	    else +	    { +		*(uint64_t *)dst = 0; +	    } + +	    mask += 2; +	    dst += 2; +	    w -= 2; +	} + +	CHECKPOINT (); + +	if (w) +	{ +	    uint64_t m = *mask; + +	    if (m) +	    { +		__m64 vdest = load8888 (dst); + +		vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); +		store8888 (dst, vdest); +	    } +	    else +	    { +		*dst = 0; +	    } +	} +    } + +    _mm_empty (); +} + +static void +mmx_composite_over_n_8_0565 (pixman_implementation_t *imp, +                             pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src, srca; +    uint16_t *dst_line, *dst; +    uint8_t *mask_line, *mask; +    int dst_stride, mask_stride; +    int32_t w; +    __m64 vsrc, vsrca, tmp; +    __m64 srcsrcsrcsrc; + +    CHECKPOINT (); + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    srca = src >> 24; +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + +    vsrc = load8888 (&src); +    vsrca = expand_alpha (vsrc); + +    tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0); +    srcsrcsrcsrc = expand_alpha_rev (tmp); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; +	w = width; + +	CHECKPOINT (); + +	while (w && (uintptr_t)dst & 7) +	{ +	    uint64_t m = *mask; + +	    if (m) +	    { +		uint64_t d = *dst; +		__m64 vd = to_m64 (d); +		__m64 vdest = in_over ( +		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0)); + +		vd = pack_565 (vdest, _mm_setzero_si64 (), 0); +		*dst = to_uint64 (vd); +	    } + +	    w--; +	    mask++; +	    dst++; +	} + +	CHECKPOINT (); + +	while (w >= 4) +	{ +	    uint64_t m0, m1, m2, m3; +	    m0 = *mask; +	    m1 = *(mask + 1); +	    m2 = *(mask + 2); +	    m3 = *(mask + 3); + +	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff) +	    { +		*(__m64 *)dst = srcsrcsrcsrc; +	    } +	    else if (m0 | m1 | m2 | m3) +	    { +		__m64 vdest = *(__m64 *)dst; +		__m64 v0, v1, v2, v3; +		__m64 vm0, vm1, vm2, vm3; + +		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); + +		vm0 = to_m64 (m0); +		v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0); + +		vm1 = to_m64 (m1); +		v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1); + +		vm2 = to_m64 (m2); +		v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2); + +		vm3 = to_m64 (m3); +		v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3); + +		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);; +	    } + +	    w -= 4; +	    mask += 4; +	    dst += 4; +	} + +	CHECKPOINT (); + +	while (w) +	{ +	    uint64_t m = *mask; + +	    if (m) +	    { +		uint64_t d = *dst; +		__m64 vd = to_m64 (d); +		__m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)), +				       expand565 (vd, 0)); +		vd = pack_565 (vdest, _mm_setzero_si64 (), 0); +		*dst = to_uint64 (vd); +	    } + +	    w--; +	    mask++; +	    dst++; +	} +    } + +    _mm_empty (); +} + +static void +mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, +                                pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint16_t    *dst_line, *dst; +    uint32_t    *src_line, *src; +    int dst_stride, src_stride; +    int32_t w; + +    CHECKPOINT (); + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +#if 0 +    /* FIXME */ +    assert (src_image->drawable == mask_image->drawable); +#endif + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	CHECKPOINT (); + +	while (w && (uintptr_t)dst & 7) +	{ +	    __m64 vsrc = load8888 (src); +	    uint64_t d = *dst; +	    __m64 vdest = expand565 (to_m64 (d), 0); + +	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); + +	    *dst = to_uint64 (vdest); + +	    w--; +	    dst++; +	    src++; +	} + +	CHECKPOINT (); + +	while (w >= 4) +	{ +	    uint32_t s0, s1, s2, s3; +	    unsigned char a0, a1, a2, a3; + +	    s0 = *src; +	    s1 = *(src + 1); +	    s2 = *(src + 2); +	    s3 = *(src + 3); + +	    a0 = (s0 >> 24); +	    a1 = (s1 >> 24); +	    a2 = (s2 >> 24); +	    a3 = (s3 >> 24); + +	    if ((a0 & a1 & a2 & a3) == 0xFF) +	    { +		__m64 v0 = invert_colors (load8888 (&s0)); +		__m64 v1 = invert_colors (load8888 (&s1)); +		__m64 v2 = invert_colors (load8888 (&s2)); +		__m64 v3 = invert_colors (load8888 (&s3)); + +		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); +	    } +	    else if (s0 | s1 | s2 | s3) +	    { +		__m64 vdest = *(__m64 *)dst; +		__m64 v0, v1, v2, v3; + +		__m64 vsrc0 = load8888 (&s0); +		__m64 vsrc1 = load8888 (&s1); +		__m64 vsrc2 = load8888 (&s2); +		__m64 vsrc3 = load8888 (&s3); + +		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); + +		v0 = over_rev_non_pre (vsrc0, v0); +		v1 = over_rev_non_pre (vsrc1, v1); +		v2 = over_rev_non_pre (vsrc2, v2); +		v3 = over_rev_non_pre (vsrc3, v3); + +		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); +	    } + +	    w -= 4; +	    dst += 4; +	    src += 4; +	} + +	CHECKPOINT (); + +	while (w) +	{ +	    __m64 vsrc = load8888 (src); +	    uint64_t d = *dst; +	    __m64 vdest = expand565 (to_m64 (d), 0); + +	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); + +	    *dst = to_uint64 (vdest); + +	    w--; +	    dst++; +	    src++; +	} +    } + +    _mm_empty (); +} + +static void +mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, +                                pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t    *dst_line, *dst; +    uint32_t    *src_line, *src; +    int dst_stride, src_stride; +    int32_t w; + +    CHECKPOINT (); + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +#if 0 +    /* FIXME */ +    assert (src_image->drawable == mask_image->drawable); +#endif + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w && (uintptr_t)dst & 7) +	{ +	    __m64 s = load8888 (src); +	    __m64 d = load8888 (dst); + +	    store8888 (dst, over_rev_non_pre (s, d)); + +	    w--; +	    dst++; +	    src++; +	} + +	while (w >= 2) +	{ +	    uint32_t s0, s1; +	    unsigned char a0, a1; +	    __m64 d0, d1; + +	    s0 = *src; +	    s1 = *(src + 1); + +	    a0 = (s0 >> 24); +	    a1 = (s1 >> 24); + +	    if ((a0 & a1) == 0xFF) +	    { +		d0 = invert_colors (load8888 (&s0)); +		d1 = invert_colors (load8888 (&s1)); + +		*(__m64 *)dst = pack8888 (d0, d1); +	    } +	    else if (s0 | s1) +	    { +		__m64 vdest = *(__m64 *)dst; + +		d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0)); +		d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1)); + +		*(__m64 *)dst = pack8888 (d0, d1); +	    } + +	    w -= 2; +	    dst += 2; +	    src += 2; +	} + +	if (w) +	{ +	    __m64 s = load8888 (src); +	    __m64 d = load8888 (dst); + +	    store8888 (dst, over_rev_non_pre (s, d)); +	} +    } + +    _mm_empty (); +} + +static void +mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, +                                   pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src; +    uint16_t    *dst_line; +    uint32_t    *mask_line; +    int dst_stride, mask_stride; +    __m64 vsrc, vsrca; + +    CHECKPOINT (); + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + +    vsrc = load8888 (&src); +    vsrca = expand_alpha (vsrc); + +    while (height--) +    { +	int twidth = width; +	uint32_t *p = (uint32_t *)mask_line; +	uint16_t *q = (uint16_t *)dst_line; + +	while (twidth && ((uintptr_t)q & 7)) +	{ +	    uint32_t m = *(uint32_t *)p; + +	    if (m) +	    { +		uint64_t d = *q; +		__m64 vdest = expand565 (to_m64 (d), 0); +		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); +		*q = to_uint64 (vdest); +	    } + +	    twidth--; +	    p++; +	    q++; +	} + +	while (twidth >= 4) +	{ +	    uint32_t m0, m1, m2, m3; + +	    m0 = *p; +	    m1 = *(p + 1); +	    m2 = *(p + 2); +	    m3 = *(p + 3); + +	    if ((m0 | m1 | m2 | m3)) +	    { +		__m64 vdest = *(__m64 *)q; +		__m64 v0, v1, v2, v3; + +		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); + +		v0 = in_over (vsrc, vsrca, load8888 (&m0), v0); +		v1 = in_over (vsrc, vsrca, load8888 (&m1), v1); +		v2 = in_over (vsrc, vsrca, load8888 (&m2), v2); +		v3 = in_over (vsrc, vsrca, load8888 (&m3), v3); + +		*(__m64 *)q = pack_4x565 (v0, v1, v2, v3); +	    } +	    twidth -= 4; +	    p += 4; +	    q += 4; +	} + +	while (twidth) +	{ +	    uint32_t m; + +	    m = *(uint32_t *)p; +	    if (m) +	    { +		uint64_t d = *q; +		__m64 vdest = expand565 (to_m64 (d), 0); +		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); +		*q = to_uint64 (vdest); +	    } + +	    twidth--; +	    p++; +	    q++; +	} + +	mask_line += mask_stride; +	dst_line += dst_stride; +    } + +    _mm_empty (); +} + +static void +mmx_composite_in_n_8_8 (pixman_implementation_t *imp, +                        pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint8_t *dst_line, *dst; +    uint8_t *mask_line, *mask; +    int dst_stride, mask_stride; +    int32_t w; +    uint32_t src; +    uint8_t sa; +    __m64 vsrc, vsrca; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    sa = src >> 24; + +    vsrc = load8888 (&src); +    vsrca = expand_alpha (vsrc); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; +	w = width; + +	while (w && (uintptr_t)dst & 7) +	{ +	    uint16_t tmp; +	    uint8_t a; +	    uint32_t m, d; + +	    a = *mask++; +	    d = *dst; + +	    m = MUL_UN8 (sa, a, tmp); +	    d = MUL_UN8 (m, d, tmp); + +	    *dst++ = d; +	    w--; +	} + +	while (w >= 4) +	{ +	    __m64 vmask; +	    __m64 vdest; + +	    vmask = load8888u ((uint32_t *)mask); +	    vdest = load8888 ((uint32_t *)dst); + +	    store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest)); + +	    dst += 4; +	    mask += 4; +	    w -= 4; +	} + +	while (w--) +	{ +	    uint16_t tmp; +	    uint8_t a; +	    uint32_t m, d; + +	    a = *mask++; +	    d = *dst; + +	    m = MUL_UN8 (sa, a, tmp); +	    d = MUL_UN8 (m, d, tmp); + +	    *dst++ = d; +	} +    } + +    _mm_empty (); +} + +static void +mmx_composite_in_8_8 (pixman_implementation_t *imp, +                      pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint8_t     *dst_line, *dst; +    uint8_t     *src_line, *src; +    int src_stride, dst_stride; +    int32_t w; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w && (uintptr_t)dst & 3) +	{ +	    uint8_t s, d; +	    uint16_t tmp; + +	    s = *src; +	    d = *dst; + +	    *dst = MUL_UN8 (s, d, tmp); + +	    src++; +	    dst++; +	    w--; +	} + +	while (w >= 4) +	{ +	    uint32_t *s = (uint32_t *)src; +	    uint32_t *d = (uint32_t *)dst; + +	    store8888 (d, in (load8888u (s), load8888 (d))); + +	    w -= 4; +	    dst += 4; +	    src += 4; +	} + +	while (w--) +	{ +	    uint8_t s, d; +	    uint16_t tmp; + +	    s = *src; +	    d = *dst; + +	    *dst = MUL_UN8 (s, d, tmp); + +	    src++; +	    dst++; +	} +    } + +    _mm_empty (); +} + +static void +mmx_composite_add_n_8_8 (pixman_implementation_t *imp, +			 pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint8_t     *dst_line, *dst; +    uint8_t     *mask_line, *mask; +    int dst_stride, mask_stride; +    int32_t w; +    uint32_t src; +    uint8_t sa; +    __m64 vsrc, vsrca; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    sa = src >> 24; + +    if (src == 0) +	return; + +    vsrc = load8888 (&src); +    vsrca = expand_alpha (vsrc); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; +	w = width; + +	while (w && (uintptr_t)dst & 3) +	{ +	    uint16_t tmp; +	    uint16_t a; +	    uint32_t m, d; +	    uint32_t r; + +	    a = *mask++; +	    d = *dst; + +	    m = MUL_UN8 (sa, a, tmp); +	    r = ADD_UN8 (m, d, tmp); + +	    *dst++ = r; +	    w--; +	} + +	while (w >= 4) +	{ +	    __m64 vmask; +	    __m64 vdest; + +	    vmask = load8888u ((uint32_t *)mask); +	    vdest = load8888 ((uint32_t *)dst); + +	    store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest)); + +	    dst += 4; +	    mask += 4; +	    w -= 4; +	} + +	while (w--) +	{ +	    uint16_t tmp; +	    uint16_t a; +	    uint32_t m, d; +	    uint32_t r; + +	    a = *mask++; +	    d = *dst; + +	    m = MUL_UN8 (sa, a, tmp); +	    r = ADD_UN8 (m, d, tmp); + +	    *dst++ = r; +	} +    } + +    _mm_empty (); +} + +static void +mmx_composite_add_8_8 (pixman_implementation_t *imp, +		       pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint8_t *dst_line, *dst; +    uint8_t *src_line, *src; +    int dst_stride, src_stride; +    int32_t w; +    uint8_t s, d; +    uint16_t t; + +    CHECKPOINT (); + +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w && (uintptr_t)dst & 7) +	{ +	    s = *src; +	    d = *dst; +	    t = d + s; +	    s = t | (0 - (t >> 8)); +	    *dst = s; + +	    dst++; +	    src++; +	    w--; +	} + +	while (w >= 8) +	{ +	    *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst); +	    dst += 8; +	    src += 8; +	    w -= 8; +	} + +	while (w) +	{ +	    s = *src; +	    d = *dst; +	    t = d + s; +	    s = t | (0 - (t >> 8)); +	    *dst = s; + +	    dst++; +	    src++; +	    w--; +	} +    } + +    _mm_empty (); +} + +static void +mmx_composite_add_0565_0565 (pixman_implementation_t *imp, +                             pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint16_t    *dst_line, *dst; +    uint32_t	d; +    uint16_t    *src_line, *src; +    uint32_t	s; +    int dst_stride, src_stride; +    int32_t w; + +    CHECKPOINT (); + +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1); +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w && (uintptr_t)dst & 7) +	{ +	    s = *src++; +	    if (s) +	    { +		d = *dst; +		s = convert_0565_to_8888 (s); +		if (d) +		{ +		    d = convert_0565_to_8888 (d); +		    UN8x4_ADD_UN8x4 (s, d); +		} +		*dst = convert_8888_to_0565 (s); +	    } +	    dst++; +	    w--; +	} + +	while (w >= 4) +	{ +	    __m64 vdest = *(__m64 *)dst; +	    __m64 vsrc = ldq_u ((__m64 *)src); +	    __m64 vd0, vd1; +	    __m64 vs0, vs1; + +	    expand_4xpacked565 (vdest, &vd0, &vd1, 0); +	    expand_4xpacked565 (vsrc, &vs0, &vs1, 0); + +	    vd0 = _mm_adds_pu8 (vd0, vs0); +	    vd1 = _mm_adds_pu8 (vd1, vs1); + +	    *(__m64 *)dst = pack_4xpacked565 (vd0, vd1); + +	    dst += 4; +	    src += 4; +	    w -= 4; +	} + +	while (w--) +	{ +	    s = *src++; +	    if (s) +	    { +		d = *dst; +		s = convert_0565_to_8888 (s); +		if (d) +		{ +		    d = convert_0565_to_8888 (d); +		    UN8x4_ADD_UN8x4 (s, d); +		} +		*dst = convert_8888_to_0565 (s); +	    } +	    dst++; +	} +    } + +    _mm_empty (); +} + +static void +mmx_composite_add_8888_8888 (pixman_implementation_t *imp, +                             pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t    *dst_line, *dst; +    uint32_t    *src_line, *src; +    int dst_stride, src_stride; +    int32_t w; + +    CHECKPOINT (); + +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w && (uintptr_t)dst & 7) +	{ +	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src), +	                              load ((const uint32_t *)dst))); +	    dst++; +	    src++; +	    w--; +	} + +	while (w >= 2) +	{ +	    *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst); +	    dst += 2; +	    src += 2; +	    w -= 2; +	} + +	if (w) +	{ +	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src), +	                              load ((const uint32_t *)dst))); + +	} +    } + +    _mm_empty (); +} + +static pixman_bool_t +mmx_blt (pixman_implementation_t *imp, +         uint32_t *               src_bits, +         uint32_t *               dst_bits, +         int                      src_stride, +         int                      dst_stride, +         int                      src_bpp, +         int                      dst_bpp, +         int                      src_x, +         int                      src_y, +         int                      dest_x, +         int                      dest_y, +         int                      width, +         int                      height) +{ +    uint8_t *   src_bytes; +    uint8_t *   dst_bytes; +    int byte_width; + +    if (src_bpp != dst_bpp) +	return FALSE; + +    if (src_bpp == 16) +    { +	src_stride = src_stride * (int) sizeof (uint32_t) / 2; +	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; +	src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); +	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); +	byte_width = 2 * width; +	src_stride *= 2; +	dst_stride *= 2; +    } +    else if (src_bpp == 32) +    { +	src_stride = src_stride * (int) sizeof (uint32_t) / 4; +	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; +	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); +	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); +	byte_width = 4 * width; +	src_stride *= 4; +	dst_stride *= 4; +    } +    else +    { +	return FALSE; +    } + +    while (height--) +    { +	int w; +	uint8_t *s = src_bytes; +	uint8_t *d = dst_bytes; +	src_bytes += src_stride; +	dst_bytes += dst_stride; +	w = byte_width; + +	if (w >= 1 && ((uintptr_t)d & 1)) +	{ +	    *(uint8_t *)d = *(uint8_t *)s; +	    w -= 1; +	    s += 1; +	    d += 1; +	} + +	if (w >= 2 && ((uintptr_t)d & 3)) +	{ +	    *(uint16_t *)d = *(uint16_t *)s; +	    w -= 2; +	    s += 2; +	    d += 2; +	} + +	while (w >= 4 && ((uintptr_t)d & 7)) +	{ +	    *(uint32_t *)d = ldl_u ((uint32_t *)s); + +	    w -= 4; +	    s += 4; +	    d += 4; +	} + +	while (w >= 64) +	{ +#if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX +	    __asm__ ( +	        "movq	  (%1),	  %%mm0\n" +	        "movq	 8(%1),	  %%mm1\n" +	        "movq	16(%1),	  %%mm2\n" +	        "movq	24(%1),	  %%mm3\n" +	        "movq	32(%1),	  %%mm4\n" +	        "movq	40(%1),	  %%mm5\n" +	        "movq	48(%1),	  %%mm6\n" +	        "movq	56(%1),	  %%mm7\n" + +	        "movq	%%mm0,	  (%0)\n" +	        "movq	%%mm1,	 8(%0)\n" +	        "movq	%%mm2,	16(%0)\n" +	        "movq	%%mm3,	24(%0)\n" +	        "movq	%%mm4,	32(%0)\n" +	        "movq	%%mm5,	40(%0)\n" +	        "movq	%%mm6,	48(%0)\n" +	        "movq	%%mm7,	56(%0)\n" +		: +		: "r" (d), "r" (s) +		: "memory", +		  "%mm0", "%mm1", "%mm2", "%mm3", +		  "%mm4", "%mm5", "%mm6", "%mm7"); +#else +	    __m64 v0 = ldq_u ((__m64 *)(s + 0)); +	    __m64 v1 = ldq_u ((__m64 *)(s + 8)); +	    __m64 v2 = ldq_u ((__m64 *)(s + 16)); +	    __m64 v3 = ldq_u ((__m64 *)(s + 24)); +	    __m64 v4 = ldq_u ((__m64 *)(s + 32)); +	    __m64 v5 = ldq_u ((__m64 *)(s + 40)); +	    __m64 v6 = ldq_u ((__m64 *)(s + 48)); +	    __m64 v7 = ldq_u ((__m64 *)(s + 56)); +	    *(__m64 *)(d + 0)  = v0; +	    *(__m64 *)(d + 8)  = v1; +	    *(__m64 *)(d + 16) = v2; +	    *(__m64 *)(d + 24) = v3; +	    *(__m64 *)(d + 32) = v4; +	    *(__m64 *)(d + 40) = v5; +	    *(__m64 *)(d + 48) = v6; +	    *(__m64 *)(d + 56) = v7; +#endif + +	    w -= 64; +	    s += 64; +	    d += 64; +	} +	while (w >= 4) +	{ +	    *(uint32_t *)d = ldl_u ((uint32_t *)s); + +	    w -= 4; +	    s += 4; +	    d += 4; +	} +	if (w >= 2) +	{ +	    *(uint16_t *)d = *(uint16_t *)s; +	    w -= 2; +	    s += 2; +	    d += 2; +	} +    } + +    _mm_empty (); + +    return TRUE; +} + +static void +mmx_composite_copy_area (pixman_implementation_t *imp, +                         pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); + +    mmx_blt (imp, src_image->bits.bits, +	     dest_image->bits.bits, +	     src_image->bits.rowstride, +	     dest_image->bits.rowstride, +	     PIXMAN_FORMAT_BPP (src_image->bits.format), +	     PIXMAN_FORMAT_BPP (dest_image->bits.format), +	     src_x, src_y, dest_x, dest_y, width, height); +} + +static void +mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp, +                                pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t  *src, *src_line; +    uint32_t  *dst, *dst_line; +    uint8_t  *mask, *mask_line; +    int src_stride, mask_stride, dst_stride; +    int32_t w; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +    while (height--) +    { +	src = src_line; +	src_line += src_stride; +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; + +	w = width; + +	while (w--) +	{ +	    uint64_t m = *mask; + +	    if (m) +	    { +		uint32_t ssrc = *src | 0xff000000; +		__m64 s = load8888 (&ssrc); + +		if (m == 0xff) +		{ +		    store8888 (dst, s); +		} +		else +		{ +		    __m64 sa = expand_alpha (s); +		    __m64 vm = expand_alpha_rev (to_m64 (m)); +		    __m64 vdest = in_over (s, sa, vm, load8888 (dst)); + +		    store8888 (dst, vdest); +		} +	    } + +	    mask++; +	    dst++; +	    src++; +	} +    } + +    _mm_empty (); +} + +static void +mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp, +                                   pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src; +    uint32_t    *dst_line, *dst; +    int32_t w; +    int dst_stride; +    __m64 vsrc; + +    CHECKPOINT (); + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + +    vsrc = load8888 (&src); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	w = width; + +	CHECKPOINT (); + +	while (w && (uintptr_t)dst & 7) +	{ +	    __m64 vdest = load8888 (dst); + +	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc)); + +	    w--; +	    dst++; +	} + +	while (w >= 2) +	{ +	    __m64 vdest = *(__m64 *)dst; +	    __m64 dest0 = expand8888 (vdest, 0); +	    __m64 dest1 = expand8888 (vdest, 1); + + +	    dest0 = over (dest0, expand_alpha (dest0), vsrc); +	    dest1 = over (dest1, expand_alpha (dest1), vsrc); + +	    *(__m64 *)dst = pack8888 (dest0, dest1); + +	    dst += 2; +	    w -= 2; +	} + +	CHECKPOINT (); + +	if (w) +	{ +	    __m64 vdest = load8888 (dst); + +	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc)); +	} +    } + +    _mm_empty (); +} + +static force_inline void +scaled_nearest_scanline_mmx_8888_8888_OVER (uint32_t*       pd, +                                            const uint32_t* ps, +                                            int32_t         w, +                                            pixman_fixed_t  vx, +                                            pixman_fixed_t  unit_x, +                                            pixman_fixed_t  src_width_fixed, +                                            pixman_bool_t   fully_transparent_src) +{ +    if (fully_transparent_src) +	return; + +    while (w) +    { +	__m64 d = load (pd); +	__m64 s = load (ps + pixman_fixed_to_int (vx)); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; + +	store8888 (pd, core_combine_over_u_pixel_mmx (s, d)); +	pd++; + +	w--; +    } + +    _mm_empty (); +} + +FAST_NEAREST_MAINLOOP (mmx_8888_8888_cover_OVER, +		       scaled_nearest_scanline_mmx_8888_8888_OVER, +		       uint32_t, uint32_t, COVER) +FAST_NEAREST_MAINLOOP (mmx_8888_8888_none_OVER, +		       scaled_nearest_scanline_mmx_8888_8888_OVER, +		       uint32_t, uint32_t, NONE) +FAST_NEAREST_MAINLOOP (mmx_8888_8888_pad_OVER, +		       scaled_nearest_scanline_mmx_8888_8888_OVER, +		       uint32_t, uint32_t, PAD) +FAST_NEAREST_MAINLOOP (mmx_8888_8888_normal_OVER, +		       scaled_nearest_scanline_mmx_8888_8888_OVER, +		       uint32_t, uint32_t, NORMAL) + +static force_inline void +scaled_nearest_scanline_mmx_8888_n_8888_OVER (const uint32_t * mask, +					      uint32_t *       dst, +					      const uint32_t * src, +					      int32_t          w, +					      pixman_fixed_t   vx, +					      pixman_fixed_t   unit_x, +					      pixman_fixed_t   src_width_fixed, +					      pixman_bool_t    zero_src) +{ +    __m64 mm_mask; + +    if (zero_src || (*mask >> 24) == 0) +    { +	/* A workaround for https://gcc.gnu.org/PR47759 */ +	_mm_empty (); +	return; +    } + +    mm_mask = expand_alpha (load8888 (mask)); + +    while (w) +    { +	uint32_t s = *(src + pixman_fixed_to_int (vx)); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; + +	if (s) +	{ +	    __m64 ms = load8888 (&s); +	    __m64 alpha = expand_alpha (ms); +	    __m64 dest  = load8888 (dst); + +	    store8888 (dst, (in_over (ms, alpha, mm_mask, dest))); +	} + +	dst++; +	w--; +    } + +    _mm_empty (); +} + +FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_cover_OVER, +			      scaled_nearest_scanline_mmx_8888_n_8888_OVER, +			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE) +FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_pad_OVER, +			      scaled_nearest_scanline_mmx_8888_n_8888_OVER, +			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE) +FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_none_OVER, +			      scaled_nearest_scanline_mmx_8888_n_8888_OVER, +			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) +FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_normal_OVER, +			      scaled_nearest_scanline_mmx_8888_n_8888_OVER, +			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE) + +#define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS)) +#define BMSK (BSHIFT - 1) + +#define BILINEAR_DECLARE_VARIABLES						\ +    const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);				\ +    const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);				\ +    const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1);				\ +    const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK);			\ +    const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);		\ +    const __m64 mm_zero = _mm_setzero_si64 ();					\ +    __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx) + +#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\ +do {										\ +    /* fetch 2x2 pixel block into 2 mmx registers */				\ +    __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]);		\ +    __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]);		\ +    /* vertical interpolation */						\ +    __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt);		\ +    __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt);		\ +    __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb);		\ +    __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);		\ +    __m64 hi = _mm_add_pi16 (t_hi, b_hi);					\ +    __m64 lo = _mm_add_pi16 (t_lo, b_lo);					\ +    /* calculate horizontal weights */						\ +    __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,		\ +			  _mm_srli_pi16 (mm_x,					\ +					 16 - BILINEAR_INTERPOLATION_BITS)));	\ +    /* horizontal interpolation */						\ +    __m64 p = _mm_unpacklo_pi16 (lo, hi);					\ +    __m64 q = _mm_unpackhi_pi16 (lo, hi);					\ +    vx += unit_x;								\ +    lo = _mm_madd_pi16 (p, mm_wh);						\ +    hi = _mm_madd_pi16 (q, mm_wh);						\ +    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\ +    /* shift and pack the result */						\ +    hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);			\ +    lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);			\ +    lo = _mm_packs_pi32 (lo, hi);						\ +    lo = _mm_packs_pu16 (lo, lo);						\ +    pix = lo;									\ +} while (0) + +#define BILINEAR_SKIP_ONE_PIXEL()						\ +do {										\ +    vx += unit_x;								\ +    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\ +} while(0) + +static force_inline void +scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst, +					    const uint32_t * mask, +					    const uint32_t * src_top, +					    const uint32_t * src_bottom, +					    int32_t          w, +					    int              wt, +					    int              wb, +					    pixman_fixed_t   vx, +					    pixman_fixed_t   unit_x, +					    pixman_fixed_t   max_vx, +					    pixman_bool_t    zero_src) +{ +    BILINEAR_DECLARE_VARIABLES; +    __m64 pix; + +    while (w--) +    { +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix); +	store (dst, pix); +	dst++; +    } + +    _mm_empty (); +} + +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC, +			       scaled_bilinear_scanline_mmx_8888_8888_SRC, +			       uint32_t, uint32_t, uint32_t, +			       COVER, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC, +			       scaled_bilinear_scanline_mmx_8888_8888_SRC, +			       uint32_t, uint32_t, uint32_t, +			       PAD, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC, +			       scaled_bilinear_scanline_mmx_8888_8888_SRC, +			       uint32_t, uint32_t, uint32_t, +			       NONE, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC, +			       scaled_bilinear_scanline_mmx_8888_8888_SRC, +			       uint32_t, uint32_t, uint32_t, +			       NORMAL, FLAG_NONE) + +static force_inline void +scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t *       dst, +					     const uint32_t * mask, +					     const uint32_t * src_top, +					     const uint32_t * src_bottom, +					     int32_t          w, +					     int              wt, +					     int              wb, +					     pixman_fixed_t   vx, +					     pixman_fixed_t   unit_x, +					     pixman_fixed_t   max_vx, +					     pixman_bool_t    zero_src) +{ +    BILINEAR_DECLARE_VARIABLES; +    __m64 pix1, pix2; + +    while (w) +    { +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + +	if (!is_zero (pix1)) +	{ +	    pix2 = load (dst); +	    store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2)); +	} + +	w--; +	dst++; +    } + +    _mm_empty (); +} + +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER, +			       scaled_bilinear_scanline_mmx_8888_8888_OVER, +			       uint32_t, uint32_t, uint32_t, +			       COVER, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER, +			       scaled_bilinear_scanline_mmx_8888_8888_OVER, +			       uint32_t, uint32_t, uint32_t, +			       PAD, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER, +			       scaled_bilinear_scanline_mmx_8888_8888_OVER, +			       uint32_t, uint32_t, uint32_t, +			       NONE, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER, +			       scaled_bilinear_scanline_mmx_8888_8888_OVER, +			       uint32_t, uint32_t, uint32_t, +			       NORMAL, FLAG_NONE) + +static force_inline void +scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t *       dst, +					       const uint8_t  * mask, +					       const uint32_t * src_top, +					       const uint32_t * src_bottom, +					       int32_t          w, +					       int              wt, +					       int              wb, +					       pixman_fixed_t   vx, +					       pixman_fixed_t   unit_x, +					       pixman_fixed_t   max_vx, +					       pixman_bool_t    zero_src) +{ +    BILINEAR_DECLARE_VARIABLES; +    __m64 pix1, pix2; +    uint32_t m; + +    while (w) +    { +	m = (uint32_t) *mask++; + +	if (m) +	{ +	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + +	    if (m == 0xff && is_opaque (pix1)) +	    { +		store (dst, pix1); +	    } +	    else +	    { +		__m64 ms, md, ma, msa; + +		pix2 = load (dst); +		ma = expand_alpha_rev (to_m64 (m)); +		ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ()); +		md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ()); + +		msa = expand_alpha (ms); + +		store8888 (dst, (in_over (ms, msa, ma, md))); +	    } +	} +	else +	{ +	    BILINEAR_SKIP_ONE_PIXEL (); +	} + +	w--; +	dst++; +    } + +    _mm_empty (); +} + +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER, +			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER, +			       uint32_t, uint8_t, uint32_t, +			       COVER, FLAG_HAVE_NON_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER, +			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER, +			       uint32_t, uint8_t, uint32_t, +			       PAD, FLAG_HAVE_NON_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER, +			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER, +			       uint32_t, uint8_t, uint32_t, +			       NONE, FLAG_HAVE_NON_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER, +			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER, +			       uint32_t, uint8_t, uint32_t, +			       NORMAL, FLAG_HAVE_NON_SOLID_MASK) + +static uint32_t * +mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) +{ +    int w = iter->width; +    uint32_t *dst = iter->buffer; +    uint32_t *src = (uint32_t *)iter->bits; + +    iter->bits += iter->stride; + +    while (w && ((uintptr_t)dst) & 7) +    { +	*dst++ = (*src++) | 0xff000000; +	w--; +    } + +    while (w >= 8) +    { +	__m64 vsrc1 = ldq_u ((__m64 *)(src + 0)); +	__m64 vsrc2 = ldq_u ((__m64 *)(src + 2)); +	__m64 vsrc3 = ldq_u ((__m64 *)(src + 4)); +	__m64 vsrc4 = ldq_u ((__m64 *)(src + 6)); + +	*(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000)); +	*(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000)); +	*(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000)); +	*(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000)); + +	dst += 8; +	src += 8; +	w -= 8; +    } + +    while (w) +    { +	*dst++ = (*src++) | 0xff000000; +	w--; +    } + +    _mm_empty (); +    return iter->buffer; +} + +static uint32_t * +mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) +{ +    int w = iter->width; +    uint32_t *dst = iter->buffer; +    uint16_t *src = (uint16_t *)iter->bits; + +    iter->bits += iter->stride; + +    while (w && ((uintptr_t)dst) & 0x0f) +    { +	uint16_t s = *src++; + +	*dst++ = convert_0565_to_8888 (s); +	w--; +    } + +    while (w >= 4) +    { +	__m64 vsrc = ldq_u ((__m64 *)src); +	__m64 mm0, mm1; + +	expand_4xpacked565 (vsrc, &mm0, &mm1, 1); + +	*(__m64 *)(dst + 0) = mm0; +	*(__m64 *)(dst + 2) = mm1; + +	dst += 4; +	src += 4; +	w -= 4; +    } + +    while (w) +    { +	uint16_t s = *src++; + +	*dst++ = convert_0565_to_8888 (s); +	w--; +    } + +    _mm_empty (); +    return iter->buffer; +} + +static uint32_t * +mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) +{ +    int w = iter->width; +    uint32_t *dst = iter->buffer; +    uint8_t *src = iter->bits; + +    iter->bits += iter->stride; + +    while (w && (((uintptr_t)dst) & 15)) +    { +        *dst++ = (uint32_t)*(src++) << 24; +        w--; +    } + +    while (w >= 8) +    { +	__m64 mm0 = ldq_u ((__m64 *)src); + +	__m64 mm1 = _mm_unpacklo_pi8  (_mm_setzero_si64(), mm0); +	__m64 mm2 = _mm_unpackhi_pi8  (_mm_setzero_si64(), mm0); +	__m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1); +	__m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1); +	__m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2); +	__m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2); + +	*(__m64 *)(dst + 0) = mm3; +	*(__m64 *)(dst + 2) = mm4; +	*(__m64 *)(dst + 4) = mm5; +	*(__m64 *)(dst + 6) = mm6; + +	dst += 8; +	src += 8; +	w -= 8; +    } + +    while (w) +    { +	*dst++ = (uint32_t)*(src++) << 24; +	w--; +    } + +    _mm_empty (); +    return iter->buffer; +} + +#define IMAGE_FLAGS							\ +    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\ +     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) + +static const pixman_iter_info_t mmx_iters[] =  +{ +    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW, +      _pixman_iter_init_bits_stride, mmx_fetch_x8r8g8b8, NULL +    }, +    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW, +      _pixman_iter_init_bits_stride, mmx_fetch_r5g6b5, NULL +    }, +    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW, +      _pixman_iter_init_bits_stride, mmx_fetch_a8, NULL +    }, +    { PIXMAN_null }, +}; + +static const pixman_fast_path_t mmx_fast_paths[] = +{ +    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ), +    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ), +    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ), +    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ), +    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ), +    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ), +    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ), +    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ), +    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ), +    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ), +    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ), +    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ), +    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ), +    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ), +    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ), +    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ), +    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ), +    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ), +    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ), +    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ), +    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ), +    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ), +    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ), +    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, mmx_composite_over_x888_8_8888    ), +    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ), +    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ), +    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ), +    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     b5g6r5,   mmx_composite_over_n_0565         ), +    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ), +    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ), + +    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ), +    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ), +    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ), +    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ), +    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ), +    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ), + +    PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888), +    PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888), + +    PIXMAN_STD_FAST_PATH    (ADD,  r5g6b5,   null,     r5g6b5,   mmx_composite_add_0565_0565       ), +    PIXMAN_STD_FAST_PATH    (ADD,  b5g6r5,   null,     b5g6r5,   mmx_composite_add_0565_0565       ), +    PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ), +    PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ), +    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8		   ), +    PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ), + +    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ), +    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ), +    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ), +    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ), +    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ), +    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ), +    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ), +    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ), +    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ), +    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ), +    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ), +    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ), +    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ), +    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ), +    PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ), +    PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ), + +    PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ), +    PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ), + +    SIMPLE_NEAREST_FAST_PATH (OVER,   a8r8g8b8, x8r8g8b8, mmx_8888_8888                            ), +    SIMPLE_NEAREST_FAST_PATH (OVER,   a8b8g8r8, x8b8g8r8, mmx_8888_8888                            ), +    SIMPLE_NEAREST_FAST_PATH (OVER,   a8r8g8b8, a8r8g8b8, mmx_8888_8888                            ), +    SIMPLE_NEAREST_FAST_PATH (OVER,   a8b8g8r8, a8b8g8r8, mmx_8888_8888                            ), + +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_n_8888                 ), +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_n_8888                 ), +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_n_8888                 ), +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_n_8888                 ), + +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          a8r8g8b8, mmx_8888_8888                     ), +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ), +    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ), +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          a8b8g8r8, mmx_8888_8888                     ), +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ), +    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ), + +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         x8r8g8b8, mmx_8888_8888                     ), +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         x8b8g8r8, mmx_8888_8888                     ), +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         a8r8g8b8, mmx_8888_8888                     ), +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         a8b8g8r8, mmx_8888_8888                     ), + +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888                   ), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888                   ), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888                   ), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888                   ), + +    { PIXMAN_OP_NONE }, +}; + +pixman_implementation_t * +_pixman_implementation_create_mmx (pixman_implementation_t *fallback) +{ +    pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths); + +    imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u; +    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u; +    imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u; +    imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u; +    imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u; +    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u; +    imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u; +    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u; +    imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u; +    imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u; +    imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u; + +    imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca; +    imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca; +    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca; +    imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca; +    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca; +    imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca; +    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca; +    imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca; +    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca; +    imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca; +    imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca; + +    imp->blt = mmx_blt; +    imp->fill = mmx_fill; + +    imp->iter_info = mmx_iters; + +    return imp; +} + +#endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */ diff --git a/libs/pixman-0.40.0/pixman/pixman-noop.c b/libs/pixman-0.40.0/pixman/pixman-noop.c new file mode 100644 index 0000000..e598904 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-noop.c @@ -0,0 +1,161 @@ +/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */ +/* + * Copyright © 2011 Red Hat, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include <string.h> +#include <stdlib.h> +#include "pixman-private.h" +#include "pixman-combine32.h" +#include "pixman-inlines.h" + +static void +noop_composite (pixman_implementation_t *imp, +		pixman_composite_info_t *info) +{ +    return; +} + +static uint32_t * +noop_get_scanline (pixman_iter_t *iter, const uint32_t *mask) +{ +    uint32_t *result = iter->buffer; + +    iter->buffer += iter->image->bits.rowstride; + +    return result; +} + +static void +noop_init_solid_narrow (pixman_iter_t *iter, +			const pixman_iter_info_t *info) +{  +    pixman_image_t *image = iter->image; +    uint32_t *buffer = iter->buffer; +    uint32_t *end = buffer + iter->width; +    uint32_t color; + +    if (iter->image->type == SOLID) +	color = image->solid.color_32; +    else +	color = image->bits.fetch_pixel_32 (&image->bits, 0, 0); + +    while (buffer < end) +	*(buffer++) = color; +} + +static void +noop_init_solid_wide (pixman_iter_t *iter, +		      const pixman_iter_info_t *info) +{ +    pixman_image_t *image = iter->image; +    argb_t *buffer = (argb_t *)iter->buffer; +    argb_t *end = buffer + iter->width; +    argb_t color; + +    if (iter->image->type == SOLID) +	color = image->solid.color_float; +    else +	color = image->bits.fetch_pixel_float (&image->bits, 0, 0); + +    while (buffer < end) +	*(buffer++) = color; +} + +static void +noop_init_direct_buffer (pixman_iter_t *iter, const pixman_iter_info_t *info) +{ +    pixman_image_t *image = iter->image; + +    iter->buffer = +	image->bits.bits + iter->y * image->bits.rowstride + iter->x; +} + +static void +dest_write_back_direct (pixman_iter_t *iter) +{ +    iter->buffer += iter->image->bits.rowstride; +} + +static const pixman_iter_info_t noop_iters[] = +{ +    /* Source iters */ +    { PIXMAN_any, +      0, ITER_IGNORE_ALPHA | ITER_IGNORE_RGB | ITER_SRC, +      NULL, +      _pixman_iter_get_scanline_noop, +      NULL +    }, +    { PIXMAN_solid, +      FAST_PATH_NO_ALPHA_MAP, ITER_NARROW | ITER_SRC, +      noop_init_solid_narrow, +      _pixman_iter_get_scanline_noop, +      NULL, +    }, +    { PIXMAN_solid, +      FAST_PATH_NO_ALPHA_MAP, ITER_WIDE | ITER_SRC, +      noop_init_solid_wide, +      _pixman_iter_get_scanline_noop, +      NULL +    }, +    { PIXMAN_a8r8g8b8, +      FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | +          FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST, +      ITER_NARROW | ITER_SRC, +      noop_init_direct_buffer, +      noop_get_scanline, +      NULL +    }, +    /* Dest iters */ +    { PIXMAN_a8r8g8b8, +      FAST_PATH_STD_DEST_FLAGS, ITER_NARROW | ITER_DEST, +      noop_init_direct_buffer, +      _pixman_iter_get_scanline_noop, +      dest_write_back_direct +    }, +    { PIXMAN_x8r8g8b8, +      FAST_PATH_STD_DEST_FLAGS, ITER_NARROW | ITER_DEST | ITER_LOCALIZED_ALPHA, +      noop_init_direct_buffer, +      _pixman_iter_get_scanline_noop, +      dest_write_back_direct +    }, +    { PIXMAN_null }, +}; + +static const pixman_fast_path_t noop_fast_paths[] = +{ +    { PIXMAN_OP_DST, PIXMAN_any, 0, PIXMAN_any, 0, PIXMAN_any, 0, noop_composite }, +    { PIXMAN_OP_NONE }, +}; + +pixman_implementation_t * +_pixman_implementation_create_noop (pixman_implementation_t *fallback) +{ +    pixman_implementation_t *imp = +	_pixman_implementation_create (fallback, noop_fast_paths); +  +    imp->iter_info = noop_iters; + +    return imp; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-ppc.c b/libs/pixman-0.40.0/pixman/pixman-ppc.c new file mode 100644 index 0000000..a6e7bb0 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-ppc.c @@ -0,0 +1,155 @@ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of SuSE not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission.  SuSE makes no representations about the + * suitability of this software for any purpose.  It is provided "as is" + * without express or implied warranty. + * + * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include "pixman-private.h" + +#ifdef USE_VMX + +/* The CPU detection code needs to be in a file not compiled with + * "-maltivec -mabi=altivec", as gcc would try to save vector register + * across function calls causing SIGILL on cpus without Altivec/vmx. + */ +#ifdef __APPLE__ +#include <sys/sysctl.h> + +static pixman_bool_t +pixman_have_vmx (void) +{ +    int error, have_vmx; +    size_t length = sizeof(have_vmx); + +    error = sysctlbyname ("hw.optional.altivec", &have_vmx, &length, NULL, 0); + +    if (error) +	return FALSE; + +    return have_vmx; +} + +#elif defined (__OpenBSD__) +#include <sys/param.h> +#include <sys/sysctl.h> +#include <machine/cpu.h> + +static pixman_bool_t +pixman_have_vmx (void) +{ +    int error, have_vmx; +    int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC }; +    size_t length = sizeof(have_vmx); + +    error = sysctl (mib, 2, &have_vmx, &length, NULL, 0); + +    if (error != 0) +	return FALSE; + +    return have_vmx; +} + +#elif defined (__linux__) + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdio.h> +#include <linux/auxvec.h> +#include <asm/cputable.h> + +static pixman_bool_t +pixman_have_vmx (void) +{ +    int have_vmx = FALSE; +    int fd; +    struct +    { +	unsigned long type; +	unsigned long value; +    } aux; + +    fd = open ("/proc/self/auxv", O_RDONLY); +    if (fd >= 0) +    { +	while (read (fd, &aux, sizeof (aux)) == sizeof (aux)) +	{ +	    if (aux.type == AT_HWCAP && (aux.value & PPC_FEATURE_HAS_ALTIVEC)) +	    { +		have_vmx = TRUE; +		break; +	    } +	} + +	close (fd); +    } + +    return have_vmx; +} + +#else /* !__APPLE__ && !__OpenBSD__ && !__linux__ */ +#include <signal.h> +#include <setjmp.h> + +static jmp_buf jump_env; + +static void +vmx_test (int        sig, +	  siginfo_t *si, +	  void *     unused) +{ +    longjmp (jump_env, 1); +} + +static pixman_bool_t +pixman_have_vmx (void) +{ +    struct sigaction sa, osa; +    int jmp_result; + +    sa.sa_flags = SA_SIGINFO; +    sigemptyset (&sa.sa_mask); +    sa.sa_sigaction = vmx_test; +    sigaction (SIGILL, &sa, &osa); +    jmp_result = setjmp (jump_env); +    if (jmp_result == 0) +    { +	asm volatile ( "vor 0, 0, 0" ); +    } +    sigaction (SIGILL, &osa, NULL); +    return (jmp_result == 0); +} + +#endif /* __APPLE__ */ +#endif /* USE_VMX */ + +pixman_implementation_t * +_pixman_ppc_get_implementations (pixman_implementation_t *imp) +{ +#ifdef USE_VMX +    if (!_pixman_disabled ("vmx") && pixman_have_vmx ()) +	imp = _pixman_implementation_create_vmx (imp); +#endif + +    return imp; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-private.h b/libs/pixman-0.40.0/pixman/pixman-private.h new file mode 100644 index 0000000..d836cc5 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-private.h @@ -0,0 +1,1188 @@ +#ifndef PIXMAN_PRIVATE_H +#define PIXMAN_PRIVATE_H + +/* + * The defines which are shared between C and assembly code + */ + +/* bilinear interpolation precision (must be < 8) */ +#define BILINEAR_INTERPOLATION_BITS 7 +#define BILINEAR_INTERPOLATION_RANGE (1 << BILINEAR_INTERPOLATION_BITS) + +/* + * C specific part + */ + +#ifndef __ASSEMBLER__ + +#ifndef PACKAGE +#  error config.h must be included before pixman-private.h +#endif + +#define PIXMAN_DISABLE_DEPRECATED +#define PIXMAN_USE_INTERNAL_API + +#include "pixman.h" +#include <time.h> +#include <assert.h> +#include <stdio.h> +#include <string.h> +#include <stddef.h> +#include <float.h> + +#include "pixman-compiler.h" + +/* + * Images + */ +typedef struct image_common image_common_t; +typedef struct solid_fill solid_fill_t; +typedef struct gradient gradient_t; +typedef struct linear_gradient linear_gradient_t; +typedef struct horizontal_gradient horizontal_gradient_t; +typedef struct vertical_gradient vertical_gradient_t; +typedef struct conical_gradient conical_gradient_t; +typedef struct radial_gradient radial_gradient_t; +typedef struct bits_image bits_image_t; +typedef struct circle circle_t; + +typedef struct argb_t argb_t; + +struct argb_t +{ +    float a; +    float r; +    float g; +    float b; +}; + +typedef void (*fetch_scanline_t) (bits_image_t   *image, +				  int             x, +				  int             y, +				  int             width, +				  uint32_t       *buffer, +				  const uint32_t *mask); + +typedef uint32_t (*fetch_pixel_32_t) (bits_image_t *image, +				      int           x, +				      int           y); + +typedef argb_t (*fetch_pixel_float_t) (bits_image_t *image, +				       int           x, +				       int           y); + +typedef void (*store_scanline_t) (bits_image_t *  image, +				  int             x, +				  int             y, +				  int             width, +				  const uint32_t *values); + +typedef enum +{ +    BITS, +    LINEAR, +    CONICAL, +    RADIAL, +    SOLID +} image_type_t; + +typedef void (*property_changed_func_t) (pixman_image_t *image); + +struct image_common +{ +    image_type_t                type; +    int32_t                     ref_count; +    pixman_region32_t           clip_region; +    int32_t			alpha_count;	    /* How many times this image is being used as an alpha map */ +    pixman_bool_t               have_clip_region;   /* FALSE if there is no clip */ +    pixman_bool_t               client_clip;        /* Whether the source clip was +						       set by a client */ +    pixman_bool_t               clip_sources;       /* Whether the clip applies when +						     * the image is used as a source +						     */ +    pixman_bool_t		dirty; +    pixman_transform_t *        transform; +    pixman_repeat_t             repeat; +    pixman_filter_t             filter; +    pixman_fixed_t *            filter_params; +    int                         n_filter_params; +    bits_image_t *              alpha_map; +    int                         alpha_origin_x; +    int                         alpha_origin_y; +    pixman_bool_t               component_alpha; +    property_changed_func_t     property_changed; + +    pixman_image_destroy_func_t destroy_func; +    void *                      destroy_data; + +    uint32_t			flags; +    pixman_format_code_t	extended_format_code; +}; + +struct solid_fill +{ +    image_common_t common; +    pixman_color_t color; + +    uint32_t	   color_32; +    argb_t	   color_float; +}; + +struct gradient +{ +    image_common_t	    common; +    int                     n_stops; +    pixman_gradient_stop_t *stops; +}; + +struct linear_gradient +{ +    gradient_t           common; +    pixman_point_fixed_t p1; +    pixman_point_fixed_t p2; +}; + +struct circle +{ +    pixman_fixed_t x; +    pixman_fixed_t y; +    pixman_fixed_t radius; +}; + +struct radial_gradient +{ +    gradient_t common; + +    circle_t   c1; +    circle_t   c2; + +    circle_t   delta; +    double     a; +    double     inva; +    double     mindr; +}; + +struct conical_gradient +{ +    gradient_t           common; +    pixman_point_fixed_t center; +    double		 angle; +}; + +struct bits_image +{ +    image_common_t             common; +    pixman_format_code_t       format; +    const pixman_indexed_t *   indexed; +    int                        width; +    int                        height; +    uint32_t *                 bits; +    uint32_t *                 free_me; +    int                        rowstride;  /* in number of uint32_t's */ + +    pixman_dither_t            dither; +    uint32_t                   dither_offset_y; +    uint32_t                   dither_offset_x; + +    fetch_scanline_t           fetch_scanline_32; +    fetch_pixel_32_t	       fetch_pixel_32; +    store_scanline_t           store_scanline_32; + +    fetch_scanline_t	       fetch_scanline_float; +    fetch_pixel_float_t	       fetch_pixel_float; +    store_scanline_t           store_scanline_float; + +    /* Used for indirect access to the bits */ +    pixman_read_memory_func_t  read_func; +    pixman_write_memory_func_t write_func; +}; + +union pixman_image +{ +    image_type_t       type; +    image_common_t     common; +    bits_image_t       bits; +    gradient_t         gradient; +    linear_gradient_t  linear; +    conical_gradient_t conical; +    radial_gradient_t  radial; +    solid_fill_t       solid; +}; + +typedef struct pixman_iter_t pixman_iter_t; +typedef uint32_t *(* pixman_iter_get_scanline_t) (pixman_iter_t *iter, const uint32_t *mask); +typedef void      (* pixman_iter_write_back_t)   (pixman_iter_t *iter); +typedef void	  (* pixman_iter_fini_t)	 (pixman_iter_t *iter); + +typedef enum +{ +    ITER_NARROW =               (1 << 0), +    ITER_WIDE =                 (1 << 1), + +    /* "Localized alpha" is when the alpha channel is used only to compute +     * the alpha value of the destination. This means that the computation +     * of the RGB values of the result is independent of the alpha value. +     * +     * For example, the OVER operator has localized alpha for the +     * destination, because the RGB values of the result can be computed +     * without knowing the destination alpha. Similarly, ADD has localized +     * alpha for both source and destination because the RGB values of the +     * result can be computed without knowing the alpha value of source or +     * destination. +     * +     * When he destination is xRGB, this is useful knowledge, because then +     * we can treat it as if it were ARGB, which means in some cases we can +     * avoid copying it to a temporary buffer. +     */ +    ITER_LOCALIZED_ALPHA =	(1 << 2), +    ITER_IGNORE_ALPHA =		(1 << 3), +    ITER_IGNORE_RGB =		(1 << 4), + +    /* These indicate whether the iterator is for a source +     * or a destination image +     */ +    ITER_SRC =			(1 << 5), +    ITER_DEST =			(1 << 6) +} iter_flags_t; + +struct pixman_iter_t +{ +    /* These are initialized by _pixman_implementation_{src,dest}_init */ +    pixman_image_t *		image; +    uint32_t *			buffer; +    int				x, y; +    int				width; +    int				height; +    iter_flags_t		iter_flags; +    uint32_t			image_flags; + +    /* These function pointers are initialized by the implementation */ +    pixman_iter_get_scanline_t	get_scanline; +    pixman_iter_write_back_t	write_back; +    pixman_iter_fini_t          fini; + +    /* These fields are scratch data that implementations can use */ +    void *			data; +    uint8_t *			bits; +    int				stride; +}; + +typedef struct pixman_iter_info_t pixman_iter_info_t; +typedef void (* pixman_iter_initializer_t) (pixman_iter_t *iter, +                                            const pixman_iter_info_t *info); +struct pixman_iter_info_t +{ +    pixman_format_code_t	format; +    uint32_t			image_flags; +    iter_flags_t		iter_flags; +    pixman_iter_initializer_t	initializer; +    pixman_iter_get_scanline_t	get_scanline; +    pixman_iter_write_back_t	write_back; +}; + +void +_pixman_bits_image_setup_accessors (bits_image_t *image); + +void +_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter); + +void +_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter); + +void +_pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t  *iter); + +void +_pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter); + +void +_pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter); + +void +_pixman_image_init (pixman_image_t *image); + +pixman_bool_t +_pixman_bits_image_init (pixman_image_t *     image, +                         pixman_format_code_t format, +                         int                  width, +                         int                  height, +                         uint32_t *           bits, +                         int                  rowstride, +			 pixman_bool_t	      clear); +pixman_bool_t +_pixman_image_fini (pixman_image_t *image); + +pixman_image_t * +_pixman_image_allocate (void); + +pixman_bool_t +_pixman_init_gradient (gradient_t *                  gradient, +                       const pixman_gradient_stop_t *stops, +                       int                           n_stops); +void +_pixman_image_reset_clip_region (pixman_image_t *image); + +void +_pixman_image_validate (pixman_image_t *image); + +#define PIXMAN_IMAGE_GET_LINE(image, x, y, type, out_stride, line, mul)	\ +    do									\ +    {									\ +	uint32_t *__bits__;						\ +	int       __stride__;						\ +        								\ +	__bits__ = image->bits.bits;					\ +	__stride__ = image->bits.rowstride;				\ +	(out_stride) =							\ +	    __stride__ * (int) sizeof (uint32_t) / (int) sizeof (type);	\ +	(line) =							\ +	    ((type *) __bits__) + (out_stride) * (y) + (mul) * (x);	\ +    } while (0) + +/* + * Gradient walker + */ +typedef struct +{ +    float		    a_s, a_b; +    float		    r_s, r_b; +    float		    g_s, g_b; +    float		    b_s, b_b; +    pixman_fixed_48_16_t    left_x; +    pixman_fixed_48_16_t    right_x; + +    pixman_gradient_stop_t *stops; +    int                     num_stops; +    pixman_repeat_t	    repeat; + +    pixman_bool_t           need_reset; +} pixman_gradient_walker_t; + +void +_pixman_gradient_walker_init (pixman_gradient_walker_t *walker, +                              gradient_t *              gradient, +			      pixman_repeat_t           repeat); + +void +_pixman_gradient_walker_reset (pixman_gradient_walker_t *walker, +                               pixman_fixed_48_16_t      pos); + +typedef void (*pixman_gradient_walker_write_t) ( +    pixman_gradient_walker_t *walker, +    pixman_fixed_48_16_t      x, +    uint32_t                 *buffer); + +void +_pixman_gradient_walker_write_narrow(pixman_gradient_walker_t *walker, +				     pixman_fixed_48_16_t      x, +				     uint32_t                 *buffer); + +void +_pixman_gradient_walker_write_wide(pixman_gradient_walker_t *walker, +				   pixman_fixed_48_16_t      x, +				   uint32_t                 *buffer); + +typedef void (*pixman_gradient_walker_fill_t) ( +    pixman_gradient_walker_t *walker, +    pixman_fixed_48_16_t      x, +    uint32_t                 *buffer, +    uint32_t                 *end); + +void +_pixman_gradient_walker_fill_narrow(pixman_gradient_walker_t *walker, +				    pixman_fixed_48_16_t      x, +				    uint32_t                 *buffer, +				    uint32_t                 *end); + +void +_pixman_gradient_walker_fill_wide(pixman_gradient_walker_t *walker, +				  pixman_fixed_48_16_t      x, +				  uint32_t                 *buffer, +				  uint32_t                 *end); + +/* + * Edges + */ + +#define MAX_ALPHA(n)    ((1 << (n)) - 1) +#define N_Y_FRAC(n)     ((n) == 1 ? 1 : (1 << ((n) / 2)) - 1) +#define N_X_FRAC(n)     ((n) == 1 ? 1 : (1 << ((n) / 2)) + 1) + +#define STEP_Y_SMALL(n) (pixman_fixed_1 / N_Y_FRAC (n)) +#define STEP_Y_BIG(n)   (pixman_fixed_1 - (N_Y_FRAC (n) - 1) * STEP_Y_SMALL (n)) + +#define Y_FRAC_FIRST(n) (STEP_Y_BIG (n) / 2) +#define Y_FRAC_LAST(n)  (Y_FRAC_FIRST (n) + (N_Y_FRAC (n) - 1) * STEP_Y_SMALL (n)) + +#define STEP_X_SMALL(n) (pixman_fixed_1 / N_X_FRAC (n)) +#define STEP_X_BIG(n)   (pixman_fixed_1 - (N_X_FRAC (n) - 1) * STEP_X_SMALL (n)) + +#define X_FRAC_FIRST(n) (STEP_X_BIG (n) / 2) +#define X_FRAC_LAST(n)  (X_FRAC_FIRST (n) + (N_X_FRAC (n) - 1) * STEP_X_SMALL (n)) + +#define RENDER_SAMPLES_X(x, n)						\ +    ((n) == 1? 0 : (pixman_fixed_frac (x) +				\ +		    X_FRAC_FIRST (n)) / STEP_X_SMALL (n)) + +void +pixman_rasterize_edges_accessors (pixman_image_t *image, +                                  pixman_edge_t * l, +                                  pixman_edge_t * r, +                                  pixman_fixed_t  t, +                                  pixman_fixed_t  b); + +/* + * Implementations + */ +typedef struct pixman_implementation_t pixman_implementation_t; + +typedef struct +{ +    pixman_op_t              op; +    pixman_image_t *         src_image; +    pixman_image_t *         mask_image; +    pixman_image_t *         dest_image; +    int32_t                  src_x; +    int32_t                  src_y; +    int32_t                  mask_x; +    int32_t                  mask_y; +    int32_t                  dest_x; +    int32_t                  dest_y; +    int32_t                  width; +    int32_t                  height; + +    uint32_t                 src_flags; +    uint32_t                 mask_flags; +    uint32_t                 dest_flags; +} pixman_composite_info_t; + +#define PIXMAN_COMPOSITE_ARGS(info)					\ +    MAYBE_UNUSED pixman_op_t        op = info->op;			\ +    MAYBE_UNUSED pixman_image_t *   src_image = info->src_image;	\ +    MAYBE_UNUSED pixman_image_t *   mask_image = info->mask_image;	\ +    MAYBE_UNUSED pixman_image_t *   dest_image = info->dest_image;	\ +    MAYBE_UNUSED int32_t            src_x = info->src_x;		\ +    MAYBE_UNUSED int32_t            src_y = info->src_y;		\ +    MAYBE_UNUSED int32_t            mask_x = info->mask_x;		\ +    MAYBE_UNUSED int32_t            mask_y = info->mask_y;		\ +    MAYBE_UNUSED int32_t            dest_x = info->dest_x;		\ +    MAYBE_UNUSED int32_t            dest_y = info->dest_y;		\ +    MAYBE_UNUSED int32_t            width = info->width;		\ +    MAYBE_UNUSED int32_t            height = info->height + +typedef void (*pixman_combine_32_func_t) (pixman_implementation_t *imp, +					  pixman_op_t              op, +					  uint32_t *               dest, +					  const uint32_t *         src, +					  const uint32_t *         mask, +					  int                      width); + +typedef void (*pixman_combine_float_func_t) (pixman_implementation_t *imp, +					     pixman_op_t	      op, +					     float *		      dest, +					     const float *	      src, +					     const float *	      mask, +					     int		      n_pixels); + +typedef void (*pixman_composite_func_t) (pixman_implementation_t *imp, +					 pixman_composite_info_t *info); +typedef pixman_bool_t (*pixman_blt_func_t) (pixman_implementation_t *imp, +					    uint32_t *               src_bits, +					    uint32_t *               dst_bits, +					    int                      src_stride, +					    int                      dst_stride, +					    int                      src_bpp, +					    int                      dst_bpp, +					    int                      src_x, +					    int                      src_y, +					    int                      dest_x, +					    int                      dest_y, +					    int                      width, +					    int                      height); +typedef pixman_bool_t (*pixman_fill_func_t) (pixman_implementation_t *imp, +					     uint32_t *               bits, +					     int                      stride, +					     int                      bpp, +					     int                      x, +					     int                      y, +					     int                      width, +					     int                      height, +					     uint32_t                 filler); + +void _pixman_setup_combiner_functions_32 (pixman_implementation_t *imp); +void _pixman_setup_combiner_functions_float (pixman_implementation_t *imp); + +typedef struct +{ +    pixman_op_t             op; +    pixman_format_code_t    src_format; +    uint32_t		    src_flags; +    pixman_format_code_t    mask_format; +    uint32_t		    mask_flags; +    pixman_format_code_t    dest_format; +    uint32_t		    dest_flags; +    pixman_composite_func_t func; +} pixman_fast_path_t; + +struct pixman_implementation_t +{ +    pixman_implementation_t *	toplevel; +    pixman_implementation_t *	fallback; +    const pixman_fast_path_t *	fast_paths; +    const pixman_iter_info_t *  iter_info; + +    pixman_blt_func_t		blt; +    pixman_fill_func_t		fill; + +    pixman_combine_32_func_t	combine_32[PIXMAN_N_OPERATORS]; +    pixman_combine_32_func_t	combine_32_ca[PIXMAN_N_OPERATORS]; +    pixman_combine_float_func_t	combine_float[PIXMAN_N_OPERATORS]; +    pixman_combine_float_func_t	combine_float_ca[PIXMAN_N_OPERATORS]; +}; + +uint32_t +_pixman_image_get_solid (pixman_implementation_t *imp, +			 pixman_image_t *         image, +                         pixman_format_code_t     format); + +pixman_implementation_t * +_pixman_implementation_create (pixman_implementation_t *fallback, +			       const pixman_fast_path_t *fast_paths); + +void +_pixman_implementation_lookup_composite (pixman_implementation_t  *toplevel, +					 pixman_op_t               op, +					 pixman_format_code_t      src_format, +					 uint32_t                  src_flags, +					 pixman_format_code_t      mask_format, +					 uint32_t                  mask_flags, +					 pixman_format_code_t      dest_format, +					 uint32_t                  dest_flags, +					 pixman_implementation_t **out_imp, +					 pixman_composite_func_t  *out_func); + +pixman_combine_32_func_t +_pixman_implementation_lookup_combiner (pixman_implementation_t *imp, +					pixman_op_t		 op, +					pixman_bool_t		 component_alpha, +					pixman_bool_t		 wide); + +pixman_bool_t +_pixman_implementation_blt (pixman_implementation_t *imp, +                            uint32_t *               src_bits, +                            uint32_t *               dst_bits, +                            int                      src_stride, +                            int                      dst_stride, +                            int                      src_bpp, +                            int                      dst_bpp, +                            int                      src_x, +                            int                      src_y, +                            int                      dest_x, +                            int                      dest_y, +                            int                      width, +                            int                      height); + +pixman_bool_t +_pixman_implementation_fill (pixman_implementation_t *imp, +                             uint32_t *               bits, +                             int                      stride, +                             int                      bpp, +                             int                      x, +                             int                      y, +                             int                      width, +                             int                      height, +                             uint32_t                 filler); + +void +_pixman_implementation_iter_init (pixman_implementation_t       *imp, +                                  pixman_iter_t                 *iter, +                                  pixman_image_t                *image, +                                  int                            x, +                                  int                            y, +                                  int                            width, +                                  int                            height, +                                  uint8_t                       *buffer, +                                  iter_flags_t                   flags, +                                  uint32_t                       image_flags); + +/* Specific implementations */ +pixman_implementation_t * +_pixman_implementation_create_general (void); + +pixman_implementation_t * +_pixman_implementation_create_fast_path (pixman_implementation_t *fallback); + +pixman_implementation_t * +_pixman_implementation_create_noop (pixman_implementation_t *fallback); + +#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI +pixman_implementation_t * +_pixman_implementation_create_mmx (pixman_implementation_t *fallback); +#endif + +#ifdef USE_SSE2 +pixman_implementation_t * +_pixman_implementation_create_sse2 (pixman_implementation_t *fallback); +#endif + +#ifdef USE_SSSE3 +pixman_implementation_t * +_pixman_implementation_create_ssse3 (pixman_implementation_t *fallback); +#endif + +#ifdef USE_ARM_SIMD +pixman_implementation_t * +_pixman_implementation_create_arm_simd (pixman_implementation_t *fallback); +#endif + +#ifdef USE_ARM_NEON +pixman_implementation_t * +_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback); +#endif + +#ifdef USE_MIPS_DSPR2 +pixman_implementation_t * +_pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback); +#endif + +#ifdef USE_VMX +pixman_implementation_t * +_pixman_implementation_create_vmx (pixman_implementation_t *fallback); +#endif + +pixman_bool_t +_pixman_implementation_disabled (const char *name); + +pixman_implementation_t * +_pixman_x86_get_implementations (pixman_implementation_t *imp); + +pixman_implementation_t * +_pixman_arm_get_implementations (pixman_implementation_t *imp); + +pixman_implementation_t * +_pixman_ppc_get_implementations (pixman_implementation_t *imp); + +pixman_implementation_t * +_pixman_mips_get_implementations (pixman_implementation_t *imp); + +pixman_implementation_t * +_pixman_choose_implementation (void); + +pixman_bool_t +_pixman_disabled (const char *name); + + +/* + * Utilities + */ +pixman_bool_t +_pixman_compute_composite_region32 (pixman_region32_t * region, +				    pixman_image_t *    src_image, +				    pixman_image_t *    mask_image, +				    pixman_image_t *    dest_image, +				    int32_t             src_x, +				    int32_t             src_y, +				    int32_t             mask_x, +				    int32_t             mask_y, +				    int32_t             dest_x, +				    int32_t             dest_y, +				    int32_t             width, +				    int32_t             height); +uint32_t * +_pixman_iter_get_scanline_noop (pixman_iter_t *iter, const uint32_t *mask); + +void +_pixman_iter_init_bits_stride (pixman_iter_t *iter, const pixman_iter_info_t *info); + +/* These "formats" all have depth 0, so they + * will never clash with any real ones + */ +#define PIXMAN_null             PIXMAN_FORMAT (0, 0, 0, 0, 0, 0) +#define PIXMAN_solid            PIXMAN_FORMAT (0, 1, 0, 0, 0, 0) +#define PIXMAN_pixbuf		PIXMAN_FORMAT (0, 2, 0, 0, 0, 0) +#define PIXMAN_rpixbuf		PIXMAN_FORMAT (0, 3, 0, 0, 0, 0) +#define PIXMAN_unknown		PIXMAN_FORMAT (0, 4, 0, 0, 0, 0) +#define PIXMAN_any		PIXMAN_FORMAT (0, 5, 0, 0, 0, 0) + +#define PIXMAN_OP_any		(PIXMAN_N_OPERATORS + 1) + +#define FAST_PATH_ID_TRANSFORM			(1 <<  0) +#define FAST_PATH_NO_ALPHA_MAP			(1 <<  1) +#define FAST_PATH_NO_CONVOLUTION_FILTER		(1 <<  2) +#define FAST_PATH_NO_PAD_REPEAT			(1 <<  3) +#define FAST_PATH_NO_REFLECT_REPEAT		(1 <<  4) +#define FAST_PATH_NO_ACCESSORS			(1 <<  5) +#define FAST_PATH_NARROW_FORMAT			(1 <<  6) +#define FAST_PATH_COMPONENT_ALPHA		(1 <<  8) +#define FAST_PATH_SAMPLES_OPAQUE		(1 <<  7) +#define FAST_PATH_UNIFIED_ALPHA			(1 <<  9) +#define FAST_PATH_SCALE_TRANSFORM		(1 << 10) +#define FAST_PATH_NEAREST_FILTER		(1 << 11) +#define FAST_PATH_HAS_TRANSFORM			(1 << 12) +#define FAST_PATH_IS_OPAQUE			(1 << 13) +#define FAST_PATH_NO_NORMAL_REPEAT		(1 << 14) +#define FAST_PATH_NO_NONE_REPEAT		(1 << 15) +#define FAST_PATH_X_UNIT_POSITIVE		(1 << 16) +#define FAST_PATH_AFFINE_TRANSFORM		(1 << 17) +#define FAST_PATH_Y_UNIT_ZERO			(1 << 18) +#define FAST_PATH_BILINEAR_FILTER		(1 << 19) +#define FAST_PATH_ROTATE_90_TRANSFORM		(1 << 20) +#define FAST_PATH_ROTATE_180_TRANSFORM		(1 << 21) +#define FAST_PATH_ROTATE_270_TRANSFORM		(1 << 22) +#define FAST_PATH_SAMPLES_COVER_CLIP_NEAREST	(1 << 23) +#define FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR	(1 << 24) +#define FAST_PATH_BITS_IMAGE			(1 << 25) +#define FAST_PATH_SEPARABLE_CONVOLUTION_FILTER  (1 << 26) + +#define FAST_PATH_PAD_REPEAT						\ +    (FAST_PATH_NO_NONE_REPEAT		|				\ +     FAST_PATH_NO_NORMAL_REPEAT		|				\ +     FAST_PATH_NO_REFLECT_REPEAT) + +#define FAST_PATH_NORMAL_REPEAT						\ +    (FAST_PATH_NO_NONE_REPEAT		|				\ +     FAST_PATH_NO_PAD_REPEAT		|				\ +     FAST_PATH_NO_REFLECT_REPEAT) + +#define FAST_PATH_NONE_REPEAT						\ +    (FAST_PATH_NO_NORMAL_REPEAT		|				\ +     FAST_PATH_NO_PAD_REPEAT		|				\ +     FAST_PATH_NO_REFLECT_REPEAT) + +#define FAST_PATH_REFLECT_REPEAT					\ +    (FAST_PATH_NO_NONE_REPEAT		|				\ +     FAST_PATH_NO_NORMAL_REPEAT		|				\ +     FAST_PATH_NO_PAD_REPEAT) + +#define FAST_PATH_STANDARD_FLAGS					\ +    (FAST_PATH_NO_CONVOLUTION_FILTER	|				\ +     FAST_PATH_NO_ACCESSORS		|				\ +     FAST_PATH_NO_ALPHA_MAP		|				\ +     FAST_PATH_NARROW_FORMAT) + +#define FAST_PATH_STD_DEST_FLAGS					\ +    (FAST_PATH_NO_ACCESSORS		|				\ +     FAST_PATH_NO_ALPHA_MAP		|				\ +     FAST_PATH_NARROW_FORMAT) + +#define SOURCE_FLAGS(format)						\ +    (FAST_PATH_STANDARD_FLAGS |						\ +     ((PIXMAN_ ## format == PIXMAN_solid) ?				\ +      0 : (FAST_PATH_SAMPLES_COVER_CLIP_NEAREST | FAST_PATH_NEAREST_FILTER | FAST_PATH_ID_TRANSFORM))) + +#define MASK_FLAGS(format, extra)					\ +    ((PIXMAN_ ## format == PIXMAN_null) ? 0 : (SOURCE_FLAGS (format) | extra)) + +#define FAST_PATH(op, src, src_flags, mask, mask_flags, dest, dest_flags, func) \ +    PIXMAN_OP_ ## op,							\ +    PIXMAN_ ## src,							\ +    src_flags,							        \ +    PIXMAN_ ## mask,						        \ +    mask_flags,							        \ +    PIXMAN_ ## dest,	                                                \ +    dest_flags,							        \ +    func + +#define PIXMAN_STD_FAST_PATH(op, src, mask, dest, func)			\ +    { FAST_PATH (							\ +	    op,								\ +	    src,  SOURCE_FLAGS (src),					\ +	    mask, MASK_FLAGS (mask, FAST_PATH_UNIFIED_ALPHA),		\ +	    dest, FAST_PATH_STD_DEST_FLAGS,				\ +	    func) } + +#define PIXMAN_STD_FAST_PATH_CA(op, src, mask, dest, func)		\ +    { FAST_PATH (							\ +	    op,								\ +	    src,  SOURCE_FLAGS (src),					\ +	    mask, MASK_FLAGS (mask, FAST_PATH_COMPONENT_ALPHA),		\ +	    dest, FAST_PATH_STD_DEST_FLAGS,				\ +	    func) } + +extern pixman_implementation_t *global_implementation; + +static force_inline pixman_implementation_t * +get_implementation (void) +{ +#ifndef TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR +    if (!global_implementation) +	global_implementation = _pixman_choose_implementation (); +#endif +    return global_implementation; +} + +/* This function is exported for the sake of the test suite and not part + * of the ABI. + */ +PIXMAN_EXPORT pixman_implementation_t * +_pixman_internal_only_get_implementation (void); + +/* Memory allocation helpers */ +void * +pixman_malloc_ab (unsigned int n, unsigned int b); + +void * +pixman_malloc_abc (unsigned int a, unsigned int b, unsigned int c); + +void * +pixman_malloc_ab_plus_c (unsigned int a, unsigned int b, unsigned int c); + +pixman_bool_t +_pixman_multiply_overflows_size (size_t a, size_t b); + +pixman_bool_t +_pixman_multiply_overflows_int (unsigned int a, unsigned int b); + +pixman_bool_t +_pixman_addition_overflows_int (unsigned int a, unsigned int b); + +/* Compositing utilities */ +void +pixman_expand_to_float (argb_t               *dst, +			const uint32_t       *src, +			pixman_format_code_t  format, +			int                   width); + +void +pixman_contract_from_float (uint32_t     *dst, +			    const argb_t *src, +			    int           width); + +/* Region Helpers */ +pixman_bool_t +pixman_region32_copy_from_region16 (pixman_region32_t *dst, +                                    pixman_region16_t *src); + +pixman_bool_t +pixman_region16_copy_from_region32 (pixman_region16_t *dst, +                                    pixman_region32_t *src); + +/* Doubly linked lists */ +typedef struct pixman_link_t pixman_link_t; +struct pixman_link_t +{ +    pixman_link_t *next; +    pixman_link_t *prev; +}; + +typedef struct pixman_list_t pixman_list_t; +struct pixman_list_t +{ +    pixman_link_t *head; +    pixman_link_t *tail; +}; + +static force_inline void +pixman_list_init (pixman_list_t *list) +{ +    list->head = (pixman_link_t *)list; +    list->tail = (pixman_link_t *)list; +} + +static force_inline void +pixman_list_prepend (pixman_list_t *list, pixman_link_t *link) +{ +    link->next = list->head; +    link->prev = (pixman_link_t *)list; +    list->head->prev = link; +    list->head = link; +} + +static force_inline void +pixman_list_unlink (pixman_link_t *link) +{ +    link->prev->next = link->next; +    link->next->prev = link->prev; +} + +static force_inline void +pixman_list_move_to_front (pixman_list_t *list, pixman_link_t *link) +{ +    pixman_list_unlink (link); +    pixman_list_prepend (list, link); +} + +/* Misc macros */ + +#ifndef FALSE +#   define FALSE 0 +#endif + +#ifndef TRUE +#   define TRUE 1 +#endif + +#ifndef MIN +#  define MIN(a, b) ((a < b) ? a : b) +#endif + +#ifndef MAX +#  define MAX(a, b) ((a > b) ? a : b) +#endif + +/* Integer division that rounds towards -infinity */ +#define DIV(a, b)					   \ +    ((((a) < 0) == ((b) < 0)) ? (a) / (b) :                \ +     ((a) - (b) + 1 - (((b) < 0) << 1)) / (b)) + +/* Modulus that produces the remainder wrt. DIV */ +#define MOD(a, b) ((a) < 0 ? ((b) - ((-(a) - 1) % (b))) - 1 : (a) % (b)) + +#define CLIP(v, low, high) ((v) < (low) ? (low) : ((v) > (high) ? (high) : (v))) + +#define FLOAT_IS_ZERO(f)     (-FLT_MIN < (f) && (f) < FLT_MIN) + +/* Conversion between 8888 and 0565 */ + +static force_inline uint16_t +convert_8888_to_0565 (uint32_t s) +{ +    /* The following code can be compiled into just 4 instructions on ARM */ +    uint32_t a, b; +    a = (s >> 3) & 0x1F001F; +    b = s & 0xFC00; +    a |= a >> 5; +    a |= b >> 5; +    return (uint16_t)a; +} + +static force_inline uint32_t +convert_0565_to_0888 (uint16_t s) +{ +    return (((((s) << 3) & 0xf8) | (((s) >> 2) & 0x7)) | +            ((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) | +            ((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000))); +} + +static force_inline uint32_t +convert_0565_to_8888 (uint16_t s) +{ +    return convert_0565_to_0888 (s) | 0xff000000; +} + +/* Trivial versions that are useful in macros */ + +static force_inline uint32_t +convert_8888_to_8888 (uint32_t s) +{ +    return s; +} + +static force_inline uint32_t +convert_x888_to_8888 (uint32_t s) +{ +    return s | 0xff000000; +} + +static force_inline uint16_t +convert_0565_to_0565 (uint16_t s) +{ +    return s; +} + +#define PIXMAN_FORMAT_IS_WIDE(f)					\ +    (PIXMAN_FORMAT_A (f) > 8 ||						\ +     PIXMAN_FORMAT_R (f) > 8 ||						\ +     PIXMAN_FORMAT_G (f) > 8 ||						\ +     PIXMAN_FORMAT_B (f) > 8 ||						\ +     PIXMAN_FORMAT_TYPE (f) == PIXMAN_TYPE_ARGB_SRGB) + +#ifdef WORDS_BIGENDIAN +#   define SCREEN_SHIFT_LEFT(x,n)	((x) << (n)) +#   define SCREEN_SHIFT_RIGHT(x,n)	((x) >> (n)) +#else +#   define SCREEN_SHIFT_LEFT(x,n)	((x) >> (n)) +#   define SCREEN_SHIFT_RIGHT(x,n)	((x) << (n)) +#endif + +static force_inline uint32_t +unorm_to_unorm (uint32_t val, int from_bits, int to_bits) +{ +    uint32_t result; + +    if (from_bits == 0) +	return 0; + +    /* Delete any extra bits */ +    val &= ((1 << from_bits) - 1); + +    if (from_bits >= to_bits) +	return val >> (from_bits - to_bits); + +    /* Start out with the high bit of val in the high bit of result. */ +    result = val << (to_bits - from_bits); + +    /* Copy the bits in result, doubling the number of bits each time, until +     * we fill all to_bits. Unrolled manually because from_bits and to_bits +     * are usually known statically, so the compiler can turn all of this +     * into a few shifts. +     */ +#define REPLICATE()							\ +    do									\ +    {									\ +	if (from_bits < to_bits)					\ +	{								\ +	    result |= result >> from_bits;				\ +									\ +	    from_bits *= 2;						\ +	}								\ +    }									\ +    while (0) + +    REPLICATE(); +    REPLICATE(); +    REPLICATE(); +    REPLICATE(); +    REPLICATE(); + +    return result; +} + +uint16_t pixman_float_to_unorm (float f, int n_bits); +float pixman_unorm_to_float (uint16_t u, int n_bits); + +/* + * Various debugging code + */ + +#undef DEBUG + +#define COMPILE_TIME_ASSERT(x)						\ +    do { typedef int compile_time_assertion [(x)?1:-1]; } while (0) + +/* Turn on debugging depending on what type of release this is + */ +#if (((PIXMAN_VERSION_MICRO % 2) == 0) && ((PIXMAN_VERSION_MINOR % 2) == 1)) + +/* Debugging gets turned on for development releases because these + * are the things that end up in bleeding edge distributions such + * as Rawhide etc. + * + * For performance reasons we don't turn it on for stable releases or + * random git checkouts. (Random git checkouts are often used for + * performance work). + */ + +#    define DEBUG + +#endif + +void +_pixman_log_error (const char *function, const char *message); + +#define return_if_fail(expr)                                            \ +    do                                                                  \ +    {                                                                   \ +	if (unlikely (!(expr)))                                         \ +	{								\ +	    _pixman_log_error (FUNC, "The expression " # expr " was false"); \ +	    return;							\ +	}								\ +    }                                                                   \ +    while (0) + +#define return_val_if_fail(expr, retval)                                \ +    do                                                                  \ +    {                                                                   \ +	if (unlikely (!(expr)))                                         \ +	{								\ +	    _pixman_log_error (FUNC, "The expression " # expr " was false"); \ +	    return (retval);						\ +	}								\ +    }                                                                   \ +    while (0) + +#define critical_if_fail(expr)						\ +    do									\ +    {									\ +	if (unlikely (!(expr)))                                         \ +	    _pixman_log_error (FUNC, "The expression " # expr " was false"); \ +    }									\ +    while (0) + +/* + * Matrix + */ + +typedef struct { pixman_fixed_48_16_t v[3]; } pixman_vector_48_16_t; + +PIXMAN_EXPORT +pixman_bool_t +pixman_transform_point_31_16 (const pixman_transform_t    *t, +                              const pixman_vector_48_16_t *v, +                              pixman_vector_48_16_t       *result); + +PIXMAN_EXPORT +void +pixman_transform_point_31_16_3d (const pixman_transform_t    *t, +                                 const pixman_vector_48_16_t *v, +                                 pixman_vector_48_16_t       *result); + +PIXMAN_EXPORT +void +pixman_transform_point_31_16_affine (const pixman_transform_t    *t, +                                     const pixman_vector_48_16_t *v, +                                     pixman_vector_48_16_t       *result); + +/* + * Timers + */ + +#ifdef PIXMAN_TIMERS + +static inline uint64_t +oil_profile_stamp_rdtsc (void) +{ +    uint32_t hi, lo; + +    __asm__ __volatile__ ("rdtsc\n" : "=a" (lo), "=d" (hi)); + +    return lo | (((uint64_t)hi) << 32); +} + +#define OIL_STAMP oil_profile_stamp_rdtsc + +typedef struct pixman_timer_t pixman_timer_t; + +struct pixman_timer_t +{ +    int             initialized; +    const char *    name; +    uint64_t        n_times; +    uint64_t        total; +    pixman_timer_t *next; +}; + +extern int timer_defined; + +void pixman_timer_register (pixman_timer_t *timer); + +#define TIMER_BEGIN(tname)                                              \ +    {                                                                   \ +	static pixman_timer_t timer ## tname;                           \ +	uint64_t              begin ## tname;                           \ +        								\ +	if (!timer ## tname.initialized)				\ +	{                                                               \ +	    timer ## tname.initialized = 1;				\ +	    timer ## tname.name = # tname;				\ +	    pixman_timer_register (&timer ## tname);			\ +	}                                                               \ +									\ +	timer ## tname.n_times++;					\ +	begin ## tname = OIL_STAMP (); + +#define TIMER_END(tname)                                                \ +    timer ## tname.total += OIL_STAMP () - begin ## tname;		\ +    } + +#else + +#define TIMER_BEGIN(tname) +#define TIMER_END(tname) + +#endif /* PIXMAN_TIMERS */ + +#endif /* __ASSEMBLER__ */ + +#endif /* PIXMAN_PRIVATE_H */ diff --git a/libs/pixman-0.40.0/pixman/pixman-radial-gradient.c b/libs/pixman-0.40.0/pixman/pixman-radial-gradient.c new file mode 100644 index 0000000..e8e99c9 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-radial-gradient.c @@ -0,0 +1,509 @@ +/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */ +/* + * + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. + * Copyright © 2000 SuSE, Inc. + *             2005 Lars Knoll & Zack Rusin, Trolltech + * Copyright © 2007 Red Hat, Inc. + * + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Keith Packard not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission.  Keith Packard makes no + * representations about the suitability of this software for any purpose.  It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include <stdlib.h> +#include <math.h> +#include "pixman-private.h" + +static inline pixman_fixed_32_32_t +dot (pixman_fixed_48_16_t x1, +     pixman_fixed_48_16_t y1, +     pixman_fixed_48_16_t z1, +     pixman_fixed_48_16_t x2, +     pixman_fixed_48_16_t y2, +     pixman_fixed_48_16_t z2) +{ +    /* +     * Exact computation, assuming that the input values can +     * be represented as pixman_fixed_16_16_t +     */ +    return x1 * x2 + y1 * y2 + z1 * z2; +} + +static inline double +fdot (double x1, +      double y1, +      double z1, +      double x2, +      double y2, +      double z2) +{ +    /* +     * Error can be unbound in some special cases. +     * Using clever dot product algorithms (for example compensated +     * dot product) would improve this but make the code much less +     * obvious +     */ +    return x1 * x2 + y1 * y2 + z1 * z2; +} + +static void +radial_write_color (double                         a, +		    double                         b, +		    double                         c, +		    double                         inva, +		    double                         dr, +		    double                         mindr, +		    pixman_gradient_walker_t      *walker, +		    pixman_repeat_t                repeat, +		    int                            Bpp, +		    pixman_gradient_walker_write_t write_pixel, +		    uint32_t                      *buffer) +{ +    /* +     * In this function error propagation can lead to bad results: +     *  - discr can have an unbound error (if b*b-a*c is very small), +     *    potentially making it the opposite sign of what it should have been +     *    (thus clearing a pixel that would have been colored or vice-versa) +     *    or propagating the error to sqrtdiscr; +     *    if discr has the wrong sign or b is very small, this can lead to bad +     *    results +     * +     *  - the algorithm used to compute the solutions of the quadratic +     *    equation is not numerically stable (but saves one division compared +     *    to the numerically stable one); +     *    this can be a problem if a*c is much smaller than b*b +     * +     *  - the above problems are worse if a is small (as inva becomes bigger) +     */ +    double discr; + +    if (a == 0) +    { +	double t; + +	if (b == 0) +	{ +	    memset (buffer, 0, Bpp); +	    return; +	} + +	t = pixman_fixed_1 / 2 * c / b; +	if (repeat == PIXMAN_REPEAT_NONE) +	{ +	    if (0 <= t && t <= pixman_fixed_1) +	    { +		write_pixel (walker, t, buffer); +		return; +	    } +	} +	else +	{ +	    if (t * dr >= mindr) +	    { +		write_pixel (walker, t, buffer); +		return; +	    } +	} + +	memset (buffer, 0, Bpp); +	return; +    } + +    discr = fdot (b, a, 0, b, -c, 0); +    if (discr >= 0) +    { +	double sqrtdiscr, t0, t1; + +	sqrtdiscr = sqrt (discr); +	t0 = (b + sqrtdiscr) * inva; +	t1 = (b - sqrtdiscr) * inva; + +	/* +	 * The root that must be used is the biggest one that belongs +	 * to the valid range ([0,1] for PIXMAN_REPEAT_NONE, any +	 * solution that results in a positive radius otherwise). +	 * +	 * If a > 0, t0 is the biggest solution, so if it is valid, it +	 * is the correct result. +	 * +	 * If a < 0, only one of the solutions can be valid, so the +	 * order in which they are tested is not important. +	 */ +	if (repeat == PIXMAN_REPEAT_NONE) +	{ +	    if (0 <= t0 && t0 <= pixman_fixed_1) +	    { +		write_pixel (walker, t0, buffer); +		return; +	    } +	    else if (0 <= t1 && t1 <= pixman_fixed_1) +	    { +		write_pixel (walker, t1, buffer); +		return; +           } +	} +	else +	{ +	    if (t0 * dr >= mindr) +	    { +		write_pixel (walker, t0, buffer); +		return; +	    } +	    else if (t1 * dr >= mindr) +	    { +		write_pixel (walker, t1, buffer); +		return; +	    } +	} +    } + +    memset (buffer, 0, Bpp); +    return; +} + +static uint32_t * +radial_get_scanline (pixman_iter_t                 *iter, +		     const uint32_t                *mask, +		     int                            Bpp, +		     pixman_gradient_walker_write_t write_pixel) +{ +    /* +     * Implementation of radial gradients following the PDF specification. +     * See section 8.7.4.5.4 Type 3 (Radial) Shadings of the PDF Reference +     * Manual (PDF 32000-1:2008 at the time of this writing). +     * +     * In the radial gradient problem we are given two circles (c₁,r₁) and +     * (c₂,r₂) that define the gradient itself. +     * +     * Mathematically the gradient can be defined as the family of circles +     * +     *     ((1-t)·c₁ + t·(c₂), (1-t)·r₁ + t·r₂) +     * +     * excluding those circles whose radius would be < 0. When a point +     * belongs to more than one circle, the one with a bigger t is the only +     * one that contributes to its color. When a point does not belong +     * to any of the circles, it is transparent black, i.e. RGBA (0, 0, 0, 0). +     * Further limitations on the range of values for t are imposed when +     * the gradient is not repeated, namely t must belong to [0,1]. +     * +     * The graphical result is the same as drawing the valid (radius > 0) +     * circles with increasing t in [-inf, +inf] (or in [0,1] if the gradient +     * is not repeated) using SOURCE operator composition. +     * +     * It looks like a cone pointing towards the viewer if the ending circle +     * is smaller than the starting one, a cone pointing inside the page if +     * the starting circle is the smaller one and like a cylinder if they +     * have the same radius. +     * +     * What we actually do is, given the point whose color we are interested +     * in, compute the t values for that point, solving for t in: +     * +     *     length((1-t)·c₁ + t·(c₂) - p) = (1-t)·r₁ + t·r₂ +     * +     * Let's rewrite it in a simpler way, by defining some auxiliary +     * variables: +     * +     *     cd = c₂ - c₁ +     *     pd = p - c₁ +     *     dr = r₂ - r₁ +     *     length(t·cd - pd) = r₁ + t·dr +     * +     * which actually means +     * +     *     hypot(t·cdx - pdx, t·cdy - pdy) = r₁ + t·dr +     * +     * or +     * +     *     ⎷((t·cdx - pdx)² + (t·cdy - pdy)²) = r₁ + t·dr. +     * +     * If we impose (as stated earlier) that r₁ + t·dr >= 0, it becomes: +     * +     *     (t·cdx - pdx)² + (t·cdy - pdy)² = (r₁ + t·dr)² +     * +     * where we can actually expand the squares and solve for t: +     * +     *     t²cdx² - 2t·cdx·pdx + pdx² + t²cdy² - 2t·cdy·pdy + pdy² = +     *       = r₁² + 2·r₁·t·dr + t²·dr² +     * +     *     (cdx² + cdy² - dr²)t² - 2(cdx·pdx + cdy·pdy + r₁·dr)t + +     *         (pdx² + pdy² - r₁²) = 0 +     * +     *     A = cdx² + cdy² - dr² +     *     B = pdx·cdx + pdy·cdy + r₁·dr +     *     C = pdx² + pdy² - r₁² +     *     At² - 2Bt + C = 0 +     * +     * The solutions (unless the equation degenerates because of A = 0) are: +     * +     *     t = (B ± ⎷(B² - A·C)) / A +     * +     * The solution we are going to prefer is the bigger one, unless the +     * radius associated to it is negative (or it falls outside the valid t +     * range). +     * +     * Additional observations (useful for optimizations): +     * A does not depend on p +     * +     * A < 0 <=> one of the two circles completely contains the other one +     *   <=> for every p, the radiuses associated with the two t solutions +     *       have opposite sign +     */ +    pixman_image_t *image = iter->image; +    int x = iter->x; +    int y = iter->y; +    int width = iter->width; +    uint32_t *buffer = iter->buffer; + +    gradient_t *gradient = (gradient_t *)image; +    radial_gradient_t *radial = (radial_gradient_t *)image; +    uint32_t *end = buffer + width * (Bpp / 4); +    pixman_gradient_walker_t walker; +    pixman_vector_t v, unit; + +    /* reference point is the center of the pixel */ +    v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2; +    v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2; +    v.vector[2] = pixman_fixed_1; + +    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat); + +    if (image->common.transform) +    { +	if (!pixman_transform_point_3d (image->common.transform, &v)) +	    return iter->buffer; + +	unit.vector[0] = image->common.transform->matrix[0][0]; +	unit.vector[1] = image->common.transform->matrix[1][0]; +	unit.vector[2] = image->common.transform->matrix[2][0]; +    } +    else +    { +	unit.vector[0] = pixman_fixed_1; +	unit.vector[1] = 0; +	unit.vector[2] = 0; +    } + +    if (unit.vector[2] == 0 && v.vector[2] == pixman_fixed_1) +    { +	/* +	 * Given: +	 * +	 * t = (B ± ⎷(B² - A·C)) / A +	 * +	 * where +	 * +	 * A = cdx² + cdy² - dr² +	 * B = pdx·cdx + pdy·cdy + r₁·dr +	 * C = pdx² + pdy² - r₁² +	 * det = B² - A·C +	 * +	 * Since we have an affine transformation, we know that (pdx, pdy) +	 * increase linearly with each pixel, +	 * +	 * pdx = pdx₀ + n·ux, +	 * pdy = pdy₀ + n·uy, +	 * +	 * we can then express B, C and det through multiple differentiation. +	 */ +	pixman_fixed_32_32_t b, db, c, dc, ddc; + +	/* warning: this computation may overflow */ +	v.vector[0] -= radial->c1.x; +	v.vector[1] -= radial->c1.y; + +	/* +	 * B and C are computed and updated exactly. +	 * If fdot was used instead of dot, in the worst case it would +	 * lose 11 bits of precision in each of the multiplication and +	 * summing up would zero out all the bit that were preserved, +	 * thus making the result 0 instead of the correct one. +	 * This would mean a worst case of unbound relative error or +	 * about 2^10 absolute error +	 */ +	b = dot (v.vector[0], v.vector[1], radial->c1.radius, +		 radial->delta.x, radial->delta.y, radial->delta.radius); +	db = dot (unit.vector[0], unit.vector[1], 0, +		  radial->delta.x, radial->delta.y, 0); + +	c = dot (v.vector[0], v.vector[1], +		 -((pixman_fixed_48_16_t) radial->c1.radius), +		 v.vector[0], v.vector[1], radial->c1.radius); +	dc = dot (2 * (pixman_fixed_48_16_t) v.vector[0] + unit.vector[0], +		  2 * (pixman_fixed_48_16_t) v.vector[1] + unit.vector[1], +		  0, +		  unit.vector[0], unit.vector[1], 0); +	ddc = 2 * dot (unit.vector[0], unit.vector[1], 0, +		       unit.vector[0], unit.vector[1], 0); + +	while (buffer < end) +	{ +	    if (!mask || *mask++) +	    { +		radial_write_color (radial->a, b, c, +				    radial->inva, +				    radial->delta.radius, +				    radial->mindr, +				    &walker, +				    image->common.repeat, +				    Bpp, +				    write_pixel, +				    buffer); +	    } + +	    b += db; +	    c += dc; +	    dc += ddc; +	    buffer += (Bpp / 4); +	} +    } +    else +    { +	/* projective */ +	/* Warning: +	 * error propagation guarantees are much looser than in the affine case +	 */ +	while (buffer < end) +	{ +	    if (!mask || *mask++) +	    { +		if (v.vector[2] != 0) +		{ +		    double pdx, pdy, invv2, b, c; + +		    invv2 = 1. * pixman_fixed_1 / v.vector[2]; + +		    pdx = v.vector[0] * invv2 - radial->c1.x; +		    /*    / pixman_fixed_1 */ + +		    pdy = v.vector[1] * invv2 - radial->c1.y; +		    /*    / pixman_fixed_1 */ + +		    b = fdot (pdx, pdy, radial->c1.radius, +			      radial->delta.x, radial->delta.y, +			      radial->delta.radius); +		    /*  / pixman_fixed_1 / pixman_fixed_1 */ + +		    c = fdot (pdx, pdy, -radial->c1.radius, +			      pdx, pdy, radial->c1.radius); +		    /*  / pixman_fixed_1 / pixman_fixed_1 */ + +		    radial_write_color (radial->a, b, c, +					radial->inva, +					radial->delta.radius, +					radial->mindr, +					&walker, +					image->common.repeat, +					Bpp, +					write_pixel, +					buffer); +		} +		else +		{ +		    memset (buffer, 0, Bpp); +		} +	    } + +	    buffer += (Bpp / 4); + +	    v.vector[0] += unit.vector[0]; +	    v.vector[1] += unit.vector[1]; +	    v.vector[2] += unit.vector[2]; +	} +    } + +    iter->y++; +    return iter->buffer; +} + +static uint32_t * +radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask) +{ +    return radial_get_scanline (iter, mask, 4, +				_pixman_gradient_walker_write_narrow); +} + +static uint32_t * +radial_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask) +{ +    return radial_get_scanline (iter, NULL, 16, +				_pixman_gradient_walker_write_wide); +} + +void +_pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter) +{ +    if (iter->iter_flags & ITER_NARROW) +	iter->get_scanline = radial_get_scanline_narrow; +    else +	iter->get_scanline = radial_get_scanline_wide; +} + +PIXMAN_EXPORT pixman_image_t * +pixman_image_create_radial_gradient (const pixman_point_fixed_t *  inner, +				     const pixman_point_fixed_t *  outer, +				     pixman_fixed_t                inner_radius, +				     pixman_fixed_t                outer_radius, +				     const pixman_gradient_stop_t *stops, +				     int                           n_stops) +{ +    pixman_image_t *image; +    radial_gradient_t *radial; + +    image = _pixman_image_allocate (); + +    if (!image) +	return NULL; + +    radial = &image->radial; + +    if (!_pixman_init_gradient (&radial->common, stops, n_stops)) +    { +	free (image); +	return NULL; +    } + +    image->type = RADIAL; + +    radial->c1.x = inner->x; +    radial->c1.y = inner->y; +    radial->c1.radius = inner_radius; +    radial->c2.x = outer->x; +    radial->c2.y = outer->y; +    radial->c2.radius = outer_radius; + +    /* warning: this computations may overflow */ +    radial->delta.x = radial->c2.x - radial->c1.x; +    radial->delta.y = radial->c2.y - radial->c1.y; +    radial->delta.radius = radial->c2.radius - radial->c1.radius; + +    /* computed exactly, then cast to double -> every bit of the double +       representation is correct (53 bits) */ +    radial->a = dot (radial->delta.x, radial->delta.y, -radial->delta.radius, +		     radial->delta.x, radial->delta.y, radial->delta.radius); +    if (radial->a != 0) +	radial->inva = 1. * pixman_fixed_1 / radial->a; + +    radial->mindr = -1. * pixman_fixed_1 * radial->c1.radius; + +    return image; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-region.c b/libs/pixman-0.40.0/pixman/pixman-region.c new file mode 100644 index 0000000..59bc9c7 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-region.c @@ -0,0 +1,2792 @@ +/* + * Copyright 1987, 1988, 1989, 1998  The Open Group + *  + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation. + *  + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + *  + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE + * OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *  + * Except as contained in this notice, the name of The Open Group shall not be + * used in advertising or otherwise to promote the sale, use or other dealings + * in this Software without prior written authorization from The Open Group. + *  + * Copyright 1987, 1988, 1989 by + * Digital Equipment Corporation, Maynard, Massachusetts. + *  + *                    All Rights Reserved + *  + * Permission to use, copy, modify, and distribute this software and its + * documentation for any purpose and without fee is hereby granted, + * provided that the above copyright notice appear in all copies and that + * both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of Digital not be + * used in advertising or publicity pertaining to distribution of the + * software without specific, written prior permission. + *  + * DIGITAL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING + * ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL + * DIGITAL BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR + * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, + * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Copyright © 1998 Keith Packard + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Keith Packard not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission.  Keith Packard makes no + * representations about the suitability of this software for any purpose.  It + * is provided "as is" without express or implied warranty. + * + * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include <stdlib.h> +#include <limits.h> +#include <string.h> +#include <stdio.h> +#include "pixman-private.h" + +#define PIXREGION_NIL(reg) ((reg)->data && !(reg)->data->numRects) +/* not a region */ +#define PIXREGION_NAR(reg)      ((reg)->data == pixman_broken_data) +#define PIXREGION_NUMRECTS(reg) ((reg)->data ? (reg)->data->numRects : 1) +#define PIXREGION_SIZE(reg) ((reg)->data ? (reg)->data->size : 0) +#define PIXREGION_RECTS(reg) \ +    ((reg)->data ? (box_type_t *)((reg)->data + 1) \ +     : &(reg)->extents) +#define PIXREGION_BOXPTR(reg) ((box_type_t *)((reg)->data + 1)) +#define PIXREGION_BOX(reg, i) (&PIXREGION_BOXPTR (reg)[i]) +#define PIXREGION_TOP(reg) PIXREGION_BOX (reg, (reg)->data->numRects) +#define PIXREGION_END(reg) PIXREGION_BOX (reg, (reg)->data->numRects - 1) + +#define GOOD_RECT(rect) ((rect)->x1 < (rect)->x2 && (rect)->y1 < (rect)->y2) +#define BAD_RECT(rect) ((rect)->x1 > (rect)->x2 || (rect)->y1 > (rect)->y2) + +#ifdef DEBUG + +#define GOOD(reg)							\ +    do									\ +    {									\ +	if (!PREFIX (_selfcheck (reg)))					\ +	    _pixman_log_error (FUNC, "Malformed region " # reg);	\ +    } while (0) + +#else + +#define GOOD(reg) + +#endif + +static const box_type_t PREFIX (_empty_box_) = { 0, 0, 0, 0 }; +static const region_data_type_t PREFIX (_empty_data_) = { 0, 0 }; +#if defined (__llvm__) && !defined (__clang__) +static const volatile region_data_type_t PREFIX (_broken_data_) = { 0, 0 }; +#else +static const region_data_type_t PREFIX (_broken_data_) = { 0, 0 }; +#endif + +static box_type_t *pixman_region_empty_box = +    (box_type_t *)&PREFIX (_empty_box_); +static region_data_type_t *pixman_region_empty_data = +    (region_data_type_t *)&PREFIX (_empty_data_); +static region_data_type_t *pixman_broken_data = +    (region_data_type_t *)&PREFIX (_broken_data_); + +static pixman_bool_t +pixman_break (region_type_t *region); + +/* + * The functions in this file implement the Region abstraction used extensively + * throughout the X11 sample server. A Region is simply a set of disjoint + * (non-overlapping) rectangles, plus an "extent" rectangle which is the + * smallest single rectangle that contains all the non-overlapping rectangles. + * + * A Region is implemented as a "y-x-banded" array of rectangles.  This array + * imposes two degrees of order.  First, all rectangles are sorted by top side + * y coordinate first (y1), and then by left side x coordinate (x1). + * + * Furthermore, the rectangles are grouped into "bands".  Each rectangle in a + * band has the same top y coordinate (y1), and each has the same bottom y + * coordinate (y2).  Thus all rectangles in a band differ only in their left + * and right side (x1 and x2).  Bands are implicit in the array of rectangles: + * there is no separate list of band start pointers. + * + * The y-x band representation does not minimize rectangles.  In particular, + * if a rectangle vertically crosses a band (the rectangle has scanlines in + * the y1 to y2 area spanned by the band), then the rectangle may be broken + * down into two or more smaller rectangles stacked one atop the other. + * + *  -----------				    ----------- + *  |         |				    |         |		    band 0 + *  |         |  --------		    -----------  -------- + *  |         |  |      |  in y-x banded    |         |  |      |   band 1 + *  |         |  |      |  form is	    |         |  |      | + *  -----------  |      |		    -----------  -------- + *               |      |				 |      |   band 2 + *               --------				 -------- + * + * An added constraint on the rectangles is that they must cover as much + * horizontal area as possible: no two rectangles within a band are allowed + * to touch. + * + * Whenever possible, bands will be merged together to cover a greater vertical + * distance (and thus reduce the number of rectangles). Two bands can be merged + * only if the bottom of one touches the top of the other and they have + * rectangles in the same places (of the same width, of course). + * + * Adam de Boor wrote most of the original region code.  Joel McCormack + * substantially modified or rewrote most of the core arithmetic routines, and + * added pixman_region_validate in order to support several speed improvements + * to pixman_region_validate_tree.  Bob Scheifler changed the representation + * to be more compact when empty or a single rectangle, and did a bunch of + * gratuitous reformatting. Carl Worth did further gratuitous reformatting + * while re-merging the server and client region code into libpixregion. + * Soren Sandmann did even more gratuitous reformatting. + */ + +/*  true iff two Boxes overlap */ +#define EXTENTCHECK(r1, r2)	   \ +    (!( ((r1)->x2 <= (r2)->x1)  || \ +        ((r1)->x1 >= (r2)->x2)  || \ +        ((r1)->y2 <= (r2)->y1)  || \ +        ((r1)->y1 >= (r2)->y2) ) ) + +/* true iff (x,y) is in Box */ +#define INBOX(r, x, y)	\ +    ( ((r)->x2 >  x) && \ +      ((r)->x1 <= x) && \ +      ((r)->y2 >  y) && \ +      ((r)->y1 <= y) ) + +/* true iff Box r1 contains Box r2 */ +#define SUBSUMES(r1, r2)	\ +    ( ((r1)->x1 <= (r2)->x1) && \ +      ((r1)->x2 >= (r2)->x2) && \ +      ((r1)->y1 <= (r2)->y1) && \ +      ((r1)->y2 >= (r2)->y2) ) + +static size_t +PIXREGION_SZOF (size_t n) +{ +    size_t size = n * sizeof(box_type_t); +     +    if (n > UINT32_MAX / sizeof(box_type_t)) +	return 0; + +    if (sizeof(region_data_type_t) > UINT32_MAX - size) +	return 0; + +    return size + sizeof(region_data_type_t); +} + +static region_data_type_t * +alloc_data (size_t n) +{ +    size_t sz = PIXREGION_SZOF (n); + +    if (!sz) +	return NULL; + +    return malloc (sz); +} + +#define FREE_DATA(reg) if ((reg)->data && (reg)->data->size) free ((reg)->data) + +#define RECTALLOC_BAIL(region, n, bail)					\ +    do									\ +    {									\ +	if (!(region)->data ||						\ +	    (((region)->data->numRects + (n)) > (region)->data->size))	\ +	{								\ +	    if (!pixman_rect_alloc (region, n))				\ +		goto bail;						\ +	}								\ +    } while (0) + +#define RECTALLOC(region, n)						\ +    do									\ +    {									\ +	if (!(region)->data ||						\ +	    (((region)->data->numRects + (n)) > (region)->data->size))	\ +	{								\ +	    if (!pixman_rect_alloc (region, n)) {			\ +		return FALSE;						\ +	    }								\ +	}								\ +    } while (0) + +#define ADDRECT(next_rect, nx1, ny1, nx2, ny2)      \ +    do						    \ +    {						    \ +	next_rect->x1 = nx1;                        \ +	next_rect->y1 = ny1;                        \ +	next_rect->x2 = nx2;                        \ +	next_rect->y2 = ny2;                        \ +	next_rect++;                                \ +    }						    \ +    while (0) + +#define NEWRECT(region, next_rect, nx1, ny1, nx2, ny2)			\ +    do									\ +    {									\ +	if (!(region)->data ||						\ +	    ((region)->data->numRects == (region)->data->size))		\ +	{								\ +	    if (!pixman_rect_alloc (region, 1))				\ +		return FALSE;						\ +	    next_rect = PIXREGION_TOP (region);				\ +	}								\ +	ADDRECT (next_rect, nx1, ny1, nx2, ny2);			\ +	region->data->numRects++;					\ +	critical_if_fail (region->data->numRects <= region->data->size);		\ +    } while (0) + +#define DOWNSIZE(reg, numRects)						\ +    do									\ +    {									\ +	if (((numRects) < ((reg)->data->size >> 1)) &&			\ +	    ((reg)->data->size > 50))					\ +	{								\ +	    region_data_type_t * new_data;				\ +	    size_t data_size = PIXREGION_SZOF (numRects);		\ +									\ +	    if (!data_size)						\ +	    {								\ +		new_data = NULL;					\ +	    }								\ +	    else							\ +	    {								\ +		new_data = (region_data_type_t *)			\ +		    realloc ((reg)->data, data_size);			\ +	    }								\ +									\ +	    if (new_data)						\ +	    {								\ +		new_data->size = (numRects);				\ +		(reg)->data = new_data;					\ +	    }								\ +	}								\ +    } while (0) + +PIXMAN_EXPORT pixman_bool_t +PREFIX (_equal) (region_type_t *reg1, region_type_t *reg2) +{ +    int i; +    box_type_t *rects1; +    box_type_t *rects2; + +    if (reg1->extents.x1 != reg2->extents.x1) +	return FALSE; +     +    if (reg1->extents.x2 != reg2->extents.x2) +	return FALSE; +     +    if (reg1->extents.y1 != reg2->extents.y1) +	return FALSE; +     +    if (reg1->extents.y2 != reg2->extents.y2) +	return FALSE; +     +    if (PIXREGION_NUMRECTS (reg1) != PIXREGION_NUMRECTS (reg2)) +	return FALSE; + +    rects1 = PIXREGION_RECTS (reg1); +    rects2 = PIXREGION_RECTS (reg2); +     +    for (i = 0; i != PIXREGION_NUMRECTS (reg1); i++) +    { +	if (rects1[i].x1 != rects2[i].x1) +	    return FALSE; +	 +	if (rects1[i].x2 != rects2[i].x2) +	    return FALSE; +	 +	if (rects1[i].y1 != rects2[i].y1) +	    return FALSE; +	 +	if (rects1[i].y2 != rects2[i].y2) +	    return FALSE; +    } + +    return TRUE; +} + +int +PREFIX (_print) (region_type_t *rgn) +{ +    int num, size; +    int i; +    box_type_t * rects; + +    num = PIXREGION_NUMRECTS (rgn); +    size = PIXREGION_SIZE (rgn); +    rects = PIXREGION_RECTS (rgn); + +    fprintf (stderr, "num: %d size: %d\n", num, size); +    fprintf (stderr, "extents: %d %d %d %d\n", +             rgn->extents.x1, +	     rgn->extents.y1, +	     rgn->extents.x2, +	     rgn->extents.y2); +     +    for (i = 0; i < num; i++) +    { +	fprintf (stderr, "%d %d %d %d \n", +	         rects[i].x1, rects[i].y1, rects[i].x2, rects[i].y2); +    } +     +    fprintf (stderr, "\n"); + +    return(num); +} + + +PIXMAN_EXPORT void +PREFIX (_init) (region_type_t *region) +{ +    region->extents = *pixman_region_empty_box; +    region->data = pixman_region_empty_data; +} + +PIXMAN_EXPORT void +PREFIX (_init_rect) (region_type_t *	region, +                     int		x, +		     int		y, +		     unsigned int	width, +		     unsigned int	height) +{ +    region->extents.x1 = x; +    region->extents.y1 = y; +    region->extents.x2 = x + width; +    region->extents.y2 = y + height; + +    if (!GOOD_RECT (®ion->extents)) +    { +        if (BAD_RECT (®ion->extents)) +            _pixman_log_error (FUNC, "Invalid rectangle passed"); +        PREFIX (_init) (region); +        return; +    } + +    region->data = NULL; +} + +PIXMAN_EXPORT void +PREFIX (_init_with_extents) (region_type_t *region, box_type_t *extents) +{ +    if (!GOOD_RECT (extents)) +    { +        if (BAD_RECT (extents)) +            _pixman_log_error (FUNC, "Invalid rectangle passed"); +        PREFIX (_init) (region); +        return; +    } +    region->extents = *extents; + +    region->data = NULL; +} + +PIXMAN_EXPORT void +PREFIX (_fini) (region_type_t *region) +{ +    GOOD (region); +    FREE_DATA (region); +} + +PIXMAN_EXPORT int +PREFIX (_n_rects) (region_type_t *region) +{ +    return PIXREGION_NUMRECTS (region); +} + +PIXMAN_EXPORT box_type_t * +PREFIX (_rectangles) (region_type_t *region, +                      int               *n_rects) +{ +    if (n_rects) +	*n_rects = PIXREGION_NUMRECTS (region); + +    return PIXREGION_RECTS (region); +} + +static pixman_bool_t +pixman_break (region_type_t *region) +{ +    FREE_DATA (region); + +    region->extents = *pixman_region_empty_box; +    region->data = pixman_broken_data; + +    return FALSE; +} + +static pixman_bool_t +pixman_rect_alloc (region_type_t * region, +                   int             n) +{ +    region_data_type_t *data; + +    if (!region->data) +    { +	n++; +	region->data = alloc_data (n); + +	if (!region->data) +	    return pixman_break (region); + +	region->data->numRects = 1; +	*PIXREGION_BOXPTR (region) = region->extents; +    } +    else if (!region->data->size) +    { +	region->data = alloc_data (n); + +	if (!region->data) +	    return pixman_break (region); + +	region->data->numRects = 0; +    } +    else +    { +	size_t data_size; + +	if (n == 1) +	{ +	    n = region->data->numRects; +	    if (n > 500) /* XXX pick numbers out of a hat */ +		n = 250; +	} + +	n += region->data->numRects; +	data_size = PIXREGION_SZOF (n); + +	if (!data_size) +	{ +	    data = NULL; +	} +	else +	{ +	    data = (region_data_type_t *) +		realloc (region->data, PIXREGION_SZOF (n)); +	} +	 +	if (!data) +	    return pixman_break (region); +	 +	region->data = data; +    } +     +    region->data->size = n; + +    return TRUE; +} + +PIXMAN_EXPORT pixman_bool_t +PREFIX (_copy) (region_type_t *dst, region_type_t *src) +{ +    GOOD (dst); +    GOOD (src); + +    if (dst == src) +	return TRUE; +     +    dst->extents = src->extents; + +    if (!src->data || !src->data->size) +    { +	FREE_DATA (dst); +	dst->data = src->data; +	return TRUE; +    } +     +    if (!dst->data || (dst->data->size < src->data->numRects)) +    { +	FREE_DATA (dst); + +	dst->data = alloc_data (src->data->numRects); + +	if (!dst->data) +	    return pixman_break (dst); + +	dst->data->size = src->data->numRects; +    } + +    dst->data->numRects = src->data->numRects; + +    memmove ((char *)PIXREGION_BOXPTR (dst), (char *)PIXREGION_BOXPTR (src), +             dst->data->numRects * sizeof(box_type_t)); + +    return TRUE; +} + +/*====================================================================== + *	    Generic Region Operator + *====================================================================*/ + +/*- + *----------------------------------------------------------------------- + * pixman_coalesce -- + *	Attempt to merge the boxes in the current band with those in the + *	previous one.  We are guaranteed that the current band extends to + *      the end of the rects array.  Used only by pixman_op. + * + * Results: + *	The new index for the previous band. + * + * Side Effects: + *	If coalescing takes place: + *	    - rectangles in the previous band will have their y2 fields + *	      altered. + *	    - region->data->numRects will be decreased. + * + *----------------------------------------------------------------------- + */ +static inline int +pixman_coalesce (region_type_t * region,      /* Region to coalesce		 */ +		 int             prev_start,  /* Index of start of previous band */ +		 int             cur_start)   /* Index of start of current band  */ +{ +    box_type_t *prev_box;       /* Current box in previous band	     */ +    box_type_t *cur_box;        /* Current box in current band       */ +    int numRects;               /* Number rectangles in both bands   */ +    int y2;                     /* Bottom of current band	     */ + +    /* +     * Figure out how many rectangles are in the band. +     */ +    numRects = cur_start - prev_start; +    critical_if_fail (numRects == region->data->numRects - cur_start); + +    if (!numRects) return cur_start; + +    /* +     * The bands may only be coalesced if the bottom of the previous +     * matches the top scanline of the current. +     */ +    prev_box = PIXREGION_BOX (region, prev_start); +    cur_box = PIXREGION_BOX (region, cur_start); +    if (prev_box->y2 != cur_box->y1) return cur_start; + +    /* +     * Make sure the bands have boxes in the same places. This +     * assumes that boxes have been added in such a way that they +     * cover the most area possible. I.e. two boxes in a band must +     * have some horizontal space between them. +     */ +    y2 = cur_box->y2; + +    do +    { +	if ((prev_box->x1 != cur_box->x1) || (prev_box->x2 != cur_box->x2)) +	    return (cur_start); +	 +	prev_box++; +	cur_box++; +	numRects--; +    } +    while (numRects); + +    /* +     * The bands may be merged, so set the bottom y of each box +     * in the previous band to the bottom y of the current band. +     */ +    numRects = cur_start - prev_start; +    region->data->numRects -= numRects; + +    do +    { +	prev_box--; +	prev_box->y2 = y2; +	numRects--; +    } +    while (numRects); + +    return prev_start; +} + +/* Quicky macro to avoid trivial reject procedure calls to pixman_coalesce */ + +#define COALESCE(new_reg, prev_band, cur_band)                          \ +    do									\ +    {									\ +	if (cur_band - prev_band == new_reg->data->numRects - cur_band)	\ +	    prev_band = pixman_coalesce (new_reg, prev_band, cur_band);	\ +	else								\ +	    prev_band = cur_band;					\ +    } while (0) + +/*- + *----------------------------------------------------------------------- + * pixman_region_append_non_o -- + *	Handle a non-overlapping band for the union and subtract operations. + *      Just adds the (top/bottom-clipped) rectangles into the region. + *      Doesn't have to check for subsumption or anything. + * + * Results: + *	None. + * + * Side Effects: + *	region->data->numRects is incremented and the rectangles overwritten + *	with the rectangles we're passed. + * + *----------------------------------------------------------------------- + */ +static inline pixman_bool_t +pixman_region_append_non_o (region_type_t * region, +			    box_type_t *    r, +			    box_type_t *    r_end, +			    int             y1, +			    int             y2) +{ +    box_type_t *next_rect; +    int new_rects; + +    new_rects = r_end - r; + +    critical_if_fail (y1 < y2); +    critical_if_fail (new_rects != 0); + +    /* Make sure we have enough space for all rectangles to be added */ +    RECTALLOC (region, new_rects); +    next_rect = PIXREGION_TOP (region); +    region->data->numRects += new_rects; + +    do +    { +	critical_if_fail (r->x1 < r->x2); +	ADDRECT (next_rect, r->x1, y1, r->x2, y2); +	r++; +    } +    while (r != r_end); + +    return TRUE; +} + +#define FIND_BAND(r, r_band_end, r_end, ry1)			     \ +    do								     \ +    {								     \ +	ry1 = r->y1;						     \ +	r_band_end = r + 1;					     \ +	while ((r_band_end != r_end) && (r_band_end->y1 == ry1)) {   \ +	    r_band_end++;					     \ +	}							     \ +    } while (0) + +#define APPEND_REGIONS(new_reg, r, r_end)				\ +    do									\ +    {									\ +	int new_rects;							\ +	if ((new_rects = r_end - r)) {					\ +	    RECTALLOC_BAIL (new_reg, new_rects, bail);			\ +	    memmove ((char *)PIXREGION_TOP (new_reg), (char *)r,	\ +		     new_rects * sizeof(box_type_t));			\ +	    new_reg->data->numRects += new_rects;			\ +	}								\ +    } while (0) + +/*- + *----------------------------------------------------------------------- + * pixman_op -- + *	Apply an operation to two regions. Called by pixman_region_union, pixman_region_inverse, + *	pixman_region_subtract, pixman_region_intersect....  Both regions MUST have at least one + *      rectangle, and cannot be the same object. + * + * Results: + *	TRUE if successful. + * + * Side Effects: + *	The new region is overwritten. + *	overlap set to TRUE if overlap_func ever returns TRUE. + * + * Notes: + *	The idea behind this function is to view the two regions as sets. + *	Together they cover a rectangle of area that this function divides + *	into horizontal bands where points are covered only by one region + *	or by both. For the first case, the non_overlap_func is called with + *	each the band and the band's upper and lower extents. For the + *	second, the overlap_func is called to process the entire band. It + *	is responsible for clipping the rectangles in the band, though + *	this function provides the boundaries. + *	At the end of each band, the new region is coalesced, if possible, + *	to reduce the number of rectangles in the region. + * + *----------------------------------------------------------------------- + */ + +typedef pixman_bool_t (*overlap_proc_ptr) (region_type_t *region, +					   box_type_t *   r1, +					   box_type_t *   r1_end, +					   box_type_t *   r2, +					   box_type_t *   r2_end, +					   int            y1, +					   int            y2); + +static pixman_bool_t +pixman_op (region_type_t *  new_reg,               /* Place to store result	    */ +	   region_type_t *  reg1,                  /* First region in operation     */ +	   region_type_t *  reg2,                  /* 2d region in operation        */ +	   overlap_proc_ptr overlap_func,          /* Function to call for over- +						    * lapping bands		    */ +	   int              append_non1,           /* Append non-overlapping bands   +						    * in region 1 ? +						    */ +	   int              append_non2            /* Append non-overlapping bands +						    * in region 2 ? +						    */ +    ) +{ +    box_type_t *r1;                 /* Pointer into first region     */ +    box_type_t *r2;                 /* Pointer into 2d region	     */ +    box_type_t *r1_end;             /* End of 1st region	     */ +    box_type_t *r2_end;             /* End of 2d region		     */ +    int ybot;                       /* Bottom of intersection	     */ +    int ytop;                       /* Top of intersection	     */ +    region_data_type_t *old_data;   /* Old data for new_reg	     */ +    int prev_band;                  /* Index of start of +				     * previous band in new_reg       */ +    int cur_band;                   /* Index of start of current +				     * band in new_reg		     */ +    box_type_t * r1_band_end;       /* End of current band in r1     */ +    box_type_t * r2_band_end;       /* End of current band in r2     */ +    int top;                        /* Top of non-overlapping band   */ +    int bot;                        /* Bottom of non-overlapping band*/ +    int r1y1;                       /* Temps for r1->y1 and r2->y1   */ +    int r2y1; +    int new_size; +    int numRects; + +    /* +     * Break any region computed from a broken region +     */ +    if (PIXREGION_NAR (reg1) || PIXREGION_NAR (reg2)) +	return pixman_break (new_reg); + +    /* +     * Initialization: +     *	set r1, r2, r1_end and r2_end appropriately, save the rectangles +     * of the destination region until the end in case it's one of +     * the two source regions, then mark the "new" region empty, allocating +     * another array of rectangles for it to use. +     */ + +    r1 = PIXREGION_RECTS (reg1); +    new_size = PIXREGION_NUMRECTS (reg1); +    r1_end = r1 + new_size; + +    numRects = PIXREGION_NUMRECTS (reg2); +    r2 = PIXREGION_RECTS (reg2); +    r2_end = r2 + numRects; +     +    critical_if_fail (r1 != r1_end); +    critical_if_fail (r2 != r2_end); + +    old_data = (region_data_type_t *)NULL; + +    if (((new_reg == reg1) && (new_size > 1)) || +        ((new_reg == reg2) && (numRects > 1))) +    { +        old_data = new_reg->data; +        new_reg->data = pixman_region_empty_data; +    } + +    /* guess at new size */ +    if (numRects > new_size) +	new_size = numRects; + +    new_size <<= 1; + +    if (!new_reg->data) +	new_reg->data = pixman_region_empty_data; +    else if (new_reg->data->size) +	new_reg->data->numRects = 0; + +    if (new_size > new_reg->data->size) +    { +        if (!pixman_rect_alloc (new_reg, new_size)) +        { +            free (old_data); +            return FALSE; +	} +    } + +    /* +     * Initialize ybot. +     * In the upcoming loop, ybot and ytop serve different functions depending +     * on whether the band being handled is an overlapping or non-overlapping +     * band. +     *  In the case of a non-overlapping band (only one of the regions +     * has points in the band), ybot is the bottom of the most recent +     * intersection and thus clips the top of the rectangles in that band. +     * ytop is the top of the next intersection between the two regions and +     * serves to clip the bottom of the rectangles in the current band. +     *	For an overlapping band (where the two regions intersect), ytop clips +     * the top of the rectangles of both regions and ybot clips the bottoms. +     */ + +    ybot = MIN (r1->y1, r2->y1); + +    /* +     * prev_band serves to mark the start of the previous band so rectangles +     * can be coalesced into larger rectangles. qv. pixman_coalesce, above. +     * In the beginning, there is no previous band, so prev_band == cur_band +     * (cur_band is set later on, of course, but the first band will always +     * start at index 0). prev_band and cur_band must be indices because of +     * the possible expansion, and resultant moving, of the new region's +     * array of rectangles. +     */ +    prev_band = 0; + +    do +    { +        /* +	 * This algorithm proceeds one source-band (as opposed to a +	 * destination band, which is determined by where the two regions +	 * intersect) at a time. r1_band_end and r2_band_end serve to mark the +	 * rectangle after the last one in the current band for their +	 * respective regions. +	 */ +        critical_if_fail (r1 != r1_end); +        critical_if_fail (r2 != r2_end); + +        FIND_BAND (r1, r1_band_end, r1_end, r1y1); +        FIND_BAND (r2, r2_band_end, r2_end, r2y1); + +        /* +	 * First handle the band that doesn't intersect, if any. +	 * +	 * Note that attention is restricted to one band in the +	 * non-intersecting region at once, so if a region has n +	 * bands between the current position and the next place it overlaps +	 * the other, this entire loop will be passed through n times. +	 */ +        if (r1y1 < r2y1) +        { +            if (append_non1) +            { +                top = MAX (r1y1, ybot); +                bot = MIN (r1->y2, r2y1); +                if (top != bot) +                { +                    cur_band = new_reg->data->numRects; +                    if (!pixman_region_append_non_o (new_reg, r1, r1_band_end, top, bot)) +			goto bail; +                    COALESCE (new_reg, prev_band, cur_band); +		} +	    } +            ytop = r2y1; +	} +        else if (r2y1 < r1y1) +        { +            if (append_non2) +            { +                top = MAX (r2y1, ybot); +                bot = MIN (r2->y2, r1y1); +		 +                if (top != bot) +                { +                    cur_band = new_reg->data->numRects; + +                    if (!pixman_region_append_non_o (new_reg, r2, r2_band_end, top, bot)) +			goto bail; + +                    COALESCE (new_reg, prev_band, cur_band); +		} +	    } +            ytop = r1y1; +	} +        else +        { +            ytop = r1y1; +	} + +        /* +	 * Now see if we've hit an intersecting band. The two bands only +	 * intersect if ybot > ytop +	 */ +        ybot = MIN (r1->y2, r2->y2); +        if (ybot > ytop) +        { +            cur_band = new_reg->data->numRects; + +            if (!(*overlap_func)(new_reg, +                                 r1, r1_band_end, +                                 r2, r2_band_end, +                                 ytop, ybot)) +	    { +		goto bail; +	    } +	     +            COALESCE (new_reg, prev_band, cur_band); +	} + +        /* +	 * If we've finished with a band (y2 == ybot) we skip forward +	 * in the region to the next band. +	 */ +        if (r1->y2 == ybot) +	    r1 = r1_band_end; + +        if (r2->y2 == ybot) +	    r2 = r2_band_end; + +    } +    while (r1 != r1_end && r2 != r2_end); + +    /* +     * Deal with whichever region (if any) still has rectangles left. +     * +     * We only need to worry about banding and coalescing for the very first +     * band left.  After that, we can just group all remaining boxes, +     * regardless of how many bands, into one final append to the list. +     */ + +    if ((r1 != r1_end) && append_non1) +    { +        /* Do first non_overlap1Func call, which may be able to coalesce */ +        FIND_BAND (r1, r1_band_end, r1_end, r1y1); +	 +        cur_band = new_reg->data->numRects; +	 +        if (!pixman_region_append_non_o (new_reg, +                                         r1, r1_band_end, +                                         MAX (r1y1, ybot), r1->y2)) +	{ +	    goto bail; +	} +	 +        COALESCE (new_reg, prev_band, cur_band); + +        /* Just append the rest of the boxes  */ +        APPEND_REGIONS (new_reg, r1_band_end, r1_end); +    } +    else if ((r2 != r2_end) && append_non2) +    { +        /* Do first non_overlap2Func call, which may be able to coalesce */ +        FIND_BAND (r2, r2_band_end, r2_end, r2y1); + +	cur_band = new_reg->data->numRects; + +        if (!pixman_region_append_non_o (new_reg, +                                         r2, r2_band_end, +                                         MAX (r2y1, ybot), r2->y2)) +	{ +	    goto bail; +	} + +        COALESCE (new_reg, prev_band, cur_band); + +        /* Append rest of boxes */ +        APPEND_REGIONS (new_reg, r2_band_end, r2_end); +    } + +    free (old_data); + +    if (!(numRects = new_reg->data->numRects)) +    { +        FREE_DATA (new_reg); +        new_reg->data = pixman_region_empty_data; +    } +    else if (numRects == 1) +    { +        new_reg->extents = *PIXREGION_BOXPTR (new_reg); +        FREE_DATA (new_reg); +        new_reg->data = (region_data_type_t *)NULL; +    } +    else +    { +        DOWNSIZE (new_reg, numRects); +    } + +    return TRUE; + +bail: +    free (old_data); + +    return pixman_break (new_reg); +} + +/*- + *----------------------------------------------------------------------- + * pixman_set_extents -- + *	Reset the extents of a region to what they should be. Called by + *	pixman_region_subtract and pixman_region_intersect as they can't + *      figure it out along the way or do so easily, as pixman_region_union can. + * + * Results: + *	None. + * + * Side Effects: + *	The region's 'extents' structure is overwritten. + * + *----------------------------------------------------------------------- + */ +static void +pixman_set_extents (region_type_t *region) +{ +    box_type_t *box, *box_end; + +    if (!region->data) +	return; + +    if (!region->data->size) +    { +        region->extents.x2 = region->extents.x1; +        region->extents.y2 = region->extents.y1; +        return; +    } + +    box = PIXREGION_BOXPTR (region); +    box_end = PIXREGION_END (region); + +    /* +     * Since box is the first rectangle in the region, it must have the +     * smallest y1 and since box_end is the last rectangle in the region, +     * it must have the largest y2, because of banding. Initialize x1 and +     * x2 from  box and box_end, resp., as good things to initialize them +     * to... +     */ +    region->extents.x1 = box->x1; +    region->extents.y1 = box->y1; +    region->extents.x2 = box_end->x2; +    region->extents.y2 = box_end->y2; + +    critical_if_fail (region->extents.y1 < region->extents.y2); + +    while (box <= box_end) +    { +        if (box->x1 < region->extents.x1) +	    region->extents.x1 = box->x1; +        if (box->x2 > region->extents.x2) +	    region->extents.x2 = box->x2; +        box++; +    } + +    critical_if_fail (region->extents.x1 < region->extents.x2); +} + +/*====================================================================== + *	    Region Intersection + *====================================================================*/ +/*- + *----------------------------------------------------------------------- + * pixman_region_intersect_o -- + *	Handle an overlapping band for pixman_region_intersect. + * + * Results: + *	TRUE if successful. + * + * Side Effects: + *	Rectangles may be added to the region. + * + *----------------------------------------------------------------------- + */ +/*ARGSUSED*/ +static pixman_bool_t +pixman_region_intersect_o (region_type_t *region, +                           box_type_t *   r1, +                           box_type_t *   r1_end, +                           box_type_t *   r2, +                           box_type_t *   r2_end, +                           int            y1, +                           int            y2) +{ +    int x1; +    int x2; +    box_type_t *        next_rect; + +    next_rect = PIXREGION_TOP (region); + +    critical_if_fail (y1 < y2); +    critical_if_fail (r1 != r1_end && r2 != r2_end); + +    do +    { +        x1 = MAX (r1->x1, r2->x1); +        x2 = MIN (r1->x2, r2->x2); + +        /* +	 * If there's any overlap between the two rectangles, add that +	 * overlap to the new region. +	 */ +        if (x1 < x2) +	    NEWRECT (region, next_rect, x1, y1, x2, y2); + +        /* +	 * Advance the pointer(s) with the leftmost right side, since the next +	 * rectangle on that list may still overlap the other region's +	 * current rectangle. +	 */ +        if (r1->x2 == x2) +        { +            r1++; +	} +        if (r2->x2 == x2) +        { +            r2++; +	} +    } +    while ((r1 != r1_end) && (r2 != r2_end)); + +    return TRUE; +} + +PIXMAN_EXPORT pixman_bool_t +PREFIX (_intersect) (region_type_t *     new_reg, +                     region_type_t *        reg1, +                     region_type_t *        reg2) +{ +    GOOD (reg1); +    GOOD (reg2); +    GOOD (new_reg); + +    /* check for trivial reject */ +    if (PIXREGION_NIL (reg1) || PIXREGION_NIL (reg2) || +        !EXTENTCHECK (®1->extents, ®2->extents)) +    { +        /* Covers about 20% of all cases */ +        FREE_DATA (new_reg); +        new_reg->extents.x2 = new_reg->extents.x1; +        new_reg->extents.y2 = new_reg->extents.y1; +        if (PIXREGION_NAR (reg1) || PIXREGION_NAR (reg2)) +        { +            new_reg->data = pixman_broken_data; +            return FALSE; +	} +        else +	{ +	    new_reg->data = pixman_region_empty_data; +	} +    } +    else if (!reg1->data && !reg2->data) +    { +        /* Covers about 80% of cases that aren't trivially rejected */ +        new_reg->extents.x1 = MAX (reg1->extents.x1, reg2->extents.x1); +        new_reg->extents.y1 = MAX (reg1->extents.y1, reg2->extents.y1); +        new_reg->extents.x2 = MIN (reg1->extents.x2, reg2->extents.x2); +        new_reg->extents.y2 = MIN (reg1->extents.y2, reg2->extents.y2); + +        FREE_DATA (new_reg); + +	new_reg->data = (region_data_type_t *)NULL; +    } +    else if (!reg2->data && SUBSUMES (®2->extents, ®1->extents)) +    { +        return PREFIX (_copy) (new_reg, reg1); +    } +    else if (!reg1->data && SUBSUMES (®1->extents, ®2->extents)) +    { +        return PREFIX (_copy) (new_reg, reg2); +    } +    else if (reg1 == reg2) +    { +        return PREFIX (_copy) (new_reg, reg1); +    } +    else +    { +        /* General purpose intersection */ + +        if (!pixman_op (new_reg, reg1, reg2, pixman_region_intersect_o, FALSE, FALSE)) +	    return FALSE; +	 +        pixman_set_extents (new_reg); +    } + +    GOOD (new_reg); +    return(TRUE); +} + +#define MERGERECT(r)							\ +    do									\ +    {									\ +        if (r->x1 <= x2)						\ +	{								\ +            /* Merge with current rectangle */				\ +            if (x2 < r->x2)						\ +		x2 = r->x2;						\ +	}								\ +	else								\ +	{								\ +            /* Add current rectangle, start new one */			\ +            NEWRECT (region, next_rect, x1, y1, x2, y2);		\ +            x1 = r->x1;							\ +            x2 = r->x2;							\ +	}								\ +        r++;								\ +    } while (0) + +/*====================================================================== + *	    Region Union + *====================================================================*/ + +/*- + *----------------------------------------------------------------------- + * pixman_region_union_o -- + *	Handle an overlapping band for the union operation. Picks the + *	left-most rectangle each time and merges it into the region. + * + * Results: + *	TRUE if successful. + * + * Side Effects: + *	region is overwritten. + *	overlap is set to TRUE if any boxes overlap. + * + *----------------------------------------------------------------------- + */ +static pixman_bool_t +pixman_region_union_o (region_type_t *region, +		       box_type_t *   r1, +		       box_type_t *   r1_end, +		       box_type_t *   r2, +		       box_type_t *   r2_end, +		       int            y1, +		       int            y2) +{ +    box_type_t *next_rect; +    int x1;            /* left and right side of current union */ +    int x2; + +    critical_if_fail (y1 < y2); +    critical_if_fail (r1 != r1_end && r2 != r2_end); + +    next_rect = PIXREGION_TOP (region); + +    /* Start off current rectangle */ +    if (r1->x1 < r2->x1) +    { +        x1 = r1->x1; +        x2 = r1->x2; +        r1++; +    } +    else +    { +        x1 = r2->x1; +        x2 = r2->x2; +        r2++; +    } +    while (r1 != r1_end && r2 != r2_end) +    { +        if (r1->x1 < r2->x1) +	    MERGERECT (r1); +	else +	    MERGERECT (r2); +    } + +    /* Finish off whoever (if any) is left */ +    if (r1 != r1_end) +    { +        do +        { +            MERGERECT (r1); +	} +        while (r1 != r1_end); +    } +    else if (r2 != r2_end) +    { +        do +        { +            MERGERECT (r2); +	} +        while (r2 != r2_end); +    } + +    /* Add current rectangle */ +    NEWRECT (region, next_rect, x1, y1, x2, y2); + +    return TRUE; +} + +PIXMAN_EXPORT pixman_bool_t +PREFIX(_intersect_rect) (region_type_t *dest, +			 region_type_t *source, +			 int x, int y, +			 unsigned int width, +			 unsigned int height) +{ +    region_type_t region; + +    region.data = NULL; +    region.extents.x1 = x; +    region.extents.y1 = y; +    region.extents.x2 = x + width; +    region.extents.y2 = y + height; + +    return PREFIX(_intersect) (dest, source, ®ion); +} + +/* Convenience function for performing union of region with a + * single rectangle + */ +PIXMAN_EXPORT pixman_bool_t +PREFIX (_union_rect) (region_type_t *dest, +                      region_type_t *source, +                      int            x, +		      int            y, +                      unsigned int   width, +		      unsigned int   height) +{ +    region_type_t region; + +    region.extents.x1 = x; +    region.extents.y1 = y; +    region.extents.x2 = x + width; +    region.extents.y2 = y + height; + +    if (!GOOD_RECT (®ion.extents)) +    { +        if (BAD_RECT (®ion.extents)) +            _pixman_log_error (FUNC, "Invalid rectangle passed"); +	return PREFIX (_copy) (dest, source); +    } + +    region.data = NULL; + +    return PREFIX (_union) (dest, source, ®ion); +} + +PIXMAN_EXPORT pixman_bool_t +PREFIX (_union) (region_type_t *new_reg, +                 region_type_t *reg1, +                 region_type_t *reg2) +{ +    /* Return TRUE if some overlap +     * between reg1, reg2 +     */ +    GOOD (reg1); +    GOOD (reg2); +    GOOD (new_reg); + +    /*  checks all the simple cases */ + +    /* +     * Region 1 and 2 are the same +     */ +    if (reg1 == reg2) +        return PREFIX (_copy) (new_reg, reg1); + +    /* +     * Region 1 is empty +     */ +    if (PIXREGION_NIL (reg1)) +    { +        if (PIXREGION_NAR (reg1)) +	    return pixman_break (new_reg); + +        if (new_reg != reg2) +	    return PREFIX (_copy) (new_reg, reg2); + +	return TRUE; +    } + +    /* +     * Region 2 is empty +     */ +    if (PIXREGION_NIL (reg2)) +    { +        if (PIXREGION_NAR (reg2)) +	    return pixman_break (new_reg); + +	if (new_reg != reg1) +	    return PREFIX (_copy) (new_reg, reg1); + +	return TRUE; +    } + +    /* +     * Region 1 completely subsumes region 2 +     */ +    if (!reg1->data && SUBSUMES (®1->extents, ®2->extents)) +    { +        if (new_reg != reg1) +	    return PREFIX (_copy) (new_reg, reg1); + +	return TRUE; +    } + +    /* +     * Region 2 completely subsumes region 1 +     */ +    if (!reg2->data && SUBSUMES (®2->extents, ®1->extents)) +    { +        if (new_reg != reg2) +	    return PREFIX (_copy) (new_reg, reg2); + +	return TRUE; +    } + +    if (!pixman_op (new_reg, reg1, reg2, pixman_region_union_o, TRUE, TRUE)) +	return FALSE; + +    new_reg->extents.x1 = MIN (reg1->extents.x1, reg2->extents.x1); +    new_reg->extents.y1 = MIN (reg1->extents.y1, reg2->extents.y1); +    new_reg->extents.x2 = MAX (reg1->extents.x2, reg2->extents.x2); +    new_reg->extents.y2 = MAX (reg1->extents.y2, reg2->extents.y2); +     +    GOOD (new_reg); + +    return TRUE; +} + +/*====================================================================== + *	    Batch Rectangle Union + *====================================================================*/ + +#define EXCHANGE_RECTS(a, b)	\ +    {                           \ +        box_type_t t;		\ +        t = rects[a];           \ +        rects[a] = rects[b];    \ +        rects[b] = t;           \ +    } + +static void +quick_sort_rects ( +    box_type_t rects[], +    int        numRects) +{ +    int y1; +    int x1; +    int i, j; +    box_type_t *r; + +    /* Always called with numRects > 1 */ + +    do +    { +        if (numRects == 2) +        { +            if (rects[0].y1 > rects[1].y1 || +                (rects[0].y1 == rects[1].y1 && rects[0].x1 > rects[1].x1)) +	    { +		EXCHANGE_RECTS (0, 1); +	    } + +            return; +	} + +        /* Choose partition element, stick in location 0 */ +        EXCHANGE_RECTS (0, numRects >> 1); +        y1 = rects[0].y1; +        x1 = rects[0].x1; + +        /* Partition array */ +        i = 0; +        j = numRects; + +        do +        { +            r = &(rects[i]); +            do +            { +                r++; +                i++; +	    } +	    while (i != numRects && (r->y1 < y1 || (r->y1 == y1 && r->x1 < x1))); + +	    r = &(rects[j]); +            do +            { +                r--; +                j--; +	    } +            while (y1 < r->y1 || (y1 == r->y1 && x1 < r->x1)); +	     +            if (i < j) +		EXCHANGE_RECTS (i, j); +	} +        while (i < j); + +        /* Move partition element back to middle */ +        EXCHANGE_RECTS (0, j); + +        /* Recurse */ +        if (numRects - j - 1 > 1) +	    quick_sort_rects (&rects[j + 1], numRects - j - 1); + +        numRects = j; +    } +    while (numRects > 1); +} + +/*- + *----------------------------------------------------------------------- + * pixman_region_validate -- + * + *      Take a ``region'' which is a non-y-x-banded random collection of + *      rectangles, and compute a nice region which is the union of all the + *      rectangles. + * + * Results: + *	TRUE if successful. + * + * Side Effects: + *      The passed-in ``region'' may be modified. + *	overlap set to TRUE if any retangles overlapped, + *      else FALSE; + * + * Strategy: + *      Step 1. Sort the rectangles into ascending order with primary key y1 + *		and secondary key x1. + * + *      Step 2. Split the rectangles into the minimum number of proper y-x + *		banded regions.  This may require horizontally merging + *		rectangles, and vertically coalescing bands.  With any luck, + *		this step in an identity transformation (ala the Box widget), + *		or a coalescing into 1 box (ala Menus). + * + *	Step 3. Merge the separate regions down to a single region by calling + *		pixman_region_union.  Maximize the work each pixman_region_union call does by using + *		a binary merge. + * + *----------------------------------------------------------------------- + */ + +static pixman_bool_t +validate (region_type_t * badreg) +{ +    /* Descriptor for regions under construction  in Step 2. */ +    typedef struct +    { +        region_type_t reg; +        int prev_band; +        int cur_band; +    } region_info_t; + +    region_info_t stack_regions[64]; + +    int numRects;                   /* Original numRects for badreg	    */ +    region_info_t *ri;              /* Array of current regions		    */ +    int num_ri;                     /* Number of entries used in ri	    */ +    int size_ri;                    /* Number of entries available in ri    */ +    int i;                          /* Index into rects			    */ +    int j;                          /* Index into ri			    */ +    region_info_t *rit;             /* &ri[j]				    */ +    region_type_t *reg;             /* ri[j].reg			    */ +    box_type_t *box;                /* Current box in rects		    */ +    box_type_t *ri_box;             /* Last box in ri[j].reg		    */ +    region_type_t *hreg;            /* ri[j_half].reg			    */ +    pixman_bool_t ret = TRUE; + +    if (!badreg->data) +    { +        GOOD (badreg); +        return TRUE; +    } +     +    numRects = badreg->data->numRects; +    if (!numRects) +    { +        if (PIXREGION_NAR (badreg)) +	    return FALSE; +        GOOD (badreg); +        return TRUE; +    } +     +    if (badreg->extents.x1 < badreg->extents.x2) +    { +        if ((numRects) == 1) +        { +            FREE_DATA (badreg); +            badreg->data = (region_data_type_t *) NULL; +	} +        else +        { +            DOWNSIZE (badreg, numRects); +	} + +        GOOD (badreg); + +	return TRUE; +    } + +    /* Step 1: Sort the rects array into ascending (y1, x1) order */ +    quick_sort_rects (PIXREGION_BOXPTR (badreg), numRects); + +    /* Step 2: Scatter the sorted array into the minimum number of regions */ + +    /* Set up the first region to be the first rectangle in badreg */ +    /* Note that step 2 code will never overflow the ri[0].reg rects array */ +    ri = stack_regions; +    size_ri = sizeof (stack_regions) / sizeof (stack_regions[0]); +    num_ri = 1; +    ri[0].prev_band = 0; +    ri[0].cur_band = 0; +    ri[0].reg = *badreg; +    box = PIXREGION_BOXPTR (&ri[0].reg); +    ri[0].reg.extents = *box; +    ri[0].reg.data->numRects = 1; +    badreg->extents = *pixman_region_empty_box; +    badreg->data = pixman_region_empty_data; + +    /* Now scatter rectangles into the minimum set of valid regions.  If the +     * next rectangle to be added to a region would force an existing rectangle +     * in the region to be split up in order to maintain y-x banding, just +     * forget it.  Try the next region.  If it doesn't fit cleanly into any +     * region, make a new one. +     */ + +    for (i = numRects; --i > 0;) +    { +        box++; +        /* Look for a region to append box to */ +        for (j = num_ri, rit = ri; --j >= 0; rit++) +        { +            reg = &rit->reg; +            ri_box = PIXREGION_END (reg); + +            if (box->y1 == ri_box->y1 && box->y2 == ri_box->y2) +            { +                /* box is in same band as ri_box.  Merge or append it */ +                if (box->x1 <= ri_box->x2) +                { +                    /* Merge it with ri_box */ +                    if (box->x2 > ri_box->x2) +			ri_box->x2 = box->x2; +		} +                else +                { +                    RECTALLOC_BAIL (reg, 1, bail); +                    *PIXREGION_TOP (reg) = *box; +                    reg->data->numRects++; +		} +		 +                goto next_rect;   /* So sue me */ +	    } +            else if (box->y1 >= ri_box->y2) +            { +                /* Put box into new band */ +                if (reg->extents.x2 < ri_box->x2) +		    reg->extents.x2 = ri_box->x2; +		 +                if (reg->extents.x1 > box->x1) +		    reg->extents.x1 = box->x1; +		 +                COALESCE (reg, rit->prev_band, rit->cur_band); +                rit->cur_band = reg->data->numRects; +                RECTALLOC_BAIL (reg, 1, bail); +                *PIXREGION_TOP (reg) = *box; +                reg->data->numRects++; + +                goto next_rect; +	    } +            /* Well, this region was inappropriate.  Try the next one. */ +	} /* for j */ + +        /* Uh-oh.  No regions were appropriate.  Create a new one. */ +        if (size_ri == num_ri) +        { +            size_t data_size; + +            /* Oops, allocate space for new region information */ +            size_ri <<= 1; + +            data_size = size_ri * sizeof(region_info_t); +            if (data_size / size_ri != sizeof(region_info_t)) +		goto bail; + +            if (ri == stack_regions) +            { +                rit = malloc (data_size); +                if (!rit) +		    goto bail; +                memcpy (rit, ri, num_ri * sizeof (region_info_t)); +	    } +            else +            { +                rit = (region_info_t *) realloc (ri, data_size); +                if (!rit) +		    goto bail; +	    } +            ri = rit; +            rit = &ri[num_ri]; +	} +        num_ri++; +        rit->prev_band = 0; +        rit->cur_band = 0; +        rit->reg.extents = *box; +        rit->reg.data = (region_data_type_t *)NULL; + +	/* MUST force allocation */ +        if (!pixman_rect_alloc (&rit->reg, (i + num_ri) / num_ri)) +	    goto bail; +	 +    next_rect: ; +    } /* for i */ + +    /* Make a final pass over each region in order to COALESCE and set +     * extents.x2 and extents.y2 +     */ +    for (j = num_ri, rit = ri; --j >= 0; rit++) +    { +        reg = &rit->reg; +        ri_box = PIXREGION_END (reg); +        reg->extents.y2 = ri_box->y2; + +        if (reg->extents.x2 < ri_box->x2) +	    reg->extents.x2 = ri_box->x2; +	 +        COALESCE (reg, rit->prev_band, rit->cur_band); + +	if (reg->data->numRects == 1) /* keep unions happy below */ +        { +            FREE_DATA (reg); +            reg->data = (region_data_type_t *)NULL; +	} +    } + +    /* Step 3: Union all regions into a single region */ +    while (num_ri > 1) +    { +        int half = num_ri / 2; +        for (j = num_ri & 1; j < (half + (num_ri & 1)); j++) +        { +            reg = &ri[j].reg; +            hreg = &ri[j + half].reg; + +            if (!pixman_op (reg, reg, hreg, pixman_region_union_o, TRUE, TRUE)) +		ret = FALSE; + +            if (hreg->extents.x1 < reg->extents.x1) +		reg->extents.x1 = hreg->extents.x1; + +            if (hreg->extents.y1 < reg->extents.y1) +		reg->extents.y1 = hreg->extents.y1; + +            if (hreg->extents.x2 > reg->extents.x2) +		reg->extents.x2 = hreg->extents.x2; + +            if (hreg->extents.y2 > reg->extents.y2) +		reg->extents.y2 = hreg->extents.y2; + +            FREE_DATA (hreg); +	} + +        num_ri -= half; + +	if (!ret) +	    goto bail; +    } + +    *badreg = ri[0].reg; + +    if (ri != stack_regions) +	free (ri); + +    GOOD (badreg); +    return ret; + +bail: +    for (i = 0; i < num_ri; i++) +	FREE_DATA (&ri[i].reg); + +    if (ri != stack_regions) +	free (ri); + +    return pixman_break (badreg); +} + +/*====================================================================== + *                Region Subtraction + *====================================================================*/ + +/*- + *----------------------------------------------------------------------- + * pixman_region_subtract_o -- + *	Overlapping band subtraction. x1 is the left-most point not yet + *	checked. + * + * Results: + *	TRUE if successful. + * + * Side Effects: + *	region may have rectangles added to it. + * + *----------------------------------------------------------------------- + */ +/*ARGSUSED*/ +static pixman_bool_t +pixman_region_subtract_o (region_type_t * region, +			  box_type_t *    r1, +			  box_type_t *    r1_end, +			  box_type_t *    r2, +			  box_type_t *    r2_end, +			  int             y1, +			  int             y2) +{ +    box_type_t *        next_rect; +    int x1; + +    x1 = r1->x1; + +    critical_if_fail (y1 < y2); +    critical_if_fail (r1 != r1_end && r2 != r2_end); + +    next_rect = PIXREGION_TOP (region); + +    do +    { +        if (r2->x2 <= x1) +        { +            /* +	     * Subtrahend entirely to left of minuend: go to next subtrahend. +	     */ +            r2++; +	} +        else if (r2->x1 <= x1) +        { +            /* +	     * Subtrahend precedes minuend: nuke left edge of minuend. +	     */ +            x1 = r2->x2; +            if (x1 >= r1->x2) +            { +                /* +		 * Minuend completely covered: advance to next minuend and +		 * reset left fence to edge of new minuend. +		 */ +                r1++; +                if (r1 != r1_end) +		    x1 = r1->x1; +	    } +            else +            { +                /* +		 * Subtrahend now used up since it doesn't extend beyond +		 * minuend +		 */ +                r2++; +	    } +	} +        else if (r2->x1 < r1->x2) +        { +            /* +	     * Left part of subtrahend covers part of minuend: add uncovered +	     * part of minuend to region and skip to next subtrahend. +	     */ +            critical_if_fail (x1 < r2->x1); +            NEWRECT (region, next_rect, x1, y1, r2->x1, y2); + +            x1 = r2->x2; +            if (x1 >= r1->x2) +            { +                /* +		 * Minuend used up: advance to new... +		 */ +                r1++; +                if (r1 != r1_end) +		    x1 = r1->x1; +	    } +            else +            { +                /* +		 * Subtrahend used up +		 */ +                r2++; +	    } +	} +        else +        { +            /* +	     * Minuend used up: add any remaining piece before advancing. +	     */ +            if (r1->x2 > x1) +		NEWRECT (region, next_rect, x1, y1, r1->x2, y2); + +            r1++; + +	    if (r1 != r1_end) +		x1 = r1->x1; +	} +    } +    while ((r1 != r1_end) && (r2 != r2_end)); + +    /* +     * Add remaining minuend rectangles to region. +     */ +    while (r1 != r1_end) +    { +        critical_if_fail (x1 < r1->x2); + +        NEWRECT (region, next_rect, x1, y1, r1->x2, y2); + +        r1++; +        if (r1 != r1_end) +	    x1 = r1->x1; +    } +    return TRUE; +} + +/*- + *----------------------------------------------------------------------- + * pixman_region_subtract -- + *	Subtract reg_s from reg_m and leave the result in reg_d. + *	S stands for subtrahend, M for minuend and D for difference. + * + * Results: + *	TRUE if successful. + * + * Side Effects: + *	reg_d is overwritten. + * + *----------------------------------------------------------------------- + */ +PIXMAN_EXPORT pixman_bool_t +PREFIX (_subtract) (region_type_t *reg_d, +                    region_type_t *reg_m, +                    region_type_t *reg_s) +{ +    GOOD (reg_m); +    GOOD (reg_s); +    GOOD (reg_d); +     +    /* check for trivial rejects */ +    if (PIXREGION_NIL (reg_m) || PIXREGION_NIL (reg_s) || +        !EXTENTCHECK (®_m->extents, ®_s->extents)) +    { +        if (PIXREGION_NAR (reg_s)) +	    return pixman_break (reg_d); +	 +        return PREFIX (_copy) (reg_d, reg_m); +    } +    else if (reg_m == reg_s) +    { +        FREE_DATA (reg_d); +        reg_d->extents.x2 = reg_d->extents.x1; +        reg_d->extents.y2 = reg_d->extents.y1; +        reg_d->data = pixman_region_empty_data; + +        return TRUE; +    } + +    /* Add those rectangles in region 1 that aren't in region 2, +       do yucky subtraction for overlaps, and +       just throw away rectangles in region 2 that aren't in region 1 */ +    if (!pixman_op (reg_d, reg_m, reg_s, pixman_region_subtract_o, TRUE, FALSE)) +	return FALSE; + +    /* +     * Can't alter reg_d's extents before we call pixman_op because +     * it might be one of the source regions and pixman_op depends +     * on the extents of those regions being unaltered. Besides, this +     * way there's no checking against rectangles that will be nuked +     * due to coalescing, so we have to examine fewer rectangles. +     */ +    pixman_set_extents (reg_d); +    GOOD (reg_d); +    return TRUE; +} + +/*====================================================================== + *	    Region Inversion + *====================================================================*/ + +/*- + *----------------------------------------------------------------------- + * pixman_region_inverse -- + *	Take a region and a box and return a region that is everything + *	in the box but not in the region. The careful reader will note + *	that this is the same as subtracting the region from the box... + * + * Results: + *	TRUE. + * + * Side Effects: + *	new_reg is overwritten. + * + *----------------------------------------------------------------------- + */ +PIXMAN_EXPORT pixman_bool_t +PREFIX (_inverse) (region_type_t *new_reg,  /* Destination region */ +		   region_type_t *reg1,     /* Region to invert */ +		   box_type_t *   inv_rect) /* Bounding box for inversion */ +{ +    region_type_t inv_reg; /* Quick and dirty region made from the +			    * bounding box */ +    GOOD (reg1); +    GOOD (new_reg); +     +    /* check for trivial rejects */ +    if (PIXREGION_NIL (reg1) || !EXTENTCHECK (inv_rect, ®1->extents)) +    { +        if (PIXREGION_NAR (reg1)) +	    return pixman_break (new_reg); +	 +        new_reg->extents = *inv_rect; +        FREE_DATA (new_reg); +        new_reg->data = (region_data_type_t *)NULL; +	 +        return TRUE; +    } + +    /* Add those rectangles in region 1 that aren't in region 2, +     * do yucky subtraction for overlaps, and +     * just throw away rectangles in region 2 that aren't in region 1 +     */ +    inv_reg.extents = *inv_rect; +    inv_reg.data = (region_data_type_t *)NULL; +    if (!pixman_op (new_reg, &inv_reg, reg1, pixman_region_subtract_o, TRUE, FALSE)) +	return FALSE; + +    /* +     * Can't alter new_reg's extents before we call pixman_op because +     * it might be one of the source regions and pixman_op depends +     * on the extents of those regions being unaltered. Besides, this +     * way there's no checking against rectangles that will be nuked +     * due to coalescing, so we have to examine fewer rectangles. +     */ +    pixman_set_extents (new_reg); +    GOOD (new_reg); +    return TRUE; +} + +/* In time O(log n), locate the first box whose y2 is greater than y. + * Return @end if no such box exists. + */ +static box_type_t * +find_box_for_y (box_type_t *begin, box_type_t *end, int y) +{ +    box_type_t *mid; + +    if (end == begin) +	return end; + +    if (end - begin == 1) +    { +	if (begin->y2 > y) +	    return begin; +	else +	    return end; +    } + +    mid = begin + (end - begin) / 2; +    if (mid->y2 > y) +    { +	/* If no box is found in [begin, mid], the function +	 * will return @mid, which is then known to be the +	 * correct answer. +	 */ +	return find_box_for_y (begin, mid, y); +    } +    else +    { +	return find_box_for_y (mid, end, y); +    } +} + +/* + *   rect_in(region, rect) + *   This routine takes a pointer to a region and a pointer to a box + *   and determines if the box is outside/inside/partly inside the region. + * + *   The idea is to travel through the list of rectangles trying to cover the + *   passed box with them. Anytime a piece of the rectangle isn't covered + *   by a band of rectangles, part_out is set TRUE. Any time a rectangle in + *   the region covers part of the box, part_in is set TRUE. The process ends + *   when either the box has been completely covered (we reached a band that + *   doesn't overlap the box, part_in is TRUE and part_out is false), the + *   box has been partially covered (part_in == part_out == TRUE -- because of + *   the banding, the first time this is true we know the box is only + *   partially in the region) or is outside the region (we reached a band + *   that doesn't overlap the box at all and part_in is false) + */ +PIXMAN_EXPORT pixman_region_overlap_t +PREFIX (_contains_rectangle) (region_type_t *  region, +			      box_type_t *     prect) +{ +    box_type_t *     pbox; +    box_type_t *     pbox_end; +    int part_in, part_out; +    int numRects; +    int x, y; + +    GOOD (region); + +    numRects = PIXREGION_NUMRECTS (region); + +    /* useful optimization */ +    if (!numRects || !EXTENTCHECK (®ion->extents, prect)) +	return(PIXMAN_REGION_OUT); + +    if (numRects == 1) +    { +        /* We know that it must be PIXMAN_REGION_IN or PIXMAN_REGION_PART */ +        if (SUBSUMES (®ion->extents, prect)) +	    return(PIXMAN_REGION_IN); +        else +	    return(PIXMAN_REGION_PART); +    } + +    part_out = FALSE; +    part_in = FALSE; + +    /* (x,y) starts at upper left of rect, moving to the right and down */ +    x = prect->x1; +    y = prect->y1; + +    /* can stop when both part_out and part_in are TRUE, or we reach prect->y2 */ +    for (pbox = PIXREGION_BOXPTR (region), pbox_end = pbox + numRects; +	 pbox != pbox_end; +	 pbox++) +    { +	/* getting up to speed or skipping remainder of band */ +	if (pbox->y2 <= y) +	{ +	    if ((pbox = find_box_for_y (pbox, pbox_end, y)) == pbox_end) +		break; +	} + +        if (pbox->y1 > y) +        { +            part_out = TRUE;     /* missed part of rectangle above */ +            if (part_in || (pbox->y1 >= prect->y2)) +		break; +            y = pbox->y1;       /* x guaranteed to be == prect->x1 */ +	} + +        if (pbox->x2 <= x) +	    continue;           /* not far enough over yet */ + +        if (pbox->x1 > x) +        { +            part_out = TRUE;     /* missed part of rectangle to left */ +            if (part_in) +		break; +	} + +        if (pbox->x1 < prect->x2) +        { +            part_in = TRUE;      /* definitely overlap */ +            if (part_out) +		break; +	} + +        if (pbox->x2 >= prect->x2) +        { +            y = pbox->y2;       /* finished with this band */ +            if (y >= prect->y2) +		break; +            x = prect->x1;      /* reset x out to left again */ +	} +        else +        { +            /* +	     * Because boxes in a band are maximal width, if the first box +	     * to overlap the rectangle doesn't completely cover it in that +	     * band, the rectangle must be partially out, since some of it +	     * will be uncovered in that band. part_in will have been set true +	     * by now... +	     */ +            part_out = TRUE; +            break; +	} +    } + +    if (part_in) +    { +        if (y < prect->y2) +	    return PIXMAN_REGION_PART; +        else +	    return PIXMAN_REGION_IN; +    } +    else +    { +        return PIXMAN_REGION_OUT; +    } +} + +/* PREFIX(_translate) (region, x, y) + * translates in place + */ + +PIXMAN_EXPORT void +PREFIX (_translate) (region_type_t *region, int x, int y) +{ +    overflow_int_t x1, x2, y1, y2; +    int nbox; +    box_type_t * pbox; + +    GOOD (region); +    region->extents.x1 = x1 = region->extents.x1 + x; +    region->extents.y1 = y1 = region->extents.y1 + y; +    region->extents.x2 = x2 = region->extents.x2 + x; +    region->extents.y2 = y2 = region->extents.y2 + y; +     +    if (((x1 - PIXMAN_REGION_MIN) | (y1 - PIXMAN_REGION_MIN) | (PIXMAN_REGION_MAX - x2) | (PIXMAN_REGION_MAX - y2)) >= 0) +    { +        if (region->data && (nbox = region->data->numRects)) +        { +            for (pbox = PIXREGION_BOXPTR (region); nbox--; pbox++) +            { +                pbox->x1 += x; +                pbox->y1 += y; +                pbox->x2 += x; +                pbox->y2 += y; +	    } +	} +        return; +    } + +    if (((x2 - PIXMAN_REGION_MIN) | (y2 - PIXMAN_REGION_MIN) | (PIXMAN_REGION_MAX - x1) | (PIXMAN_REGION_MAX - y1)) <= 0) +    { +        region->extents.x2 = region->extents.x1; +        region->extents.y2 = region->extents.y1; +        FREE_DATA (region); +        region->data = pixman_region_empty_data; +        return; +    } + +    if (x1 < PIXMAN_REGION_MIN) +	region->extents.x1 = PIXMAN_REGION_MIN; +    else if (x2 > PIXMAN_REGION_MAX) +	region->extents.x2 = PIXMAN_REGION_MAX; + +    if (y1 < PIXMAN_REGION_MIN) +	region->extents.y1 = PIXMAN_REGION_MIN; +    else if (y2 > PIXMAN_REGION_MAX) +	region->extents.y2 = PIXMAN_REGION_MAX; + +    if (region->data && (nbox = region->data->numRects)) +    { +        box_type_t * pbox_out; + +        for (pbox_out = pbox = PIXREGION_BOXPTR (region); nbox--; pbox++) +        { +            pbox_out->x1 = x1 = pbox->x1 + x; +            pbox_out->y1 = y1 = pbox->y1 + y; +            pbox_out->x2 = x2 = pbox->x2 + x; +            pbox_out->y2 = y2 = pbox->y2 + y; + +            if (((x2 - PIXMAN_REGION_MIN) | (y2 - PIXMAN_REGION_MIN) | +                 (PIXMAN_REGION_MAX - x1) | (PIXMAN_REGION_MAX - y1)) <= 0) +            { +                region->data->numRects--; +                continue; +	    } + +            if (x1 < PIXMAN_REGION_MIN) +		pbox_out->x1 = PIXMAN_REGION_MIN; +            else if (x2 > PIXMAN_REGION_MAX) +		pbox_out->x2 = PIXMAN_REGION_MAX; + +            if (y1 < PIXMAN_REGION_MIN) +		pbox_out->y1 = PIXMAN_REGION_MIN; +            else if (y2 > PIXMAN_REGION_MAX) +		pbox_out->y2 = PIXMAN_REGION_MAX; + +            pbox_out++; +	} + +        if (pbox_out != pbox) +        { +            if (region->data->numRects == 1) +            { +                region->extents = *PIXREGION_BOXPTR (region); +                FREE_DATA (region); +                region->data = (region_data_type_t *)NULL; +	    } +            else +	    { +		pixman_set_extents (region); +	    } +	} +    } + +    GOOD (region); +} + +PIXMAN_EXPORT void +PREFIX (_reset) (region_type_t *region, box_type_t *box) +{ +    GOOD (region); + +    critical_if_fail (GOOD_RECT (box)); + +    region->extents = *box; + +    FREE_DATA (region); + +    region->data = NULL; +} + +PIXMAN_EXPORT void +PREFIX (_clear) (region_type_t *region) +{ +    GOOD (region); +    FREE_DATA (region); + +    region->extents = *pixman_region_empty_box; +    region->data = pixman_region_empty_data; +} + +/* box is "return" value */ +PIXMAN_EXPORT int +PREFIX (_contains_point) (region_type_t * region, +                          int x, int y, +                          box_type_t * box) +{ +    box_type_t *pbox, *pbox_end; +    int numRects; + +    GOOD (region); +    numRects = PIXREGION_NUMRECTS (region); + +    if (!numRects || !INBOX (®ion->extents, x, y)) +	return(FALSE); + +    if (numRects == 1) +    { +        if (box) +	    *box = region->extents; + +        return(TRUE); +    } + +    pbox = PIXREGION_BOXPTR (region); +    pbox_end = pbox + numRects; + +    pbox = find_box_for_y (pbox, pbox_end, y); + +    for (;pbox != pbox_end; pbox++) +    { +        if ((y < pbox->y1) || (x < pbox->x1)) +	    break;              /* missed it */ + +        if (x >= pbox->x2) +	    continue;           /* not there yet */ + +        if (box) +	    *box = *pbox; + +        return(TRUE); +    } + +    return(FALSE); +} + +PIXMAN_EXPORT int +PREFIX (_not_empty) (region_type_t * region) +{ +    GOOD (region); + +    return(!PIXREGION_NIL (region)); +} + +PIXMAN_EXPORT box_type_t * +PREFIX (_extents) (region_type_t * region) +{ +    GOOD (region); + +    return(®ion->extents); +} + +/* + * Clip a list of scanlines to a region.  The caller has allocated the + * space.  FSorted is non-zero if the scanline origins are in ascending order. + * + * returns the number of new, clipped scanlines. + */ + +PIXMAN_EXPORT pixman_bool_t +PREFIX (_selfcheck) (region_type_t *reg) +{ +    int i, numRects; + +    if ((reg->extents.x1 > reg->extents.x2) || +        (reg->extents.y1 > reg->extents.y2)) +    { +	return FALSE; +    } + +    numRects = PIXREGION_NUMRECTS (reg); +    if (!numRects) +    { +	return ((reg->extents.x1 == reg->extents.x2) && +	        (reg->extents.y1 == reg->extents.y2) && +	        (reg->data->size || (reg->data == pixman_region_empty_data))); +    } +    else if (numRects == 1) +    { +	return (!reg->data); +    } +    else +    { +        box_type_t * pbox_p, * pbox_n; +        box_type_t box; + +        pbox_p = PIXREGION_RECTS (reg); +        box = *pbox_p; +        box.y2 = pbox_p[numRects - 1].y2; +        pbox_n = pbox_p + 1; + +        for (i = numRects; --i > 0; pbox_p++, pbox_n++) +        { +            if ((pbox_n->x1 >= pbox_n->x2) || +                (pbox_n->y1 >= pbox_n->y2)) +	    { +		return FALSE; +	    } + +            if (pbox_n->x1 < box.x1) +		box.x1 = pbox_n->x1; +	     +            if (pbox_n->x2 > box.x2) +		box.x2 = pbox_n->x2; +	     +            if ((pbox_n->y1 < pbox_p->y1) || +                ((pbox_n->y1 == pbox_p->y1) && +                 ((pbox_n->x1 < pbox_p->x2) || (pbox_n->y2 != pbox_p->y2)))) +	    { +		return FALSE; +	    } +	} + +        return ((box.x1 == reg->extents.x1) && +                (box.x2 == reg->extents.x2) && +                (box.y1 == reg->extents.y1) && +                (box.y2 == reg->extents.y2)); +    } +} + +PIXMAN_EXPORT pixman_bool_t +PREFIX (_init_rects) (region_type_t *region, +                      const box_type_t *boxes, int count) +{ +    box_type_t *rects; +    int displacement; +    int i; + +    /* if it's 1, then we just want to set the extents, so call +     * the existing method. */ +    if (count == 1) +    { +        PREFIX (_init_rect) (region, +                             boxes[0].x1, +                             boxes[0].y1, +                             boxes[0].x2 - boxes[0].x1, +                             boxes[0].y2 - boxes[0].y1); +        return TRUE; +    } + +    PREFIX (_init) (region); + +    /* if it's 0, don't call pixman_rect_alloc -- 0 rectangles is +     * a special case, and causing pixman_rect_alloc would cause +     * us to leak memory (because the 0-rect case should be the +     * static pixman_region_empty_data data). +     */ +    if (count == 0) +	return TRUE; + +    if (!pixman_rect_alloc (region, count)) +	return FALSE; + +    rects = PIXREGION_RECTS (region); + +    /* Copy in the rects */ +    memcpy (rects, boxes, sizeof(box_type_t) * count); +    region->data->numRects = count; + +    /* Eliminate empty and malformed rectangles */ +    displacement = 0; + +    for (i = 0; i < count; ++i) +    { +        box_type_t *box = &rects[i]; + +        if (box->x1 >= box->x2 || box->y1 >= box->y2) +	    displacement++; +        else if (displacement) +	    rects[i - displacement] = rects[i]; +    } + +    region->data->numRects -= displacement; + +    /* If eliminating empty rectangles caused there +     * to be only 0 or 1 rectangles, deal with that. +     */ +    if (region->data->numRects == 0) +    { +        FREE_DATA (region); +        PREFIX (_init) (region); + +        return TRUE; +    } + +    if (region->data->numRects == 1) +    { +        region->extents = rects[0]; + +        FREE_DATA (region); +        region->data = NULL; + +        GOOD (region); + +        return TRUE; +    } + +    /* Validate */ +    region->extents.x1 = region->extents.x2 = 0; + +    return validate (region); +} + +#define READ(_ptr) (*(_ptr)) + +static inline box_type_t * +bitmap_addrect (region_type_t *reg, +                box_type_t *r, +                box_type_t **first_rect, +                int rx1, int ry1, +                int rx2, int ry2) +{ +    if ((rx1 < rx2) && (ry1 < ry2) && +	(!(reg->data->numRects && +	   ((r-1)->y1 == ry1) && ((r-1)->y2 == ry2) && +	   ((r-1)->x1 <= rx1) && ((r-1)->x2 >= rx2)))) +    { +	if (reg->data->numRects == reg->data->size) +	{ +	    if (!pixman_rect_alloc (reg, 1)) +		return NULL; +	    *first_rect = PIXREGION_BOXPTR(reg); +	    r = *first_rect + reg->data->numRects; +	} +	r->x1 = rx1; +	r->y1 = ry1; +	r->x2 = rx2; +	r->y2 = ry2; +	reg->data->numRects++; +	if (r->x1 < reg->extents.x1) +	    reg->extents.x1 = r->x1; +	if (r->x2 > reg->extents.x2) +	    reg->extents.x2 = r->x2; +	r++; +    } +    return r; +} + +/* Convert bitmap clip mask into clipping region. + * First, goes through each line and makes boxes by noting the transitions + * from 0 to 1 and 1 to 0. + * Then it coalesces the current line with the previous if they have boxes + * at the same X coordinates. + * Stride is in number of uint32_t per line. + */ +PIXMAN_EXPORT void +PREFIX (_init_from_image) (region_type_t *region, +                           pixman_image_t *image) +{ +    uint32_t mask0 = 0xffffffff & ~SCREEN_SHIFT_RIGHT(0xffffffff, 1); +    box_type_t *first_rect, *rects, *prect_line_start; +    box_type_t *old_rect, *new_rect; +    uint32_t *pw, w, *pw_line, *pw_line_end; +    int	irect_prev_start, irect_line_start; +    int	h, base, rx1 = 0, crects; +    int	ib; +    pixman_bool_t in_box, same; +    int width, height, stride; + +    PREFIX(_init) (region); + +    critical_if_fail (region->data); + +    return_if_fail (image->type == BITS); +    return_if_fail (image->bits.format == PIXMAN_a1); + +    pw_line = pixman_image_get_data (image); +    width = pixman_image_get_width (image); +    height = pixman_image_get_height (image); +    stride = pixman_image_get_stride (image) / 4; + +    first_rect = PIXREGION_BOXPTR(region); +    rects = first_rect; + +    region->extents.x1 = width - 1; +    region->extents.x2 = 0; +    irect_prev_start = -1; +    for (h = 0; h < height; h++) +    { +        pw = pw_line; +        pw_line += stride; +        irect_line_start = rects - first_rect; + +        /* If the Screen left most bit of the word is set, we're starting in +         * a box */ +        if (READ(pw) & mask0) +        { +            in_box = TRUE; +            rx1 = 0; +        } +        else +        { +            in_box = FALSE; +        } + +        /* Process all words which are fully in the pixmap */ +        pw_line_end = pw + (width >> 5); +        for (base = 0; pw < pw_line_end; base += 32) +        { +            w = READ(pw++); +            if (in_box) +            { +                if (!~w) +                    continue; +            } +            else +            { +                if (!w) +                    continue; +            } +            for (ib = 0; ib < 32; ib++) +            { +                /* If the Screen left most bit of the word is set, we're +                 * starting a box */ +                if (w & mask0) +                { +                    if (!in_box) +                    { +                        rx1 = base + ib; +                        /* start new box */ +                        in_box = TRUE; +                    } +                } +                else +                { +                    if (in_box) +                    { +                        /* end box */ +                        rects = bitmap_addrect (region, rects, &first_rect, +                                                rx1, h, base + ib, h + 1); +                        if (rects == NULL) +                            goto error; +                        in_box = FALSE; +                    } +                } +                /* Shift the word VISUALLY left one. */ +                w = SCREEN_SHIFT_LEFT(w, 1); +            } +        } + +        if (width & 31) +        { +            /* Process final partial word on line */ +             w = READ(pw++); +            for (ib = 0; ib < (width & 31); ib++) +            { +                /* If the Screen left most bit of the word is set, we're +                 * starting a box */ +                if (w & mask0) +                { +                    if (!in_box) +                    { +                        rx1 = base + ib; +                        /* start new box */ +                        in_box = TRUE; +                    } +                } +                else +                { +                    if (in_box) +                    { +                        /* end box */ +                        rects = bitmap_addrect(region, rects, &first_rect, +					       rx1, h, base + ib, h + 1); +			if (rects == NULL) +			    goto error; +                        in_box = FALSE; +                    } +                } +                /* Shift the word VISUALLY left one. */ +                w = SCREEN_SHIFT_LEFT(w, 1); +            } +        } +        /* If scanline ended with last bit set, end the box */ +        if (in_box) +        { +            rects = bitmap_addrect(region, rects, &first_rect, +				   rx1, h, base + (width & 31), h + 1); +	    if (rects == NULL) +		goto error; +        } +        /* if all rectangles on this line have the same x-coords as +         * those on the previous line, then add 1 to all the previous  y2s and +         * throw away all the rectangles from this line +         */ +        same = FALSE; +        if (irect_prev_start != -1) +        { +            crects = irect_line_start - irect_prev_start; +            if (crects != 0 && +                crects == ((rects - first_rect) - irect_line_start)) +            { +                old_rect = first_rect + irect_prev_start; +                new_rect = prect_line_start = first_rect + irect_line_start; +                same = TRUE; +                while (old_rect < prect_line_start) +                { +                    if ((old_rect->x1 != new_rect->x1) || +                        (old_rect->x2 != new_rect->x2)) +                    { +                          same = FALSE; +                          break; +                    } +                    old_rect++; +                    new_rect++; +                } +                if (same) +                { +                    old_rect = first_rect + irect_prev_start; +                    while (old_rect < prect_line_start) +                    { +                        old_rect->y2 += 1; +                        old_rect++; +                    } +                    rects -= crects; +                    region->data->numRects -= crects; +                } +            } +        } +        if(!same) +            irect_prev_start = irect_line_start; +    } +    if (!region->data->numRects) +    { +        region->extents.x1 = region->extents.x2 = 0; +    } +    else +    { +        region->extents.y1 = PIXREGION_BOXPTR(region)->y1; +        region->extents.y2 = PIXREGION_END(region)->y2; +        if (region->data->numRects == 1) +        { +            free (region->data); +            region->data = NULL; +        } +    } + + error: +    return; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-region16.c b/libs/pixman-0.40.0/pixman/pixman-region16.c new file mode 100644 index 0000000..d88d338 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-region16.c @@ -0,0 +1,67 @@ +/* + * Copyright © 2008 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without + * fee, provided that the above copyright notice appear in all copies + * and that both that copyright notice and this permission notice + * appear in supporting documentation, and that the name of + * Red Hat, Inc. not be used in advertising or publicity pertaining to + * distribution of the software without specific, written prior + * permission. Red Hat, Inc. makes no representations about the + * suitability of this software for any purpose.  It is provided "as + * is" without express or implied warranty. + * + * RED HAT, INC. DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL RED HAT, INC. BE LIABLE FOR ANY SPECIAL, + * INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER + * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR + * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Author: Soren Sandmann <sandmann@redhat.com> + */ +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#undef PIXMAN_DISABLE_DEPRECATED + +#include "pixman-private.h" + +#include <stdlib.h> + +typedef pixman_box16_t		box_type_t; +typedef pixman_region16_data_t	region_data_type_t; +typedef pixman_region16_t	region_type_t; +typedef int32_t                 overflow_int_t; + +typedef struct { +    int x, y; +} point_type_t; + +#define PREFIX(x) pixman_region##x + +#define PIXMAN_REGION_MAX INT16_MAX +#define PIXMAN_REGION_MIN INT16_MIN + +#include "pixman-region.c" + +/* This function exists only to make it possible to preserve the X ABI - + * it should go away at first opportunity. + * + * The problem is that the X ABI exports the three structs and has used + * them through macros. So the X server calls this function with + * the addresses of those structs which makes the existing code continue to + * work. + */ +PIXMAN_EXPORT void +pixman_region_set_static_pointers (pixman_box16_t *empty_box, +				   pixman_region16_data_t *empty_data, +				   pixman_region16_data_t *broken_data) +{ +    pixman_region_empty_box = empty_box; +    pixman_region_empty_data = empty_data; +    pixman_broken_data = broken_data; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-region32.c b/libs/pixman-0.40.0/pixman/pixman-region32.c new file mode 100644 index 0000000..abd6b1a --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-region32.c @@ -0,0 +1,47 @@ +/* + * Copyright © 2008 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without + * fee, provided that the above copyright notice appear in all copies + * and that both that copyright notice and this permission notice + * appear in supporting documentation, and that the name of + * Red Hat, Inc. not be used in advertising or publicity pertaining to + * distribution of the software without specific, written prior + * permission. Red Hat, Inc. makes no representations about the + * suitability of this software for any purpose.  It is provided "as + * is" without express or implied warranty. + * + * RED HAT, INC. DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL RED HAT, INC. BE LIABLE FOR ANY SPECIAL, + * INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER + * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR + * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Author: Soren Sandmann <sandmann@redhat.com> + */ +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include "pixman-private.h" + +#include <stdlib.h> + +typedef pixman_box32_t		box_type_t; +typedef pixman_region32_data_t	region_data_type_t; +typedef pixman_region32_t	region_type_t; +typedef int64_t                 overflow_int_t; + +typedef struct { +    int x, y; +} point_type_t; + +#define PREFIX(x) pixman_region32##x + +#define PIXMAN_REGION_MAX INT32_MAX +#define PIXMAN_REGION_MIN INT32_MIN + +#include "pixman-region.c" diff --git a/libs/pixman-0.40.0/pixman/pixman-solid-fill.c b/libs/pixman-0.40.0/pixman/pixman-solid-fill.c new file mode 100644 index 0000000..4694ebc --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-solid-fill.c @@ -0,0 +1,67 @@ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007, 2009 Red Hat, Inc. + * Copyright © 2009 Soren Sandmann + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of SuSE not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission.  SuSE makes no representations about the + * suitability of this software for any purpose.  It is provided "as is" + * without express or implied warranty. + * + * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include "pixman-private.h" + +static uint32_t +color_to_uint32 (const pixman_color_t *color) +{ +    return +        ((unsigned int) color->alpha >> 8 << 24) | +        ((unsigned int) color->red >> 8 << 16) | +        ((unsigned int) color->green & 0xff00) | +        ((unsigned int) color->blue >> 8); +} + +static argb_t +color_to_float (const pixman_color_t *color) +{ +    argb_t result; + +    result.a = pixman_unorm_to_float (color->alpha, 16); +    result.r = pixman_unorm_to_float (color->red, 16); +    result.g = pixman_unorm_to_float (color->green, 16); +    result.b = pixman_unorm_to_float (color->blue, 16); + +    return result; +} + +PIXMAN_EXPORT pixman_image_t * +pixman_image_create_solid_fill (const pixman_color_t *color) +{ +    pixman_image_t *img = _pixman_image_allocate (); + +    if (!img) +	return NULL; + +    img->type = SOLID; +    img->solid.color = *color; +    img->solid.color_32 = color_to_uint32 (color); +    img->solid.color_float = color_to_float (color); + +    return img; +} + diff --git a/libs/pixman-0.40.0/pixman/pixman-sse2.c b/libs/pixman-0.40.0/pixman/pixman-sse2.c new file mode 100644 index 0000000..2644b0a --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-sse2.c @@ -0,0 +1,6527 @@ +/* + * Copyright © 2008 Rodrigo Kumpera + * Copyright © 2008 André Tupinambá + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Red Hat not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission.  Red Hat makes no representations about the + * suitability of this software for any purpose.  It is provided "as is" + * without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author:  Rodrigo Kumpera (kumpera@gmail.com) + *          André Tupinambá (andrelrt@gmail.com) + * + * Based on work by Owen Taylor and Søren Sandmann + */ +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +/* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */ +#define PSHUFD_IS_FAST 0 + +#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ +#include <emmintrin.h> /* for SSE2 intrinsics */ +#include "pixman-private.h" +#include "pixman-combine32.h" +#include "pixman-inlines.h" + +static __m128i mask_0080; +static __m128i mask_00ff; +static __m128i mask_0101; +static __m128i mask_ffff; +static __m128i mask_ff000000; +static __m128i mask_alpha; + +static __m128i mask_565_r; +static __m128i mask_565_g1, mask_565_g2; +static __m128i mask_565_b; +static __m128i mask_red; +static __m128i mask_green; +static __m128i mask_blue; + +static __m128i mask_565_fix_rb; +static __m128i mask_565_fix_g; + +static __m128i mask_565_rb; +static __m128i mask_565_pack_multiplier; + +static force_inline __m128i +unpack_32_1x128 (uint32_t data) +{ +    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ()); +} + +static force_inline void +unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi) +{ +    *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ()); +    *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ()); +} + +static force_inline __m128i +unpack_565_to_8888 (__m128i lo) +{ +    __m128i r, g, b, rb, t; + +    r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red); +    g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green); +    b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue); + +    rb = _mm_or_si128 (r, b); +    t  = _mm_and_si128 (rb, mask_565_fix_rb); +    t  = _mm_srli_epi32 (t, 5); +    rb = _mm_or_si128 (rb, t); + +    t  = _mm_and_si128 (g, mask_565_fix_g); +    t  = _mm_srli_epi32 (t, 6); +    g  = _mm_or_si128 (g, t); + +    return _mm_or_si128 (rb, g); +} + +static force_inline void +unpack_565_128_4x128 (__m128i  data, +                      __m128i* data0, +                      __m128i* data1, +                      __m128i* data2, +                      __m128i* data3) +{ +    __m128i lo, hi; + +    lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ()); +    hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ()); + +    lo = unpack_565_to_8888 (lo); +    hi = unpack_565_to_8888 (hi); + +    unpack_128_2x128 (lo, data0, data1); +    unpack_128_2x128 (hi, data2, data3); +} + +static force_inline uint16_t +pack_565_32_16 (uint32_t pixel) +{ +    return (uint16_t) (((pixel >> 8) & 0xf800) | +		       ((pixel >> 5) & 0x07e0) | +		       ((pixel >> 3) & 0x001f)); +} + +static force_inline __m128i +pack_2x128_128 (__m128i lo, __m128i hi) +{ +    return _mm_packus_epi16 (lo, hi); +} + +static force_inline __m128i +pack_565_2packedx128_128 (__m128i lo, __m128i hi) +{ +    __m128i rb0 = _mm_and_si128 (lo, mask_565_rb); +    __m128i rb1 = _mm_and_si128 (hi, mask_565_rb); + +    __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier); +    __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier); + +    __m128i g0 = _mm_and_si128 (lo, mask_green); +    __m128i g1 = _mm_and_si128 (hi, mask_green); + +    t0 = _mm_or_si128 (t0, g0); +    t1 = _mm_or_si128 (t1, g1); + +    /* Simulates _mm_packus_epi32 */ +    t0 = _mm_slli_epi32 (t0, 16 - 5); +    t1 = _mm_slli_epi32 (t1, 16 - 5); +    t0 = _mm_srai_epi32 (t0, 16); +    t1 = _mm_srai_epi32 (t1, 16); +    return _mm_packs_epi32 (t0, t1); +} + +static force_inline __m128i +pack_565_2x128_128 (__m128i lo, __m128i hi) +{ +    __m128i data; +    __m128i r, g1, g2, b; + +    data = pack_2x128_128 (lo, hi); + +    r  = _mm_and_si128 (data, mask_565_r); +    g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1); +    g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2); +    b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b); + +    return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b); +} + +static force_inline __m128i +pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3) +{ +    return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1), +			     pack_565_2x128_128 (*xmm2, *xmm3)); +} + +static force_inline int +is_opaque (__m128i x) +{ +    __m128i ffs = _mm_cmpeq_epi8 (x, x); + +    return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888; +} + +static force_inline int +is_zero (__m128i x) +{ +    return _mm_movemask_epi8 ( +	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff; +} + +static force_inline int +is_transparent (__m128i x) +{ +    return (_mm_movemask_epi8 ( +		_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888; +} + +static force_inline __m128i +expand_pixel_32_1x128 (uint32_t data) +{ +    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0)); +} + +static force_inline __m128i +expand_alpha_1x128 (__m128i data) +{ +    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, +						     _MM_SHUFFLE (3, 3, 3, 3)), +				_MM_SHUFFLE (3, 3, 3, 3)); +} + +static force_inline void +expand_alpha_2x128 (__m128i  data_lo, +                    __m128i  data_hi, +                    __m128i* alpha_lo, +                    __m128i* alpha_hi) +{ +    __m128i lo, hi; + +    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3)); +    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3)); + +    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3)); +    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3)); +} + +static force_inline void +expand_alpha_rev_2x128 (__m128i  data_lo, +                        __m128i  data_hi, +                        __m128i* alpha_lo, +                        __m128i* alpha_hi) +{ +    __m128i lo, hi; + +    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0)); +    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0)); +    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0)); +    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0)); +} + +static force_inline void +pix_multiply_2x128 (__m128i* data_lo, +                    __m128i* data_hi, +                    __m128i* alpha_lo, +                    __m128i* alpha_hi, +                    __m128i* ret_lo, +                    __m128i* ret_hi) +{ +    __m128i lo, hi; + +    lo = _mm_mullo_epi16 (*data_lo, *alpha_lo); +    hi = _mm_mullo_epi16 (*data_hi, *alpha_hi); +    lo = _mm_adds_epu16 (lo, mask_0080); +    hi = _mm_adds_epu16 (hi, mask_0080); +    *ret_lo = _mm_mulhi_epu16 (lo, mask_0101); +    *ret_hi = _mm_mulhi_epu16 (hi, mask_0101); +} + +static force_inline void +pix_add_multiply_2x128 (__m128i* src_lo, +                        __m128i* src_hi, +                        __m128i* alpha_dst_lo, +                        __m128i* alpha_dst_hi, +                        __m128i* dst_lo, +                        __m128i* dst_hi, +                        __m128i* alpha_src_lo, +                        __m128i* alpha_src_hi, +                        __m128i* ret_lo, +                        __m128i* ret_hi) +{ +    __m128i t1_lo, t1_hi; +    __m128i t2_lo, t2_hi; + +    pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi); +    pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi); + +    *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo); +    *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi); +} + +static force_inline void +negate_2x128 (__m128i  data_lo, +              __m128i  data_hi, +              __m128i* neg_lo, +              __m128i* neg_hi) +{ +    *neg_lo = _mm_xor_si128 (data_lo, mask_00ff); +    *neg_hi = _mm_xor_si128 (data_hi, mask_00ff); +} + +static force_inline void +invert_colors_2x128 (__m128i  data_lo, +                     __m128i  data_hi, +                     __m128i* inv_lo, +                     __m128i* inv_hi) +{ +    __m128i lo, hi; + +    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2)); +    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2)); +    *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2)); +    *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2)); +} + +static force_inline void +over_2x128 (__m128i* src_lo, +            __m128i* src_hi, +            __m128i* alpha_lo, +            __m128i* alpha_hi, +            __m128i* dst_lo, +            __m128i* dst_hi) +{ +    __m128i t1, t2; + +    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2); + +    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi); + +    *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo); +    *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi); +} + +static force_inline void +over_rev_non_pre_2x128 (__m128i  src_lo, +                        __m128i  src_hi, +                        __m128i* dst_lo, +                        __m128i* dst_hi) +{ +    __m128i lo, hi; +    __m128i alpha_lo, alpha_hi; + +    expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi); + +    lo = _mm_or_si128 (alpha_lo, mask_alpha); +    hi = _mm_or_si128 (alpha_hi, mask_alpha); + +    invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi); + +    pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi); + +    over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi); +} + +static force_inline void +in_over_2x128 (__m128i* src_lo, +               __m128i* src_hi, +               __m128i* alpha_lo, +               __m128i* alpha_hi, +               __m128i* mask_lo, +               __m128i* mask_hi, +               __m128i* dst_lo, +               __m128i* dst_hi) +{ +    __m128i s_lo, s_hi; +    __m128i a_lo, a_hi; + +    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi); +    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi); + +    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi); +} + +/* load 4 pixels from a 16-byte boundary aligned address */ +static force_inline __m128i +load_128_aligned (__m128i* src) +{ +    return _mm_load_si128 (src); +} + +/* load 4 pixels from a unaligned address */ +static force_inline __m128i +load_128_unaligned (const __m128i* src) +{ +    return _mm_loadu_si128 (src); +} + +/* save 4 pixels using Write Combining memory on a 16-byte + * boundary aligned address + */ +static force_inline void +save_128_write_combining (__m128i* dst, +                          __m128i  data) +{ +    _mm_stream_si128 (dst, data); +} + +/* save 4 pixels on a 16-byte boundary aligned address */ +static force_inline void +save_128_aligned (__m128i* dst, +                  __m128i  data) +{ +    _mm_store_si128 (dst, data); +} + +/* save 4 pixels on a unaligned address */ +static force_inline void +save_128_unaligned (__m128i* dst, +                    __m128i  data) +{ +    _mm_storeu_si128 (dst, data); +} + +static force_inline __m128i +load_32_1x128 (uint32_t data) +{ +    return _mm_cvtsi32_si128 (data); +} + +static force_inline __m128i +expand_alpha_rev_1x128 (__m128i data) +{ +    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0)); +} + +static force_inline __m128i +expand_pixel_8_1x128 (uint8_t data) +{ +    return _mm_shufflelo_epi16 ( +	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0)); +} + +static force_inline __m128i +pix_multiply_1x128 (__m128i data, +		    __m128i alpha) +{ +    return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha), +					    mask_0080), +			    mask_0101); +} + +static force_inline __m128i +pix_add_multiply_1x128 (__m128i* src, +			__m128i* alpha_dst, +			__m128i* dst, +			__m128i* alpha_src) +{ +    __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst); +    __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src); + +    return _mm_adds_epu8 (t1, t2); +} + +static force_inline __m128i +negate_1x128 (__m128i data) +{ +    return _mm_xor_si128 (data, mask_00ff); +} + +static force_inline __m128i +invert_colors_1x128 (__m128i data) +{ +    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2)); +} + +static force_inline __m128i +over_1x128 (__m128i src, __m128i alpha, __m128i dst) +{ +    return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha))); +} + +static force_inline __m128i +in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst) +{ +    return over_1x128 (pix_multiply_1x128 (*src, *mask), +		       pix_multiply_1x128 (*alpha, *mask), +		       *dst); +} + +static force_inline __m128i +over_rev_non_pre_1x128 (__m128i src, __m128i dst) +{ +    __m128i alpha = expand_alpha_1x128 (src); + +    return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src), +					   _mm_or_si128 (alpha, mask_alpha)), +		       alpha, +		       dst); +} + +static force_inline uint32_t +pack_1x128_32 (__m128i data) +{ +    return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ())); +} + +static force_inline __m128i +expand565_16_1x128 (uint16_t pixel) +{ +    __m128i m = _mm_cvtsi32_si128 (pixel); + +    m = unpack_565_to_8888 (m); + +    return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ()); +} + +static force_inline uint32_t +core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst) +{ +    uint8_t a; +    __m128i xmms; + +    a = src >> 24; + +    if (a == 0xff) +    { +	return src; +    } +    else if (src) +    { +	xmms = unpack_32_1x128 (src); +	return pack_1x128_32 ( +	    over_1x128 (xmms, expand_alpha_1x128 (xmms), +			unpack_32_1x128 (dst))); +    } + +    return dst; +} + +static force_inline uint32_t +combine1 (const uint32_t *ps, const uint32_t *pm) +{ +    uint32_t s; +    memcpy(&s, ps, sizeof(uint32_t)); + +    if (pm) +    { +	__m128i ms, mm; + +	mm = unpack_32_1x128 (*pm); +	mm = expand_alpha_1x128 (mm); + +	ms = unpack_32_1x128 (s); +	ms = pix_multiply_1x128 (ms, mm); + +	s = pack_1x128_32 (ms); +    } + +    return s; +} + +static force_inline __m128i +combine4 (const __m128i *ps, const __m128i *pm) +{ +    __m128i xmm_src_lo, xmm_src_hi; +    __m128i xmm_msk_lo, xmm_msk_hi; +    __m128i s; + +    if (pm) +    { +	xmm_msk_lo = load_128_unaligned (pm); + +	if (is_transparent (xmm_msk_lo)) +	    return _mm_setzero_si128 (); +    } + +    s = load_128_unaligned (ps); + +    if (pm) +    { +	unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi); +	unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi); + +	expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi); + +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, +			    &xmm_msk_lo, &xmm_msk_hi, +			    &xmm_src_lo, &xmm_src_hi); + +	s = pack_2x128_128 (xmm_src_lo, xmm_src_hi); +    } + +    return s; +} + +static force_inline void +core_combine_over_u_sse2_mask (uint32_t *	  pd, +			       const uint32_t*    ps, +			       const uint32_t*    pm, +			       int                w) +{ +    uint32_t s, d; + +    /* Align dst on a 16-byte boundary */ +    while (w && ((uintptr_t)pd & 15)) +    { +	d = *pd; +	s = combine1 (ps, pm); + +	if (s) +	    *pd = core_combine_over_u_pixel_sse2 (s, d); +	pd++; +	ps++; +	pm++; +	w--; +    } + +    while (w >= 4) +    { +	__m128i mask = load_128_unaligned ((__m128i *)pm); + +	if (!is_zero (mask)) +	{ +	    __m128i src; +	    __m128i src_hi, src_lo; +	    __m128i mask_hi, mask_lo; +	    __m128i alpha_hi, alpha_lo; + +	    src = load_128_unaligned ((__m128i *)ps); + +	    if (is_opaque (_mm_and_si128 (src, mask))) +	    { +		save_128_aligned ((__m128i *)pd, src); +	    } +	    else +	    { +		__m128i dst = load_128_aligned ((__m128i *)pd); +		__m128i dst_hi, dst_lo; + +		unpack_128_2x128 (mask, &mask_lo, &mask_hi); +		unpack_128_2x128 (src, &src_lo, &src_hi); + +		expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi); +		pix_multiply_2x128 (&src_lo, &src_hi, +				    &mask_lo, &mask_hi, +				    &src_lo, &src_hi); + +		unpack_128_2x128 (dst, &dst_lo, &dst_hi); + +		expand_alpha_2x128 (src_lo, src_hi, +				    &alpha_lo, &alpha_hi); + +		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, +			    &dst_lo, &dst_hi); + +		save_128_aligned ( +		    (__m128i *)pd, +		    pack_2x128_128 (dst_lo, dst_hi)); +	    } +	} + +	pm += 4; +	ps += 4; +	pd += 4; +	w -= 4; +    } +    while (w) +    { +	d = *pd; +	s = combine1 (ps, pm); + +	if (s) +	    *pd = core_combine_over_u_pixel_sse2 (s, d); +	pd++; +	ps++; +	pm++; + +	w--; +    } +} + +static force_inline void +core_combine_over_u_sse2_no_mask (uint32_t *	  pd, +				  const uint32_t*    ps, +				  int                w) +{ +    uint32_t s, d; + +    /* Align dst on a 16-byte boundary */ +    while (w && ((uintptr_t)pd & 15)) +    { +	d = *pd; +	s = *ps; + +	if (s) +	    *pd = core_combine_over_u_pixel_sse2 (s, d); +	pd++; +	ps++; +	w--; +    } + +    while (w >= 4) +    { +	__m128i src; +	__m128i src_hi, src_lo, dst_hi, dst_lo; +	__m128i alpha_hi, alpha_lo; + +	src = load_128_unaligned ((__m128i *)ps); + +	if (!is_zero (src)) +	{ +	    if (is_opaque (src)) +	    { +		save_128_aligned ((__m128i *)pd, src); +	    } +	    else +	    { +		__m128i dst = load_128_aligned ((__m128i *)pd); + +		unpack_128_2x128 (src, &src_lo, &src_hi); +		unpack_128_2x128 (dst, &dst_lo, &dst_hi); + +		expand_alpha_2x128 (src_lo, src_hi, +				    &alpha_lo, &alpha_hi); +		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, +			    &dst_lo, &dst_hi); + +		save_128_aligned ( +		    (__m128i *)pd, +		    pack_2x128_128 (dst_lo, dst_hi)); +	    } +	} + +	ps += 4; +	pd += 4; +	w -= 4; +    } +    while (w) +    { +	d = *pd; +	s = *ps; + +	if (s) +	    *pd = core_combine_over_u_pixel_sse2 (s, d); +	pd++; +	ps++; + +	w--; +    } +} + +static force_inline void +sse2_combine_over_u (pixman_implementation_t *imp, +                     pixman_op_t              op, +                     uint32_t *               pd, +                     const uint32_t *         ps, +                     const uint32_t *         pm, +                     int                      w) +{ +    if (pm) +	core_combine_over_u_sse2_mask (pd, ps, pm, w); +    else +	core_combine_over_u_sse2_no_mask (pd, ps, w); +} + +static void +sse2_combine_over_reverse_u (pixman_implementation_t *imp, +                             pixman_op_t              op, +                             uint32_t *               pd, +                             const uint32_t *         ps, +                             const uint32_t *         pm, +                             int                      w) +{ +    uint32_t s, d; + +    __m128i xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_src_lo, xmm_src_hi; +    __m128i xmm_alpha_lo, xmm_alpha_hi; + +    /* Align dst on a 16-byte boundary */ +    while (w && +           ((uintptr_t)pd & 15)) +    { +	d = *pd; +	s = combine1 (ps, pm); + +	*pd++ = core_combine_over_u_pixel_sse2 (d, s); +	w--; +	ps++; +	if (pm) +	    pm++; +    } + +    while (w >= 4) +    { +	/* I'm loading unaligned because I'm not sure +	 * about the address alignment. +	 */ +	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); +	xmm_dst_hi = load_128_aligned ((__m128i*) pd); + +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, +			    &xmm_alpha_lo, &xmm_alpha_hi); + +	over_2x128 (&xmm_dst_lo, &xmm_dst_hi, +		    &xmm_alpha_lo, &xmm_alpha_hi, +		    &xmm_src_lo, &xmm_src_hi); + +	/* rebuid the 4 pixel data and save*/ +	save_128_aligned ((__m128i*)pd, +			  pack_2x128_128 (xmm_src_lo, xmm_src_hi)); + +	w -= 4; +	ps += 4; +	pd += 4; + +	if (pm) +	    pm += 4; +    } + +    while (w) +    { +	d = *pd; +	s = combine1 (ps, pm); + +	*pd++ = core_combine_over_u_pixel_sse2 (d, s); +	ps++; +	w--; +	if (pm) +	    pm++; +    } +} + +static force_inline uint32_t +core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst) +{ +    uint32_t maska = src >> 24; + +    if (maska == 0) +    { +	return 0; +    } +    else if (maska != 0xff) +    { +	return pack_1x128_32 ( +	    pix_multiply_1x128 (unpack_32_1x128 (dst), +				expand_alpha_1x128 (unpack_32_1x128 (src)))); +    } + +    return dst; +} + +static void +sse2_combine_in_u (pixman_implementation_t *imp, +                   pixman_op_t              op, +                   uint32_t *               pd, +                   const uint32_t *         ps, +                   const uint32_t *         pm, +                   int                      w) +{ +    uint32_t s, d; + +    __m128i xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst_lo, xmm_dst_hi; + +    while (w && ((uintptr_t)pd & 15)) +    { +	s = combine1 (ps, pm); +	d = *pd; + +	*pd++ = core_combine_in_u_pixel_sse2 (d, s); +	w--; +	ps++; +	if (pm) +	    pm++; +    } + +    while (w >= 4) +    { +	xmm_dst_hi = load_128_aligned ((__m128i*) pd); +	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); + +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, +			    &xmm_dst_lo, &xmm_dst_hi, +			    &xmm_dst_lo, &xmm_dst_hi); + +	save_128_aligned ((__m128i*)pd, +			  pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	ps += 4; +	pd += 4; +	w -= 4; +	if (pm) +	    pm += 4; +    } + +    while (w) +    { +	s = combine1 (ps, pm); +	d = *pd; + +	*pd++ = core_combine_in_u_pixel_sse2 (d, s); +	w--; +	ps++; +	if (pm) +	    pm++; +    } +} + +static void +sse2_combine_in_reverse_u (pixman_implementation_t *imp, +                           pixman_op_t              op, +                           uint32_t *               pd, +                           const uint32_t *         ps, +                           const uint32_t *         pm, +                           int                      w) +{ +    uint32_t s, d; + +    __m128i xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst_lo, xmm_dst_hi; + +    while (w && ((uintptr_t)pd & 15)) +    { +	s = combine1 (ps, pm); +	d = *pd; + +	*pd++ = core_combine_in_u_pixel_sse2 (s, d); +	ps++; +	w--; +	if (pm) +	    pm++; +    } + +    while (w >= 4) +    { +	xmm_dst_hi = load_128_aligned ((__m128i*) pd); +	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); + +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); +	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, +			    &xmm_src_lo, &xmm_src_hi, +			    &xmm_dst_lo, &xmm_dst_hi); + +	save_128_aligned ( +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	ps += 4; +	pd += 4; +	w -= 4; +	if (pm) +	    pm += 4; +    } + +    while (w) +    { +	s = combine1 (ps, pm); +	d = *pd; + +	*pd++ = core_combine_in_u_pixel_sse2 (s, d); +	w--; +	ps++; +	if (pm) +	    pm++; +    } +} + +static void +sse2_combine_out_reverse_u (pixman_implementation_t *imp, +                            pixman_op_t              op, +                            uint32_t *               pd, +                            const uint32_t *         ps, +                            const uint32_t *         pm, +                            int                      w) +{ +    while (w && ((uintptr_t)pd & 15)) +    { +	uint32_t s = combine1 (ps, pm); +	uint32_t d = *pd; + +	*pd++ = pack_1x128_32 ( +	    pix_multiply_1x128 ( +		unpack_32_1x128 (d), negate_1x128 ( +		    expand_alpha_1x128 (unpack_32_1x128 (s))))); + +	if (pm) +	    pm++; +	ps++; +	w--; +    } + +    while (w >= 4) +    { +	__m128i xmm_src_lo, xmm_src_hi; +	__m128i xmm_dst_lo, xmm_dst_hi; + +	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); +	xmm_dst_hi = load_128_aligned ((__m128i*) pd); + +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + +	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, +			    &xmm_src_lo, &xmm_src_hi, +			    &xmm_dst_lo, &xmm_dst_hi); + +	save_128_aligned ( +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	ps += 4; +	pd += 4; +	if (pm) +	    pm += 4; + +	w -= 4; +    } + +    while (w) +    { +	uint32_t s = combine1 (ps, pm); +	uint32_t d = *pd; + +	*pd++ = pack_1x128_32 ( +	    pix_multiply_1x128 ( +		unpack_32_1x128 (d), negate_1x128 ( +		    expand_alpha_1x128 (unpack_32_1x128 (s))))); +	ps++; +	if (pm) +	    pm++; +	w--; +    } +} + +static void +sse2_combine_out_u (pixman_implementation_t *imp, +                    pixman_op_t              op, +                    uint32_t *               pd, +                    const uint32_t *         ps, +                    const uint32_t *         pm, +                    int                      w) +{ +    while (w && ((uintptr_t)pd & 15)) +    { +	uint32_t s = combine1 (ps, pm); +	uint32_t d = *pd; + +	*pd++ = pack_1x128_32 ( +	    pix_multiply_1x128 ( +		unpack_32_1x128 (s), negate_1x128 ( +		    expand_alpha_1x128 (unpack_32_1x128 (d))))); +	w--; +	ps++; +	if (pm) +	    pm++; +    } + +    while (w >= 4) +    { +	__m128i xmm_src_lo, xmm_src_hi; +	__m128i xmm_dst_lo, xmm_dst_hi; + +	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); +	xmm_dst_hi = load_128_aligned ((__m128i*) pd); + +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); +	negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, +			    &xmm_dst_lo, &xmm_dst_hi, +			    &xmm_dst_lo, &xmm_dst_hi); + +	save_128_aligned ( +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	ps += 4; +	pd += 4; +	w -= 4; +	if (pm) +	    pm += 4; +    } + +    while (w) +    { +	uint32_t s = combine1 (ps, pm); +	uint32_t d = *pd; + +	*pd++ = pack_1x128_32 ( +	    pix_multiply_1x128 ( +		unpack_32_1x128 (s), negate_1x128 ( +		    expand_alpha_1x128 (unpack_32_1x128 (d))))); +	w--; +	ps++; +	if (pm) +	    pm++; +    } +} + +static force_inline uint32_t +core_combine_atop_u_pixel_sse2 (uint32_t src, +                                uint32_t dst) +{ +    __m128i s = unpack_32_1x128 (src); +    __m128i d = unpack_32_1x128 (dst); + +    __m128i sa = negate_1x128 (expand_alpha_1x128 (s)); +    __m128i da = expand_alpha_1x128 (d); + +    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); +} + +static void +sse2_combine_atop_u (pixman_implementation_t *imp, +                     pixman_op_t              op, +                     uint32_t *               pd, +                     const uint32_t *         ps, +                     const uint32_t *         pm, +                     int                      w) +{ +    uint32_t s, d; + +    __m128i xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; +    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; + +    while (w && ((uintptr_t)pd & 15)) +    { +	s = combine1 (ps, pm); +	d = *pd; + +	*pd++ = core_combine_atop_u_pixel_sse2 (s, d); +	w--; +	ps++; +	if (pm) +	    pm++; +    } + +    while (w >= 4) +    { +	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); +	xmm_dst_hi = load_128_aligned ((__m128i*) pd); + +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, +			    &xmm_alpha_src_lo, &xmm_alpha_src_hi); +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, +			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + +	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, +		      &xmm_alpha_src_lo, &xmm_alpha_src_hi); + +	pix_add_multiply_2x128 ( +	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, +	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, +	    &xmm_dst_lo, &xmm_dst_hi); + +	save_128_aligned ( +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	ps += 4; +	pd += 4; +	w -= 4; +	if (pm) +	    pm += 4; +    } + +    while (w) +    { +	s = combine1 (ps, pm); +	d = *pd; + +	*pd++ = core_combine_atop_u_pixel_sse2 (s, d); +	w--; +	ps++; +	if (pm) +	    pm++; +    } +} + +static force_inline uint32_t +core_combine_reverse_atop_u_pixel_sse2 (uint32_t src, +                                        uint32_t dst) +{ +    __m128i s = unpack_32_1x128 (src); +    __m128i d = unpack_32_1x128 (dst); + +    __m128i sa = expand_alpha_1x128 (s); +    __m128i da = negate_1x128 (expand_alpha_1x128 (d)); + +    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); +} + +static void +sse2_combine_atop_reverse_u (pixman_implementation_t *imp, +                             pixman_op_t              op, +                             uint32_t *               pd, +                             const uint32_t *         ps, +                             const uint32_t *         pm, +                             int                      w) +{ +    uint32_t s, d; + +    __m128i xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; +    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; + +    while (w && ((uintptr_t)pd & 15)) +    { +	s = combine1 (ps, pm); +	d = *pd; + +	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); +	ps++; +	w--; +	if (pm) +	    pm++; +    } + +    while (w >= 4) +    { +	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); +	xmm_dst_hi = load_128_aligned ((__m128i*) pd); + +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, +			    &xmm_alpha_src_lo, &xmm_alpha_src_hi); +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, +			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + +	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, +		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + +	pix_add_multiply_2x128 ( +	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, +	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, +	    &xmm_dst_lo, &xmm_dst_hi); + +	save_128_aligned ( +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	ps += 4; +	pd += 4; +	w -= 4; +	if (pm) +	    pm += 4; +    } + +    while (w) +    { +	s = combine1 (ps, pm); +	d = *pd; + +	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); +	ps++; +	w--; +	if (pm) +	    pm++; +    } +} + +static force_inline uint32_t +core_combine_xor_u_pixel_sse2 (uint32_t src, +                               uint32_t dst) +{ +    __m128i s = unpack_32_1x128 (src); +    __m128i d = unpack_32_1x128 (dst); + +    __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d)); +    __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s)); + +    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s)); +} + +static void +sse2_combine_xor_u (pixman_implementation_t *imp, +                    pixman_op_t              op, +                    uint32_t *               dst, +                    const uint32_t *         src, +                    const uint32_t *         mask, +                    int                      width) +{ +    int w = width; +    uint32_t s, d; +    uint32_t* pd = dst; +    const uint32_t* ps = src; +    const uint32_t* pm = mask; + +    __m128i xmm_src, xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; +    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; + +    while (w && ((uintptr_t)pd & 15)) +    { +	s = combine1 (ps, pm); +	d = *pd; + +	*pd++ = core_combine_xor_u_pixel_sse2 (s, d); +	w--; +	ps++; +	if (pm) +	    pm++; +    } + +    while (w >= 4) +    { +	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); +	xmm_dst = load_128_aligned ((__m128i*) pd); + +	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); +	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, +			    &xmm_alpha_src_lo, &xmm_alpha_src_hi); +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, +			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + +	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, +		      &xmm_alpha_src_lo, &xmm_alpha_src_hi); +	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, +		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + +	pix_add_multiply_2x128 ( +	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, +	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, +	    &xmm_dst_lo, &xmm_dst_hi); + +	save_128_aligned ( +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	ps += 4; +	pd += 4; +	w -= 4; +	if (pm) +	    pm += 4; +    } + +    while (w) +    { +	s = combine1 (ps, pm); +	d = *pd; + +	*pd++ = core_combine_xor_u_pixel_sse2 (s, d); +	w--; +	ps++; +	if (pm) +	    pm++; +    } +} + +static force_inline void +sse2_combine_add_u (pixman_implementation_t *imp, +                    pixman_op_t              op, +                    uint32_t *               dst, +                    const uint32_t *         src, +                    const uint32_t *         mask, +                    int                      width) +{ +    int w = width; +    uint32_t s, d; +    uint32_t* pd = dst; +    const uint32_t* ps = src; +    const uint32_t* pm = mask; + +    while (w && (uintptr_t)pd & 15) +    { +	s = combine1 (ps, pm); +	d = *pd; + +	ps++; +	if (pm) +	    pm++; +	*pd++ = _mm_cvtsi128_si32 ( +	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); +	w--; +    } + +    while (w >= 4) +    { +	__m128i s; + +	s = combine4 ((__m128i*)ps, (__m128i*)pm); + +	save_128_aligned ( +	    (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd))); + +	pd += 4; +	ps += 4; +	if (pm) +	    pm += 4; +	w -= 4; +    } + +    while (w--) +    { +	s = combine1 (ps, pm); +	d = *pd; + +	ps++; +	*pd++ = _mm_cvtsi128_si32 ( +	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); +	if (pm) +	    pm++; +    } +} + +static force_inline uint32_t +core_combine_saturate_u_pixel_sse2 (uint32_t src, +                                    uint32_t dst) +{ +    __m128i ms = unpack_32_1x128 (src); +    __m128i md = unpack_32_1x128 (dst); +    uint32_t sa = src >> 24; +    uint32_t da = ~dst >> 24; + +    if (sa > da) +    { +	ms = pix_multiply_1x128 ( +	    ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24))); +    } + +    return pack_1x128_32 (_mm_adds_epu16 (md, ms)); +} + +static void +sse2_combine_saturate_u (pixman_implementation_t *imp, +                         pixman_op_t              op, +                         uint32_t *               pd, +                         const uint32_t *         ps, +                         const uint32_t *         pm, +                         int                      w) +{ +    uint32_t s, d; + +    uint32_t pack_cmp; +    __m128i xmm_src, xmm_dst; + +    while (w && (uintptr_t)pd & 15) +    { +	s = combine1 (ps, pm); +	d = *pd; + +	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d); +	w--; +	ps++; +	if (pm) +	    pm++; +    } + +    while (w >= 4) +    { +	xmm_dst = load_128_aligned  ((__m128i*)pd); +	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); + +	pack_cmp = _mm_movemask_epi8 ( +	    _mm_cmpgt_epi32 ( +		_mm_srli_epi32 (xmm_src, 24), +		_mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24))); + +	/* if some alpha src is grater than respective ~alpha dst */ +	if (pack_cmp) +	{ +	    s = combine1 (ps++, pm); +	    d = *pd; +	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); +	    if (pm) +		pm++; + +	    s = combine1 (ps++, pm); +	    d = *pd; +	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); +	    if (pm) +		pm++; + +	    s = combine1 (ps++, pm); +	    d = *pd; +	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); +	    if (pm) +		pm++; + +	    s = combine1 (ps++, pm); +	    d = *pd; +	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); +	    if (pm) +		pm++; +	} +	else +	{ +	    save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src)); + +	    pd += 4; +	    ps += 4; +	    if (pm) +		pm += 4; +	} + +	w -= 4; +    } + +    while (w--) +    { +	s = combine1 (ps, pm); +	d = *pd; + +	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d); +	ps++; +	if (pm) +	    pm++; +    } +} + +static void +sse2_combine_src_ca (pixman_implementation_t *imp, +                     pixman_op_t              op, +                     uint32_t *               pd, +                     const uint32_t *         ps, +                     const uint32_t *         pm, +                     int                      w) +{ +    uint32_t s, m; + +    __m128i xmm_src_lo, xmm_src_hi; +    __m128i xmm_mask_lo, xmm_mask_hi; +    __m128i xmm_dst_lo, xmm_dst_hi; + +    while (w && (uintptr_t)pd & 15) +    { +	s = *ps++; +	m = *pm++; +	*pd++ = pack_1x128_32 ( +	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); +	w--; +    } + +    while (w >= 4) +    { +	xmm_src_hi = load_128_unaligned ((__m128i*)ps); +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, +			    &xmm_mask_lo, &xmm_mask_hi, +			    &xmm_dst_lo, &xmm_dst_hi); + +	save_128_aligned ( +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	ps += 4; +	pd += 4; +	pm += 4; +	w -= 4; +    } + +    while (w) +    { +	s = *ps++; +	m = *pm++; +	*pd++ = pack_1x128_32 ( +	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); +	w--; +    } +} + +static force_inline uint32_t +core_combine_over_ca_pixel_sse2 (uint32_t src, +                                 uint32_t mask, +                                 uint32_t dst) +{ +    __m128i s = unpack_32_1x128 (src); +    __m128i expAlpha = expand_alpha_1x128 (s); +    __m128i unpk_mask = unpack_32_1x128 (mask); +    __m128i unpk_dst  = unpack_32_1x128 (dst); + +    return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst)); +} + +static void +sse2_combine_over_ca (pixman_implementation_t *imp, +                      pixman_op_t              op, +                      uint32_t *               pd, +                      const uint32_t *         ps, +                      const uint32_t *         pm, +                      int                      w) +{ +    uint32_t s, m, d; + +    __m128i xmm_alpha_lo, xmm_alpha_hi; +    __m128i xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_mask_lo, xmm_mask_hi; + +    while (w && (uintptr_t)pd & 15) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); +	w--; +    } + +    while (w >= 4) +    { +	xmm_dst_hi = load_128_aligned ((__m128i*)pd); +	xmm_src_hi = load_128_unaligned ((__m128i*)ps); +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, +			    &xmm_alpha_lo, &xmm_alpha_hi); + +	in_over_2x128 (&xmm_src_lo, &xmm_src_hi, +		       &xmm_alpha_lo, &xmm_alpha_hi, +		       &xmm_mask_lo, &xmm_mask_hi, +		       &xmm_dst_lo, &xmm_dst_hi); + +	save_128_aligned ( +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	ps += 4; +	pd += 4; +	pm += 4; +	w -= 4; +    } + +    while (w) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); +	w--; +    } +} + +static force_inline uint32_t +core_combine_over_reverse_ca_pixel_sse2 (uint32_t src, +                                         uint32_t mask, +                                         uint32_t dst) +{ +    __m128i d = unpack_32_1x128 (dst); + +    return pack_1x128_32 ( +	over_1x128 (d, expand_alpha_1x128 (d), +		    pix_multiply_1x128 (unpack_32_1x128 (src), +					unpack_32_1x128 (mask)))); +} + +static void +sse2_combine_over_reverse_ca (pixman_implementation_t *imp, +                              pixman_op_t              op, +                              uint32_t *               pd, +                              const uint32_t *         ps, +                              const uint32_t *         pm, +                              int                      w) +{ +    uint32_t s, m, d; + +    __m128i xmm_alpha_lo, xmm_alpha_hi; +    __m128i xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_mask_lo, xmm_mask_hi; + +    while (w && (uintptr_t)pd & 15) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); +	w--; +    } + +    while (w >= 4) +    { +	xmm_dst_hi = load_128_aligned ((__m128i*)pd); +	xmm_src_hi = load_128_unaligned ((__m128i*)ps); +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, +			    &xmm_alpha_lo, &xmm_alpha_hi); +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, +			    &xmm_mask_lo, &xmm_mask_hi, +			    &xmm_mask_lo, &xmm_mask_hi); + +	over_2x128 (&xmm_dst_lo, &xmm_dst_hi, +		    &xmm_alpha_lo, &xmm_alpha_hi, +		    &xmm_mask_lo, &xmm_mask_hi); + +	save_128_aligned ( +	    (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); + +	ps += 4; +	pd += 4; +	pm += 4; +	w -= 4; +    } + +    while (w) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); +	w--; +    } +} + +static void +sse2_combine_in_ca (pixman_implementation_t *imp, +                    pixman_op_t              op, +                    uint32_t *               pd, +                    const uint32_t *         ps, +                    const uint32_t *         pm, +                    int                      w) +{ +    uint32_t s, m, d; + +    __m128i xmm_alpha_lo, xmm_alpha_hi; +    __m128i xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_mask_lo, xmm_mask_hi; + +    while (w && (uintptr_t)pd & 15) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = pack_1x128_32 ( +	    pix_multiply_1x128 ( +		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)), +		expand_alpha_1x128 (unpack_32_1x128 (d)))); + +	w--; +    } + +    while (w >= 4) +    { +	xmm_dst_hi = load_128_aligned ((__m128i*)pd); +	xmm_src_hi = load_128_unaligned ((__m128i*)ps); +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, +			    &xmm_alpha_lo, &xmm_alpha_hi); + +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, +			    &xmm_mask_lo, &xmm_mask_hi, +			    &xmm_dst_lo, &xmm_dst_hi); + +	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, +			    &xmm_alpha_lo, &xmm_alpha_hi, +			    &xmm_dst_lo, &xmm_dst_hi); + +	save_128_aligned ( +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	ps += 4; +	pd += 4; +	pm += 4; +	w -= 4; +    } + +    while (w) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = pack_1x128_32 ( +	    pix_multiply_1x128 ( +		pix_multiply_1x128 ( +		    unpack_32_1x128 (s), unpack_32_1x128 (m)), +		expand_alpha_1x128 (unpack_32_1x128 (d)))); + +	w--; +    } +} + +static void +sse2_combine_in_reverse_ca (pixman_implementation_t *imp, +                            pixman_op_t              op, +                            uint32_t *               pd, +                            const uint32_t *         ps, +                            const uint32_t *         pm, +                            int                      w) +{ +    uint32_t s, m, d; + +    __m128i xmm_alpha_lo, xmm_alpha_hi; +    __m128i xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_mask_lo, xmm_mask_hi; + +    while (w && (uintptr_t)pd & 15) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = pack_1x128_32 ( +	    pix_multiply_1x128 ( +		unpack_32_1x128 (d), +		pix_multiply_1x128 (unpack_32_1x128 (m), +				   expand_alpha_1x128 (unpack_32_1x128 (s))))); +	w--; +    } + +    while (w >= 4) +    { +	xmm_dst_hi = load_128_aligned ((__m128i*)pd); +	xmm_src_hi = load_128_unaligned ((__m128i*)ps); +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, +			    &xmm_alpha_lo, &xmm_alpha_hi); +	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, +			    &xmm_alpha_lo, &xmm_alpha_hi, +			    &xmm_alpha_lo, &xmm_alpha_hi); + +	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, +			    &xmm_alpha_lo, &xmm_alpha_hi, +			    &xmm_dst_lo, &xmm_dst_hi); + +	save_128_aligned ( +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	ps += 4; +	pd += 4; +	pm += 4; +	w -= 4; +    } + +    while (w) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = pack_1x128_32 ( +	    pix_multiply_1x128 ( +		unpack_32_1x128 (d), +		pix_multiply_1x128 (unpack_32_1x128 (m), +				   expand_alpha_1x128 (unpack_32_1x128 (s))))); +	w--; +    } +} + +static void +sse2_combine_out_ca (pixman_implementation_t *imp, +                     pixman_op_t              op, +                     uint32_t *               pd, +                     const uint32_t *         ps, +                     const uint32_t *         pm, +                     int                      w) +{ +    uint32_t s, m, d; + +    __m128i xmm_alpha_lo, xmm_alpha_hi; +    __m128i xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_mask_lo, xmm_mask_hi; + +    while (w && (uintptr_t)pd & 15) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = pack_1x128_32 ( +	    pix_multiply_1x128 ( +		pix_multiply_1x128 ( +		    unpack_32_1x128 (s), unpack_32_1x128 (m)), +		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); +	w--; +    } + +    while (w >= 4) +    { +	xmm_dst_hi = load_128_aligned ((__m128i*)pd); +	xmm_src_hi = load_128_unaligned ((__m128i*)ps); +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, +			    &xmm_alpha_lo, &xmm_alpha_hi); +	negate_2x128 (xmm_alpha_lo, xmm_alpha_hi, +		      &xmm_alpha_lo, &xmm_alpha_hi); + +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, +			    &xmm_mask_lo, &xmm_mask_hi, +			    &xmm_dst_lo, &xmm_dst_hi); +	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, +			    &xmm_alpha_lo, &xmm_alpha_hi, +			    &xmm_dst_lo, &xmm_dst_hi); + +	save_128_aligned ( +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	ps += 4; +	pd += 4; +	pm += 4; +	w -= 4; +    } + +    while (w) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = pack_1x128_32 ( +	    pix_multiply_1x128 ( +		pix_multiply_1x128 ( +		    unpack_32_1x128 (s), unpack_32_1x128 (m)), +		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); + +	w--; +    } +} + +static void +sse2_combine_out_reverse_ca (pixman_implementation_t *imp, +                             pixman_op_t              op, +                             uint32_t *               pd, +                             const uint32_t *         ps, +                             const uint32_t *         pm, +                             int                      w) +{ +    uint32_t s, m, d; + +    __m128i xmm_alpha_lo, xmm_alpha_hi; +    __m128i xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_mask_lo, xmm_mask_hi; + +    while (w && (uintptr_t)pd & 15) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = pack_1x128_32 ( +	    pix_multiply_1x128 ( +		unpack_32_1x128 (d), +		negate_1x128 (pix_multiply_1x128 ( +				 unpack_32_1x128 (m), +				 expand_alpha_1x128 (unpack_32_1x128 (s)))))); +	w--; +    } + +    while (w >= 4) +    { +	xmm_dst_hi = load_128_aligned ((__m128i*)pd); +	xmm_src_hi = load_128_unaligned ((__m128i*)ps); +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, +			    &xmm_alpha_lo, &xmm_alpha_hi); + +	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, +			    &xmm_alpha_lo, &xmm_alpha_hi, +			    &xmm_mask_lo, &xmm_mask_hi); + +	negate_2x128 (xmm_mask_lo, xmm_mask_hi, +		      &xmm_mask_lo, &xmm_mask_hi); + +	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, +			    &xmm_mask_lo, &xmm_mask_hi, +			    &xmm_dst_lo, &xmm_dst_hi); + +	save_128_aligned ( +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	ps += 4; +	pd += 4; +	pm += 4; +	w -= 4; +    } + +    while (w) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = pack_1x128_32 ( +	    pix_multiply_1x128 ( +		unpack_32_1x128 (d), +		negate_1x128 (pix_multiply_1x128 ( +				 unpack_32_1x128 (m), +				 expand_alpha_1x128 (unpack_32_1x128 (s)))))); +	w--; +    } +} + +static force_inline uint32_t +core_combine_atop_ca_pixel_sse2 (uint32_t src, +                                 uint32_t mask, +                                 uint32_t dst) +{ +    __m128i m = unpack_32_1x128 (mask); +    __m128i s = unpack_32_1x128 (src); +    __m128i d = unpack_32_1x128 (dst); +    __m128i sa = expand_alpha_1x128 (s); +    __m128i da = expand_alpha_1x128 (d); + +    s = pix_multiply_1x128 (s, m); +    m = negate_1x128 (pix_multiply_1x128 (m, sa)); + +    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); +} + +static void +sse2_combine_atop_ca (pixman_implementation_t *imp, +                      pixman_op_t              op, +                      uint32_t *               pd, +                      const uint32_t *         ps, +                      const uint32_t *         pm, +                      int                      w) +{ +    uint32_t s, m, d; + +    __m128i xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; +    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; +    __m128i xmm_mask_lo, xmm_mask_hi; + +    while (w && (uintptr_t)pd & 15) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); +	w--; +    } + +    while (w >= 4) +    { +	xmm_dst_hi = load_128_aligned ((__m128i*)pd); +	xmm_src_hi = load_128_unaligned ((__m128i*)ps); +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, +			    &xmm_alpha_src_lo, &xmm_alpha_src_hi); +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, +			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, +			    &xmm_mask_lo, &xmm_mask_hi, +			    &xmm_src_lo, &xmm_src_hi); +	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, +			    &xmm_alpha_src_lo, &xmm_alpha_src_hi, +			    &xmm_mask_lo, &xmm_mask_hi); + +	negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + +	pix_add_multiply_2x128 ( +	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, +	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, +	    &xmm_dst_lo, &xmm_dst_hi); + +	save_128_aligned ( +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	ps += 4; +	pd += 4; +	pm += 4; +	w -= 4; +    } + +    while (w) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); +	w--; +    } +} + +static force_inline uint32_t +core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src, +                                         uint32_t mask, +                                         uint32_t dst) +{ +    __m128i m = unpack_32_1x128 (mask); +    __m128i s = unpack_32_1x128 (src); +    __m128i d = unpack_32_1x128 (dst); + +    __m128i da = negate_1x128 (expand_alpha_1x128 (d)); +    __m128i sa = expand_alpha_1x128 (s); + +    s = pix_multiply_1x128 (s, m); +    m = pix_multiply_1x128 (m, sa); + +    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); +} + +static void +sse2_combine_atop_reverse_ca (pixman_implementation_t *imp, +                              pixman_op_t              op, +                              uint32_t *               pd, +                              const uint32_t *         ps, +                              const uint32_t *         pm, +                              int                      w) +{ +    uint32_t s, m, d; + +    __m128i xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; +    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; +    __m128i xmm_mask_lo, xmm_mask_hi; + +    while (w && (uintptr_t)pd & 15) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); +	w--; +    } + +    while (w >= 4) +    { +	xmm_dst_hi = load_128_aligned ((__m128i*)pd); +	xmm_src_hi = load_128_unaligned ((__m128i*)ps); +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, +			    &xmm_alpha_src_lo, &xmm_alpha_src_hi); +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, +			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, +			    &xmm_mask_lo, &xmm_mask_hi, +			    &xmm_src_lo, &xmm_src_hi); +	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, +			    &xmm_alpha_src_lo, &xmm_alpha_src_hi, +			    &xmm_mask_lo, &xmm_mask_hi); + +	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, +		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + +	pix_add_multiply_2x128 ( +	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, +	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, +	    &xmm_dst_lo, &xmm_dst_hi); + +	save_128_aligned ( +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	ps += 4; +	pd += 4; +	pm += 4; +	w -= 4; +    } + +    while (w) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); +	w--; +    } +} + +static force_inline uint32_t +core_combine_xor_ca_pixel_sse2 (uint32_t src, +                                uint32_t mask, +                                uint32_t dst) +{ +    __m128i a = unpack_32_1x128 (mask); +    __m128i s = unpack_32_1x128 (src); +    __m128i d = unpack_32_1x128 (dst); + +    __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 ( +				       a, expand_alpha_1x128 (s))); +    __m128i dest      = pix_multiply_1x128 (s, a); +    __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d)); + +    return pack_1x128_32 (pix_add_multiply_1x128 (&d, +                                                &alpha_dst, +                                                &dest, +                                                &alpha_src)); +} + +static void +sse2_combine_xor_ca (pixman_implementation_t *imp, +                     pixman_op_t              op, +                     uint32_t *               pd, +                     const uint32_t *         ps, +                     const uint32_t *         pm, +                     int                      w) +{ +    uint32_t s, m, d; + +    __m128i xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; +    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; +    __m128i xmm_mask_lo, xmm_mask_hi; + +    while (w && (uintptr_t)pd & 15) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); +	w--; +    } + +    while (w >= 4) +    { +	xmm_dst_hi = load_128_aligned ((__m128i*)pd); +	xmm_src_hi = load_128_unaligned ((__m128i*)ps); +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm); + +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, +			    &xmm_alpha_src_lo, &xmm_alpha_src_hi); +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, +			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); + +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, +			    &xmm_mask_lo, &xmm_mask_hi, +			    &xmm_src_lo, &xmm_src_hi); +	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, +			    &xmm_alpha_src_lo, &xmm_alpha_src_hi, +			    &xmm_mask_lo, &xmm_mask_hi); + +	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, +		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); +	negate_2x128 (xmm_mask_lo, xmm_mask_hi, +		      &xmm_mask_lo, &xmm_mask_hi); + +	pix_add_multiply_2x128 ( +	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, +	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, +	    &xmm_dst_lo, &xmm_dst_hi); + +	save_128_aligned ( +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	ps += 4; +	pd += 4; +	pm += 4; +	w -= 4; +    } + +    while (w) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); +	w--; +    } +} + +static void +sse2_combine_add_ca (pixman_implementation_t *imp, +                     pixman_op_t              op, +                     uint32_t *               pd, +                     const uint32_t *         ps, +                     const uint32_t *         pm, +                     int                      w) +{ +    uint32_t s, m, d; + +    __m128i xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_mask_lo, xmm_mask_hi; + +    while (w && (uintptr_t)pd & 15) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = pack_1x128_32 ( +	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), +					       unpack_32_1x128 (m)), +			   unpack_32_1x128 (d))); +	w--; +    } + +    while (w >= 4) +    { +	xmm_src_hi = load_128_unaligned ((__m128i*)ps); +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm); +	xmm_dst_hi = load_128_aligned ((__m128i*)pd); + +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, +			    &xmm_mask_lo, &xmm_mask_hi, +			    &xmm_src_lo, &xmm_src_hi); + +	save_128_aligned ( +	    (__m128i*)pd, pack_2x128_128 ( +		_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo), +		_mm_adds_epu8 (xmm_src_hi, xmm_dst_hi))); + +	ps += 4; +	pd += 4; +	pm += 4; +	w -= 4; +    } + +    while (w) +    { +	s = *ps++; +	m = *pm++; +	d = *pd; + +	*pd++ = pack_1x128_32 ( +	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), +					       unpack_32_1x128 (m)), +			   unpack_32_1x128 (d))); +	w--; +    } +} + +static force_inline __m128i +create_mask_16_128 (uint16_t mask) +{ +    return _mm_set1_epi16 (mask); +} + +/* Work around a code generation bug in Sun Studio 12. */ +#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590) +# define create_mask_2x32_128(mask0, mask1)				\ +    (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1))) +#else +static force_inline __m128i +create_mask_2x32_128 (uint32_t mask0, +                      uint32_t mask1) +{ +    return _mm_set_epi32 (mask0, mask1, mask0, mask1); +} +#endif + +static void +sse2_composite_over_n_8888 (pixman_implementation_t *imp, +                            pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src; +    uint32_t    *dst_line, *dst, d; +    int32_t w; +    int dst_stride; +    __m128i xmm_src, xmm_alpha; +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + +    xmm_src = expand_pixel_32_1x128 (src); +    xmm_alpha = expand_alpha_1x128 (xmm_src); + +    while (height--) +    { +	dst = dst_line; + +	dst_line += dst_stride; +	w = width; + +	while (w && (uintptr_t)dst & 15) +	{ +	    d = *dst; +	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src, +						xmm_alpha, +						unpack_32_1x128 (d))); +	    w--; +	} + +	while (w >= 4) +	{ +	    xmm_dst = load_128_aligned ((__m128i*)dst); + +	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + +	    over_2x128 (&xmm_src, &xmm_src, +			&xmm_alpha, &xmm_alpha, +			&xmm_dst_lo, &xmm_dst_hi); + +	    /* rebuid the 4 pixel data and save*/ +	    save_128_aligned ( +		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	    w -= 4; +	    dst += 4; +	} + +	while (w) +	{ +	    d = *dst; +	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src, +						xmm_alpha, +						unpack_32_1x128 (d))); +	    w--; +	} + +    } +} + +static void +sse2_composite_over_n_0565 (pixman_implementation_t *imp, +                            pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src; +    uint16_t    *dst_line, *dst, d; +    int32_t w; +    int dst_stride; +    __m128i xmm_src, xmm_alpha; +    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + +    xmm_src = expand_pixel_32_1x128 (src); +    xmm_alpha = expand_alpha_1x128 (xmm_src); + +    while (height--) +    { +	dst = dst_line; + +	dst_line += dst_stride; +	w = width; + +	while (w && (uintptr_t)dst & 15) +	{ +	    d = *dst; + +	    *dst++ = pack_565_32_16 ( +		pack_1x128_32 (over_1x128 (xmm_src, +					   xmm_alpha, +					   expand565_16_1x128 (d)))); +	    w--; +	} + +	while (w >= 8) +	{ +	    xmm_dst = load_128_aligned ((__m128i*)dst); + +	    unpack_565_128_4x128 (xmm_dst, +				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + +	    over_2x128 (&xmm_src, &xmm_src, +			&xmm_alpha, &xmm_alpha, +			&xmm_dst0, &xmm_dst1); +	    over_2x128 (&xmm_src, &xmm_src, +			&xmm_alpha, &xmm_alpha, +			&xmm_dst2, &xmm_dst3); + +	    xmm_dst = pack_565_4x128_128 ( +		&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + +	    save_128_aligned ((__m128i*)dst, xmm_dst); + +	    dst += 8; +	    w -= 8; +	} + +	while (w--) +	{ +	    d = *dst; +	    *dst++ = pack_565_32_16 ( +		pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha, +					   expand565_16_1x128 (d)))); +	} +    } + +} + +static void +sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, +				   pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src; +    uint32_t    *dst_line, d; +    uint32_t    *mask_line, m; +    uint32_t pack_cmp; +    int dst_stride, mask_stride; + +    __m128i xmm_src; +    __m128i xmm_dst; +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + +    __m128i mmx_src, mmx_mask, mmx_dest; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + +    xmm_src = _mm_unpacklo_epi8 ( +	create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); +    mmx_src   = xmm_src; + +    while (height--) +    { +	int w = width; +	const uint32_t *pm = (uint32_t *)mask_line; +	uint32_t *pd = (uint32_t *)dst_line; + +	dst_line += dst_stride; +	mask_line += mask_stride; + +	while (w && (uintptr_t)pd & 15) +	{ +	    m = *pm++; + +	    if (m) +	    { +		d = *pd; + +		mmx_mask = unpack_32_1x128 (m); +		mmx_dest = unpack_32_1x128 (d); + +		*pd = pack_1x128_32 ( +		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), +				   mmx_dest)); +	    } + +	    pd++; +	    w--; +	} + +	while (w >= 4) +	{ +	    xmm_mask = load_128_unaligned ((__m128i*)pm); + +	    pack_cmp = +		_mm_movemask_epi8 ( +		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); + +	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ +	    if (pack_cmp != 0xffff) +	    { +		xmm_dst = load_128_aligned ((__m128i*)pd); + +		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + +		pix_multiply_2x128 (&xmm_src, &xmm_src, +				    &xmm_mask_lo, &xmm_mask_hi, +				    &xmm_mask_lo, &xmm_mask_hi); +		xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi); + +		save_128_aligned ( +		    (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst)); +	    } + +	    pd += 4; +	    pm += 4; +	    w -= 4; +	} + +	while (w) +	{ +	    m = *pm++; + +	    if (m) +	    { +		d = *pd; + +		mmx_mask = unpack_32_1x128 (m); +		mmx_dest = unpack_32_1x128 (d); + +		*pd = pack_1x128_32 ( +		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), +				   mmx_dest)); +	    } + +	    pd++; +	    w--; +	} +    } + +} + +static void +sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, +                                    pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src; +    uint32_t    *dst_line, d; +    uint32_t    *mask_line, m; +    uint32_t pack_cmp; +    int dst_stride, mask_stride; + +    __m128i xmm_src, xmm_alpha; +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + +    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + +    xmm_src = _mm_unpacklo_epi8 ( +	create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); +    xmm_alpha = expand_alpha_1x128 (xmm_src); +    mmx_src   = xmm_src; +    mmx_alpha = xmm_alpha; + +    while (height--) +    { +	int w = width; +	const uint32_t *pm = (uint32_t *)mask_line; +	uint32_t *pd = (uint32_t *)dst_line; + +	dst_line += dst_stride; +	mask_line += mask_stride; + +	while (w && (uintptr_t)pd & 15) +	{ +	    m = *pm++; + +	    if (m) +	    { +		d = *pd; +		mmx_mask = unpack_32_1x128 (m); +		mmx_dest = unpack_32_1x128 (d); + +		*pd = pack_1x128_32 (in_over_1x128 (&mmx_src, +		                                  &mmx_alpha, +		                                  &mmx_mask, +		                                  &mmx_dest)); +	    } + +	    pd++; +	    w--; +	} + +	while (w >= 4) +	{ +	    xmm_mask = load_128_unaligned ((__m128i*)pm); + +	    pack_cmp = +		_mm_movemask_epi8 ( +		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); + +	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ +	    if (pack_cmp != 0xffff) +	    { +		xmm_dst = load_128_aligned ((__m128i*)pd); + +		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); +		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + +		in_over_2x128 (&xmm_src, &xmm_src, +			       &xmm_alpha, &xmm_alpha, +			       &xmm_mask_lo, &xmm_mask_hi, +			       &xmm_dst_lo, &xmm_dst_hi); + +		save_128_aligned ( +		    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); +	    } + +	    pd += 4; +	    pm += 4; +	    w -= 4; +	} + +	while (w) +	{ +	    m = *pm++; + +	    if (m) +	    { +		d = *pd; +		mmx_mask = unpack_32_1x128 (m); +		mmx_dest = unpack_32_1x128 (d); + +		*pd = pack_1x128_32 ( +		    in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); +	    } + +	    pd++; +	    w--; +	} +    } + +} + +static void +sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, +                                 pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t    *dst_line, *dst; +    uint32_t    *src_line, *src; +    uint32_t mask; +    int32_t w; +    int dst_stride, src_stride; + +    __m128i xmm_mask; +    __m128i xmm_src, xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_alpha_lo, xmm_alpha_hi; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); + +    xmm_mask = create_mask_16_128 (mask >> 24); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w && (uintptr_t)dst & 15) +	{ +	    uint32_t s = *src++; + +	    if (s) +	    { +		uint32_t d = *dst; +		 +		__m128i ms = unpack_32_1x128 (s); +		__m128i alpha    = expand_alpha_1x128 (ms); +		__m128i dest     = xmm_mask; +		__m128i alpha_dst = unpack_32_1x128 (d); +		 +		*dst = pack_1x128_32 ( +		    in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); +	    } +	    dst++; +	    w--; +	} + +	while (w >= 4) +	{ +	    xmm_src = load_128_unaligned ((__m128i*)src); + +	    if (!is_zero (xmm_src)) +	    { +		xmm_dst = load_128_aligned ((__m128i*)dst); +		 +		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); +		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); +		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, +				    &xmm_alpha_lo, &xmm_alpha_hi); +		 +		in_over_2x128 (&xmm_src_lo, &xmm_src_hi, +			       &xmm_alpha_lo, &xmm_alpha_hi, +			       &xmm_mask, &xmm_mask, +			       &xmm_dst_lo, &xmm_dst_hi); +		 +		save_128_aligned ( +		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); +	    } +		 +	    dst += 4; +	    src += 4; +	    w -= 4; +	} + +	while (w) +	{ +	    uint32_t s = *src++; + +	    if (s) +	    { +		uint32_t d = *dst; +		 +		__m128i ms = unpack_32_1x128 (s); +		__m128i alpha = expand_alpha_1x128 (ms); +		__m128i mask  = xmm_mask; +		__m128i dest  = unpack_32_1x128 (d); +		 +		*dst = pack_1x128_32 ( +		    in_over_1x128 (&ms, &alpha, &mask, &dest)); +	    } + +	    dst++; +	    w--; +	} +    } + +} + +static void +sse2_composite_src_x888_0565 (pixman_implementation_t *imp, +                              pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint16_t    *dst_line, *dst; +    uint32_t    *src_line, *src, s; +    int dst_stride, src_stride; +    int32_t w; + +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w && (uintptr_t)dst & 15) +	{ +	    s = *src++; +	    *dst = convert_8888_to_0565 (s); +	    dst++; +	    w--; +	} + +	while (w >= 8) +	{ +	    __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0); +	    __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1); + +	    save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1)); + +	    w -= 8; +	    src += 8; +	    dst += 8; +	} + +	while (w) +	{ +	    s = *src++; +	    *dst = convert_8888_to_0565 (s); +	    dst++; +	    w--; +	} +    } +} + +static void +sse2_composite_src_x888_8888 (pixman_implementation_t *imp, +			      pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t    *dst_line, *dst; +    uint32_t    *src_line, *src; +    int32_t w; +    int dst_stride, src_stride; + + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w && (uintptr_t)dst & 15) +	{ +	    *dst++ = *src++ | 0xff000000; +	    w--; +	} + +	while (w >= 16) +	{ +	    __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4; +	     +	    xmm_src1 = load_128_unaligned ((__m128i*)src + 0); +	    xmm_src2 = load_128_unaligned ((__m128i*)src + 1); +	    xmm_src3 = load_128_unaligned ((__m128i*)src + 2); +	    xmm_src4 = load_128_unaligned ((__m128i*)src + 3); +	     +	    save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000)); +	    save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000)); +	    save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000)); +	    save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000)); +	     +	    dst += 16; +	    src += 16; +	    w -= 16; +	} + +	while (w) +	{ +	    *dst++ = *src++ | 0xff000000; +	    w--; +	} +    } + +} + +static void +sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, +                                 pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t    *dst_line, *dst; +    uint32_t    *src_line, *src; +    uint32_t mask; +    int dst_stride, src_stride; +    int32_t w; + +    __m128i xmm_mask, xmm_alpha; +    __m128i xmm_src, xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); + +    xmm_mask = create_mask_16_128 (mask >> 24); +    xmm_alpha = mask_00ff; + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w && (uintptr_t)dst & 15) +	{ +	    uint32_t s = (*src++) | 0xff000000; +	    uint32_t d = *dst; + +	    __m128i src   = unpack_32_1x128 (s); +	    __m128i alpha = xmm_alpha; +	    __m128i mask  = xmm_mask; +	    __m128i dest  = unpack_32_1x128 (d); + +	    *dst++ = pack_1x128_32 ( +		in_over_1x128 (&src, &alpha, &mask, &dest)); + +	    w--; +	} + +	while (w >= 4) +	{ +	    xmm_src = _mm_or_si128 ( +		load_128_unaligned ((__m128i*)src), mask_ff000000); +	    xmm_dst = load_128_aligned ((__m128i*)dst); + +	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); +	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + +	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, +			   &xmm_alpha, &xmm_alpha, +			   &xmm_mask, &xmm_mask, +			   &xmm_dst_lo, &xmm_dst_hi); + +	    save_128_aligned ( +		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	    dst += 4; +	    src += 4; +	    w -= 4; + +	} + +	while (w) +	{ +	    uint32_t s = (*src++) | 0xff000000; +	    uint32_t d = *dst; + +	    __m128i src  = unpack_32_1x128 (s); +	    __m128i alpha = xmm_alpha; +	    __m128i mask  = xmm_mask; +	    __m128i dest  = unpack_32_1x128 (d); + +	    *dst++ = pack_1x128_32 ( +		in_over_1x128 (&src, &alpha, &mask, &dest)); + +	    w--; +	} +    } + +} + +static void +sse2_composite_over_8888_8888 (pixman_implementation_t *imp, +                               pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    int dst_stride, src_stride; +    uint32_t    *dst_line, *dst; +    uint32_t    *src_line, *src; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +    dst = dst_line; +    src = src_line; + +    while (height--) +    { +	sse2_combine_over_u (imp, op, dst, src, NULL, width); + +	dst += dst_stride; +	src += src_stride; +    } +} + +static force_inline uint16_t +composite_over_8888_0565pixel (uint32_t src, uint16_t dst) +{ +    __m128i ms; + +    ms = unpack_32_1x128 (src); +    return pack_565_32_16 ( +	pack_1x128_32 ( +	    over_1x128 ( +		ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst)))); +} + +static void +sse2_composite_over_8888_0565 (pixman_implementation_t *imp, +                               pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint16_t    *dst_line, *dst, d; +    uint32_t    *src_line, *src, s; +    int dst_stride, src_stride; +    int32_t w; + +    __m128i xmm_alpha_lo, xmm_alpha_hi; +    __m128i xmm_src, xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +    while (height--) +    { +	dst = dst_line; +	src = src_line; + +	dst_line += dst_stride; +	src_line += src_stride; +	w = width; + +	/* Align dst on a 16-byte boundary */ +	while (w && +	       ((uintptr_t)dst & 15)) +	{ +	    s = *src++; +	    d = *dst; + +	    *dst++ = composite_over_8888_0565pixel (s, d); +	    w--; +	} + +	/* It's a 8 pixel loop */ +	while (w >= 8) +	{ +	    /* I'm loading unaligned because I'm not sure +	     * about the address alignment. +	     */ +	    xmm_src = load_128_unaligned ((__m128i*) src); +	    xmm_dst = load_128_aligned ((__m128i*) dst); + +	    /* Unpacking */ +	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); +	    unpack_565_128_4x128 (xmm_dst, +				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); +	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, +				&xmm_alpha_lo, &xmm_alpha_hi); + +	    /* I'm loading next 4 pixels from memory +	     * before to optimze the memory read. +	     */ +	    xmm_src = load_128_unaligned ((__m128i*) (src + 4)); + +	    over_2x128 (&xmm_src_lo, &xmm_src_hi, +			&xmm_alpha_lo, &xmm_alpha_hi, +			&xmm_dst0, &xmm_dst1); + +	    /* Unpacking */ +	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); +	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, +				&xmm_alpha_lo, &xmm_alpha_hi); + +	    over_2x128 (&xmm_src_lo, &xmm_src_hi, +			&xmm_alpha_lo, &xmm_alpha_hi, +			&xmm_dst2, &xmm_dst3); + +	    save_128_aligned ( +		(__m128i*)dst, pack_565_4x128_128 ( +		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); + +	    w -= 8; +	    dst += 8; +	    src += 8; +	} + +	while (w--) +	{ +	    s = *src++; +	    d = *dst; + +	    *dst++ = composite_over_8888_0565pixel (s, d); +	} +    } + +} + +static void +sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, +                              pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src, srca; +    uint32_t *dst_line, *dst; +    uint8_t *mask_line, *mask; +    int dst_stride, mask_stride; +    int32_t w; +    uint32_t m, d; + +    __m128i xmm_src, xmm_alpha, xmm_def; +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + +    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    srca = src >> 24; +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + +    xmm_def = create_mask_2x32_128 (src, src); +    xmm_src = expand_pixel_32_1x128 (src); +    xmm_alpha = expand_alpha_1x128 (xmm_src); +    mmx_src   = xmm_src; +    mmx_alpha = xmm_alpha; + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; +	w = width; + +	while (w && (uintptr_t)dst & 15) +	{ +	    uint8_t m = *mask++; + +	    if (m) +	    { +		d = *dst; +		mmx_mask = expand_pixel_8_1x128 (m); +		mmx_dest = unpack_32_1x128 (d); + +		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src, +		                                   &mmx_alpha, +		                                   &mmx_mask, +		                                   &mmx_dest)); +	    } + +	    w--; +	    dst++; +	} + +	while (w >= 4) +	{ +            memcpy(&m, mask, sizeof(uint32_t)); + +	    if (srca == 0xff && m == 0xffffffff) +	    { +		save_128_aligned ((__m128i*)dst, xmm_def); +	    } +	    else if (m) +	    { +		xmm_dst = load_128_aligned ((__m128i*) dst); +		xmm_mask = unpack_32_1x128 (m); +		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); + +		/* Unpacking */ +		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); +		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + +		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, +					&xmm_mask_lo, &xmm_mask_hi); + +		in_over_2x128 (&xmm_src, &xmm_src, +			       &xmm_alpha, &xmm_alpha, +			       &xmm_mask_lo, &xmm_mask_hi, +			       &xmm_dst_lo, &xmm_dst_hi); + +		save_128_aligned ( +		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); +	    } + +	    w -= 4; +	    dst += 4; +	    mask += 4; +	} + +	while (w) +	{ +	    uint8_t m = *mask++; + +	    if (m) +	    { +		d = *dst; +		mmx_mask = expand_pixel_8_1x128 (m); +		mmx_dest = unpack_32_1x128 (d); + +		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src, +		                                   &mmx_alpha, +		                                   &mmx_mask, +		                                   &mmx_dest)); +	    } + +	    w--; +	    dst++; +	} +    } + +} + +#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) +__attribute__((__force_align_arg_pointer__)) +#endif +static pixman_bool_t +sse2_fill (pixman_implementation_t *imp, +           uint32_t *               bits, +           int                      stride, +           int                      bpp, +           int                      x, +           int                      y, +           int                      width, +           int                      height, +           uint32_t		    filler) +{ +    uint32_t byte_width; +    uint8_t *byte_line; + +    __m128i xmm_def; + +    if (bpp == 8) +    { +	uint32_t b; +	uint32_t w; + +	stride = stride * (int) sizeof (uint32_t) / 1; +	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); +	byte_width = width; +	stride *= 1; + +	b = filler & 0xff; +	w = (b << 8) | b; +	filler = (w << 16) | w; +    } +    else if (bpp == 16) +    { +	stride = stride * (int) sizeof (uint32_t) / 2; +	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); +	byte_width = 2 * width; +	stride *= 2; + +        filler = (filler & 0xffff) * 0x00010001; +    } +    else if (bpp == 32) +    { +	stride = stride * (int) sizeof (uint32_t) / 4; +	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); +	byte_width = 4 * width; +	stride *= 4; +    } +    else +    { +	return FALSE; +    } + +    xmm_def = create_mask_2x32_128 (filler, filler); + +    while (height--) +    { +	int w; +	uint8_t *d = byte_line; +	byte_line += stride; +	w = byte_width; + +	if (w >= 1 && ((uintptr_t)d & 1)) +	{ +	    *(uint8_t *)d = filler; +	    w -= 1; +	    d += 1; +	} + +	while (w >= 2 && ((uintptr_t)d & 3)) +	{ +	    *(uint16_t *)d = filler; +	    w -= 2; +	    d += 2; +	} + +	while (w >= 4 && ((uintptr_t)d & 15)) +	{ +	    *(uint32_t *)d = filler; + +	    w -= 4; +	    d += 4; +	} + +	while (w >= 128) +	{ +	    save_128_aligned ((__m128i*)(d),     xmm_def); +	    save_128_aligned ((__m128i*)(d + 16),  xmm_def); +	    save_128_aligned ((__m128i*)(d + 32),  xmm_def); +	    save_128_aligned ((__m128i*)(d + 48),  xmm_def); +	    save_128_aligned ((__m128i*)(d + 64),  xmm_def); +	    save_128_aligned ((__m128i*)(d + 80),  xmm_def); +	    save_128_aligned ((__m128i*)(d + 96),  xmm_def); +	    save_128_aligned ((__m128i*)(d + 112), xmm_def); + +	    d += 128; +	    w -= 128; +	} + +	if (w >= 64) +	{ +	    save_128_aligned ((__m128i*)(d),     xmm_def); +	    save_128_aligned ((__m128i*)(d + 16),  xmm_def); +	    save_128_aligned ((__m128i*)(d + 32),  xmm_def); +	    save_128_aligned ((__m128i*)(d + 48),  xmm_def); + +	    d += 64; +	    w -= 64; +	} + +	if (w >= 32) +	{ +	    save_128_aligned ((__m128i*)(d),     xmm_def); +	    save_128_aligned ((__m128i*)(d + 16),  xmm_def); + +	    d += 32; +	    w -= 32; +	} + +	if (w >= 16) +	{ +	    save_128_aligned ((__m128i*)(d),     xmm_def); + +	    d += 16; +	    w -= 16; +	} + +	while (w >= 4) +	{ +	    *(uint32_t *)d = filler; + +	    w -= 4; +	    d += 4; +	} + +	if (w >= 2) +	{ +	    *(uint16_t *)d = filler; +	    w -= 2; +	    d += 2; +	} + +	if (w >= 1) +	{ +	    *(uint8_t *)d = filler; +	    w -= 1; +	    d += 1; +	} +    } + +    return TRUE; +} + +static void +sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, +                             pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src, srca; +    uint32_t    *dst_line, *dst; +    uint8_t     *mask_line, *mask; +    int dst_stride, mask_stride; +    int32_t w; +    uint32_t m; + +    __m128i xmm_src, xmm_def; +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    srca = src >> 24; +    if (src == 0) +    { +	sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride, +		   PIXMAN_FORMAT_BPP (dest_image->bits.format), +		   dest_x, dest_y, width, height, 0); +	return; +    } + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + +    xmm_def = create_mask_2x32_128 (src, src); +    xmm_src = expand_pixel_32_1x128 (src); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; +	w = width; + +	while (w && (uintptr_t)dst & 15) +	{ +	    uint8_t m = *mask++; + +	    if (m) +	    { +		*dst = pack_1x128_32 ( +		    pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m))); +	    } +	    else +	    { +		*dst = 0; +	    } + +	    w--; +	    dst++; +	} + +	while (w >= 4) +	{ +            memcpy(&m, mask, sizeof(uint32_t)); + +	    if (srca == 0xff && m == 0xffffffff) +	    { +		save_128_aligned ((__m128i*)dst, xmm_def); +	    } +	    else if (m) +	    { +		xmm_mask = unpack_32_1x128 (m); +		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); + +		/* Unpacking */ +		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + +		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, +					&xmm_mask_lo, &xmm_mask_hi); + +		pix_multiply_2x128 (&xmm_src, &xmm_src, +				    &xmm_mask_lo, &xmm_mask_hi, +				    &xmm_mask_lo, &xmm_mask_hi); + +		save_128_aligned ( +		    (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); +	    } +	    else +	    { +		save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ()); +	    } + +	    w -= 4; +	    dst += 4; +	    mask += 4; +	} + +	while (w) +	{ +	    uint8_t m = *mask++; + +	    if (m) +	    { +		*dst = pack_1x128_32 ( +		    pix_multiply_1x128 ( +			xmm_src, expand_pixel_8_1x128 (m))); +	    } +	    else +	    { +		*dst = 0; +	    } + +	    w--; +	    dst++; +	} +    } + +} + +static void +sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, +                              pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src; +    uint16_t    *dst_line, *dst, d; +    uint8_t     *mask_line, *mask; +    int dst_stride, mask_stride; +    int32_t w; +    uint32_t m; +    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; + +    __m128i xmm_src, xmm_alpha; +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; +    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + +    xmm_src = expand_pixel_32_1x128 (src); +    xmm_alpha = expand_alpha_1x128 (xmm_src); +    mmx_src = xmm_src; +    mmx_alpha = xmm_alpha; + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; +	w = width; + +	while (w && (uintptr_t)dst & 15) +	{ +	    m = *mask++; + +	    if (m) +	    { +		d = *dst; +		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); +		mmx_dest = expand565_16_1x128 (d); + +		*dst = pack_565_32_16 ( +		    pack_1x128_32 ( +			in_over_1x128 ( +			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); +	    } + +	    w--; +	    dst++; +	} + +	while (w >= 8) +	{ +	    xmm_dst = load_128_aligned ((__m128i*) dst); +	    unpack_565_128_4x128 (xmm_dst, +				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); + +            memcpy(&m, mask, sizeof(uint32_t)); +	    mask += 4; + +	    if (m) +	    { +		xmm_mask = unpack_32_1x128 (m); +		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); + +		/* Unpacking */ +		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + +		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, +					&xmm_mask_lo, &xmm_mask_hi); + +		in_over_2x128 (&xmm_src, &xmm_src, +			       &xmm_alpha, &xmm_alpha, +			       &xmm_mask_lo, &xmm_mask_hi, +			       &xmm_dst0, &xmm_dst1); +	    } + +            memcpy(&m, mask, sizeof(uint32_t)); +	    mask += 4; + +	    if (m) +	    { +		xmm_mask = unpack_32_1x128 (m); +		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); + +		/* Unpacking */ +		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + +		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, +					&xmm_mask_lo, &xmm_mask_hi); +		in_over_2x128 (&xmm_src, &xmm_src, +			       &xmm_alpha, &xmm_alpha, +			       &xmm_mask_lo, &xmm_mask_hi, +			       &xmm_dst2, &xmm_dst3); +	    } + +	    save_128_aligned ( +		(__m128i*)dst, pack_565_4x128_128 ( +		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); + +	    w -= 8; +	    dst += 8; +	} + +	while (w) +	{ +	    m = *mask++; + +	    if (m) +	    { +		d = *dst; +		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); +		mmx_dest = expand565_16_1x128 (d); + +		*dst = pack_565_32_16 ( +		    pack_1x128_32 ( +			in_over_1x128 ( +			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); +	    } + +	    w--; +	    dst++; +	} +    } + +} + +static void +sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, +                                 pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint16_t    *dst_line, *dst, d; +    uint32_t    *src_line, *src, s; +    int dst_stride, src_stride; +    int32_t w; +    uint32_t opaque, zero; + +    __m128i ms; +    __m128i xmm_src, xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w && (uintptr_t)dst & 15) +	{ +	    s = *src++; +	    d = *dst; + +	    ms = unpack_32_1x128 (s); + +	    *dst++ = pack_565_32_16 ( +		pack_1x128_32 ( +		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); +	    w--; +	} + +	while (w >= 8) +	{ +	    /* First round */ +	    xmm_src = load_128_unaligned ((__m128i*)src); +	    xmm_dst = load_128_aligned  ((__m128i*)dst); + +	    opaque = is_opaque (xmm_src); +	    zero = is_zero (xmm_src); + +	    unpack_565_128_4x128 (xmm_dst, +				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); +	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + +	    /* preload next round*/ +	    xmm_src = load_128_unaligned ((__m128i*)(src + 4)); + +	    if (opaque) +	    { +		invert_colors_2x128 (xmm_src_lo, xmm_src_hi, +				     &xmm_dst0, &xmm_dst1); +	    } +	    else if (!zero) +	    { +		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, +					&xmm_dst0, &xmm_dst1); +	    } + +	    /* Second round */ +	    opaque = is_opaque (xmm_src); +	    zero = is_zero (xmm_src); + +	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + +	    if (opaque) +	    { +		invert_colors_2x128 (xmm_src_lo, xmm_src_hi, +				     &xmm_dst2, &xmm_dst3); +	    } +	    else if (!zero) +	    { +		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, +					&xmm_dst2, &xmm_dst3); +	    } + +	    save_128_aligned ( +		(__m128i*)dst, pack_565_4x128_128 ( +		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); + +	    w -= 8; +	    src += 8; +	    dst += 8; +	} + +	while (w) +	{ +	    s = *src++; +	    d = *dst; + +	    ms = unpack_32_1x128 (s); + +	    *dst++ = pack_565_32_16 ( +		pack_1x128_32 ( +		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); +	    w--; +	} +    } + +} + +static void +sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, +                                 pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t    *dst_line, *dst, d; +    uint32_t    *src_line, *src, s; +    int dst_stride, src_stride; +    int32_t w; +    uint32_t opaque, zero; + +    __m128i xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst_lo, xmm_dst_hi; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w && (uintptr_t)dst & 15) +	{ +	    s = *src++; +	    d = *dst; + +	    *dst++ = pack_1x128_32 ( +		over_rev_non_pre_1x128 ( +		    unpack_32_1x128 (s), unpack_32_1x128 (d))); + +	    w--; +	} + +	while (w >= 4) +	{ +	    xmm_src_hi = load_128_unaligned ((__m128i*)src); + +	    opaque = is_opaque (xmm_src_hi); +	    zero = is_zero (xmm_src_hi); + +	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + +	    if (opaque) +	    { +		invert_colors_2x128 (xmm_src_lo, xmm_src_hi, +				     &xmm_dst_lo, &xmm_dst_hi); + +		save_128_aligned ( +		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); +	    } +	    else if (!zero) +	    { +		xmm_dst_hi = load_128_aligned  ((__m128i*)dst); + +		unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + +		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, +					&xmm_dst_lo, &xmm_dst_hi); + +		save_128_aligned ( +		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); +	    } + +	    w -= 4; +	    dst += 4; +	    src += 4; +	} + +	while (w) +	{ +	    s = *src++; +	    d = *dst; + +	    *dst++ = pack_1x128_32 ( +		over_rev_non_pre_1x128 ( +		    unpack_32_1x128 (s), unpack_32_1x128 (d))); + +	    w--; +	} +    } + +} + +static void +sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, +                                    pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src; +    uint16_t    *dst_line, *dst, d; +    uint32_t    *mask_line, *mask, m; +    int dst_stride, mask_stride; +    int w; +    uint32_t pack_cmp; + +    __m128i xmm_src, xmm_alpha; +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; +    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; + +    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + +    xmm_src = expand_pixel_32_1x128 (src); +    xmm_alpha = expand_alpha_1x128 (xmm_src); +    mmx_src = xmm_src; +    mmx_alpha = xmm_alpha; + +    while (height--) +    { +	w = width; +	mask = mask_line; +	dst = dst_line; +	mask_line += mask_stride; +	dst_line += dst_stride; + +	while (w && ((uintptr_t)dst & 15)) +	{ +	    m = *(uint32_t *) mask; + +	    if (m) +	    { +		d = *dst; +		mmx_mask = unpack_32_1x128 (m); +		mmx_dest = expand565_16_1x128 (d); + +		*dst = pack_565_32_16 ( +		    pack_1x128_32 ( +			in_over_1x128 ( +			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); +	    } + +	    w--; +	    dst++; +	    mask++; +	} + +	while (w >= 8) +	{ +	    /* First round */ +	    xmm_mask = load_128_unaligned ((__m128i*)mask); +	    xmm_dst = load_128_aligned ((__m128i*)dst); + +	    pack_cmp = _mm_movemask_epi8 ( +		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); + +	    unpack_565_128_4x128 (xmm_dst, +				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); +	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + +	    /* preload next round */ +	    xmm_mask = load_128_unaligned ((__m128i*)(mask + 4)); + +	    /* preload next round */ +	    if (pack_cmp != 0xffff) +	    { +		in_over_2x128 (&xmm_src, &xmm_src, +			       &xmm_alpha, &xmm_alpha, +			       &xmm_mask_lo, &xmm_mask_hi, +			       &xmm_dst0, &xmm_dst1); +	    } + +	    /* Second round */ +	    pack_cmp = _mm_movemask_epi8 ( +		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); + +	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + +	    if (pack_cmp != 0xffff) +	    { +		in_over_2x128 (&xmm_src, &xmm_src, +			       &xmm_alpha, &xmm_alpha, +			       &xmm_mask_lo, &xmm_mask_hi, +			       &xmm_dst2, &xmm_dst3); +	    } + +	    save_128_aligned ( +		(__m128i*)dst, pack_565_4x128_128 ( +		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); + +	    w -= 8; +	    dst += 8; +	    mask += 8; +	} + +	while (w) +	{ +	    m = *(uint32_t *) mask; + +	    if (m) +	    { +		d = *dst; +		mmx_mask = unpack_32_1x128 (m); +		mmx_dest = expand565_16_1x128 (d); + +		*dst = pack_565_32_16 ( +		    pack_1x128_32 ( +			in_over_1x128 ( +			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); +	    } + +	    w--; +	    dst++; +	    mask++; +	} +    } + +} + +static void +sse2_composite_in_n_8_8 (pixman_implementation_t *imp, +                         pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint8_t     *dst_line, *dst; +    uint8_t     *mask_line, *mask; +    int dst_stride, mask_stride; +    uint32_t d, m; +    uint32_t src; +    int32_t w; + +    __m128i xmm_alpha; +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; +	w = width; + +	while (w && ((uintptr_t)dst & 15)) +	{ +	    m = (uint32_t) *mask++; +	    d = (uint32_t) *dst; + +	    *dst++ = (uint8_t) pack_1x128_32 ( +		pix_multiply_1x128 ( +		    pix_multiply_1x128 (xmm_alpha, +				       unpack_32_1x128 (m)), +		    unpack_32_1x128 (d))); +	    w--; +	} + +	while (w >= 16) +	{ +	    xmm_mask = load_128_unaligned ((__m128i*)mask); +	    xmm_dst = load_128_aligned ((__m128i*)dst); + +	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); +	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + +	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, +				&xmm_mask_lo, &xmm_mask_hi, +				&xmm_mask_lo, &xmm_mask_hi); + +	    pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, +				&xmm_dst_lo, &xmm_dst_hi, +				&xmm_dst_lo, &xmm_dst_hi); + +	    save_128_aligned ( +		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	    mask += 16; +	    dst += 16; +	    w -= 16; +	} + +	while (w) +	{ +	    m = (uint32_t) *mask++; +	    d = (uint32_t) *dst; + +	    *dst++ = (uint8_t) pack_1x128_32 ( +		pix_multiply_1x128 ( +		    pix_multiply_1x128 ( +			xmm_alpha, unpack_32_1x128 (m)), +		    unpack_32_1x128 (d))); +	    w--; +	} +    } + +} + +static void +sse2_composite_in_n_8 (pixman_implementation_t *imp, +		       pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint8_t     *dst_line, *dst; +    int dst_stride; +    uint32_t d; +    uint32_t src; +    int32_t w; + +    __m128i xmm_alpha; +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); + +    src = src >> 24; + +    if (src == 0xff) +	return; + +    if (src == 0x00) +    { +	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, +		     8, dest_x, dest_y, width, height, src); + +	return; +    } + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	w = width; + +	while (w && ((uintptr_t)dst & 15)) +	{ +	    d = (uint32_t) *dst; + +	    *dst++ = (uint8_t) pack_1x128_32 ( +		pix_multiply_1x128 ( +		    xmm_alpha, +		    unpack_32_1x128 (d))); +	    w--; +	} + +	while (w >= 16) +	{ +	    xmm_dst = load_128_aligned ((__m128i*)dst); + +	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); +	     +	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, +				&xmm_dst_lo, &xmm_dst_hi, +				&xmm_dst_lo, &xmm_dst_hi); + +	    save_128_aligned ( +		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	    dst += 16; +	    w -= 16; +	} + +	while (w) +	{ +	    d = (uint32_t) *dst; + +	    *dst++ = (uint8_t) pack_1x128_32 ( +		pix_multiply_1x128 ( +		    xmm_alpha, +		    unpack_32_1x128 (d))); +	    w--; +	} +    } + +} + +static void +sse2_composite_in_8_8 (pixman_implementation_t *imp, +                       pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint8_t     *dst_line, *dst; +    uint8_t     *src_line, *src; +    int src_stride, dst_stride; +    int32_t w; +    uint32_t s, d; + +    __m128i xmm_src, xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w && ((uintptr_t)dst & 15)) +	{ +	    s = (uint32_t) *src++; +	    d = (uint32_t) *dst; + +	    *dst++ = (uint8_t) pack_1x128_32 ( +		pix_multiply_1x128 ( +		    unpack_32_1x128 (s), unpack_32_1x128 (d))); +	    w--; +	} + +	while (w >= 16) +	{ +	    xmm_src = load_128_unaligned ((__m128i*)src); +	    xmm_dst = load_128_aligned ((__m128i*)dst); + +	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); +	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + +	    pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, +				&xmm_dst_lo, &xmm_dst_hi, +				&xmm_dst_lo, &xmm_dst_hi); + +	    save_128_aligned ( +		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	    src += 16; +	    dst += 16; +	    w -= 16; +	} + +	while (w) +	{ +	    s = (uint32_t) *src++; +	    d = (uint32_t) *dst; + +	    *dst++ = (uint8_t) pack_1x128_32 ( +		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d))); +	    w--; +	} +    } + +} + +static void +sse2_composite_add_n_8_8 (pixman_implementation_t *imp, +			  pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint8_t     *dst_line, *dst; +    uint8_t     *mask_line, *mask; +    int dst_stride, mask_stride; +    int32_t w; +    uint32_t src; +    uint32_t m, d; + +    __m128i xmm_alpha; +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; +	w = width; + +	while (w && ((uintptr_t)dst & 15)) +	{ +	    m = (uint32_t) *mask++; +	    d = (uint32_t) *dst; + +	    *dst++ = (uint8_t) pack_1x128_32 ( +		_mm_adds_epu16 ( +		    pix_multiply_1x128 ( +			xmm_alpha, unpack_32_1x128 (m)), +		    unpack_32_1x128 (d))); +	    w--; +	} + +	while (w >= 16) +	{ +	    xmm_mask = load_128_unaligned ((__m128i*)mask); +	    xmm_dst = load_128_aligned ((__m128i*)dst); + +	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); +	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + +	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, +				&xmm_mask_lo, &xmm_mask_hi, +				&xmm_mask_lo, &xmm_mask_hi); + +	    xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); +	    xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); + +	    save_128_aligned ( +		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + +	    mask += 16; +	    dst += 16; +	    w -= 16; +	} + +	while (w) +	{ +	    m = (uint32_t) *mask++; +	    d = (uint32_t) *dst; + +	    *dst++ = (uint8_t) pack_1x128_32 ( +		_mm_adds_epu16 ( +		    pix_multiply_1x128 ( +			xmm_alpha, unpack_32_1x128 (m)), +		    unpack_32_1x128 (d))); + +	    w--; +	} +    } + +} + +static void +sse2_composite_add_n_8 (pixman_implementation_t *imp, +			pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint8_t     *dst_line, *dst; +    int dst_stride; +    int32_t w; +    uint32_t src; + +    __m128i xmm_src; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    src >>= 24; + +    if (src == 0x00) +	return; + +    if (src == 0xff) +    { +	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, +		     8, dest_x, dest_y, width, height, 0xff); + +	return; +    } + +    src = (src << 24) | (src << 16) | (src << 8) | src; +    xmm_src = _mm_set_epi32 (src, src, src, src); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	w = width; + +	while (w && ((uintptr_t)dst & 15)) +	{ +	    *dst = (uint8_t)_mm_cvtsi128_si32 ( +		_mm_adds_epu8 ( +		    xmm_src, +		    _mm_cvtsi32_si128 (*dst))); + +	    w--; +	    dst++; +	} + +	while (w >= 16) +	{ +	    save_128_aligned ( +		(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst))); + +	    dst += 16; +	    w -= 16; +	} + +	while (w) +	{ +	    *dst = (uint8_t)_mm_cvtsi128_si32 ( +		_mm_adds_epu8 ( +		    xmm_src, +		    _mm_cvtsi32_si128 (*dst))); + +	    w--; +	    dst++; +	} +    } + +} + +static void +sse2_composite_add_8_8 (pixman_implementation_t *imp, +			pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint8_t     *dst_line, *dst; +    uint8_t     *src_line, *src; +    int dst_stride, src_stride; +    int32_t w; +    uint16_t t; + +    PIXMAN_IMAGE_GET_LINE ( +	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + +    while (height--) +    { +	dst = dst_line; +	src = src_line; + +	dst_line += dst_stride; +	src_line += src_stride; +	w = width; + +	/* Small head */ +	while (w && (uintptr_t)dst & 3) +	{ +	    t = (*dst) + (*src++); +	    *dst++ = t | (0 - (t >> 8)); +	    w--; +	} + +	sse2_combine_add_u (imp, op, +			    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); + +	/* Small tail */ +	dst += w & 0xfffc; +	src += w & 0xfffc; + +	w &= 3; + +	while (w) +	{ +	    t = (*dst) + (*src++); +	    *dst++ = t | (0 - (t >> 8)); +	    w--; +	} +    } + +} + +static void +sse2_composite_add_8888_8888 (pixman_implementation_t *imp, +                              pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t    *dst_line, *dst; +    uint32_t    *src_line, *src; +    int dst_stride, src_stride; + +    PIXMAN_IMAGE_GET_LINE ( +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; + +	sse2_combine_add_u (imp, op, dst, src, NULL, width); +    } +} + +static void +sse2_composite_add_n_8888 (pixman_implementation_t *imp, +			   pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t *dst_line, *dst, src; +    int dst_stride; + +    __m128i xmm_src; + +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); +    if (src == 0) +	return; + +    if (src == ~0) +    { +	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32, +		     dest_x, dest_y, width, height, ~0); + +	return; +    } + +    xmm_src = _mm_set_epi32 (src, src, src, src); +    while (height--) +    { +	int w = width; +	uint32_t d; + +	dst = dst_line; +	dst_line += dst_stride; + +	while (w && (uintptr_t)dst & 15) +	{ +	    d = *dst; +	    *dst++ = +		_mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d))); +	    w--; +	} + +	while (w >= 4) +	{ +	    save_128_aligned +		((__m128i*)dst, +		 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); + +	    dst += 4; +	    w -= 4; +	} + +	while (w--) +	{ +	    d = *dst; +	    *dst++ = +		_mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src, +						  _mm_cvtsi32_si128 (d))); +	} +    } +} + +static void +sse2_composite_add_n_8_8888 (pixman_implementation_t *imp, +			     pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t     *dst_line, *dst; +    uint8_t     *mask_line, *mask; +    int dst_stride, mask_stride; +    int32_t w; +    uint32_t src; + +    __m128i xmm_src; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); +    if (src == 0) +	return; +    xmm_src = expand_pixel_32_1x128 (src); + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	mask = mask_line; +	mask_line += mask_stride; +	w = width; + +	while (w && ((uintptr_t)dst & 15)) +	{ +	    uint8_t m = *mask++; +	    if (m) +	    { +		*dst = pack_1x128_32 +		    (_mm_adds_epu16 +		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), +		      unpack_32_1x128 (*dst))); +	    } +	    dst++; +	    w--; +	} + +	while (w >= 4) +	{ +	    uint32_t m; +            memcpy(&m, mask, sizeof(uint32_t)); + +	    if (m) +	    { +		__m128i xmm_mask_lo, xmm_mask_hi; +		__m128i xmm_dst_lo, xmm_dst_hi; + +		__m128i xmm_dst = load_128_aligned ((__m128i*)dst); +		__m128i xmm_mask = +		    _mm_unpacklo_epi8 (unpack_32_1x128(m), +				       _mm_setzero_si128 ()); + +		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); +		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + +		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, +					&xmm_mask_lo, &xmm_mask_hi); + +		pix_multiply_2x128 (&xmm_src, &xmm_src, +				    &xmm_mask_lo, &xmm_mask_hi, +				    &xmm_mask_lo, &xmm_mask_hi); + +		xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); +		xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); + +		save_128_aligned ( +		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); +	    } + +	    w -= 4; +	    dst += 4; +	    mask += 4; +	} + +	while (w) +	{ +	    uint8_t m = *mask++; +	    if (m) +	    { +		*dst = pack_1x128_32 +		    (_mm_adds_epu16 +		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), +		      unpack_32_1x128 (*dst))); +	    } +	    dst++; +	    w--; +	} +    } +} + +static pixman_bool_t +sse2_blt (pixman_implementation_t *imp, +          uint32_t *               src_bits, +          uint32_t *               dst_bits, +          int                      src_stride, +          int                      dst_stride, +          int                      src_bpp, +          int                      dst_bpp, +          int                      src_x, +          int                      src_y, +          int                      dest_x, +          int                      dest_y, +          int                      width, +          int                      height) +{ +    uint8_t *   src_bytes; +    uint8_t *   dst_bytes; +    int byte_width; + +    if (src_bpp != dst_bpp) +	return FALSE; + +    if (src_bpp == 16) +    { +	src_stride = src_stride * (int) sizeof (uint32_t) / 2; +	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; +	src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); +	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); +	byte_width = 2 * width; +	src_stride *= 2; +	dst_stride *= 2; +    } +    else if (src_bpp == 32) +    { +	src_stride = src_stride * (int) sizeof (uint32_t) / 4; +	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; +	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); +	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); +	byte_width = 4 * width; +	src_stride *= 4; +	dst_stride *= 4; +    } +    else +    { +	return FALSE; +    } + +    while (height--) +    { +	int w; +	uint8_t *s = src_bytes; +	uint8_t *d = dst_bytes; +	src_bytes += src_stride; +	dst_bytes += dst_stride; +	w = byte_width; + +	while (w >= 2 && ((uintptr_t)d & 3)) +	{ +            memmove(d, s, 2); +	    w -= 2; +	    s += 2; +	    d += 2; +	} + +	while (w >= 4 && ((uintptr_t)d & 15)) +	{ +            memmove(d, s, 4); + +	    w -= 4; +	    s += 4; +	    d += 4; +	} + +	while (w >= 64) +	{ +	    __m128i xmm0, xmm1, xmm2, xmm3; + +	    xmm0 = load_128_unaligned ((__m128i*)(s)); +	    xmm1 = load_128_unaligned ((__m128i*)(s + 16)); +	    xmm2 = load_128_unaligned ((__m128i*)(s + 32)); +	    xmm3 = load_128_unaligned ((__m128i*)(s + 48)); + +	    save_128_aligned ((__m128i*)(d),    xmm0); +	    save_128_aligned ((__m128i*)(d + 16), xmm1); +	    save_128_aligned ((__m128i*)(d + 32), xmm2); +	    save_128_aligned ((__m128i*)(d + 48), xmm3); + +	    s += 64; +	    d += 64; +	    w -= 64; +	} + +	while (w >= 16) +	{ +	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) ); + +	    w -= 16; +	    d += 16; +	    s += 16; +	} + +	while (w >= 4) +	{ +            memmove(d, s, 4); + +	    w -= 4; +	    s += 4; +	    d += 4; +	} + +	if (w >= 2) +	{ +            memmove(d, s, 2); +	    w -= 2; +	    s += 2; +	    d += 2; +	} +    } + +    return TRUE; +} + +static void +sse2_composite_copy_area (pixman_implementation_t *imp, +                          pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    sse2_blt (imp, src_image->bits.bits, +	      dest_image->bits.bits, +	      src_image->bits.rowstride, +	      dest_image->bits.rowstride, +	      PIXMAN_FORMAT_BPP (src_image->bits.format), +	      PIXMAN_FORMAT_BPP (dest_image->bits.format), +	      src_x, src_y, dest_x, dest_y, width, height); +} + +static void +sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, +                                 pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t    *src, *src_line, s; +    uint32_t    *dst, *dst_line, d; +    uint8_t         *mask, *mask_line; +    uint32_t m; +    int src_stride, mask_stride, dst_stride; +    int32_t w; +    __m128i ms; + +    __m128i xmm_src, xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +    while (height--) +    { +        src = src_line; +        src_line += src_stride; +        dst = dst_line; +        dst_line += dst_stride; +        mask = mask_line; +        mask_line += mask_stride; + +        w = width; + +        while (w && (uintptr_t)dst & 15) +        { +            s = 0xff000000 | *src++; +            memcpy(&m, mask++, sizeof(uint32_t)); +            d = *dst; +            ms = unpack_32_1x128 (s); + +            if (m != 0xff) +            { +		__m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); +		__m128i md = unpack_32_1x128 (d); + +                ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md); +            } + +            *dst++ = pack_1x128_32 (ms); +            w--; +        } + +        while (w >= 4) +        { +            memcpy(&m, mask, sizeof(uint32_t)); +            xmm_src = _mm_or_si128 ( +		load_128_unaligned ((__m128i*)src), mask_ff000000); + +            if (m == 0xffffffff) +            { +                save_128_aligned ((__m128i*)dst, xmm_src); +            } +            else +            { +                xmm_dst = load_128_aligned ((__m128i*)dst); + +                xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); + +                unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); +                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); +                unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + +                expand_alpha_rev_2x128 ( +		    xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + +                in_over_2x128 (&xmm_src_lo, &xmm_src_hi, +			       &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, +			       &xmm_dst_lo, &xmm_dst_hi); + +                save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); +            } + +            src += 4; +            dst += 4; +            mask += 4; +            w -= 4; +        } + +        while (w) +        { +            memcpy(&m, mask++, sizeof(uint32_t)); + +            if (m) +            { +                s = 0xff000000 | *src; + +                if (m == 0xff) +                { +                    *dst = s; +                } +                else +                { +		    __m128i ma, md, ms; + +                    d = *dst; + +		    ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); +		    md = unpack_32_1x128 (d); +		    ms = unpack_32_1x128 (s); + +                    *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md)); +                } + +            } + +            src++; +            dst++; +            w--; +        } +    } + +} + +static void +sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, +                                 pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t    *src, *src_line, s; +    uint32_t    *dst, *dst_line, d; +    uint8_t         *mask, *mask_line; +    uint32_t m; +    int src_stride, mask_stride, dst_stride; +    int32_t w; + +    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +    while (height--) +    { +        src = src_line; +        src_line += src_stride; +        dst = dst_line; +        dst_line += dst_stride; +        mask = mask_line; +        mask_line += mask_stride; + +        w = width; + +        while (w && (uintptr_t)dst & 15) +        { +	    uint32_t sa; + +            s = *src++; +            m = (uint32_t) *mask++; +            d = *dst; + +	    sa = s >> 24; + +	    if (m) +	    { +		if (sa == 0xff && m == 0xff) +		{ +		    *dst = s; +		} +		else +		{ +		    __m128i ms, md, ma, msa; + +		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); +		    ms = unpack_32_1x128 (s); +		    md = unpack_32_1x128 (d); + +		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); + +		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); +		} +	    } + +	    dst++; +            w--; +        } + +        while (w >= 4) +        { +            memcpy(&m, mask, sizeof(uint32_t)); + +	    if (m) +	    { +		xmm_src = load_128_unaligned ((__m128i*)src); + +		if (m == 0xffffffff && is_opaque (xmm_src)) +		{ +		    save_128_aligned ((__m128i *)dst, xmm_src); +		} +		else +		{ +		    xmm_dst = load_128_aligned ((__m128i *)dst); + +		    xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); + +		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); +		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); +		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + +		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); +		    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + +		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, +				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); + +		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); +		} +	    } + +            src += 4; +            dst += 4; +            mask += 4; +            w -= 4; +        } + +        while (w) +        { +	    uint32_t sa; + +            s = *src++; +            m = (uint32_t) *mask++; +            d = *dst; + +	    sa = s >> 24; + +	    if (m) +	    { +		if (sa == 0xff && m == 0xff) +		{ +		    *dst = s; +		} +		else +		{ +		    __m128i ms, md, ma, msa; + +		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); +		    ms = unpack_32_1x128 (s); +		    md = unpack_32_1x128 (d); + +		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); + +		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); +		} +	    } + +	    dst++; +            w--; +        } +    } + +} + +static void +sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, +				    pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src; +    uint32_t    *dst_line, *dst; +    __m128i xmm_src; +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_dsta_hi, xmm_dsta_lo; +    int dst_stride; +    int32_t w; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + +    xmm_src = expand_pixel_32_1x128 (src); + +    while (height--) +    { +	dst = dst_line; + +	dst_line += dst_stride; +	w = width; + +	while (w && (uintptr_t)dst & 15) +	{ +	    __m128i vd; + +	    vd = unpack_32_1x128 (*dst); + +	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), +					      xmm_src)); +	    w--; +	    dst++; +	} + +	while (w >= 4) +	{ +	    __m128i tmp_lo, tmp_hi; + +	    xmm_dst = load_128_aligned ((__m128i*)dst); + +	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); +	    expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi); + +	    tmp_lo = xmm_src; +	    tmp_hi = xmm_src; + +	    over_2x128 (&xmm_dst_lo, &xmm_dst_hi, +			&xmm_dsta_lo, &xmm_dsta_hi, +			&tmp_lo, &tmp_hi); + +	    save_128_aligned ( +		(__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi)); + +	    w -= 4; +	    dst += 4; +	} + +	while (w) +	{ +	    __m128i vd; + +	    vd = unpack_32_1x128 (*dst); + +	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), +					      xmm_src)); +	    w--; +	    dst++; +	} + +    } + +} + +static void +sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, +				    pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t    *src, *src_line, s; +    uint32_t    *dst, *dst_line, d; +    uint32_t    *mask, *mask_line; +    uint32_t    m; +    int src_stride, mask_stride, dst_stride; +    int32_t w; + +    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +    while (height--) +    { +        src = src_line; +        src_line += src_stride; +        dst = dst_line; +        dst_line += dst_stride; +        mask = mask_line; +        mask_line += mask_stride; + +        w = width; + +        while (w && (uintptr_t)dst & 15) +        { +	    uint32_t sa; + +            s = *src++; +            m = (*mask++) >> 24; +            d = *dst; + +	    sa = s >> 24; + +	    if (m) +	    { +		if (sa == 0xff && m == 0xff) +		{ +		    *dst = s; +		} +		else +		{ +		    __m128i ms, md, ma, msa; + +		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); +		    ms = unpack_32_1x128 (s); +		    md = unpack_32_1x128 (d); + +		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); + +		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); +		} +	    } + +	    dst++; +            w--; +        } + +        while (w >= 4) +        { +	    xmm_mask = load_128_unaligned ((__m128i*)mask); + +	    if (!is_transparent (xmm_mask)) +	    { +		xmm_src = load_128_unaligned ((__m128i*)src); + +		if (is_opaque (xmm_mask) && is_opaque (xmm_src)) +		{ +		    save_128_aligned ((__m128i *)dst, xmm_src); +		} +		else +		{ +		    xmm_dst = load_128_aligned ((__m128i *)dst); + +		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); +		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); +		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + +		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); +		    expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + +		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, +				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); + +		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); +		} +	    } + +            src += 4; +            dst += 4; +            mask += 4; +            w -= 4; +        } + +        while (w) +        { +	    uint32_t sa; + +            s = *src++; +            m = (*mask++) >> 24; +            d = *dst; + +	    sa = s >> 24; + +	    if (m) +	    { +		if (sa == 0xff && m == 0xff) +		{ +		    *dst = s; +		} +		else +		{ +		    __m128i ms, md, ma, msa; + +		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); +		    ms = unpack_32_1x128 (s); +		    md = unpack_32_1x128 (d); + +		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); + +		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); +		} +	    } + +	    dst++; +            w--; +        } +    } + +} + +/* A variant of 'sse2_combine_over_u' with minor tweaks */ +static force_inline void +scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd, +                                             const uint32_t* ps, +                                             int32_t         w, +                                             pixman_fixed_t  vx, +                                             pixman_fixed_t  unit_x, +                                             pixman_fixed_t  src_width_fixed, +                                             pixman_bool_t   fully_transparent_src) +{ +    uint32_t s, d; +    const uint32_t* pm = NULL; + +    __m128i xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_src_lo, xmm_src_hi; +    __m128i xmm_alpha_lo, xmm_alpha_hi; + +    if (fully_transparent_src) +	return; + +    /* Align dst on a 16-byte boundary */ +    while (w && ((uintptr_t)pd & 15)) +    { +	d = *pd; +	s = combine1 (ps + pixman_fixed_to_int (vx), pm); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; + +	*pd++ = core_combine_over_u_pixel_sse2 (s, d); +	if (pm) +	    pm++; +	w--; +    } + +    while (w >= 4) +    { +	__m128i tmp; +	uint32_t tmp1, tmp2, tmp3, tmp4; + +	tmp1 = *(ps + pixman_fixed_to_int (vx)); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; +	tmp2 = *(ps + pixman_fixed_to_int (vx)); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; +	tmp3 = *(ps + pixman_fixed_to_int (vx)); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; +	tmp4 = *(ps + pixman_fixed_to_int (vx)); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; + +	tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); + +	xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm); + +	if (is_opaque (xmm_src_hi)) +	{ +	    save_128_aligned ((__m128i*)pd, xmm_src_hi); +	} +	else if (!is_zero (xmm_src_hi)) +	{ +	    xmm_dst_hi = load_128_aligned ((__m128i*) pd); + +	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); +	    unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + +	    expand_alpha_2x128 ( +		xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); + +	    over_2x128 (&xmm_src_lo, &xmm_src_hi, +			&xmm_alpha_lo, &xmm_alpha_hi, +			&xmm_dst_lo, &xmm_dst_hi); + +	    /* rebuid the 4 pixel data and save*/ +	    save_128_aligned ((__m128i*)pd, +			      pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); +	} + +	w -= 4; +	pd += 4; +	if (pm) +	    pm += 4; +    } + +    while (w) +    { +	d = *pd; +	s = combine1 (ps + pixman_fixed_to_int (vx), pm); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; + +	*pd++ = core_combine_over_u_pixel_sse2 (s, d); +	if (pm) +	    pm++; + +	w--; +    } +} + +FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER, +		       scaled_nearest_scanline_sse2_8888_8888_OVER, +		       uint32_t, uint32_t, COVER) +FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER, +		       scaled_nearest_scanline_sse2_8888_8888_OVER, +		       uint32_t, uint32_t, NONE) +FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER, +		       scaled_nearest_scanline_sse2_8888_8888_OVER, +		       uint32_t, uint32_t, PAD) +FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER, +		       scaled_nearest_scanline_sse2_8888_8888_OVER, +		       uint32_t, uint32_t, NORMAL) + +static force_inline void +scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, +					       uint32_t *       dst, +					       const uint32_t * src, +					       int32_t          w, +					       pixman_fixed_t   vx, +					       pixman_fixed_t   unit_x, +					       pixman_fixed_t   src_width_fixed, +					       pixman_bool_t    zero_src) +{ +    __m128i xmm_mask; +    __m128i xmm_src, xmm_src_lo, xmm_src_hi; +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; +    __m128i xmm_alpha_lo, xmm_alpha_hi; + +    if (zero_src || (*mask >> 24) == 0) +	return; + +    xmm_mask = create_mask_16_128 (*mask >> 24); + +    while (w && (uintptr_t)dst & 15) +    { +	uint32_t s = *(src + pixman_fixed_to_int (vx)); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; + +	if (s) +	{ +	    uint32_t d = *dst; + +	    __m128i ms = unpack_32_1x128 (s); +	    __m128i alpha     = expand_alpha_1x128 (ms); +	    __m128i dest      = xmm_mask; +	    __m128i alpha_dst = unpack_32_1x128 (d); + +	    *dst = pack_1x128_32 ( +		in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); +	} +	dst++; +	w--; +    } + +    while (w >= 4) +    { +	uint32_t tmp1, tmp2, tmp3, tmp4; + +	tmp1 = *(src + pixman_fixed_to_int (vx)); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; +	tmp2 = *(src + pixman_fixed_to_int (vx)); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; +	tmp3 = *(src + pixman_fixed_to_int (vx)); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; +	tmp4 = *(src + pixman_fixed_to_int (vx)); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; + +	xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); + +	if (!is_zero (xmm_src)) +	{ +	    xmm_dst = load_128_aligned ((__m128i*)dst); + +	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); +	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); +	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, +			        &xmm_alpha_lo, &xmm_alpha_hi); + +	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, +			   &xmm_alpha_lo, &xmm_alpha_hi, +			   &xmm_mask, &xmm_mask, +			   &xmm_dst_lo, &xmm_dst_hi); + +	    save_128_aligned ( +		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); +	} + +	dst += 4; +	w -= 4; +    } + +    while (w) +    { +	uint32_t s = *(src + pixman_fixed_to_int (vx)); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; + +	if (s) +	{ +	    uint32_t d = *dst; + +	    __m128i ms = unpack_32_1x128 (s); +	    __m128i alpha = expand_alpha_1x128 (ms); +	    __m128i mask  = xmm_mask; +	    __m128i dest  = unpack_32_1x128 (d); + +	    *dst = pack_1x128_32 ( +		in_over_1x128 (&ms, &alpha, &mask, &dest)); +	} + +	dst++; +	w--; +    } + +} + +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, +			      scaled_nearest_scanline_sse2_8888_n_8888_OVER, +			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE) +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, +			      scaled_nearest_scanline_sse2_8888_n_8888_OVER, +			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE) +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, +			      scaled_nearest_scanline_sse2_8888_n_8888_OVER, +			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, +			      scaled_nearest_scanline_sse2_8888_n_8888_OVER, +			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE) + +#if PSHUFD_IS_FAST + +/***********************************************************************************/ + +# define BILINEAR_DECLARE_VARIABLES						\ +    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\ +    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\ +    const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\ +    const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\ +					   unit_x, -unit_x, unit_x, -unit_x);	\ +    const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4,		\ +					   unit_x * 4, -unit_x * 4,		\ +					   unit_x * 4, -unit_x * 4,		\ +					   unit_x * 4, -unit_x * 4);		\ +    const __m128i xmm_zero = _mm_setzero_si128 ();				\ +    __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3,	\ +				   vx + unit_x * 2, -(vx + 1) - unit_x * 2,	\ +				   vx + unit_x * 1, -(vx + 1) - unit_x * 1,	\ +				   vx + unit_x * 0, -(vx + 1) - unit_x * 0);	\ +    __m128i xmm_wh_state; + +#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_)			\ +do {										\ +    int phase = phase_;								\ +    __m128i xmm_wh, xmm_a, xmm_b;						\ +    /* fetch 2x2 pixel block into sse2 registers */				\ +    __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);		\ +    __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);		\ +    vx += unit_x;								\ +    /* vertical interpolation */						\ +    xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt);	\ +    xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb);	\ +    xmm_a = _mm_add_epi16 (xmm_a, xmm_b);						\ +    /* calculate horizontal weights */						\ +    if (phase <= 0)								\ +    {										\ +	xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\ +					16 - BILINEAR_INTERPOLATION_BITS));	\ +	xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4);		\ +	phase = 0;								\ +    }										\ +    xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase,	\ +							   phase, phase));	\ +    /* horizontal interpolation */						\ +    xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\ +		xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh);		\ +    /* shift the result */							\ +    pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);		\ +} while (0) + +#else /************************************************************************/ + +# define BILINEAR_DECLARE_VARIABLES						\ +    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\ +    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\ +    const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\ +    const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\ +					  unit_x, -unit_x, unit_x, -unit_x);	\ +    const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4,		\ +					   unit_x * 4, -unit_x * 4,		\ +					   unit_x * 4, -unit_x * 4,		\ +					   unit_x * 4, -unit_x * 4);		\ +    const __m128i xmm_zero = _mm_setzero_si128 ();				\ +    __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1),		\ +				   vx, -(vx + 1), vx, -(vx + 1)) + +#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase)			\ +do {										\ +    __m128i xmm_wh, xmm_a, xmm_b;						\ +    /* fetch 2x2 pixel block into sse2 registers */				\ +    __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);		\ +    __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);		\ +    (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */		\ +    vx += unit_x;								\ +    /* vertical interpolation */						\ +    xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt);	\ +    xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb);	\ +    xmm_a = _mm_add_epi16 (xmm_a, xmm_b);					\ +    /* calculate horizontal weights */						\ +    xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,			\ +					16 - BILINEAR_INTERPOLATION_BITS));	\ +    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);					\ +    /* horizontal interpolation */						\ +    xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a);	\ +    xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh);		\ +    /* shift the result */							\ +    pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);		\ +} while (0) + +/***********************************************************************************/ + +#endif + +#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix);					\ +do {										\ +	__m128i xmm_pix;							\ +	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1);			\ +	xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix);				\ +	xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix);				\ +	pix = _mm_cvtsi128_si32 (xmm_pix);					\ +} while(0) + +#define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix);					\ +do {										\ +	__m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4;				\ +	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0);			\ +	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1);			\ +	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2);			\ +	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3);			\ +	xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2);			\ +	xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4);			\ +	pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3);				\ +} while(0) + +#define BILINEAR_SKIP_ONE_PIXEL()						\ +do {										\ +    vx += unit_x;								\ +    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);					\ +} while(0) + +#define BILINEAR_SKIP_FOUR_PIXELS()						\ +do {										\ +    vx += unit_x * 4;								\ +    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4);					\ +} while(0) + +/***********************************************************************************/ + +static force_inline void +scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst, +					     const uint32_t * mask, +					     const uint32_t * src_top, +					     const uint32_t * src_bottom, +					     int32_t          w, +					     int              wt, +					     int              wb, +					     pixman_fixed_t   vx_, +					     pixman_fixed_t   unit_x_, +					     pixman_fixed_t   max_vx, +					     pixman_bool_t    zero_src) +{ +    intptr_t vx = vx_; +    intptr_t unit_x = unit_x_; +    BILINEAR_DECLARE_VARIABLES; +    uint32_t pix1, pix2; + +    while (w && ((uintptr_t)dst & 15)) +    { +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); +	*dst++ = pix1; +	w--; +    } + +    while ((w -= 4) >= 0) { +	__m128i xmm_src; +	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); +	_mm_store_si128 ((__m128i *)dst, xmm_src); +	dst += 4; +    } + +    if (w & 2) +    { +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); +	*dst++ = pix1; +	*dst++ = pix2; +    } + +    if (w & 1) +    { +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); +	*dst = pix1; +    } + +} + +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC, +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, +			       uint32_t, uint32_t, uint32_t, +			       COVER, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC, +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, +			       uint32_t, uint32_t, uint32_t, +			       PAD, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC, +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, +			       uint32_t, uint32_t, uint32_t, +			       NONE, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC, +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, +			       uint32_t, uint32_t, uint32_t, +			       NORMAL, FLAG_NONE) + +static force_inline void +scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t *       dst, +					     const uint32_t * mask, +					     const uint32_t * src_top, +					     const uint32_t * src_bottom, +					     int32_t          w, +					     int              wt, +					     int              wb, +					     pixman_fixed_t   vx_, +					     pixman_fixed_t   unit_x_, +					     pixman_fixed_t   max_vx, +					     pixman_bool_t    zero_src) +{ +    intptr_t vx = vx_; +    intptr_t unit_x = unit_x_; +    BILINEAR_DECLARE_VARIABLES; +    uint32_t pix1, pix2; + +    while (w && ((uintptr_t)dst & 15)) +    { +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); +	*dst++ = pix1 | 0xFF000000; +	w--; +    } + +    while ((w -= 4) >= 0) { +	__m128i xmm_src; +	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); +	_mm_store_si128 ((__m128i *)dst, _mm_or_si128 (xmm_src, mask_ff000000)); +	dst += 4; +    } + +    if (w & 2) +    { +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); +	*dst++ = pix1 | 0xFF000000; +	*dst++ = pix2 | 0xFF000000; +    } + +    if (w & 1) +    { +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); +	*dst = pix1 | 0xFF000000; +    } +} + +FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC, +			       scaled_bilinear_scanline_sse2_x888_8888_SRC, +			       uint32_t, uint32_t, uint32_t, +			       COVER, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC, +			       scaled_bilinear_scanline_sse2_x888_8888_SRC, +			       uint32_t, uint32_t, uint32_t, +			       PAD, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC, +			       scaled_bilinear_scanline_sse2_x888_8888_SRC, +			       uint32_t, uint32_t, uint32_t, +			       NORMAL, FLAG_NONE) + +static force_inline void +scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst, +					      const uint32_t * mask, +					      const uint32_t * src_top, +					      const uint32_t * src_bottom, +					      int32_t          w, +					      int              wt, +					      int              wb, +					      pixman_fixed_t   vx_, +					      pixman_fixed_t   unit_x_, +					      pixman_fixed_t   max_vx, +					      pixman_bool_t    zero_src) +{ +    intptr_t vx = vx_; +    intptr_t unit_x = unit_x_; +    BILINEAR_DECLARE_VARIABLES; +    uint32_t pix1, pix2; + +    while (w && ((uintptr_t)dst & 15)) +    { +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + +	if (pix1) +	{ +	    pix2 = *dst; +	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2); +	} + +	w--; +	dst++; +    } + +    while (w  >= 4) +    { +	__m128i xmm_src; +	__m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo; +	__m128i xmm_alpha_hi, xmm_alpha_lo; + +	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); + +	if (!is_zero (xmm_src)) +	{ +	    if (is_opaque (xmm_src)) +	    { +		save_128_aligned ((__m128i *)dst, xmm_src); +	    } +	    else +	    { +		__m128i xmm_dst = load_128_aligned ((__m128i *)dst); + +		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); +		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + +		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); +		over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, +			    &xmm_dst_lo, &xmm_dst_hi); + +		save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); +	    } +	} + +	w -= 4; +	dst += 4; +    } + +    while (w) +    { +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + +	if (pix1) +	{ +	    pix2 = *dst; +	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2); +	} + +	w--; +	dst++; +    } +} + +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER, +			       scaled_bilinear_scanline_sse2_8888_8888_OVER, +			       uint32_t, uint32_t, uint32_t, +			       COVER, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER, +			       scaled_bilinear_scanline_sse2_8888_8888_OVER, +			       uint32_t, uint32_t, uint32_t, +			       PAD, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER, +			       scaled_bilinear_scanline_sse2_8888_8888_OVER, +			       uint32_t, uint32_t, uint32_t, +			       NONE, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER, +			       scaled_bilinear_scanline_sse2_8888_8888_OVER, +			       uint32_t, uint32_t, uint32_t, +			       NORMAL, FLAG_NONE) + +static force_inline void +scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst, +						const uint8_t  * mask, +						const uint32_t * src_top, +						const uint32_t * src_bottom, +						int32_t          w, +						int              wt, +						int              wb, +						pixman_fixed_t   vx_, +						pixman_fixed_t   unit_x_, +						pixman_fixed_t   max_vx, +						pixman_bool_t    zero_src) +{ +    intptr_t vx = vx_; +    intptr_t unit_x = unit_x_; +    BILINEAR_DECLARE_VARIABLES; +    uint32_t pix1, pix2; +    uint32_t m; + +    while (w && ((uintptr_t)dst & 15)) +    { +	uint32_t sa; + +	m = (uint32_t) *mask++; + +	if (m) +	{ +	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); +	    sa = pix1 >> 24; + +	    if (sa == 0xff && m == 0xff) +	    { +		*dst = pix1; +	    } +	    else +	    { +		__m128i ms, md, ma, msa; + +		pix2 = *dst; +		ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); +		ms = unpack_32_1x128 (pix1); +		md = unpack_32_1x128 (pix2); + +		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); + +		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); +	    } +	} +	else +	{ +	    BILINEAR_SKIP_ONE_PIXEL (); +	} + +	w--; +	dst++; +    } + +    while (w >= 4) +    { +	__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; +	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; +	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + +        memcpy(&m, mask, sizeof(uint32_t)); + +	if (m) +	{ +	    BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); + +	    if (m == 0xffffffff && is_opaque (xmm_src)) +	    { +		save_128_aligned ((__m128i *)dst, xmm_src); +	    } +	    else +	    { +		xmm_dst = load_128_aligned ((__m128i *)dst); + +		xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); + +		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); +		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); +		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + +		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); +		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + +		in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, +			       &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); + +		save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); +	    } +	} +	else +	{ +	    BILINEAR_SKIP_FOUR_PIXELS (); +	} + +	w -= 4; +	dst += 4; +	mask += 4; +    } + +    while (w) +    { +	uint32_t sa; + +	m = (uint32_t) *mask++; + +	if (m) +	{ +	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); +	    sa = pix1 >> 24; + +	    if (sa == 0xff && m == 0xff) +	    { +		*dst = pix1; +	    } +	    else +	    { +		__m128i ms, md, ma, msa; + +		pix2 = *dst; +		ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); +		ms = unpack_32_1x128 (pix1); +		md = unpack_32_1x128 (pix2); + +		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); + +		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); +	    } +	} +	else +	{ +	    BILINEAR_SKIP_ONE_PIXEL (); +	} + +	w--; +	dst++; +    } +} + +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER, +			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, +			       uint32_t, uint8_t, uint32_t, +			       COVER, FLAG_HAVE_NON_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER, +			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, +			       uint32_t, uint8_t, uint32_t, +			       PAD, FLAG_HAVE_NON_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER, +			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, +			       uint32_t, uint8_t, uint32_t, +			       NONE, FLAG_HAVE_NON_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER, +			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, +			       uint32_t, uint8_t, uint32_t, +			       NORMAL, FLAG_HAVE_NON_SOLID_MASK) + +static force_inline void +scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst, +						const uint32_t * mask, +						const uint32_t * src_top, +						const uint32_t * src_bottom, +						int32_t          w, +						int              wt, +						int              wb, +						pixman_fixed_t   vx_, +						pixman_fixed_t   unit_x_, +						pixman_fixed_t   max_vx, +						pixman_bool_t    zero_src) +{ +    intptr_t vx = vx_; +    intptr_t unit_x = unit_x_; +    BILINEAR_DECLARE_VARIABLES; +    uint32_t pix1; +    __m128i xmm_mask; + +    if (zero_src || (*mask >> 24) == 0) +	return; + +    xmm_mask = create_mask_16_128 (*mask >> 24); + +    while (w && ((uintptr_t)dst & 15)) +    { +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); +	if (pix1) +	{ +		uint32_t d = *dst; + +		__m128i ms = unpack_32_1x128 (pix1); +		__m128i alpha     = expand_alpha_1x128 (ms); +		__m128i dest      = xmm_mask; +		__m128i alpha_dst = unpack_32_1x128 (d); + +		*dst = pack_1x128_32 +			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); +	} + +	dst++; +	w--; +    } + +    while (w >= 4) +    { +	__m128i xmm_src; +	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); + +	if (!is_zero (xmm_src)) +	{ +	    __m128i xmm_src_lo, xmm_src_hi; +	    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; +	    __m128i xmm_alpha_lo, xmm_alpha_hi; + +	    xmm_dst = load_128_aligned ((__m128i*)dst); + +	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); +	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); +	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, +				&xmm_alpha_lo, &xmm_alpha_hi); + +	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, +			   &xmm_alpha_lo, &xmm_alpha_hi, +			   &xmm_mask, &xmm_mask, +			   &xmm_dst_lo, &xmm_dst_hi); + +	    save_128_aligned +		((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); +	} + +	dst += 4; +	w -= 4; +    } + +    while (w) +    { +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); +	if (pix1) +	{ +		uint32_t d = *dst; + +		__m128i ms = unpack_32_1x128 (pix1); +		__m128i alpha     = expand_alpha_1x128 (ms); +		__m128i dest      = xmm_mask; +		__m128i alpha_dst = unpack_32_1x128 (d); + +		*dst = pack_1x128_32 +			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); +	} + +	dst++; +	w--; +    } +} + +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, +			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER, +			       uint32_t, uint32_t, uint32_t, +			       COVER, FLAG_HAVE_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, +			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER, +			       uint32_t, uint32_t, uint32_t, +			       PAD, FLAG_HAVE_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, +			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER, +			       uint32_t, uint32_t, uint32_t, +			       NONE, FLAG_HAVE_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, +			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER, +			       uint32_t, uint32_t, uint32_t, +			       NORMAL, FLAG_HAVE_SOLID_MASK) + +static const pixman_fast_path_t sse2_fast_paths[] = +{ +    /* PIXMAN_OP_OVER */ +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565), +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565), +    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565), +    PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565), +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca), +    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888), +    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888), +    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888), +    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888), +    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565), +    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565), +    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), +    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), +     +    /* PIXMAN_OP_OVER_REVERSE */ +    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888), +    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888), + +    /* PIXMAN_OP_ADD */ +    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8), +    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888), +    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888), +    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8), +    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8), +    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888), +    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888), +    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888), +    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888), +    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888), +    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888), +    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888), +    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888), + +    /* PIXMAN_OP_SRC */ +    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888), +    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888), +    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888), +    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888), +    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565), +    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565), +    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565), +    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565), +    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888), +    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888), +    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area), +    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area), +    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), +    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), +    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), +    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), +    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area), +    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area), + +    /* PIXMAN_OP_IN */ +    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8), +    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8), +    PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8), + +    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), +    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), +    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), +    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), + +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), + +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888), +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888), +    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888), +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888), +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888), +    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888), + +    SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888), +    SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888), +    SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888), +    SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888), +    SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888), +    SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888), + +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), + +    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), +    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), +    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), +    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), + +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888), +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888), + +    { PIXMAN_OP_NONE }, +}; + +static uint32_t * +sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) +{ +    int w = iter->width; +    __m128i ff000000 = mask_ff000000; +    uint32_t *dst = iter->buffer; +    uint32_t *src = (uint32_t *)iter->bits; + +    iter->bits += iter->stride; + +    while (w && ((uintptr_t)dst) & 0x0f) +    { +	*dst++ = (*src++) | 0xff000000; +	w--; +    } + +    while (w >= 4) +    { +	save_128_aligned ( +	    (__m128i *)dst, _mm_or_si128 ( +		load_128_unaligned ((__m128i *)src), ff000000)); + +	dst += 4; +	src += 4; +	w -= 4; +    } + +    while (w) +    { +	*dst++ = (*src++) | 0xff000000; +	w--; +    } + +    return iter->buffer; +} + +static uint32_t * +sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) +{ +    int w = iter->width; +    uint32_t *dst = iter->buffer; +    uint16_t *src = (uint16_t *)iter->bits; +    __m128i ff000000 = mask_ff000000; + +    iter->bits += iter->stride; + +    while (w && ((uintptr_t)dst) & 0x0f) +    { +	uint16_t s = *src++; + +	*dst++ = convert_0565_to_8888 (s); +	w--; +    } + +    while (w >= 8) +    { +	__m128i lo, hi, s; + +	s = _mm_loadu_si128 ((__m128i *)src); + +	lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ())); +	hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ())); + +	save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000)); +	save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000)); + +	dst += 8; +	src += 8; +	w -= 8; +    } + +    while (w) +    { +	uint16_t s = *src++; + +	*dst++ = convert_0565_to_8888 (s); +	w--; +    } + +    return iter->buffer; +} + +static uint32_t * +sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) +{ +    int w = iter->width; +    uint32_t *dst = iter->buffer; +    uint8_t *src = iter->bits; +    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; + +    iter->bits += iter->stride; + +    while (w && (((uintptr_t)dst) & 15)) +    { +        *dst++ = (uint32_t)(*(src++)) << 24; +        w--; +    } + +    while (w >= 16) +    { +	xmm0 = _mm_loadu_si128((__m128i *)src); + +	xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0); +	xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0); +	xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1); +	xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1); +	xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2); +	xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2); + +	_mm_store_si128(((__m128i *)(dst +  0)), xmm3); +	_mm_store_si128(((__m128i *)(dst +  4)), xmm4); +	_mm_store_si128(((__m128i *)(dst +  8)), xmm5); +	_mm_store_si128(((__m128i *)(dst + 12)), xmm6); + +	dst += 16; +	src += 16; +	w -= 16; +    } + +    while (w) +    { +	*dst++ = (uint32_t)(*(src++)) << 24; +	w--; +    } + +    return iter->buffer; +} + +#define IMAGE_FLAGS							\ +    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\ +     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) + +static const pixman_iter_info_t sse2_iters[] =  +{ +    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW, +      _pixman_iter_init_bits_stride, sse2_fetch_x8r8g8b8, NULL +    }, +    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW, +      _pixman_iter_init_bits_stride, sse2_fetch_r5g6b5, NULL +    }, +    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW, +      _pixman_iter_init_bits_stride, sse2_fetch_a8, NULL +    }, +    { PIXMAN_null }, +}; + +#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) +__attribute__((__force_align_arg_pointer__)) +#endif +pixman_implementation_t * +_pixman_implementation_create_sse2 (pixman_implementation_t *fallback) +{ +    pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths); + +    /* SSE2 constants */ +    mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000); +    mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000); +    mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0); +    mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f); +    mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000); +    mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00); +    mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8); +    mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0); +    mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000); +    mask_0080 = create_mask_16_128 (0x0080); +    mask_00ff = create_mask_16_128 (0x00ff); +    mask_0101 = create_mask_16_128 (0x0101); +    mask_ffff = create_mask_16_128 (0xffff); +    mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000); +    mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000); +    mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8); +    mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004); + +    /* Set up function pointers */ +    imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u; +    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u; +    imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u; +    imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u; +    imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u; +    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u; +    imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u; +    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u; +    imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u; +    imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u; + +    imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u; + +    imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca; +    imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca; +    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca; +    imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca; +    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca; +    imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca; +    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca; +    imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca; +    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca; +    imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca; +    imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca; + +    imp->blt = sse2_blt; +    imp->fill = sse2_fill; + +    imp->iter_info = sse2_iters; + +    return imp; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-ssse3.c b/libs/pixman-0.40.0/pixman/pixman-ssse3.c new file mode 100644 index 0000000..680d6b9 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-ssse3.c @@ -0,0 +1,351 @@ +/* + * Copyright © 2013 Soren Sandmann Pedersen + * Copyright © 2013 Red Hat, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + *  + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Soren Sandmann (soren.sandmann@gmail.com) + */ +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <stdlib.h> +#include <mmintrin.h> +#include <xmmintrin.h> +#include <emmintrin.h> +#include <tmmintrin.h> +#include "pixman-private.h" +#include "pixman-inlines.h" + +typedef struct +{ +    int		y; +    uint64_t *	buffer; +} line_t; + +typedef struct +{ +    line_t		lines[2]; +    pixman_fixed_t	y; +    pixman_fixed_t	x; +    uint64_t		data[1]; +} bilinear_info_t; + +static void +ssse3_fetch_horizontal (bits_image_t *image, line_t *line, +			int y, pixman_fixed_t x, pixman_fixed_t ux, int n) +{ +    uint32_t *bits = image->bits + y * image->rowstride; +    __m128i vx = _mm_set_epi16 ( +	- (x + 1), x, - (x + 1), x, +	- (x + ux + 1), x + ux,  - (x + ux + 1), x + ux); +    __m128i vux = _mm_set_epi16 ( +	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux, +	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux); +    __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0); +    __m128i *b = (__m128i *)line->buffer; +    __m128i vrl0, vrl1; + +    while ((n -= 2) >= 0) +    { +	__m128i vw, vr, s; + +	vrl1 = _mm_loadl_epi64 ( +	    (__m128i *)(bits + pixman_fixed_to_int (x + ux))); +	/* vrl1: R1, L1 */ + +    final_pixel: +	vrl0 = _mm_loadl_epi64 ( +	    (__m128i *)(bits + pixman_fixed_to_int (x))); +	/* vrl0: R0, L0 */ + +	/* The weights are based on vx which is a vector of  +	 * +	 *    - (x + 1), x, - (x + 1), x, +	 *          - (x + ux + 1), x + ux, - (x + ux + 1), x + ux +	 * +	 * so the 16 bit weights end up like this: +	 * +	 *    iw0, w0, iw0, w0, iw1, w1, iw1, w1 +	 * +	 * and after shifting and packing, we get these bytes: +	 * +	 *    iw0, w0, iw0, w0, iw1, w1, iw1, w1, +	 *        iw0, w0, iw0, w0, iw1, w1, iw1, w1, +	 * +	 * which means the first and the second input pixel  +	 * have to be interleaved like this: +	 * +	 *    la0, ra0, lr0, rr0, la1, ra1, lr1, rr1, +	 *        lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1 +	 * +	 * before maddubsw can be used. +	 */ + +	vw = _mm_add_epi16 ( +	    vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS)); +	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1 +	 */ + +	vw = _mm_packus_epi16 (vw, vw); +	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1, +	 *         iw0, w0, iw0, w0, iw1, w1, iw1, w1 +	 */ +	vx = _mm_add_epi16 (vx, vux); + +	x += 2 * ux; + +	vr = _mm_unpacklo_epi16 (vrl1, vrl0); +	/* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */ + +	s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2)); +	/* s:  lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */ + +	vr = _mm_unpackhi_epi8 (vr, s); +	/* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1, +	 *         lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1 +	 */ + +	vr = _mm_maddubs_epi16 (vr, vw); + +	/* When the weight is 0, the inverse weight is +	 * 128 which can't be represented in a signed byte. +	 * As a result maddubsw computes the following: +	 * +	 *     r = l * -128 + r * 0 +	 * +	 * rather than the desired +	 * +	 *     r = l * 128 + r * 0 +	 * +	 * We fix this by taking the absolute value of the +	 * result. +	 */ +	vr = _mm_abs_epi16 (vr); + +	/* vr: A0, R0, A1, R1, G0, B0, G1, B1 */ +	_mm_store_si128 (b++, vr); +    } + +    if (n == -1) +    { +	vrl1 = _mm_setzero_si128(); +	goto final_pixel; +    } + +    line->y = y; +} + +static uint32_t * +ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask) +{ +    pixman_fixed_t fx, ux; +    bilinear_info_t *info = iter->data; +    line_t *line0, *line1; +    int y0, y1; +    int32_t dist_y; +    __m128i vw; +    int i; + +    fx = info->x; +    ux = iter->image->common.transform->matrix[0][0]; + +    y0 = pixman_fixed_to_int (info->y); +    y1 = y0 + 1; + +    line0 = &info->lines[y0 & 0x01]; +    line1 = &info->lines[y1 & 0x01]; + +    if (line0->y != y0) +    { +	ssse3_fetch_horizontal ( +	    &iter->image->bits, line0, y0, fx, ux, iter->width); +    } + +    if (line1->y != y1) +    { +	ssse3_fetch_horizontal ( +	    &iter->image->bits, line1, y1, fx, ux, iter->width); +    } + +    dist_y = pixman_fixed_to_bilinear_weight (info->y); +    dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS); + +    vw = _mm_set_epi16 ( +	dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y); + +    for (i = 0; i + 3 < iter->width; i += 4) +    { +	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i)); +	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i)); +	__m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2)); +	__m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2)); +	__m128i r0, r1, tmp, p; + +	r0 = _mm_mulhi_epu16 ( +	    _mm_sub_epi16 (bot0, top0), vw); +	tmp = _mm_cmplt_epi16 (bot0, top0); +	tmp = _mm_and_si128 (tmp, vw); +	r0 = _mm_sub_epi16 (r0, tmp); +	r0 = _mm_add_epi16 (r0, top0); +	r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS); +	/* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */ +	r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1)); +	/* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */ + +	r1 = _mm_mulhi_epu16 ( +	    _mm_sub_epi16 (bot1, top1), vw); +	tmp = _mm_cmplt_epi16 (bot1, top1); +	tmp = _mm_and_si128 (tmp, vw); +	r1 = _mm_sub_epi16 (r1, tmp); +	r1 = _mm_add_epi16 (r1, top1); +	r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS); +	r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1)); +	/* r1: A3 R3 G3 B3 A2 R2 G2 B2 */ + +	p = _mm_packus_epi16 (r0, r1); + +	_mm_storeu_si128 ((__m128i *)(iter->buffer + i), p); +    } + +    while (i < iter->width) +    { +	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i)); +	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i)); +	__m128i r0, tmp, p; + +	r0 = _mm_mulhi_epu16 ( +	    _mm_sub_epi16 (bot0, top0), vw); +	tmp = _mm_cmplt_epi16 (bot0, top0); +	tmp = _mm_and_si128 (tmp, vw); +	r0 = _mm_sub_epi16 (r0, tmp); +	r0 = _mm_add_epi16 (r0, top0); +	r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS); +	/* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */ +	r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1)); +	/* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */ + +	p = _mm_packus_epi16 (r0, r0); + +	if (iter->width - i == 1) +	{ +	    *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p); +	    i++; +	} +	else +	{ +	    _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p); +	    i += 2; +	} +    } +     +    info->y += iter->image->common.transform->matrix[1][1]; + +    return iter->buffer; +} + +static void +ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter) +{ +    free (iter->data); +} + +static void +ssse3_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info) +{ +    int width = iter->width; +    bilinear_info_t *info; +    pixman_vector_t v; + +    /* Reference point is the center of the pixel */ +    v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2; +    v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2; +    v.vector[2] = pixman_fixed_1; + +    if (!pixman_transform_point_3d (iter->image->common.transform, &v)) +	goto fail; + +    info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t) + 64); +    if (!info) +	goto fail; + +    info->x = v.vector[0] - pixman_fixed_1 / 2; +    info->y = v.vector[1] - pixman_fixed_1 / 2; + +#define ALIGN(addr)							\ +    ((void *)((((uintptr_t)(addr)) + 15) & (~15))) + +    /* It is safe to set the y coordinates to -1 initially +     * because COVER_CLIP_BILINEAR ensures that we will only +     * be asked to fetch lines in the [0, height) interval +     */ +    info->lines[0].y = -1; +    info->lines[0].buffer = ALIGN (&(info->data[0])); +    info->lines[1].y = -1; +    info->lines[1].buffer = ALIGN (info->lines[0].buffer + width); + +    iter->get_scanline = ssse3_fetch_bilinear_cover; +    iter->fini = ssse3_bilinear_cover_iter_fini; + +    iter->data = info; +    return; + +fail: +    /* Something went wrong, either a bad matrix or OOM; in such cases, +     * we don't guarantee any particular rendering. +     */ +    _pixman_log_error ( +	FUNC, "Allocation failure or bad matrix, skipping rendering\n"); +     +    iter->get_scanline = _pixman_iter_get_scanline_noop; +    iter->fini = NULL; +} + +static const pixman_iter_info_t ssse3_iters[] =  +{ +    { PIXMAN_a8r8g8b8, +      (FAST_PATH_STANDARD_FLAGS			| +       FAST_PATH_SCALE_TRANSFORM		| +       FAST_PATH_BILINEAR_FILTER		| +       FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR), +      ITER_NARROW | ITER_SRC, +      ssse3_bilinear_cover_iter_init, +      NULL, NULL +    }, + +    { PIXMAN_null }, +}; + +static const pixman_fast_path_t ssse3_fast_paths[] = +{ +    { PIXMAN_OP_NONE }, +}; + +pixman_implementation_t * +_pixman_implementation_create_ssse3 (pixman_implementation_t *fallback) +{ +    pixman_implementation_t *imp = +	_pixman_implementation_create (fallback, ssse3_fast_paths); + +    imp->iter_info = ssse3_iters; + +    return imp; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-timer.c b/libs/pixman-0.40.0/pixman/pixman-timer.c new file mode 100644 index 0000000..f5ae18e --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-timer.c @@ -0,0 +1,66 @@ +/* + * Copyright © 2007 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Red Hat not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission.  Red Hat makes no representations about the + * suitability of this software for any purpose.  It is provided "as is" + * without express or implied warranty. + * + * RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL RED HAT + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <stdlib.h> +#include <stdio.h> +#include "pixman-private.h" + +#ifdef PIXMAN_TIMERS + +static pixman_timer_t *timers; + +static void +dump_timers (void) +{ +    pixman_timer_t *timer; + +    for (timer = timers; timer != NULL; timer = timer->next) +    { +	printf ("%s:   total: %llu     n: %llu      avg: %f\n", +	        timer->name, +	        timer->total, +	        timer->n_times, +	        timer->total / (double)timer->n_times); +    } +} + +void +pixman_timer_register (pixman_timer_t *timer) +{ +    static int initialized; + +    int atexit (void (*function)(void)); + +    if (!initialized) +    { +	atexit (dump_timers); +	initialized = 1; +    } + +    timer->next = timers; +    timers = timer; +} + +#endif diff --git a/libs/pixman-0.40.0/pixman/pixman-trap.c b/libs/pixman-0.40.0/pixman/pixman-trap.c new file mode 100644 index 0000000..91766fd --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-trap.c @@ -0,0 +1,711 @@ +/* + * Copyright © 2002 Keith Packard, member of The XFree86 Project, Inc. + * Copyright © 2004 Keith Packard + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Keith Packard not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission.  Keith Packard makes no + * representations about the suitability of this software for any purpose.  It + * is provided "as is" without express or implied warranty. + * + * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <stdio.h> +#include <stdlib.h> +#include "pixman-private.h" + +/* + * Compute the smallest value greater than or equal to y which is on a + * grid row. + */ + +PIXMAN_EXPORT pixman_fixed_t +pixman_sample_ceil_y (pixman_fixed_t y, int n) +{ +    pixman_fixed_t f = pixman_fixed_frac (y); +    pixman_fixed_t i = pixman_fixed_floor (y); + +    f = DIV (f - Y_FRAC_FIRST (n) + (STEP_Y_SMALL (n) - pixman_fixed_e), STEP_Y_SMALL (n)) * STEP_Y_SMALL (n) + +	Y_FRAC_FIRST (n); +     +    if (f > Y_FRAC_LAST (n)) +    { +	if (pixman_fixed_to_int (i) == 0x7fff) +	{ +	    f = 0xffff; /* saturate */ +	} +	else +	{ +	    f = Y_FRAC_FIRST (n); +	    i += pixman_fixed_1; +	} +    } +    return (i | f); +} + +/* + * Compute the largest value strictly less than y which is on a + * grid row. + */ +PIXMAN_EXPORT pixman_fixed_t +pixman_sample_floor_y (pixman_fixed_t y, +                       int            n) +{ +    pixman_fixed_t f = pixman_fixed_frac (y); +    pixman_fixed_t i = pixman_fixed_floor (y); + +    f = DIV (f - pixman_fixed_e - Y_FRAC_FIRST (n), STEP_Y_SMALL (n)) * STEP_Y_SMALL (n) + +	Y_FRAC_FIRST (n); + +    if (f < Y_FRAC_FIRST (n)) +    { +	if (pixman_fixed_to_int (i) == 0x8000) +	{ +	    f = 0; /* saturate */ +	} +	else +	{ +	    f = Y_FRAC_LAST (n); +	    i -= pixman_fixed_1; +	} +    } +    return (i | f); +} + +/* + * Step an edge by any amount (including negative values) + */ +PIXMAN_EXPORT void +pixman_edge_step (pixman_edge_t *e, +                  int            n) +{ +    pixman_fixed_48_16_t ne; + +    e->x += n * e->stepx; + +    ne = e->e + n * (pixman_fixed_48_16_t) e->dx; + +    if (n >= 0) +    { +	if (ne > 0) +	{ +	    int nx = (ne + e->dy - 1) / e->dy; +	    e->e = ne - nx * (pixman_fixed_48_16_t) e->dy; +	    e->x += nx * e->signdx; +	} +    } +    else +    { +	if (ne <= -e->dy) +	{ +	    int nx = (-ne) / e->dy; +	    e->e = ne + nx * (pixman_fixed_48_16_t) e->dy; +	    e->x -= nx * e->signdx; +	} +    } +} + +/* + * A private routine to initialize the multi-step + * elements of an edge structure + */ +static void +_pixman_edge_multi_init (pixman_edge_t * e, +                         int             n, +                         pixman_fixed_t *stepx_p, +                         pixman_fixed_t *dx_p) +{ +    pixman_fixed_t stepx; +    pixman_fixed_48_16_t ne; + +    ne = n * (pixman_fixed_48_16_t) e->dx; +    stepx = n * e->stepx; + +    if (ne > 0) +    { +	int nx = ne / e->dy; +	ne -= nx * (pixman_fixed_48_16_t)e->dy; +	stepx += nx * e->signdx; +    } + +    *dx_p = ne; +    *stepx_p = stepx; +} + +/* + * Initialize one edge structure given the line endpoints and a + * starting y value + */ +PIXMAN_EXPORT void +pixman_edge_init (pixman_edge_t *e, +                  int            n, +                  pixman_fixed_t y_start, +                  pixman_fixed_t x_top, +                  pixman_fixed_t y_top, +                  pixman_fixed_t x_bot, +                  pixman_fixed_t y_bot) +{ +    pixman_fixed_t dx, dy; + +    e->x = x_top; +    e->e = 0; +    dx = x_bot - x_top; +    dy = y_bot - y_top; +    e->dy = dy; +    e->dx = 0; + +    if (dy) +    { +	if (dx >= 0) +	{ +	    e->signdx = 1; +	    e->stepx = dx / dy; +	    e->dx = dx % dy; +	    e->e = -dy; +	} +	else +	{ +	    e->signdx = -1; +	    e->stepx = -(-dx / dy); +	    e->dx = -dx % dy; +	    e->e = 0; +	} + +	_pixman_edge_multi_init (e, STEP_Y_SMALL (n), +				 &e->stepx_small, &e->dx_small); + +	_pixman_edge_multi_init (e, STEP_Y_BIG (n), +				 &e->stepx_big, &e->dx_big); +    } +    pixman_edge_step (e, y_start - y_top); +} + +/* + * Initialize one edge structure given a line, starting y value + * and a pixel offset for the line + */ +PIXMAN_EXPORT void +pixman_line_fixed_edge_init (pixman_edge_t *            e, +                             int                        n, +                             pixman_fixed_t             y, +                             const pixman_line_fixed_t *line, +                             int                        x_off, +                             int                        y_off) +{ +    pixman_fixed_t x_off_fixed = pixman_int_to_fixed (x_off); +    pixman_fixed_t y_off_fixed = pixman_int_to_fixed (y_off); +    const pixman_point_fixed_t *top, *bot; + +    if (line->p1.y <= line->p2.y) +    { +	top = &line->p1; +	bot = &line->p2; +    } +    else +    { +	top = &line->p2; +	bot = &line->p1; +    } +     +    pixman_edge_init (e, n, y, +                      top->x + x_off_fixed, +                      top->y + y_off_fixed, +                      bot->x + x_off_fixed, +                      bot->y + y_off_fixed); +} + +PIXMAN_EXPORT void +pixman_add_traps (pixman_image_t *     image, +                  int16_t              x_off, +                  int16_t              y_off, +                  int                  ntrap, +                  const pixman_trap_t *traps) +{ +    int bpp; +    int height; + +    pixman_fixed_t x_off_fixed; +    pixman_fixed_t y_off_fixed; +    pixman_edge_t l, r; +    pixman_fixed_t t, b; + +    _pixman_image_validate (image); +     +    height = image->bits.height; +    bpp = PIXMAN_FORMAT_BPP (image->bits.format); + +    x_off_fixed = pixman_int_to_fixed (x_off); +    y_off_fixed = pixman_int_to_fixed (y_off); + +    while (ntrap--) +    { +	t = traps->top.y + y_off_fixed; +	if (t < 0) +	    t = 0; +	t = pixman_sample_ceil_y (t, bpp); + +	b = traps->bot.y + y_off_fixed; +	if (pixman_fixed_to_int (b) >= height) +	    b = pixman_int_to_fixed (height) - 1; +	b = pixman_sample_floor_y (b, bpp); + +	if (b >= t) +	{ +	    /* initialize edge walkers */ +	    pixman_edge_init (&l, bpp, t, +	                      traps->top.l + x_off_fixed, +	                      traps->top.y + y_off_fixed, +	                      traps->bot.l + x_off_fixed, +	                      traps->bot.y + y_off_fixed); + +	    pixman_edge_init (&r, bpp, t, +	                      traps->top.r + x_off_fixed, +	                      traps->top.y + y_off_fixed, +	                      traps->bot.r + x_off_fixed, +	                      traps->bot.y + y_off_fixed); + +	    pixman_rasterize_edges (image, &l, &r, t, b); +	} + +	traps++; +    } +} + +#if 0 +static void +dump_image (pixman_image_t *image, +            const char *    title) +{ +    int i, j; + +    if (!image->type == BITS) +	printf ("%s is not a regular image\n", title); + +    if (!image->bits.format == PIXMAN_a8) +	printf ("%s is not an alpha mask\n", title); + +    printf ("\n\n\n%s: \n", title); + +    for (i = 0; i < image->bits.height; ++i) +    { +	uint8_t *line = +	    (uint8_t *)&(image->bits.bits[i * image->bits.rowstride]); + +	for (j = 0; j < image->bits.width; ++j) +	    printf ("%c", line[j] ? '#' : ' '); + +	printf ("\n"); +    } +} +#endif + +PIXMAN_EXPORT void +pixman_add_trapezoids (pixman_image_t *          image, +                       int16_t                   x_off, +                       int                       y_off, +                       int                       ntraps, +                       const pixman_trapezoid_t *traps) +{ +    int i; + +#if 0 +    dump_image (image, "before"); +#endif + +    for (i = 0; i < ntraps; ++i) +    { +	const pixman_trapezoid_t *trap = &(traps[i]); + +	if (!pixman_trapezoid_valid (trap)) +	    continue; + +	pixman_rasterize_trapezoid (image, trap, x_off, y_off); +    } + +#if 0 +    dump_image (image, "after"); +#endif +} + +PIXMAN_EXPORT void +pixman_rasterize_trapezoid (pixman_image_t *          image, +                            const pixman_trapezoid_t *trap, +                            int                       x_off, +                            int                       y_off) +{ +    int bpp; +    int height; + +    pixman_fixed_t y_off_fixed; +    pixman_edge_t l, r; +    pixman_fixed_t t, b; + +    return_if_fail (image->type == BITS); + +    _pixman_image_validate (image); +     +    if (!pixman_trapezoid_valid (trap)) +	return; + +    height = image->bits.height; +    bpp = PIXMAN_FORMAT_BPP (image->bits.format); + +    y_off_fixed = pixman_int_to_fixed (y_off); + +    t = trap->top + y_off_fixed; +    if (t < 0) +	t = 0; +    t = pixman_sample_ceil_y (t, bpp); + +    b = trap->bottom + y_off_fixed; +    if (pixman_fixed_to_int (b) >= height) +	b = pixman_int_to_fixed (height) - 1; +    b = pixman_sample_floor_y (b, bpp); +     +    if (b >= t) +    { +	/* initialize edge walkers */ +	pixman_line_fixed_edge_init (&l, bpp, t, &trap->left, x_off, y_off); +	pixman_line_fixed_edge_init (&r, bpp, t, &trap->right, x_off, y_off); + +	pixman_rasterize_edges (image, &l, &r, t, b); +    } +} + +static const pixman_bool_t zero_src_has_no_effect[PIXMAN_N_OPERATORS] = +{ +    FALSE,	/* Clear		0			0    */ +    FALSE,	/* Src			1			0    */ +    TRUE,	/* Dst			0			1    */ +    TRUE,	/* Over			1			1-Aa */ +    TRUE,	/* OverReverse		1-Ab			1    */ +    FALSE,	/* In			Ab			0    */ +    FALSE,	/* InReverse		0			Aa   */ +    FALSE,	/* Out			1-Ab			0    */ +    TRUE,	/* OutReverse		0			1-Aa */ +    TRUE,	/* Atop			Ab			1-Aa */ +    FALSE,	/* AtopReverse		1-Ab			Aa   */ +    TRUE,	/* Xor			1-Ab			1-Aa */ +    TRUE,	/* Add			1			1    */ +}; + +static pixman_bool_t +get_trap_extents (pixman_op_t op, pixman_image_t *dest, +		  const pixman_trapezoid_t *traps, int n_traps, +		  pixman_box32_t *box) +{ +    int i; + +    /* When the operator is such that a zero source has an +     * effect on the underlying image, we have to +     * composite across the entire destination +     */ +    if (!zero_src_has_no_effect [op]) +    { +	box->x1 = 0; +	box->y1 = 0; +	box->x2 = dest->bits.width; +	box->y2 = dest->bits.height; +	return TRUE; +    } +     +    box->x1 = INT32_MAX; +    box->y1 = INT32_MAX; +    box->x2 = INT32_MIN; +    box->y2 = INT32_MIN; +	 +    for (i = 0; i < n_traps; ++i) +    { +	const pixman_trapezoid_t *trap = &(traps[i]); +	int y1, y2; +	     +	if (!pixman_trapezoid_valid (trap)) +	    continue; +	     +	y1 = pixman_fixed_to_int (trap->top); +	if (y1 < box->y1) +	    box->y1 = y1; +	     +	y2 = pixman_fixed_to_int (pixman_fixed_ceil (trap->bottom)); +	if (y2 > box->y2) +	    box->y2 = y2; +	     +#define EXTEND_MIN(x)							\ +	if (pixman_fixed_to_int ((x)) < box->x1)			\ +	    box->x1 = pixman_fixed_to_int ((x)); +#define EXTEND_MAX(x)							\ +	if (pixman_fixed_to_int (pixman_fixed_ceil ((x))) > box->x2)	\ +	    box->x2 = pixman_fixed_to_int (pixman_fixed_ceil ((x))); +	     +#define EXTEND(x)							\ +	EXTEND_MIN(x);							\ +	EXTEND_MAX(x); +	     +	EXTEND(trap->left.p1.x); +	EXTEND(trap->left.p2.x); +	EXTEND(trap->right.p1.x); +	EXTEND(trap->right.p2.x); +    } +	 +    if (box->x1 >= box->x2 || box->y1 >= box->y2) +	return FALSE; + +    return TRUE; +} + +/* + * pixman_composite_trapezoids() + * + * All the trapezoids are conceptually rendered to an infinitely big image. + * The (0, 0) coordinates of this image are then aligned with the (x, y) + * coordinates of the source image, and then both images are aligned with + * the (x, y) coordinates of the destination. Then these three images are + * composited across the entire destination. + */ +PIXMAN_EXPORT void +pixman_composite_trapezoids (pixman_op_t		op, +			     pixman_image_t *		src, +			     pixman_image_t *		dst, +			     pixman_format_code_t	mask_format, +			     int			x_src, +			     int			y_src, +			     int			x_dst, +			     int			y_dst, +			     int			n_traps, +			     const pixman_trapezoid_t *	traps) +{ +    int i; + +    return_if_fail (PIXMAN_FORMAT_TYPE (mask_format) == PIXMAN_TYPE_A); +     +    if (n_traps <= 0) +	return; + +    _pixman_image_validate (src); +    _pixman_image_validate (dst); + +    if (op == PIXMAN_OP_ADD && +	(src->common.flags & FAST_PATH_IS_OPAQUE)		&& +	(mask_format == dst->common.extended_format_code)	&& +	!(dst->common.have_clip_region)) +    { +	for (i = 0; i < n_traps; ++i) +	{ +	    const pixman_trapezoid_t *trap = &(traps[i]); +	     +	    if (!pixman_trapezoid_valid (trap)) +		continue; +	     +	    pixman_rasterize_trapezoid (dst, trap, x_dst, y_dst); +	} +    } +    else +    { +	pixman_image_t *tmp; +	pixman_box32_t box; +	int i; + +	if (!get_trap_extents (op, dst, traps, n_traps, &box)) +	    return; +	 +	if (!(tmp = pixman_image_create_bits ( +		  mask_format, box.x2 - box.x1, box.y2 - box.y1, NULL, -1))) +	    return; +	 +	for (i = 0; i < n_traps; ++i) +	{ +	    const pixman_trapezoid_t *trap = &(traps[i]); +	     +	    if (!pixman_trapezoid_valid (trap)) +		continue; +	     +	    pixman_rasterize_trapezoid (tmp, trap, - box.x1, - box.y1); +	} +	 +	pixman_image_composite (op, src, tmp, dst, +				x_src + box.x1, y_src + box.y1, +				0, 0, +				x_dst + box.x1, y_dst + box.y1, +				box.x2 - box.x1, box.y2 - box.y1); +	 +	pixman_image_unref (tmp); +    } +} + +static int +greater_y (const pixman_point_fixed_t *a, const pixman_point_fixed_t *b) +{ +    if (a->y == b->y) +	return a->x > b->x; +    return a->y > b->y; +} + +/* + * Note that the definition of this function is a bit odd because + * of the X coordinate space (y increasing downwards). + */ +static int +clockwise (const pixman_point_fixed_t *ref, +	   const pixman_point_fixed_t *a, +	   const pixman_point_fixed_t *b) +{ +    pixman_point_fixed_t	ad, bd; + +    ad.x = a->x - ref->x; +    ad.y = a->y - ref->y; +    bd.x = b->x - ref->x; +    bd.y = b->y - ref->y; + +    return ((pixman_fixed_32_32_t) bd.y * ad.x - +	    (pixman_fixed_32_32_t) ad.y * bd.x) < 0; +} + +static void +triangle_to_trapezoids (const pixman_triangle_t *tri, pixman_trapezoid_t *traps) +{ +    const pixman_point_fixed_t *top, *left, *right, *tmp; + +    top = &tri->p1; +    left = &tri->p2; +    right = &tri->p3; + +    if (greater_y (top, left)) +    { +	tmp = left; +	left = top; +	top = tmp; +    } + +    if (greater_y (top, right)) +    { +	tmp = right; +	right = top; +	top = tmp; +    } + +    if (clockwise (top, right, left)) +    { +	tmp = right; +	right = left; +	left = tmp; +    } +     +    /* +     * Two cases: +     * +     *		+		+ +     *	       / \             / \ +     *	      /   \           /	  \ +     *	     /     +         +	   \ +     *      /    --           --    \ +     *     /   --               --   \ +     *    / ---                   --- \ +     *	 +--                         --+ +     */ + +    traps->top = top->y; +    traps->left.p1 = *top; +    traps->left.p2 = *left; +    traps->right.p1 = *top; +    traps->right.p2 = *right; + +    if (right->y < left->y) +	traps->bottom = right->y; +    else +	traps->bottom = left->y; + +    traps++; + +    *traps = *(traps - 1); +     +    if (right->y < left->y) +    { +	traps->top = right->y; +	traps->bottom = left->y; +	traps->right.p1 = *right; +	traps->right.p2 = *left; +    } +    else +    { +	traps->top = left->y; +	traps->bottom = right->y; +	traps->left.p1 = *left; +	traps->left.p2 = *right; +    } +} + +static pixman_trapezoid_t * +convert_triangles (int n_tris, const pixman_triangle_t *tris) +{ +    pixman_trapezoid_t *traps; +    int i; + +    if (n_tris <= 0) +	return NULL; +     +    traps = pixman_malloc_ab (n_tris, 2 * sizeof (pixman_trapezoid_t)); +    if (!traps) +	return NULL; + +    for (i = 0; i < n_tris; ++i) +	triangle_to_trapezoids (&(tris[i]), traps + 2 * i); + +    return traps; +} + +PIXMAN_EXPORT void +pixman_composite_triangles (pixman_op_t			op, +			    pixman_image_t *		src, +			    pixman_image_t *		dst, +			    pixman_format_code_t	mask_format, +			    int				x_src, +			    int				y_src, +			    int				x_dst, +			    int				y_dst, +			    int				n_tris, +			    const pixman_triangle_t *	tris) +{ +    pixman_trapezoid_t *traps; + +    if ((traps = convert_triangles (n_tris, tris))) +    { +	pixman_composite_trapezoids (op, src, dst, mask_format, +				     x_src, y_src, x_dst, y_dst, +				     n_tris * 2, traps); +	 +	free (traps); +    } +} + +PIXMAN_EXPORT void +pixman_add_triangles (pixman_image_t          *image, +		      int32_t	               x_off, +		      int32_t	               y_off, +		      int	               n_tris, +		      const pixman_triangle_t *tris) +{ +    pixman_trapezoid_t *traps; + +    if ((traps = convert_triangles (n_tris, tris))) +    { +	pixman_add_trapezoids (image, x_off, y_off, +			       n_tris * 2, traps); + +	free (traps); +    } +} diff --git a/libs/pixman-0.40.0/pixman/pixman-utils.c b/libs/pixman-0.40.0/pixman/pixman-utils.c new file mode 100644 index 0000000..2c2dddd --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-utils.c @@ -0,0 +1,330 @@ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 1999 Keith Packard + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of SuSE not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission.  SuSE makes no representations about the + * suitability of this software for any purpose.  It is provided "as is" + * without express or implied warranty. + * + * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Author:  Keith Packard, SuSE, Inc. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include <stdio.h> +#include <stdlib.h> + +#include "pixman-private.h" + +pixman_bool_t +_pixman_multiply_overflows_size (size_t a, size_t b) +{ +    return a >= SIZE_MAX / b; +} + +pixman_bool_t +_pixman_multiply_overflows_int (unsigned int a, unsigned int b) +{ +    return a >= INT32_MAX / b; +} + +pixman_bool_t +_pixman_addition_overflows_int (unsigned int a, unsigned int b) +{ +    return a > INT32_MAX - b; +} + +void * +pixman_malloc_ab_plus_c (unsigned int a, unsigned int b, unsigned int c) +{ +    if (!b || a >= INT32_MAX / b || (a * b) > INT32_MAX - c) +	return NULL; + +    return malloc (a * b + c); +} + +void * +pixman_malloc_ab (unsigned int a, +                  unsigned int b) +{ +    if (a >= INT32_MAX / b) +	return NULL; + +    return malloc (a * b); +} + +void * +pixman_malloc_abc (unsigned int a, +                   unsigned int b, +                   unsigned int c) +{ +    if (a >= INT32_MAX / b) +	return NULL; +    else if (a * b >= INT32_MAX / c) +	return NULL; +    else +	return malloc (a * b * c); +} + +static force_inline uint16_t +float_to_unorm (float f, int n_bits) +{ +    uint32_t u; + +    if (f > 1.0) +	f = 1.0; +    if (f < 0.0) +	f = 0.0; + +    u = f * (1 << n_bits); +    u -= (u >> n_bits); + +    return u; +} + +static force_inline float +unorm_to_float (uint16_t u, int n_bits) +{ +    uint32_t m = ((1 << n_bits) - 1); + +    return (u & m) * (1.f / (float)m); +} + +/* + * This function expands images from a8r8g8b8 to argb_t.  To preserve + * precision, it needs to know from which source format the a8r8g8b8 pixels + * originally came. + * + * For example, if the source was PIXMAN_x1r5g5b5 and the red component + * contained bits 12345, then the 8-bit value is 12345123.  To correctly + * expand this to floating point, it should be 12345 / 31.0 and not + * 12345123 / 255.0. + */ +void +pixman_expand_to_float (argb_t               *dst, +			const uint32_t       *src, +			pixman_format_code_t  format, +			int                   width) +{ +    static const float multipliers[16] = { +	0.0f, +	1.0f / ((1 <<  1) - 1), +	1.0f / ((1 <<  2) - 1), +	1.0f / ((1 <<  3) - 1), +	1.0f / ((1 <<  4) - 1), +	1.0f / ((1 <<  5) - 1), +	1.0f / ((1 <<  6) - 1), +	1.0f / ((1 <<  7) - 1), +	1.0f / ((1 <<  8) - 1), +	1.0f / ((1 <<  9) - 1), +	1.0f / ((1 << 10) - 1), +	1.0f / ((1 << 11) - 1), +	1.0f / ((1 << 12) - 1), +	1.0f / ((1 << 13) - 1), +	1.0f / ((1 << 14) - 1), +	1.0f / ((1 << 15) - 1), +    }; +    int a_size, r_size, g_size, b_size; +    int a_shift, r_shift, g_shift, b_shift; +    float a_mul, r_mul, g_mul, b_mul; +    uint32_t a_mask, r_mask, g_mask, b_mask; +    int i; + +    if (!PIXMAN_FORMAT_VIS (format)) +	format = PIXMAN_a8r8g8b8; + +    /* +     * Determine the sizes of each component and the masks and shifts +     * required to extract them from the source pixel. +     */ +    a_size = PIXMAN_FORMAT_A (format); +    r_size = PIXMAN_FORMAT_R (format); +    g_size = PIXMAN_FORMAT_G (format); +    b_size = PIXMAN_FORMAT_B (format); + +    a_shift = 32 - a_size; +    r_shift = 24 - r_size; +    g_shift = 16 - g_size; +    b_shift =  8 - b_size; + +    a_mask = ((1 << a_size) - 1); +    r_mask = ((1 << r_size) - 1); +    g_mask = ((1 << g_size) - 1); +    b_mask = ((1 << b_size) - 1); + +    a_mul = multipliers[a_size]; +    r_mul = multipliers[r_size]; +    g_mul = multipliers[g_size]; +    b_mul = multipliers[b_size]; + +    /* Start at the end so that we can do the expansion in place +     * when src == dst +     */ +    for (i = width - 1; i >= 0; i--) +    { +	const uint32_t pixel = src[i]; + +	dst[i].a = a_mask? ((pixel >> a_shift) & a_mask) * a_mul : 1.0f; +	dst[i].r = ((pixel >> r_shift) & r_mask) * r_mul; +	dst[i].g = ((pixel >> g_shift) & g_mask) * g_mul; +	dst[i].b = ((pixel >> b_shift) & b_mask) * b_mul; +    } +} + +uint16_t +pixman_float_to_unorm (float f, int n_bits) +{ +    return float_to_unorm (f, n_bits); +} + +float +pixman_unorm_to_float (uint16_t u, int n_bits) +{ +    return unorm_to_float (u, n_bits); +} + +void +pixman_contract_from_float (uint32_t     *dst, +			    const argb_t *src, +			    int           width) +{ +    int i; + +    for (i = 0; i < width; ++i) +    { +	uint32_t a, r, g, b; + +	a = float_to_unorm (src[i].a, 8); +	r = float_to_unorm (src[i].r, 8); +	g = float_to_unorm (src[i].g, 8); +	b = float_to_unorm (src[i].b, 8); + +	dst[i] = (a << 24) | (r << 16) | (g << 8) | (b << 0); +    } +} + +uint32_t * +_pixman_iter_get_scanline_noop (pixman_iter_t *iter, const uint32_t *mask) +{ +    return iter->buffer; +} + +void +_pixman_iter_init_bits_stride (pixman_iter_t *iter, const pixman_iter_info_t *info) +{ +    pixman_image_t *image = iter->image; +    uint8_t *b = (uint8_t *)image->bits.bits; +    int s = image->bits.rowstride * 4; + +    iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (info->format) / 8; +    iter->stride = s; +} + +#define N_TMP_BOXES (16) + +pixman_bool_t +pixman_region16_copy_from_region32 (pixman_region16_t *dst, +                                    pixman_region32_t *src) +{ +    int n_boxes, i; +    pixman_box32_t *boxes32; +    pixman_box16_t *boxes16; +    pixman_bool_t retval; + +    boxes32 = pixman_region32_rectangles (src, &n_boxes); + +    boxes16 = pixman_malloc_ab (n_boxes, sizeof (pixman_box16_t)); + +    if (!boxes16) +	return FALSE; + +    for (i = 0; i < n_boxes; ++i) +    { +	boxes16[i].x1 = boxes32[i].x1; +	boxes16[i].y1 = boxes32[i].y1; +	boxes16[i].x2 = boxes32[i].x2; +	boxes16[i].y2 = boxes32[i].y2; +    } + +    pixman_region_fini (dst); +    retval = pixman_region_init_rects (dst, boxes16, n_boxes); +    free (boxes16); +    return retval; +} + +pixman_bool_t +pixman_region32_copy_from_region16 (pixman_region32_t *dst, +                                    pixman_region16_t *src) +{ +    int n_boxes, i; +    pixman_box16_t *boxes16; +    pixman_box32_t *boxes32; +    pixman_box32_t tmp_boxes[N_TMP_BOXES]; +    pixman_bool_t retval; + +    boxes16 = pixman_region_rectangles (src, &n_boxes); + +    if (n_boxes > N_TMP_BOXES) +	boxes32 = pixman_malloc_ab (n_boxes, sizeof (pixman_box32_t)); +    else +	boxes32 = tmp_boxes; + +    if (!boxes32) +	return FALSE; + +    for (i = 0; i < n_boxes; ++i) +    { +	boxes32[i].x1 = boxes16[i].x1; +	boxes32[i].y1 = boxes16[i].y1; +	boxes32[i].x2 = boxes16[i].x2; +	boxes32[i].y2 = boxes16[i].y2; +    } + +    pixman_region32_fini (dst); +    retval = pixman_region32_init_rects (dst, boxes32, n_boxes); + +    if (boxes32 != tmp_boxes) +	free (boxes32); + +    return retval; +} + +/* This function is exported for the sake of the test suite and not part + * of the ABI. + */ +PIXMAN_EXPORT pixman_implementation_t * +_pixman_internal_only_get_implementation (void) +{ +    return get_implementation (); +} + +void +_pixman_log_error (const char *function, const char *message) +{ +    static int n_messages = 0; + +    if (n_messages < 10) +    { +	fprintf (stderr, +		 "*** BUG ***\n" +		 "In %s: %s\n" +		 "Set a breakpoint on '_pixman_log_error' to debug\n\n", +                 function, message); + +	n_messages++; +    } +} diff --git a/libs/pixman-0.40.0/pixman/pixman-version.h b/libs/pixman-0.40.0/pixman/pixman-version.h new file mode 100644 index 0000000..8b0e774 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-version.h @@ -0,0 +1,54 @@ +/* + * Copyright © 2008 Red Hat, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Author: Carl D. Worth <cworth@cworth.org> + */ + +#ifndef PIXMAN_VERSION_H__ +#define PIXMAN_VERSION_H__ + +#ifndef PIXMAN_H__ +#  error pixman-version.h should only be included by pixman.h +#endif + +#define PIXMAN_VERSION_MAJOR 0 +#define PIXMAN_VERSION_MINOR 40 +#define PIXMAN_VERSION_MICRO 0 + +#define PIXMAN_VERSION_STRING "0.40.0" + +#define PIXMAN_VERSION_ENCODE(major, minor, micro) (	\ +	  ((major) * 10000)				\ +	+ ((minor) *   100)				\ +	+ ((micro) *     1)) + +#define PIXMAN_VERSION PIXMAN_VERSION_ENCODE(	\ +	PIXMAN_VERSION_MAJOR,			\ +	PIXMAN_VERSION_MINOR,			\ +	PIXMAN_VERSION_MICRO) + +#ifndef PIXMAN_API +# define PIXMAN_API +#endif + +#endif /* PIXMAN_VERSION_H__ */ diff --git a/libs/pixman-0.40.0/pixman/pixman-version.h.in b/libs/pixman-0.40.0/pixman/pixman-version.h.in new file mode 100644 index 0000000..64778a5 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-version.h.in @@ -0,0 +1,54 @@ +/* + * Copyright © 2008 Red Hat, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Author: Carl D. Worth <cworth@cworth.org> + */ + +#ifndef PIXMAN_VERSION_H__ +#define PIXMAN_VERSION_H__ + +#ifndef PIXMAN_H__ +#  error pixman-version.h should only be included by pixman.h +#endif + +#define PIXMAN_VERSION_MAJOR @PIXMAN_VERSION_MAJOR@ +#define PIXMAN_VERSION_MINOR @PIXMAN_VERSION_MINOR@ +#define PIXMAN_VERSION_MICRO @PIXMAN_VERSION_MICRO@ + +#define PIXMAN_VERSION_STRING "@PIXMAN_VERSION_MAJOR@.@PIXMAN_VERSION_MINOR@.@PIXMAN_VERSION_MICRO@" + +#define PIXMAN_VERSION_ENCODE(major, minor, micro) (	\ +	  ((major) * 10000)				\ +	+ ((minor) *   100)				\ +	+ ((micro) *     1)) + +#define PIXMAN_VERSION PIXMAN_VERSION_ENCODE(	\ +	PIXMAN_VERSION_MAJOR,			\ +	PIXMAN_VERSION_MINOR,			\ +	PIXMAN_VERSION_MICRO) + +#ifndef PIXMAN_API +# define PIXMAN_API +#endif + +#endif /* PIXMAN_VERSION_H__ */ diff --git a/libs/pixman-0.40.0/pixman/pixman-vmx.c b/libs/pixman-0.40.0/pixman/pixman-vmx.c new file mode 100644 index 0000000..52de37e --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-vmx.c @@ -0,0 +1,3159 @@ +/* + * Copyright © 2007 Luca Barbato + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Luca Barbato not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission.  Luca Barbato makes no representations about the + * suitability of this software for any purpose.  It is provided "as is" + * without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author:  Luca Barbato (lu_zero@gentoo.org) + * + * Based on fbmmx.c by Owen Taylor, Søren Sandmann and Nicholas Miell + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include "pixman-private.h" +#include "pixman-combine32.h" +#include "pixman-inlines.h" +#include <altivec.h> + +#define AVV(x...) {x} + +static vector unsigned int mask_ff000000; +static vector unsigned int mask_red; +static vector unsigned int mask_green; +static vector unsigned int mask_blue; +static vector unsigned int mask_565_fix_rb; +static vector unsigned int mask_565_fix_g; + +static force_inline vector unsigned int +splat_alpha (vector unsigned int pix) +{ +#ifdef WORDS_BIGENDIAN +    return vec_perm (pix, pix, +		     (vector unsigned char)AVV ( +			 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, +			 0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C)); +#else +    return vec_perm (pix, pix, +		     (vector unsigned char)AVV ( +			 0x03, 0x03, 0x03, 0x03, 0x07, 0x07, 0x07, 0x07, +			 0x0B, 0x0B, 0x0B, 0x0B, 0x0F, 0x0F, 0x0F, 0x0F)); +#endif +} + +static force_inline vector unsigned int +splat_pixel (vector unsigned int pix) +{ +    return vec_perm (pix, pix, +		     (vector unsigned char)AVV ( +			 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, +			 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03)); +} + +static force_inline vector unsigned int +pix_multiply (vector unsigned int p, vector unsigned int a) +{ +    vector unsigned short hi, lo, mod; + +    /* unpack to short */ +    hi = (vector unsigned short) +#ifdef WORDS_BIGENDIAN +	vec_mergeh ((vector unsigned char)AVV (0), +		    (vector unsigned char)p); +#else +	vec_mergeh ((vector unsigned char) p, +		    (vector unsigned char) AVV (0)); +#endif + +    mod = (vector unsigned short) +#ifdef WORDS_BIGENDIAN +	vec_mergeh ((vector unsigned char)AVV (0), +		    (vector unsigned char)a); +#else +	vec_mergeh ((vector unsigned char) a, +		    (vector unsigned char) AVV (0)); +#endif + +    hi = vec_mladd (hi, mod, (vector unsigned short) +                    AVV (0x0080, 0x0080, 0x0080, 0x0080, +                         0x0080, 0x0080, 0x0080, 0x0080)); + +    hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8))); + +    hi = vec_sr (hi, vec_splat_u16 (8)); + +    /* unpack to short */ +    lo = (vector unsigned short) +#ifdef WORDS_BIGENDIAN +	vec_mergel ((vector unsigned char)AVV (0), +		    (vector unsigned char)p); +#else +	vec_mergel ((vector unsigned char) p, +		    (vector unsigned char) AVV (0)); +#endif + +    mod = (vector unsigned short) +#ifdef WORDS_BIGENDIAN +	vec_mergel ((vector unsigned char)AVV (0), +		    (vector unsigned char)a); +#else +	vec_mergel ((vector unsigned char) a, +		    (vector unsigned char) AVV (0)); +#endif + +    lo = vec_mladd (lo, mod, (vector unsigned short) +                    AVV (0x0080, 0x0080, 0x0080, 0x0080, +                         0x0080, 0x0080, 0x0080, 0x0080)); + +    lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8))); + +    lo = vec_sr (lo, vec_splat_u16 (8)); + +    return (vector unsigned int)vec_packsu (hi, lo); +} + +static force_inline vector unsigned int +pix_add (vector unsigned int a, vector unsigned int b) +{ +    return (vector unsigned int)vec_adds ((vector unsigned char)a, +                                          (vector unsigned char)b); +} + +static force_inline vector unsigned int +pix_add_mul (vector unsigned int x, +             vector unsigned int a, +             vector unsigned int y, +             vector unsigned int b) +{ +    vector unsigned int t1, t2; + +    t1 = pix_multiply (x, a); +    t2 = pix_multiply (y, b); + +    return pix_add (t1, t2); +} + +static force_inline vector unsigned int +negate (vector unsigned int src) +{ +    return vec_nor (src, src); +} + +/* dest*~srca + src */ +static force_inline vector unsigned int +over (vector unsigned int src, +      vector unsigned int srca, +      vector unsigned int dest) +{ +    vector unsigned char tmp = (vector unsigned char) +	pix_multiply (dest, negate (srca)); + +    tmp = vec_adds ((vector unsigned char)src, tmp); +    return (vector unsigned int)tmp; +} + +/* in == pix_multiply */ +#define in_over(src, srca, mask, dest)					\ +    over (pix_multiply (src, mask),					\ +          pix_multiply (srca, mask), dest) + +#ifdef WORDS_BIGENDIAN + +#define COMPUTE_SHIFT_MASK(source)					\ +    source ## _mask = vec_lvsl (0, source); + +#define COMPUTE_SHIFT_MASKS(dest, source)				\ +    source ## _mask = vec_lvsl (0, source); + +#define COMPUTE_SHIFT_MASKC(dest, source, mask)				\ +    mask ## _mask = vec_lvsl (0, mask);					\ +    source ## _mask = vec_lvsl (0, source); + +#define LOAD_VECTOR(source)				  \ +do							  \ +{							  \ +    vector unsigned char tmp1, tmp2;			  \ +    tmp1 = (typeof(tmp1))vec_ld (0, source);		  \ +    tmp2 = (typeof(tmp2))vec_ld (15, source);		  \ +    v ## source = (typeof(v ## source)) 		  \ +	vec_perm (tmp1, tmp2, source ## _mask);		  \ +} while (0) + +#define LOAD_VECTORS(dest, source)			  \ +do							  \ +{							  \ +    LOAD_VECTOR(source);				  \ +    v ## dest = (typeof(v ## dest))vec_ld (0, dest);	  \ +} while (0) + +#define LOAD_VECTORSC(dest, source, mask)		  \ +do							  \ +{							  \ +    LOAD_VECTORS(dest, source); 			  \ +    LOAD_VECTOR(mask);					  \ +} while (0) + +#define DECLARE_SRC_MASK_VAR vector unsigned char src_mask +#define DECLARE_MASK_MASK_VAR vector unsigned char mask_mask + +#else + +/* Now the COMPUTE_SHIFT_{MASK, MASKS, MASKC} below are just no-op. + * They are defined that way because little endian altivec can do unaligned + * reads natively and have no need for constructing the permutation pattern + * variables. + */ +#define COMPUTE_SHIFT_MASK(source) + +#define COMPUTE_SHIFT_MASKS(dest, source) + +#define COMPUTE_SHIFT_MASKC(dest, source, mask) + +# define LOAD_VECTOR(source)				\ +    v ## source = (typeof(v ## source))vec_xl(0, source); + +# define LOAD_VECTORS(dest, source)			\ +    LOAD_VECTOR(source);				\ +    LOAD_VECTOR(dest);					\ + +# define LOAD_VECTORSC(dest, source, mask)		\ +    LOAD_VECTORS(dest, source); 			\ +    LOAD_VECTOR(mask);					\ + +#define DECLARE_SRC_MASK_VAR +#define DECLARE_MASK_MASK_VAR + +#endif /* WORDS_BIGENDIAN */ + +#define LOAD_VECTORSM(dest, source, mask)				\ +    LOAD_VECTORSC (dest, source, mask); 				\ +    v ## source = pix_multiply (v ## source,				\ +                                splat_alpha (v ## mask)); + +#define STORE_VECTOR(dest)						\ +    vec_st ((vector unsigned int) v ## dest, 0, dest); + +/* load 4 pixels from a 16-byte boundary aligned address */ +static force_inline vector unsigned int +load_128_aligned (const uint32_t* src) +{ +    return *((vector unsigned int *) src); +} + +/* load 4 pixels from a unaligned address */ +static force_inline vector unsigned int +load_128_unaligned (const uint32_t* src) +{ +    vector unsigned int vsrc; +    DECLARE_SRC_MASK_VAR; + +    COMPUTE_SHIFT_MASK (src); +    LOAD_VECTOR (src); + +    return vsrc; +} + +/* save 4 pixels on a 16-byte boundary aligned address */ +static force_inline void +save_128_aligned (uint32_t* data, +		  vector unsigned int vdata) +{ +    STORE_VECTOR(data) +} + +static force_inline vector unsigned int +create_mask_1x32_128 (const uint32_t *src) +{ +    vector unsigned int vsrc; +    DECLARE_SRC_MASK_VAR; + +    COMPUTE_SHIFT_MASK (src); +    LOAD_VECTOR (src); +    return vec_splat(vsrc, 0); +} + +static force_inline vector unsigned int +create_mask_32_128 (uint32_t mask) +{ +    return create_mask_1x32_128(&mask); +} + +static force_inline vector unsigned int +unpacklo_128_16x8 (vector unsigned int data1, vector unsigned int data2) +{ +    vector unsigned char lo; + +    /* unpack to short */ +    lo = (vector unsigned char) +#ifdef WORDS_BIGENDIAN +	vec_mergel ((vector unsigned char) data2, +		    (vector unsigned char) data1); +#else +	vec_mergel ((vector unsigned char) data1, +		    (vector unsigned char) data2); +#endif + +    return (vector unsigned int) lo; +} + +static force_inline vector unsigned int +unpackhi_128_16x8 (vector unsigned int data1, vector unsigned int data2) +{ +    vector unsigned char hi; + +    /* unpack to short */ +    hi = (vector unsigned char) +#ifdef WORDS_BIGENDIAN +	vec_mergeh ((vector unsigned char) data2, +		    (vector unsigned char) data1); +#else +	vec_mergeh ((vector unsigned char) data1, +		    (vector unsigned char) data2); +#endif + +    return (vector unsigned int) hi; +} + +static force_inline vector unsigned int +unpacklo_128_8x16 (vector unsigned int data1, vector unsigned int data2) +{ +    vector unsigned short lo; + +    /* unpack to char */ +    lo = (vector unsigned short) +#ifdef WORDS_BIGENDIAN +	vec_mergel ((vector unsigned short) data2, +		    (vector unsigned short) data1); +#else +	vec_mergel ((vector unsigned short) data1, +		    (vector unsigned short) data2); +#endif + +    return (vector unsigned int) lo; +} + +static force_inline vector unsigned int +unpackhi_128_8x16 (vector unsigned int data1, vector unsigned int data2) +{ +    vector unsigned short hi; + +    /* unpack to char */ +    hi = (vector unsigned short) +#ifdef WORDS_BIGENDIAN +	vec_mergeh ((vector unsigned short) data2, +		    (vector unsigned short) data1); +#else +	vec_mergeh ((vector unsigned short) data1, +		    (vector unsigned short) data2); +#endif + +    return (vector unsigned int) hi; +} + +static force_inline void +unpack_128_2x128 (vector unsigned int data1, vector unsigned int data2, +		    vector unsigned int* data_lo, vector unsigned int* data_hi) +{ +    *data_lo = unpacklo_128_16x8(data1, data2); +    *data_hi = unpackhi_128_16x8(data1, data2); +} + +static force_inline void +unpack_128_2x128_16 (vector unsigned int data1, vector unsigned int data2, +		    vector unsigned int* data_lo, vector unsigned int* data_hi) +{ +    *data_lo = unpacklo_128_8x16(data1, data2); +    *data_hi = unpackhi_128_8x16(data1, data2); +} + +static force_inline vector unsigned int +unpack_565_to_8888 (vector unsigned int lo) +{ +    vector unsigned int r, g, b, rb, t; + +    r = vec_and (vec_sl(lo, create_mask_32_128(8)), mask_red); +    g = vec_and (vec_sl(lo, create_mask_32_128(5)), mask_green); +    b = vec_and (vec_sl(lo, create_mask_32_128(3)), mask_blue); + +    rb = vec_or (r, b); +    t  = vec_and (rb, mask_565_fix_rb); +    t  = vec_sr (t, create_mask_32_128(5)); +    rb = vec_or (rb, t); + +    t  = vec_and (g, mask_565_fix_g); +    t  = vec_sr (t, create_mask_32_128(6)); +    g  = vec_or (g, t); + +    return vec_or (rb, g); +} + +static force_inline int +is_opaque (vector unsigned int x) +{ +    uint32_t cmp_result; +    vector bool int ffs = vec_cmpeq(x, x); + +    cmp_result = vec_all_eq(x, ffs); + +    return (cmp_result & 0x8888) == 0x8888; +} + +static force_inline int +is_zero (vector unsigned int x) +{ +    uint32_t cmp_result; + +    cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0)); + +    return cmp_result == 0xffff; +} + +static force_inline int +is_transparent (vector unsigned int x) +{ +    uint32_t cmp_result; + +    cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0)); +    return (cmp_result & 0x8888) == 0x8888; +} + +static force_inline uint32_t +core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst) +{ +    uint32_t a; + +    a = ALPHA_8(src); + +    if (a == 0xff) +    { +	return src; +    } +    else if (src) +    { +	UN8x4_MUL_UN8_ADD_UN8x4(dst, (~a & MASK), src); +    } + +    return dst; +} + +static force_inline uint32_t +combine1 (const uint32_t *ps, const uint32_t *pm) +{ +    uint32_t s = *ps; + +    if (pm) +	UN8x4_MUL_UN8(s, ALPHA_8(*pm)); + +    return s; +} + +static force_inline vector unsigned int +combine4 (const uint32_t* ps, const uint32_t* pm) +{ +    vector unsigned int src, msk; + +    if (pm) +    { +	msk = load_128_unaligned(pm); + +	if (is_transparent(msk)) +	    return (vector unsigned int) AVV(0); +    } + +    src = load_128_unaligned(ps); + +    if (pm) +	src = pix_multiply(src, msk); + +    return src; +} + +static void +vmx_combine_over_u_no_mask (uint32_t *      dest, +                            const uint32_t *src, +                            int             width) +{ +    int i; +    vector unsigned int vdest, vsrc; +    DECLARE_SRC_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t s = *src++; +	uint32_t d = *dest; +	uint32_t ia = ALPHA_8 (~s); + +	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); + +	*dest++ = d; +	width--; +    } + +    COMPUTE_SHIFT_MASKS (dest, src); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { + +	LOAD_VECTORS (dest, src); + +	vdest = over (vsrc, splat_alpha (vsrc), vdest); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t s = src[i]; +	uint32_t d = dest[i]; +	uint32_t ia = ALPHA_8 (~s); + +	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); + +	dest[i] = d; +    } +} + +static void +vmx_combine_over_u_mask (uint32_t *      dest, +                         const uint32_t *src, +                         const uint32_t *mask, +                         int             width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t m = ALPHA_8 (*mask++); +	uint32_t s = *src++; +	uint32_t d = *dest; +	uint32_t ia; + +	UN8x4_MUL_UN8 (s, m); + +	ia = ALPHA_8 (~s); + +	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); +	*dest++ = d; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORSM (dest, src, mask); + +	vdest = over (vsrc, splat_alpha (vsrc), vdest); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +	mask += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t m = ALPHA_8 (mask[i]); +	uint32_t s = src[i]; +	uint32_t d = dest[i]; +	uint32_t ia; + +	UN8x4_MUL_UN8 (s, m); + +	ia = ALPHA_8 (~s); + +	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); +	dest[i] = d; +    } +} + +static void +vmx_combine_over_u (pixman_implementation_t *imp, +                    pixman_op_t              op, +                    uint32_t *               dest, +                    const uint32_t *         src, +                    const uint32_t *         mask, +                    int                      width) +{ +    if (mask) +	vmx_combine_over_u_mask (dest, src, mask, width); +    else +	vmx_combine_over_u_no_mask (dest, src, width); +} + +static void +vmx_combine_over_reverse_u_no_mask (uint32_t *      dest, +                                    const uint32_t *src, +                                    int             width) +{ +    int i; +    vector unsigned int vdest, vsrc; +    DECLARE_SRC_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t s = *src++; +	uint32_t d = *dest; +	uint32_t ia = ALPHA_8 (~d); + +	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d); +	*dest++ = s; +	width--; +    } + +    COMPUTE_SHIFT_MASKS (dest, src); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { + +	LOAD_VECTORS (dest, src); + +	vdest = over (vdest, splat_alpha (vdest), vsrc); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t s = src[i]; +	uint32_t d = dest[i]; +	uint32_t ia = ALPHA_8 (~dest[i]); + +	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d); +	dest[i] = s; +    } +} + +static void +vmx_combine_over_reverse_u_mask (uint32_t *      dest, +                                 const uint32_t *src, +                                 const uint32_t *mask, +                                 int             width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t m = ALPHA_8 (*mask++); +	uint32_t s = *src++; +	uint32_t d = *dest; +	uint32_t ia = ALPHA_8 (~d); + +	UN8x4_MUL_UN8 (s, m); + +	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d); +	*dest++ = s; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { + +	LOAD_VECTORSM (dest, src, mask); + +	vdest = over (vdest, splat_alpha (vdest), vsrc); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +	mask += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t m = ALPHA_8 (mask[i]); +	uint32_t s = src[i]; +	uint32_t d = dest[i]; +	uint32_t ia = ALPHA_8 (~dest[i]); + +	UN8x4_MUL_UN8 (s, m); + +	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d); +	dest[i] = s; +    } +} + +static void +vmx_combine_over_reverse_u (pixman_implementation_t *imp, +                            pixman_op_t              op, +                            uint32_t *               dest, +                            const uint32_t *         src, +                            const uint32_t *         mask, +                            int                      width) +{ +    if (mask) +	vmx_combine_over_reverse_u_mask (dest, src, mask, width); +    else +	vmx_combine_over_reverse_u_no_mask (dest, src, width); +} + +static void +vmx_combine_in_u_no_mask (uint32_t *      dest, +                          const uint32_t *src, +                          int             width) +{ +    int i; +    vector unsigned int vdest, vsrc; +    DECLARE_SRC_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t s = *src++; +	uint32_t a = ALPHA_8 (*dest); + +	UN8x4_MUL_UN8 (s, a); +	*dest++ = s; +	width--; +    } + +    COMPUTE_SHIFT_MASKS (dest, src); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORS (dest, src); + +	vdest = pix_multiply (vsrc, splat_alpha (vdest)); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t s = src[i]; +	uint32_t a = ALPHA_8 (dest[i]); + +	UN8x4_MUL_UN8 (s, a); +	dest[i] = s; +    } +} + +static void +vmx_combine_in_u_mask (uint32_t *      dest, +                       const uint32_t *src, +                       const uint32_t *mask, +                       int             width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t m = ALPHA_8 (*mask++); +	uint32_t s = *src++; +	uint32_t a = ALPHA_8 (*dest); + +	UN8x4_MUL_UN8 (s, m); +	UN8x4_MUL_UN8 (s, a); + +	*dest++ = s; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORSM (dest, src, mask); + +	vdest = pix_multiply (vsrc, splat_alpha (vdest)); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +	mask += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t m = ALPHA_8 (mask[i]); +	uint32_t s = src[i]; +	uint32_t a = ALPHA_8 (dest[i]); + +	UN8x4_MUL_UN8 (s, m); +	UN8x4_MUL_UN8 (s, a); + +	dest[i] = s; +    } +} + +static void +vmx_combine_in_u (pixman_implementation_t *imp, +                  pixman_op_t              op, +                  uint32_t *               dest, +                  const uint32_t *         src, +                  const uint32_t *         mask, +                  int                      width) +{ +    if (mask) +	vmx_combine_in_u_mask (dest, src, mask, width); +    else +	vmx_combine_in_u_no_mask (dest, src, width); +} + +static void +vmx_combine_in_reverse_u_no_mask (uint32_t *      dest, +                                  const uint32_t *src, +                                  int             width) +{ +    int i; +    vector unsigned int vdest, vsrc; +    DECLARE_SRC_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t d = *dest; +	uint32_t a = ALPHA_8 (*src++); + +	UN8x4_MUL_UN8 (d, a); + +	*dest++ = d; +	width--; +    } + +    COMPUTE_SHIFT_MASKS (dest, src); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORS (dest, src); + +	vdest = pix_multiply (vdest, splat_alpha (vsrc)); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t d = dest[i]; +	uint32_t a = ALPHA_8 (src[i]); + +	UN8x4_MUL_UN8 (d, a); + +	dest[i] = d; +    } +} + +static void +vmx_combine_in_reverse_u_mask (uint32_t *      dest, +                               const uint32_t *src, +                               const uint32_t *mask, +                               int             width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t m = ALPHA_8 (*mask++); +	uint32_t d = *dest; +	uint32_t a = *src++; + +	UN8x4_MUL_UN8 (a, m); +	a = ALPHA_8 (a); +	UN8x4_MUL_UN8 (d, a); + +	*dest++ = d; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORSM (dest, src, mask); + +	vdest = pix_multiply (vdest, splat_alpha (vsrc)); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +	mask += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t m = ALPHA_8 (mask[i]); +	uint32_t d = dest[i]; +	uint32_t a = src[i]; + +	UN8x4_MUL_UN8 (a, m); +	a = ALPHA_8 (a); +	UN8x4_MUL_UN8 (d, a); + +	dest[i] = d; +    } +} + +static void +vmx_combine_in_reverse_u (pixman_implementation_t *imp, +                          pixman_op_t              op, +                          uint32_t *               dest, +                          const uint32_t *         src, +                          const uint32_t *         mask, +                          int                      width) +{ +    if (mask) +	vmx_combine_in_reverse_u_mask (dest, src, mask, width); +    else +	vmx_combine_in_reverse_u_no_mask (dest, src, width); +} + +static void +vmx_combine_out_u_no_mask (uint32_t *      dest, +                           const uint32_t *src, +                           int             width) +{ +    int i; +    vector unsigned int vdest, vsrc; +    DECLARE_SRC_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t s = *src++; +	uint32_t a = ALPHA_8 (~(*dest)); + +	UN8x4_MUL_UN8 (s, a); + +	*dest++ = s; +	width--; +    } + +    COMPUTE_SHIFT_MASKS (dest, src); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORS (dest, src); + +	vdest = pix_multiply (vsrc, splat_alpha (negate (vdest))); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t s = src[i]; +	uint32_t a = ALPHA_8 (~dest[i]); + +	UN8x4_MUL_UN8 (s, a); + +	dest[i] = s; +    } +} + +static void +vmx_combine_out_u_mask (uint32_t *      dest, +                        const uint32_t *src, +                        const uint32_t *mask, +                        int             width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t m = ALPHA_8 (*mask++); +	uint32_t s = *src++; +	uint32_t a = ALPHA_8 (~(*dest)); + +	UN8x4_MUL_UN8 (s, m); +	UN8x4_MUL_UN8 (s, a); + +	*dest++ = s; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORSM (dest, src, mask); + +	vdest = pix_multiply (vsrc, splat_alpha (negate (vdest))); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +	mask += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t m = ALPHA_8 (mask[i]); +	uint32_t s = src[i]; +	uint32_t a = ALPHA_8 (~dest[i]); + +	UN8x4_MUL_UN8 (s, m); +	UN8x4_MUL_UN8 (s, a); + +	dest[i] = s; +    } +} + +static void +vmx_combine_out_u (pixman_implementation_t *imp, +                   pixman_op_t              op, +                   uint32_t *               dest, +                   const uint32_t *         src, +                   const uint32_t *         mask, +                   int                      width) +{ +    if (mask) +	vmx_combine_out_u_mask (dest, src, mask, width); +    else +	vmx_combine_out_u_no_mask (dest, src, width); +} + +static void +vmx_combine_out_reverse_u_no_mask (uint32_t *      dest, +                                   const uint32_t *src, +                                   int             width) +{ +    int i; +    vector unsigned int vdest, vsrc; +    DECLARE_SRC_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t d = *dest; +	uint32_t a = ALPHA_8 (~(*src++)); + +	UN8x4_MUL_UN8 (d, a); + +	*dest++ = d; +	width--; +    } + +    COMPUTE_SHIFT_MASKS (dest, src); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { + +	LOAD_VECTORS (dest, src); + +	vdest = pix_multiply (vdest, splat_alpha (negate (vsrc))); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t d = dest[i]; +	uint32_t a = ALPHA_8 (~src[i]); + +	UN8x4_MUL_UN8 (d, a); + +	dest[i] = d; +    } +} + +static void +vmx_combine_out_reverse_u_mask (uint32_t *      dest, +                                const uint32_t *src, +                                const uint32_t *mask, +                                int             width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t m = ALPHA_8 (*mask++); +	uint32_t d = *dest; +	uint32_t a = *src++; + +	UN8x4_MUL_UN8 (a, m); +	a = ALPHA_8 (~a); +	UN8x4_MUL_UN8 (d, a); + +	*dest++ = d; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORSM (dest, src, mask); + +	vdest = pix_multiply (vdest, splat_alpha (negate (vsrc))); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +	mask += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t m = ALPHA_8 (mask[i]); +	uint32_t d = dest[i]; +	uint32_t a = src[i]; + +	UN8x4_MUL_UN8 (a, m); +	a = ALPHA_8 (~a); +	UN8x4_MUL_UN8 (d, a); + +	dest[i] = d; +    } +} + +static void +vmx_combine_out_reverse_u (pixman_implementation_t *imp, +                           pixman_op_t              op, +                           uint32_t *               dest, +                           const uint32_t *         src, +                           const uint32_t *         mask, +                           int                      width) +{ +    if (mask) +	vmx_combine_out_reverse_u_mask (dest, src, mask, width); +    else +	vmx_combine_out_reverse_u_no_mask (dest, src, width); +} + +static void +vmx_combine_atop_u_no_mask (uint32_t *      dest, +                            const uint32_t *src, +                            int             width) +{ +    int i; +    vector unsigned int vdest, vsrc; +    DECLARE_SRC_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t s = *src++; +	uint32_t d = *dest; +	uint32_t dest_a = ALPHA_8 (d); +	uint32_t src_ia = ALPHA_8 (~s); + +	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia); + +	*dest++ = s; +	width--; +    } + +    COMPUTE_SHIFT_MASKS (dest, src); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORS (dest, src); + +	vdest = pix_add_mul (vsrc, splat_alpha (vdest), +			     vdest, splat_alpha (negate (vsrc))); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t s = src[i]; +	uint32_t d = dest[i]; +	uint32_t dest_a = ALPHA_8 (d); +	uint32_t src_ia = ALPHA_8 (~s); + +	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia); + +	dest[i] = s; +    } +} + +static void +vmx_combine_atop_u_mask (uint32_t *      dest, +                         const uint32_t *src, +                         const uint32_t *mask, +                         int             width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t m = ALPHA_8 (*mask++); +	uint32_t s = *src++; +	uint32_t d = *dest; +	uint32_t dest_a = ALPHA_8 (d); +	uint32_t src_ia; + +	UN8x4_MUL_UN8 (s, m); + +	src_ia = ALPHA_8 (~s); + +	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia); + +	*dest++ = s; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORSM (dest, src, mask); + +	vdest = pix_add_mul (vsrc, splat_alpha (vdest), +			     vdest, splat_alpha (negate (vsrc))); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +	mask += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t m = ALPHA_8 (mask[i]); +	uint32_t s = src[i]; +	uint32_t d = dest[i]; +	uint32_t dest_a = ALPHA_8 (d); +	uint32_t src_ia; + +	UN8x4_MUL_UN8 (s, m); + +	src_ia = ALPHA_8 (~s); + +	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia); + +	dest[i] = s; +    } +} + +static void +vmx_combine_atop_u (pixman_implementation_t *imp, +                    pixman_op_t              op, +                    uint32_t *               dest, +                    const uint32_t *         src, +                    const uint32_t *         mask, +                    int                      width) +{ +    if (mask) +	vmx_combine_atop_u_mask (dest, src, mask, width); +    else +	vmx_combine_atop_u_no_mask (dest, src, width); +} + +static void +vmx_combine_atop_reverse_u_no_mask (uint32_t *      dest, +                                    const uint32_t *src, +                                    int             width) +{ +    int i; +    vector unsigned int vdest, vsrc; +    DECLARE_SRC_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t s = *src++; +	uint32_t d = *dest; +	uint32_t src_a = ALPHA_8 (s); +	uint32_t dest_ia = ALPHA_8 (~d); + +	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a); + +	*dest++ = s; +	width--; +    } + +    COMPUTE_SHIFT_MASKS (dest, src); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORS (dest, src); + +	vdest = pix_add_mul (vdest, splat_alpha (vsrc), +			     vsrc, splat_alpha (negate (vdest))); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t s = src[i]; +	uint32_t d = dest[i]; +	uint32_t src_a = ALPHA_8 (s); +	uint32_t dest_ia = ALPHA_8 (~d); + +	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a); + +	dest[i] = s; +    } +} + +static void +vmx_combine_atop_reverse_u_mask (uint32_t *      dest, +                                 const uint32_t *src, +                                 const uint32_t *mask, +                                 int             width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t m = ALPHA_8 (*mask++); +	uint32_t s = *src++; +	uint32_t d = *dest; +	uint32_t src_a; +	uint32_t dest_ia = ALPHA_8 (~d); + +	UN8x4_MUL_UN8 (s, m); + +	src_a = ALPHA_8 (s); + +	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a); + +	*dest++ = s; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORSM (dest, src, mask); + +	vdest = pix_add_mul (vdest, splat_alpha (vsrc), +			     vsrc, splat_alpha (negate (vdest))); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +	mask += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t m = ALPHA_8 (mask[i]); +	uint32_t s = src[i]; +	uint32_t d = dest[i]; +	uint32_t src_a; +	uint32_t dest_ia = ALPHA_8 (~d); + +	UN8x4_MUL_UN8 (s, m); + +	src_a = ALPHA_8 (s); + +	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a); + +	dest[i] = s; +    } +} + +static void +vmx_combine_atop_reverse_u (pixman_implementation_t *imp, +                            pixman_op_t              op, +                            uint32_t *               dest, +                            const uint32_t *         src, +                            const uint32_t *         mask, +                            int                      width) +{ +    if (mask) +	vmx_combine_atop_reverse_u_mask (dest, src, mask, width); +    else +	vmx_combine_atop_reverse_u_no_mask (dest, src, width); +} + +static void +vmx_combine_xor_u_no_mask (uint32_t *      dest, +                           const uint32_t *src, +                           int             width) +{ +    int i; +    vector unsigned int vdest, vsrc; +    DECLARE_SRC_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t s = *src++; +	uint32_t d = *dest; +	uint32_t src_ia = ALPHA_8 (~s); +	uint32_t dest_ia = ALPHA_8 (~d); + +	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia); + +	*dest++ = s; +	width--; +    } + +    COMPUTE_SHIFT_MASKS (dest, src); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORS (dest, src); + +	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)), +			     vdest, splat_alpha (negate (vsrc))); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t s = src[i]; +	uint32_t d = dest[i]; +	uint32_t src_ia = ALPHA_8 (~s); +	uint32_t dest_ia = ALPHA_8 (~d); + +	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia); + +	dest[i] = s; +    } +} + +static void +vmx_combine_xor_u_mask (uint32_t *      dest, +                        const uint32_t *src, +                        const uint32_t *mask, +                        int             width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t m = ALPHA_8 (*mask++); +	uint32_t s = *src++; +	uint32_t d = *dest; +	uint32_t src_ia; +	uint32_t dest_ia = ALPHA_8 (~d); + +	UN8x4_MUL_UN8 (s, m); + +	src_ia = ALPHA_8 (~s); + +	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia); + +	*dest++ = s; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORSM (dest, src, mask); + +	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)), +			     vdest, splat_alpha (negate (vsrc))); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +	mask += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t m = ALPHA_8 (mask[i]); +	uint32_t s = src[i]; +	uint32_t d = dest[i]; +	uint32_t src_ia; +	uint32_t dest_ia = ALPHA_8 (~d); + +	UN8x4_MUL_UN8 (s, m); + +	src_ia = ALPHA_8 (~s); + +	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia); + +	dest[i] = s; +    } +} + +static void +vmx_combine_xor_u (pixman_implementation_t *imp, +                   pixman_op_t              op, +                   uint32_t *               dest, +                   const uint32_t *         src, +                   const uint32_t *         mask, +                   int                      width) +{ +    if (mask) +	vmx_combine_xor_u_mask (dest, src, mask, width); +    else +	vmx_combine_xor_u_no_mask (dest, src, width); +} + +static void +vmx_combine_add_u_no_mask (uint32_t *      dest, +                           const uint32_t *src, +                           int             width) +{ +    int i; +    vector unsigned int vdest, vsrc; +    DECLARE_SRC_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t s = *src++; +	uint32_t d = *dest; + +	UN8x4_ADD_UN8x4 (d, s); + +	*dest++ = d; +	width--; +    } + +    COMPUTE_SHIFT_MASKS (dest, src); +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORS (dest, src); + +	vdest = pix_add (vsrc, vdest); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t s = src[i]; +	uint32_t d = dest[i]; + +	UN8x4_ADD_UN8x4 (d, s); + +	dest[i] = d; +    } +} + +static void +vmx_combine_add_u_mask (uint32_t *      dest, +                        const uint32_t *src, +                        const uint32_t *mask, +                        int             width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t m = ALPHA_8 (*mask++); +	uint32_t s = *src++; +	uint32_t d = *dest; + +	UN8x4_MUL_UN8 (s, m); +	UN8x4_ADD_UN8x4 (d, s); + +	*dest++ = d; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORSM (dest, src, mask); + +	vdest = pix_add (vsrc, vdest); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +	mask += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t m = ALPHA_8 (mask[i]); +	uint32_t s = src[i]; +	uint32_t d = dest[i]; + +	UN8x4_MUL_UN8 (s, m); +	UN8x4_ADD_UN8x4 (d, s); + +	dest[i] = d; +    } +} + +static void +vmx_combine_add_u (pixman_implementation_t *imp, +                   pixman_op_t              op, +                   uint32_t *               dest, +                   const uint32_t *         src, +                   const uint32_t *         mask, +                   int                      width) +{ +    if (mask) +	vmx_combine_add_u_mask (dest, src, mask, width); +    else +	vmx_combine_add_u_no_mask (dest, src, width); +} + +static void +vmx_combine_src_ca (pixman_implementation_t *imp, +                    pixman_op_t              op, +                    uint32_t *               dest, +                    const uint32_t *         src, +                    const uint32_t *         mask, +                    int                      width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t a = *mask++; +	uint32_t s = *src++; + +	UN8x4_MUL_UN8x4 (s, a); + +	*dest++ = s; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORSC (dest, src, mask); + +	vdest = pix_multiply (vsrc, vmask); + +	STORE_VECTOR (dest); + +	mask += 4; +	src += 4; +	dest += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t a = mask[i]; +	uint32_t s = src[i]; + +	UN8x4_MUL_UN8x4 (s, a); + +	dest[i] = s; +    } +} + +static void +vmx_combine_over_ca (pixman_implementation_t *imp, +                     pixman_op_t              op, +                     uint32_t *               dest, +                     const uint32_t *         src, +                     const uint32_t *         mask, +                     int                      width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t a = *mask++; +	uint32_t s = *src++; +	uint32_t d = *dest; +	uint32_t sa = ALPHA_8 (s); + +	UN8x4_MUL_UN8x4 (s, a); +	UN8x4_MUL_UN8 (a, sa); +	UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s); + +	*dest++ = d; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORSC (dest, src, mask); + +	vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest); + +	STORE_VECTOR (dest); + +	mask += 4; +	src += 4; +	dest += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t a = mask[i]; +	uint32_t s = src[i]; +	uint32_t d = dest[i]; +	uint32_t sa = ALPHA_8 (s); + +	UN8x4_MUL_UN8x4 (s, a); +	UN8x4_MUL_UN8 (a, sa); +	UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s); + +	dest[i] = d; +    } +} + +static void +vmx_combine_over_reverse_ca (pixman_implementation_t *imp, +                             pixman_op_t              op, +                             uint32_t *               dest, +                             const uint32_t *         src, +                             const uint32_t *         mask, +                             int                      width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t a = *mask++; +	uint32_t s = *src++; +	uint32_t d = *dest; +	uint32_t ida = ALPHA_8 (~d); + +	UN8x4_MUL_UN8x4 (s, a); +	UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d); + +	*dest++ = s; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORSC (dest, src, mask); + +	vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask)); + +	STORE_VECTOR (dest); + +	mask += 4; +	src += 4; +	dest += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t a = mask[i]; +	uint32_t s = src[i]; +	uint32_t d = dest[i]; +	uint32_t ida = ALPHA_8 (~d); + +	UN8x4_MUL_UN8x4 (s, a); +	UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d); + +	dest[i] = s; +    } +} + +static void +vmx_combine_in_ca (pixman_implementation_t *imp, +                   pixman_op_t              op, +                   uint32_t *               dest, +                   const uint32_t *         src, +                   const uint32_t *         mask, +                   int                      width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t a = *mask++; +	uint32_t s = *src++; +	uint32_t da = ALPHA_8 (*dest); + +	UN8x4_MUL_UN8x4 (s, a); +	UN8x4_MUL_UN8 (s, da); + +	*dest++ = s; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORSC (dest, src, mask); + +	vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest)); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +	mask += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t a = mask[i]; +	uint32_t s = src[i]; +	uint32_t da = ALPHA_8 (dest[i]); + +	UN8x4_MUL_UN8x4 (s, a); +	UN8x4_MUL_UN8 (s, da); + +	dest[i] = s; +    } +} + +static void +vmx_combine_in_reverse_ca (pixman_implementation_t *imp, +                           pixman_op_t              op, +                           uint32_t *               dest, +                           const uint32_t *         src, +                           const uint32_t *         mask, +                           int                      width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t a = *mask++; +	uint32_t d = *dest; +	uint32_t sa = ALPHA_8 (*src++); + +	UN8x4_MUL_UN8 (a, sa); +	UN8x4_MUL_UN8x4 (d, a); + +	*dest++ = d; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { + +	LOAD_VECTORSC (dest, src, mask); + +	vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc))); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +	mask += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t a = mask[i]; +	uint32_t d = dest[i]; +	uint32_t sa = ALPHA_8 (src[i]); + +	UN8x4_MUL_UN8 (a, sa); +	UN8x4_MUL_UN8x4 (d, a); + +	dest[i] = d; +    } +} + +static void +vmx_combine_out_ca (pixman_implementation_t *imp, +                    pixman_op_t              op, +                    uint32_t *               dest, +                    const uint32_t *         src, +                    const uint32_t *         mask, +                    int                      width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t a = *mask++; +	uint32_t s = *src++; +	uint32_t d = *dest; +	uint32_t da = ALPHA_8 (~d); + +	UN8x4_MUL_UN8x4 (s, a); +	UN8x4_MUL_UN8 (s, da); + +	*dest++ = s; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORSC (dest, src, mask); + +	vdest = pix_multiply ( +	    pix_multiply (vsrc, vmask), splat_alpha (negate (vdest))); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +	mask += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t a = mask[i]; +	uint32_t s = src[i]; +	uint32_t d = dest[i]; +	uint32_t da = ALPHA_8 (~d); + +	UN8x4_MUL_UN8x4 (s, a); +	UN8x4_MUL_UN8 (s, da); + +	dest[i] = s; +    } +} + +static void +vmx_combine_out_reverse_ca (pixman_implementation_t *imp, +                            pixman_op_t              op, +                            uint32_t *               dest, +                            const uint32_t *         src, +                            const uint32_t *         mask, +                            int                      width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t a = *mask++; +	uint32_t s = *src++; +	uint32_t d = *dest; +	uint32_t sa = ALPHA_8 (s); + +	UN8x4_MUL_UN8 (a, sa); +	UN8x4_MUL_UN8x4 (d, ~a); + +	*dest++ = d; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORSC (dest, src, mask); + +	vdest = pix_multiply ( +	    vdest, negate (pix_multiply (vmask, splat_alpha (vsrc)))); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +	mask += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t a = mask[i]; +	uint32_t s = src[i]; +	uint32_t d = dest[i]; +	uint32_t sa = ALPHA_8 (s); + +	UN8x4_MUL_UN8 (a, sa); +	UN8x4_MUL_UN8x4 (d, ~a); + +	dest[i] = d; +    } +} + +static void +vmx_combine_atop_ca (pixman_implementation_t *imp, +                     pixman_op_t              op, +                     uint32_t *               dest, +                     const uint32_t *         src, +                     const uint32_t *         mask, +                     int                      width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask, vsrca; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t a = *mask++; +	uint32_t s = *src++; +	uint32_t d = *dest; +	uint32_t sa = ALPHA_8 (s); +	uint32_t da = ALPHA_8 (d); + +	UN8x4_MUL_UN8x4 (s, a); +	UN8x4_MUL_UN8 (a, sa); +	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da); + +	*dest++ = d; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORSC (dest, src, mask); + +	vsrca = splat_alpha (vsrc); + +	vsrc = pix_multiply (vsrc, vmask); +	vmask = pix_multiply (vmask, vsrca); + +	vdest = pix_add_mul (vsrc, splat_alpha (vdest), +			     negate (vmask), vdest); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +	mask += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t a = mask[i]; +	uint32_t s = src[i]; +	uint32_t d = dest[i]; +	uint32_t sa = ALPHA_8 (s); +	uint32_t da = ALPHA_8 (d); + +	UN8x4_MUL_UN8x4 (s, a); +	UN8x4_MUL_UN8 (a, sa); +	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da); + +	dest[i] = d; +    } +} + +static void +vmx_combine_atop_reverse_ca (pixman_implementation_t *imp, +                             pixman_op_t              op, +                             uint32_t *               dest, +                             const uint32_t *         src, +                             const uint32_t *         mask, +                             int                      width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t a = *mask++; +	uint32_t s = *src++; +	uint32_t d = *dest; +	uint32_t sa = ALPHA_8 (s); +	uint32_t da = ALPHA_8 (~d); + +	UN8x4_MUL_UN8x4 (s, a); +	UN8x4_MUL_UN8 (a, sa); +	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da); + +	*dest++ = d; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORSC (dest, src, mask); + +	vdest = pix_add_mul (vdest, +			     pix_multiply (vmask, splat_alpha (vsrc)), +			     pix_multiply (vsrc, vmask), +			     negate (splat_alpha (vdest))); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +	mask += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t a = mask[i]; +	uint32_t s = src[i]; +	uint32_t d = dest[i]; +	uint32_t sa = ALPHA_8 (s); +	uint32_t da = ALPHA_8 (~d); + +	UN8x4_MUL_UN8x4 (s, a); +	UN8x4_MUL_UN8 (a, sa); +	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da); + +	dest[i] = d; +    } +} + +static void +vmx_combine_xor_ca (pixman_implementation_t *imp, +                    pixman_op_t              op, +                    uint32_t *               dest, +                    const uint32_t *         src, +                    const uint32_t *         mask, +                    int                      width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t a = *mask++; +	uint32_t s = *src++; +	uint32_t d = *dest; +	uint32_t sa = ALPHA_8 (s); +	uint32_t da = ALPHA_8 (~d); + +	UN8x4_MUL_UN8x4 (s, a); +	UN8x4_MUL_UN8 (a, sa); +	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da); + +	*dest++ = d; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORSC (dest, src, mask); + +	vdest = pix_add_mul (vdest, +			     negate (pix_multiply (vmask, splat_alpha (vsrc))), +			     pix_multiply (vsrc, vmask), +			     negate (splat_alpha (vdest))); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +	mask += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t a = mask[i]; +	uint32_t s = src[i]; +	uint32_t d = dest[i]; +	uint32_t sa = ALPHA_8 (s); +	uint32_t da = ALPHA_8 (~d); + +	UN8x4_MUL_UN8x4 (s, a); +	UN8x4_MUL_UN8 (a, sa); +	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da); + +	dest[i] = d; +    } +} + +static void +vmx_combine_add_ca (pixman_implementation_t *imp, +                    pixman_op_t              op, +                    uint32_t *               dest, +                    const uint32_t *         src, +                    const uint32_t *         mask, +                    int                      width) +{ +    int i; +    vector unsigned int vdest, vsrc, vmask; +    DECLARE_SRC_MASK_VAR; +    DECLARE_MASK_MASK_VAR; + +    while (width && ((uintptr_t)dest & 15)) +    { +	uint32_t a = *mask++; +	uint32_t s = *src++; +	uint32_t d = *dest; + +	UN8x4_MUL_UN8x4 (s, a); +	UN8x4_ADD_UN8x4 (s, d); + +	*dest++ = s; +	width--; +    } + +    COMPUTE_SHIFT_MASKC (dest, src, mask); + +    /* printf ("%s\n",__PRETTY_FUNCTION__); */ +    for (i = width / 4; i > 0; i--) +    { +	LOAD_VECTORSC (dest, src, mask); + +	vdest = pix_add (pix_multiply (vsrc, vmask), vdest); + +	STORE_VECTOR (dest); + +	src += 4; +	dest += 4; +	mask += 4; +    } + +    for (i = width % 4; --i >= 0;) +    { +	uint32_t a = mask[i]; +	uint32_t s = src[i]; +	uint32_t d = dest[i]; + +	UN8x4_MUL_UN8x4 (s, a); +	UN8x4_ADD_UN8x4 (s, d); + +	dest[i] = s; +    } +} + +static void +vmx_composite_over_n_8_8888 (pixman_implementation_t *imp, +                              pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src, srca; +    uint32_t *dst_line, *dst; +    uint8_t *mask_line; +    int dst_stride, mask_stride; +    int32_t w; +    uint32_t m, d, s, ia; + +    vector unsigned int vsrc, valpha, vmask, vdst; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    srca = ALPHA_8(src); +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + +    vsrc = (vector unsigned int) {src, src, src, src}; +    valpha = splat_alpha(vsrc); + +    while (height--) +    { +	const uint8_t *pm = mask_line; +	dst = dst_line; +	dst_line += dst_stride; +	mask_line += mask_stride; +	w = width; + +	while (w && (uintptr_t)dst & 15) +	{ +	    s = src; +	    m = *pm++; + +	    if (m) +	    { +		d = *dst; +		UN8x4_MUL_UN8 (s, m); +		ia = ALPHA_8 (~s); +		UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); +		*dst = d; +	    } + +	    w--; +	    dst++; +	} + +	while (w >= 4) +	{ +	    m = *((uint32_t*)pm); + +	    if (srca == 0xff && m == 0xffffffff) +	    { +		save_128_aligned(dst, vsrc); +	    } +	    else if (m) +	    { +		vmask = splat_pixel((vector unsigned int) {m, m, m, m}); + +		/* dst is 16-byte aligned */ +		vdst = in_over (vsrc, valpha, vmask, load_128_aligned (dst)); + +		save_128_aligned(dst, vdst); +	    } + +	    w -= 4; +	    dst += 4; +	    pm += 4; +	} + +	while (w) +	{ +	    s = src; +	    m = *pm++; + +	    if (m) +	    { +		d = *dst; +		UN8x4_MUL_UN8 (s, m); +		ia = ALPHA_8 (~s); +		UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); +		*dst = d; +	    } + +	    w--; +	    dst++; +	} +    } + +} + +static pixman_bool_t +vmx_fill (pixman_implementation_t *imp, +           uint32_t *               bits, +           int                      stride, +           int                      bpp, +           int                      x, +           int                      y, +           int                      width, +           int                      height, +           uint32_t		    filler) +{ +    uint32_t byte_width; +    uint8_t *byte_line; + +    vector unsigned int vfiller; + +    if (bpp == 8) +    { +	uint8_t b; +	uint16_t w; + +	stride = stride * (int) sizeof (uint32_t) / 1; +	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); +	byte_width = width; +	stride *= 1; + +	b = filler & 0xff; +	w = (b << 8) | b; +	filler = (w << 16) | w; +    } +    else if (bpp == 16) +    { +	stride = stride * (int) sizeof (uint32_t) / 2; +	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); +	byte_width = 2 * width; +	stride *= 2; + +        filler = (filler & 0xffff) * 0x00010001; +    } +    else if (bpp == 32) +    { +	stride = stride * (int) sizeof (uint32_t) / 4; +	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); +	byte_width = 4 * width; +	stride *= 4; +    } +    else +    { +	return FALSE; +    } + +    vfiller = create_mask_1x32_128(&filler); + +    while (height--) +    { +	int w; +	uint8_t *d = byte_line; +	byte_line += stride; +	w = byte_width; + +	if (w >= 1 && ((uintptr_t)d & 1)) +	{ +	    *(uint8_t *)d = filler; +	    w -= 1; +	    d += 1; +	} + +	while (w >= 2 && ((uintptr_t)d & 3)) +	{ +	    *(uint16_t *)d = filler; +	    w -= 2; +	    d += 2; +	} + +	while (w >= 4 && ((uintptr_t)d & 15)) +	{ +	    *(uint32_t *)d = filler; + +	    w -= 4; +	    d += 4; +	} + +	while (w >= 128) +	{ +	    vec_st(vfiller, 0, (uint32_t *) d); +	    vec_st(vfiller, 0, (uint32_t *) d + 4); +	    vec_st(vfiller, 0, (uint32_t *) d + 8); +	    vec_st(vfiller, 0, (uint32_t *) d + 12); +	    vec_st(vfiller, 0, (uint32_t *) d + 16); +	    vec_st(vfiller, 0, (uint32_t *) d + 20); +	    vec_st(vfiller, 0, (uint32_t *) d + 24); +	    vec_st(vfiller, 0, (uint32_t *) d + 28); + +	    d += 128; +	    w -= 128; +	} + +	if (w >= 64) +	{ +	    vec_st(vfiller, 0, (uint32_t *) d); +	    vec_st(vfiller, 0, (uint32_t *) d + 4); +	    vec_st(vfiller, 0, (uint32_t *) d + 8); +	    vec_st(vfiller, 0, (uint32_t *) d + 12); + +	    d += 64; +	    w -= 64; +	} + +	if (w >= 32) +	{ +	    vec_st(vfiller, 0, (uint32_t *) d); +	    vec_st(vfiller, 0, (uint32_t *) d + 4); + +	    d += 32; +	    w -= 32; +	} + +	if (w >= 16) +	{ +	    vec_st(vfiller, 0, (uint32_t *) d); + +	    d += 16; +	    w -= 16; +	} + +	while (w >= 4) +	{ +	    *(uint32_t *)d = filler; + +	    w -= 4; +	    d += 4; +	} + +	if (w >= 2) +	{ +	    *(uint16_t *)d = filler; +	    w -= 2; +	    d += 2; +	} + +	if (w >= 1) +	{ +	    *(uint8_t *)d = filler; +	    w -= 1; +	    d += 1; +	} +    } + +    return TRUE; +} + +static void +vmx_composite_src_x888_8888 (pixman_implementation_t *imp, +			      pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t    *dst_line, *dst; +    uint32_t    *src_line, *src; +    int32_t w; +    int dst_stride, src_stride; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; +	w = width; + +	while (w && (uintptr_t)dst & 15) +	{ +	    *dst++ = *src++ | 0xff000000; +	    w--; +	} + +	while (w >= 16) +	{ +	    vector unsigned int vmx_src1, vmx_src2, vmx_src3, vmx_src4; + +	    vmx_src1 = load_128_unaligned (src); +	    vmx_src2 = load_128_unaligned (src + 4); +	    vmx_src3 = load_128_unaligned (src + 8); +	    vmx_src4 = load_128_unaligned (src + 12); + +	    save_128_aligned (dst, vec_or (vmx_src1, mask_ff000000)); +	    save_128_aligned (dst + 4, vec_or (vmx_src2, mask_ff000000)); +	    save_128_aligned (dst + 8, vec_or (vmx_src3, mask_ff000000)); +	    save_128_aligned (dst + 12, vec_or (vmx_src4, mask_ff000000)); + +	    dst += 16; +	    src += 16; +	    w -= 16; +	} + +	while (w) +	{ +	    *dst++ = *src++ | 0xff000000; +	    w--; +	} +    } +} + +static void +vmx_composite_over_n_8888 (pixman_implementation_t *imp, +                           pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t *dst_line, *dst; +    uint32_t src, ia; +    int      i, w, dst_stride; +    vector unsigned int vdst, vsrc, via; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + +    vsrc = (vector unsigned int){src, src, src, src}; +    via = negate (splat_alpha (vsrc)); +    ia = ALPHA_8 (~src); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	w = width; + +	while (w && ((uintptr_t)dst & 15)) +	{ +	    uint32_t d = *dst; +	    UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src); +	    *dst++ = d; +	    w--; +	} + +	for (i = w / 4; i > 0; i--) +	{ +	    vdst = pix_multiply (load_128_aligned (dst), via); +	    save_128_aligned (dst, pix_add (vsrc, vdst)); +	    dst += 4; +	} + +	for (i = w % 4; --i >= 0;) +	{ +	    uint32_t d = dst[i]; +	    UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src); +	    dst[i] = d; +	} +    } +} + +static void +vmx_composite_over_8888_8888 (pixman_implementation_t *imp, +                               pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    int dst_stride, src_stride; +    uint32_t    *dst_line, *dst; +    uint32_t    *src_line, *src; + +    PIXMAN_IMAGE_GET_LINE ( +    dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +    src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +    dst = dst_line; +    src = src_line; + +    while (height--) +    { +        vmx_combine_over_u (imp, op, dst, src, NULL, width); + +        dst += dst_stride; +        src += src_stride; +    } +} + +static void +vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, +                                    pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t src, ia; +    uint32_t    *dst_line, d; +    uint32_t    *mask_line, m; +    uint32_t pack_cmp; +    int dst_stride, mask_stride; + +    vector unsigned int vsrc, valpha, vmask, vdest; + +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + +    if (src == 0) +	return; + +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + +    vsrc = (vector unsigned int) {src, src, src, src}; +    valpha = splat_alpha(vsrc); +    ia = ALPHA_8 (src); + +    while (height--) +    { +	int w = width; +	const uint32_t *pm = (uint32_t *)mask_line; +	uint32_t *pd = (uint32_t *)dst_line; +	uint32_t s; + +	dst_line += dst_stride; +	mask_line += mask_stride; + +	while (w && (uintptr_t)pd & 15) +	{ +	    s = src; +	    m = *pm++; + +	    if (m) +	    { +		d = *pd; +		UN8x4_MUL_UN8x4 (s, m); +		UN8x4_MUL_UN8 (m, ia); +		m = ~m; +		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s); +		*pd = d; +	    } + +	    pd++; +	    w--; +	} + +	while (w >= 4) +	{ +	    /* pm is NOT necessarily 16-byte aligned */ +	    vmask = load_128_unaligned (pm); + +	    pack_cmp = vec_all_eq(vmask, (vector unsigned int) AVV(0)); + +	    /* if all bits in mask are zero, pack_cmp is not 0 */ +	    if (pack_cmp == 0) +	    { +		/* pd is 16-byte aligned */ +		vdest = in_over (vsrc, valpha, vmask, load_128_aligned (pd)); + +		save_128_aligned(pd, vdest); +	    } + +	    pd += 4; +	    pm += 4; +	    w -= 4; +	} + +	while (w) +	{ +	    s = src; +	    m = *pm++; + +	    if (m) +	    { +		d = *pd; +		UN8x4_MUL_UN8x4 (s, m); +		UN8x4_MUL_UN8 (m, ia); +		m = ~m; +		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s); +		*pd = d; +	    } + +	    pd++; +	    w--; +	} +    } +} + +static void +vmx_composite_add_8_8 (pixman_implementation_t *imp, +            pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint8_t     *dst_line, *dst; +    uint8_t     *src_line, *src; +    int dst_stride, src_stride; +    int32_t w; +    uint16_t t; + +    PIXMAN_IMAGE_GET_LINE ( +    src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +    dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + +    while (height--) +    { +	dst = dst_line; +	src = src_line; + +	dst_line += dst_stride; +	src_line += src_stride; +	w = width; + +	/* Small head */ +	while (w && (uintptr_t)dst & 3) +	{ +	    t = (*dst) + (*src++); +	    *dst++ = t | (0 - (t >> 8)); +	    w--; +	} + +	vmx_combine_add_u (imp, op, +		    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); + +	/* Small tail */ +	dst += w & 0xfffc; +	src += w & 0xfffc; + +	w &= 3; + +	while (w) +	{ +	    t = (*dst) + (*src++); +	    *dst++ = t | (0 - (t >> 8)); +	    w--; +	} +    } +} + +static void +vmx_composite_add_8888_8888 (pixman_implementation_t *imp, +                              pixman_composite_info_t *info) +{ +    PIXMAN_COMPOSITE_ARGS (info); +    uint32_t    *dst_line, *dst; +    uint32_t    *src_line, *src; +    int dst_stride, src_stride; + +    PIXMAN_IMAGE_GET_LINE ( +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); +    PIXMAN_IMAGE_GET_LINE ( +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + +    while (height--) +    { +	dst = dst_line; +	dst_line += dst_stride; +	src = src_line; +	src_line += src_stride; + +	vmx_combine_add_u (imp, op, dst, src, NULL, width); +    } +} + +static force_inline void +scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t*       pd, +                                            const uint32_t* ps, +                                            int32_t         w, +                                            pixman_fixed_t  vx, +                                            pixman_fixed_t  unit_x, +                                            pixman_fixed_t  src_width_fixed, +                                            pixman_bool_t   fully_transparent_src) +{ +    uint32_t s, d; +    const uint32_t* pm = NULL; + +    vector unsigned int vsrc, vdst; + +    if (fully_transparent_src) +	return; + +    /* Align dst on a 16-byte boundary */ +    while (w && ((uintptr_t)pd & 15)) +    { +	d = *pd; +	s = combine1 (ps + pixman_fixed_to_int (vx), pm); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; + +	*pd++ = core_combine_over_u_pixel_vmx (s, d); +	if (pm) +	    pm++; +	w--; +    } + +    while (w >= 4) +    { +	vector unsigned int tmp; +	uint32_t tmp1, tmp2, tmp3, tmp4; + +	tmp1 = *(ps + pixman_fixed_to_int (vx)); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; +	tmp2 = *(ps + pixman_fixed_to_int (vx)); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; +	tmp3 = *(ps + pixman_fixed_to_int (vx)); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; +	tmp4 = *(ps + pixman_fixed_to_int (vx)); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; + +	tmp[0] = tmp1; +	tmp[1] = tmp2; +	tmp[2] = tmp3; +	tmp[3] = tmp4; + +	vsrc = combine4 ((const uint32_t *) &tmp, pm); + +	if (is_opaque (vsrc)) +	{ +	    save_128_aligned (pd, vsrc); +	} +	else if (!is_zero (vsrc)) +	{ +	    vdst = over(vsrc, splat_alpha(vsrc), load_128_aligned (pd)); + +	    save_128_aligned (pd, vdst); +	} + +	w -= 4; +	pd += 4; +	if (pm) +	    pm += 4; +    } + +    while (w) +    { +	d = *pd; +	s = combine1 (ps + pixman_fixed_to_int (vx), pm); +	vx += unit_x; +	while (vx >= 0) +	    vx -= src_width_fixed; + +	*pd++ = core_combine_over_u_pixel_vmx (s, d); +	if (pm) +	    pm++; + +	w--; +    } +} + +FAST_NEAREST_MAINLOOP (vmx_8888_8888_cover_OVER, +		       scaled_nearest_scanline_vmx_8888_8888_OVER, +		       uint32_t, uint32_t, COVER) +FAST_NEAREST_MAINLOOP (vmx_8888_8888_none_OVER, +		       scaled_nearest_scanline_vmx_8888_8888_OVER, +		       uint32_t, uint32_t, NONE) +FAST_NEAREST_MAINLOOP (vmx_8888_8888_pad_OVER, +		       scaled_nearest_scanline_vmx_8888_8888_OVER, +		       uint32_t, uint32_t, PAD) +FAST_NEAREST_MAINLOOP (vmx_8888_8888_normal_OVER, +		       scaled_nearest_scanline_vmx_8888_8888_OVER, +		       uint32_t, uint32_t, NORMAL) + +static const pixman_fast_path_t vmx_fast_paths[] = +{ +    PIXMAN_STD_FAST_PATH (OVER, solid,    null, a8r8g8b8, vmx_composite_over_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid,    null, x8r8g8b8, vmx_composite_over_n_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, vmx_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, vmx_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, vmx_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, vmx_composite_over_n_8_8888), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, vmx_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, vmx_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, vmx_composite_over_n_8888_8888_ca), +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, vmx_composite_over_n_8888_8888_ca), + +    /* PIXMAN_OP_ADD */ +    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8), +    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, vmx_composite_add_8888_8888), +    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, vmx_composite_add_8888_8888), + +    /* PIXMAN_OP_SRC */ +    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, vmx_composite_src_x888_8888), +    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, vmx_composite_src_x888_8888), + +    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, vmx_8888_8888), +    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, vmx_8888_8888), +    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, vmx_8888_8888), +    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, vmx_8888_8888), + +    {   PIXMAN_OP_NONE	}, +}; + +static uint32_t * +vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) +{ +    int w = iter->width; +    vector unsigned int ff000000 = mask_ff000000; +    uint32_t *dst = iter->buffer; +    uint32_t *src = (uint32_t *)iter->bits; + +    iter->bits += iter->stride; + +    while (w && ((uintptr_t)dst) & 0x0f) +    { +	*dst++ = (*src++) | 0xff000000; +	w--; +    } + +    while (w >= 4) +    { +	save_128_aligned(dst, vec_or(load_128_unaligned(src), ff000000)); + +	dst += 4; +	src += 4; +	w -= 4; +    } + +    while (w) +    { +	*dst++ = (*src++) | 0xff000000; +	w--; +    } + +    return iter->buffer; +} + +static uint32_t * +vmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) +{ +    int w = iter->width; +    uint32_t *dst = iter->buffer; +    uint8_t *src = iter->bits; +    vector unsigned int vmx0, vmx1, vmx2, vmx3, vmx4, vmx5, vmx6; + +    iter->bits += iter->stride; + +    while (w && (((uintptr_t)dst) & 15)) +    { +        *dst++ = *(src++) << 24; +        w--; +    } + +    while (w >= 16) +    { +	vmx0 = load_128_unaligned((uint32_t *) src); + +	unpack_128_2x128((vector unsigned int) AVV(0), vmx0, &vmx1, &vmx2); +	unpack_128_2x128_16((vector unsigned int) AVV(0), vmx1, &vmx3, &vmx4); +	unpack_128_2x128_16((vector unsigned int) AVV(0), vmx2, &vmx5, &vmx6); + +	save_128_aligned(dst, vmx6); +	save_128_aligned((dst +  4), vmx5); +	save_128_aligned((dst +  8), vmx4); +	save_128_aligned((dst + 12), vmx3); + +	dst += 16; +	src += 16; +	w -= 16; +    } + +    while (w) +    { +	*dst++ = *(src++) << 24; +	w--; +    } + +    return iter->buffer; +} + +#define IMAGE_FLAGS							\ +    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\ +     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) + +static const pixman_iter_info_t vmx_iters[] = +{ +    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW, +      _pixman_iter_init_bits_stride, vmx_fetch_x8r8g8b8, NULL +    }, +    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW, +      _pixman_iter_init_bits_stride, vmx_fetch_a8, NULL +    }, +    { PIXMAN_null }, +}; + +pixman_implementation_t * +_pixman_implementation_create_vmx (pixman_implementation_t *fallback) +{ +    pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths); + +    /* VMX constants */ +    mask_ff000000 = create_mask_32_128 (0xff000000); +    mask_red   = create_mask_32_128 (0x00f80000); +    mask_green = create_mask_32_128 (0x0000fc00); +    mask_blue  = create_mask_32_128 (0x000000f8); +    mask_565_fix_rb = create_mask_32_128 (0x00e000e0); +    mask_565_fix_g = create_mask_32_128  (0x0000c000); + +    /* Set up function pointers */ + +    imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u; +    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_u; +    imp->combine_32[PIXMAN_OP_IN] = vmx_combine_in_u; +    imp->combine_32[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_u; +    imp->combine_32[PIXMAN_OP_OUT] = vmx_combine_out_u; +    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_u; +    imp->combine_32[PIXMAN_OP_ATOP] = vmx_combine_atop_u; +    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_u; +    imp->combine_32[PIXMAN_OP_XOR] = vmx_combine_xor_u; + +    imp->combine_32[PIXMAN_OP_ADD] = vmx_combine_add_u; + +    imp->combine_32_ca[PIXMAN_OP_SRC] = vmx_combine_src_ca; +    imp->combine_32_ca[PIXMAN_OP_OVER] = vmx_combine_over_ca; +    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_ca; +    imp->combine_32_ca[PIXMAN_OP_IN] = vmx_combine_in_ca; +    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_ca; +    imp->combine_32_ca[PIXMAN_OP_OUT] = vmx_combine_out_ca; +    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_ca; +    imp->combine_32_ca[PIXMAN_OP_ATOP] = vmx_combine_atop_ca; +    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_ca; +    imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca; +    imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca; + +    imp->fill = vmx_fill; + +    imp->iter_info = vmx_iters; + +    return imp; +} diff --git a/libs/pixman-0.40.0/pixman/pixman-x86.c b/libs/pixman-0.40.0/pixman/pixman-x86.c new file mode 100644 index 0000000..0130b7b --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman-x86.c @@ -0,0 +1,249 @@ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of SuSE not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission.  SuSE makes no representations about the + * suitability of this software for any purpose.  It is provided "as is" + * without express or implied warranty. + * + * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include "pixman-private.h" + +#if defined(USE_X86_MMX) || defined (USE_SSE2) || defined (USE_SSSE3) + +/* The CPU detection code needs to be in a file not compiled with + * "-mmmx -msse", as gcc would generate CMOV instructions otherwise + * that would lead to SIGILL instructions on old CPUs that don't have + * it. + */ + +typedef enum +{ +    X86_MMX			= (1 << 0), +    X86_MMX_EXTENSIONS		= (1 << 1), +    X86_SSE			= (1 << 2) | X86_MMX_EXTENSIONS, +    X86_SSE2			= (1 << 3), +    X86_CMOV			= (1 << 4), +    X86_SSSE3			= (1 << 5) +} cpu_features_t; + +#ifdef HAVE_GETISAX + +#include <sys/auxv.h> + +static cpu_features_t +detect_cpu_features (void) +{ +    cpu_features_t features = 0; +    unsigned int result = 0; + +    if (getisax (&result, 1)) +    { +	if (result & AV_386_CMOV) +	    features |= X86_CMOV; +	if (result & AV_386_MMX) +	    features |= X86_MMX; +	if (result & AV_386_AMD_MMX) +	    features |= X86_MMX_EXTENSIONS; +	if (result & AV_386_SSE) +	    features |= X86_SSE; +	if (result & AV_386_SSE2) +	    features |= X86_SSE2; +	if (result & AV_386_SSSE3) +	    features |= X86_SSSE3; +    } + +    return features; +} + +#else + +#define _PIXMAN_X86_64							\ +    (defined(__amd64__) || defined(__x86_64__) || defined(_M_AMD64)) + +static pixman_bool_t +have_cpuid (void) +{ +#if _PIXMAN_X86_64 || defined (_MSC_VER) + +    return TRUE; + +#elif defined (__GNUC__) +    uint32_t result; + +    __asm__ volatile ( +        "pushf"				"\n\t" +        "pop %%eax"			"\n\t" +        "mov %%eax, %%ecx"		"\n\t" +        "xor $0x00200000, %%eax"	"\n\t" +        "push %%eax"			"\n\t" +        "popf"				"\n\t" +        "pushf"				"\n\t" +        "pop %%eax"			"\n\t" +        "xor %%ecx, %%eax"		"\n\t" +	"mov %%eax, %0"			"\n\t" +	: "=r" (result) +	: +	: "%eax", "%ecx"); + +    return !!result; + +#else +#error "Unknown compiler" +#endif +} + +static void +pixman_cpuid (uint32_t feature, +	      uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) +{ +#if defined (__GNUC__) + +#if _PIXMAN_X86_64 +    __asm__ volatile ( +        "cpuid"				"\n\t" +	: "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d) +	: "a" (feature)); +#else +    /* On x86-32 we need to be careful about the handling of %ebx +     * and %esp. We can't declare either one as clobbered +     * since they are special registers (%ebx is the "PIC +     * register" holding an offset to global data, %esp the +     * stack pointer), so we need to make sure that %ebx is +     * preserved, and that %esp has its original value when +     * accessing the output operands. +     */ +    __asm__ volatile ( +	"xchg %%ebx, %1"		"\n\t" +	"cpuid"				"\n\t" +	"xchg %%ebx, %1"		"\n\t" +	: "=a" (*a), "=r" (*b), "=c" (*c), "=d" (*d) +	: "a" (feature)); +#endif + +#elif defined (_MSC_VER) +    int info[4]; + +    __cpuid (info, feature); + +    *a = info[0]; +    *b = info[1]; +    *c = info[2]; +    *d = info[3]; +#else +#error Unknown compiler +#endif +} + +static cpu_features_t +detect_cpu_features (void) +{ +    uint32_t a, b, c, d; +    cpu_features_t features = 0; + +    if (!have_cpuid()) +	return features; + +    /* Get feature bits */ +    pixman_cpuid (0x01, &a, &b, &c, &d); +    if (d & (1 << 15)) +	features |= X86_CMOV; +    if (d & (1 << 23)) +	features |= X86_MMX; +    if (d & (1 << 25)) +	features |= X86_SSE; +    if (d & (1 << 26)) +	features |= X86_SSE2; +    if (c & (1 << 9)) +	features |= X86_SSSE3; + +    /* Check for AMD specific features */ +    if ((features & X86_MMX) && !(features & X86_SSE)) +    { +	char vendor[13]; + +	/* Get vendor string */ +	memset (vendor, 0, sizeof vendor); + +	pixman_cpuid (0x00, &a, &b, &c, &d); +	memcpy (vendor + 0, &b, 4); +	memcpy (vendor + 4, &d, 4); +	memcpy (vendor + 8, &c, 4); + +	if (strcmp (vendor, "AuthenticAMD") == 0 || +	    strcmp (vendor, "HygonGenuine") == 0 || +	    strcmp (vendor, "Geode by NSC") == 0) +	{ +	    pixman_cpuid (0x80000000, &a, &b, &c, &d); +	    if (a >= 0x80000001) +	    { +		pixman_cpuid (0x80000001, &a, &b, &c, &d); + +		if (d & (1 << 22)) +		    features |= X86_MMX_EXTENSIONS; +	    } +	} +    } + +    return features; +} + +#endif + +static pixman_bool_t +have_feature (cpu_features_t feature) +{ +    static pixman_bool_t initialized; +    static cpu_features_t features; + +    if (!initialized) +    { +	features = detect_cpu_features(); +	initialized = TRUE; +    } + +    return (features & feature) == feature; +} + +#endif + +pixman_implementation_t * +_pixman_x86_get_implementations (pixman_implementation_t *imp) +{ +#define MMX_BITS  (X86_MMX | X86_MMX_EXTENSIONS) +#define SSE2_BITS (X86_MMX | X86_MMX_EXTENSIONS | X86_SSE | X86_SSE2) +#define SSSE3_BITS (X86_SSE | X86_SSE2 | X86_SSSE3) + +#ifdef USE_X86_MMX +    if (!_pixman_disabled ("mmx") && have_feature (MMX_BITS)) +	imp = _pixman_implementation_create_mmx (imp); +#endif + +#ifdef USE_SSE2 +    if (!_pixman_disabled ("sse2") && have_feature (SSE2_BITS)) +	imp = _pixman_implementation_create_sse2 (imp); +#endif + +#ifdef USE_SSSE3 +    if (!_pixman_disabled ("ssse3") && have_feature (SSSE3_BITS)) +	imp = _pixman_implementation_create_ssse3 (imp); +#endif + +    return imp; +} diff --git a/libs/pixman-0.40.0/pixman/pixman.c b/libs/pixman-0.40.0/pixman/pixman.c new file mode 100644 index 0000000..c09b528 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman.c @@ -0,0 +1,1133 @@ +/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of SuSE not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission.  SuSE makes no representations about the + * suitability of this software for any purpose.  It is provided "as is" + * without express or implied warranty. + * + * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Author:  Keith Packard, SuSE, Inc. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include "pixman-private.h" + +#include <stdlib.h> + +pixman_implementation_t *global_implementation; + +#ifdef TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR +static void __attribute__((constructor)) +pixman_constructor (void) +{ +    global_implementation = _pixman_choose_implementation (); +} +#endif + +typedef struct operator_info_t operator_info_t; + +struct operator_info_t +{ +    uint8_t	opaque_info[4]; +}; + +#define PACK(neither, src, dest, both)			\ +    {{	    (uint8_t)PIXMAN_OP_ ## neither,		\ +	    (uint8_t)PIXMAN_OP_ ## src,			\ +	    (uint8_t)PIXMAN_OP_ ## dest,		\ +	    (uint8_t)PIXMAN_OP_ ## both		}} + +static const operator_info_t operator_table[] = +{ +    /*    Neither Opaque         Src Opaque             Dst Opaque             Both Opaque */ +    PACK (CLEAR,                 CLEAR,                 CLEAR,                 CLEAR), +    PACK (SRC,                   SRC,                   SRC,                   SRC), +    PACK (DST,                   DST,                   DST,                   DST), +    PACK (OVER,                  SRC,                   OVER,                  SRC), +    PACK (OVER_REVERSE,          OVER_REVERSE,          DST,                   DST), +    PACK (IN,                    IN,                    SRC,                   SRC), +    PACK (IN_REVERSE,            DST,                   IN_REVERSE,            DST), +    PACK (OUT,                   OUT,                   CLEAR,                 CLEAR), +    PACK (OUT_REVERSE,           CLEAR,                 OUT_REVERSE,           CLEAR), +    PACK (ATOP,                  IN,                    OVER,                  SRC), +    PACK (ATOP_REVERSE,          OVER_REVERSE,          IN_REVERSE,            DST), +    PACK (XOR,                   OUT,                   OUT_REVERSE,           CLEAR), +    PACK (ADD,                   ADD,                   ADD,                   ADD), +    PACK (SATURATE,              OVER_REVERSE,          DST,                   DST), + +    {{ 0 /* 0x0e */ }}, +    {{ 0 /* 0x0f */ }}, + +    PACK (CLEAR,                 CLEAR,                 CLEAR,                 CLEAR), +    PACK (SRC,                   SRC,                   SRC,                   SRC), +    PACK (DST,                   DST,                   DST,                   DST), +    PACK (DISJOINT_OVER,         DISJOINT_OVER,         DISJOINT_OVER,         DISJOINT_OVER), +    PACK (DISJOINT_OVER_REVERSE, DISJOINT_OVER_REVERSE, DISJOINT_OVER_REVERSE, DISJOINT_OVER_REVERSE), +    PACK (DISJOINT_IN,           DISJOINT_IN,           DISJOINT_IN,           DISJOINT_IN), +    PACK (DISJOINT_IN_REVERSE,   DISJOINT_IN_REVERSE,   DISJOINT_IN_REVERSE,   DISJOINT_IN_REVERSE), +    PACK (DISJOINT_OUT,          DISJOINT_OUT,          DISJOINT_OUT,          DISJOINT_OUT), +    PACK (DISJOINT_OUT_REVERSE,  DISJOINT_OUT_REVERSE,  DISJOINT_OUT_REVERSE,  DISJOINT_OUT_REVERSE), +    PACK (DISJOINT_ATOP,         DISJOINT_ATOP,         DISJOINT_ATOP,         DISJOINT_ATOP), +    PACK (DISJOINT_ATOP_REVERSE, DISJOINT_ATOP_REVERSE, DISJOINT_ATOP_REVERSE, DISJOINT_ATOP_REVERSE), +    PACK (DISJOINT_XOR,          DISJOINT_XOR,          DISJOINT_XOR,          DISJOINT_XOR), + +    {{ 0 /* 0x1c */ }}, +    {{ 0 /* 0x1d */ }}, +    {{ 0 /* 0x1e */ }}, +    {{ 0 /* 0x1f */ }}, + +    PACK (CLEAR,                 CLEAR,                 CLEAR,                 CLEAR), +    PACK (SRC,                   SRC,                   SRC,                   SRC), +    PACK (DST,                   DST,                   DST,                   DST), +    PACK (CONJOINT_OVER,         CONJOINT_OVER,         CONJOINT_OVER,         CONJOINT_OVER), +    PACK (CONJOINT_OVER_REVERSE, CONJOINT_OVER_REVERSE, CONJOINT_OVER_REVERSE, CONJOINT_OVER_REVERSE), +    PACK (CONJOINT_IN,           CONJOINT_IN,           CONJOINT_IN,           CONJOINT_IN), +    PACK (CONJOINT_IN_REVERSE,   CONJOINT_IN_REVERSE,   CONJOINT_IN_REVERSE,   CONJOINT_IN_REVERSE), +    PACK (CONJOINT_OUT,          CONJOINT_OUT,          CONJOINT_OUT,          CONJOINT_OUT), +    PACK (CONJOINT_OUT_REVERSE,  CONJOINT_OUT_REVERSE,  CONJOINT_OUT_REVERSE,  CONJOINT_OUT_REVERSE), +    PACK (CONJOINT_ATOP,         CONJOINT_ATOP,         CONJOINT_ATOP,         CONJOINT_ATOP), +    PACK (CONJOINT_ATOP_REVERSE, CONJOINT_ATOP_REVERSE, CONJOINT_ATOP_REVERSE, CONJOINT_ATOP_REVERSE), +    PACK (CONJOINT_XOR,          CONJOINT_XOR,          CONJOINT_XOR,          CONJOINT_XOR), + +    {{ 0 /* 0x2c */ }}, +    {{ 0 /* 0x2d */ }}, +    {{ 0 /* 0x2e */ }}, +    {{ 0 /* 0x2f */ }}, + +    PACK (MULTIPLY,              MULTIPLY,              MULTIPLY,              MULTIPLY), +    PACK (SCREEN,                SCREEN,                SCREEN,                SCREEN), +    PACK (OVERLAY,               OVERLAY,               OVERLAY,               OVERLAY), +    PACK (DARKEN,                DARKEN,                DARKEN,                DARKEN), +    PACK (LIGHTEN,               LIGHTEN,               LIGHTEN,               LIGHTEN), +    PACK (COLOR_DODGE,           COLOR_DODGE,           COLOR_DODGE,           COLOR_DODGE), +    PACK (COLOR_BURN,            COLOR_BURN,            COLOR_BURN,            COLOR_BURN), +    PACK (HARD_LIGHT,            HARD_LIGHT,            HARD_LIGHT,            HARD_LIGHT), +    PACK (SOFT_LIGHT,            SOFT_LIGHT,            SOFT_LIGHT,            SOFT_LIGHT), +    PACK (DIFFERENCE,            DIFFERENCE,            DIFFERENCE,            DIFFERENCE), +    PACK (EXCLUSION,             EXCLUSION,             EXCLUSION,             EXCLUSION), +    PACK (HSL_HUE,               HSL_HUE,               HSL_HUE,               HSL_HUE), +    PACK (HSL_SATURATION,        HSL_SATURATION,        HSL_SATURATION,        HSL_SATURATION), +    PACK (HSL_COLOR,             HSL_COLOR,             HSL_COLOR,             HSL_COLOR), +    PACK (HSL_LUMINOSITY,        HSL_LUMINOSITY,        HSL_LUMINOSITY,        HSL_LUMINOSITY), +}; + +/* + * Optimize the current operator based on opacity of source or destination + * The output operator should be mathematically equivalent to the source. + */ +static pixman_op_t +optimize_operator (pixman_op_t     op, +		   uint32_t        src_flags, +		   uint32_t        mask_flags, +		   uint32_t        dst_flags) +{ +    pixman_bool_t is_source_opaque, is_dest_opaque; + +#define OPAQUE_SHIFT 13 +     +    COMPILE_TIME_ASSERT (FAST_PATH_IS_OPAQUE == (1 << OPAQUE_SHIFT)); +     +    is_dest_opaque = (dst_flags & FAST_PATH_IS_OPAQUE); +    is_source_opaque = ((src_flags & mask_flags) & FAST_PATH_IS_OPAQUE); + +    is_dest_opaque >>= OPAQUE_SHIFT - 1; +    is_source_opaque >>= OPAQUE_SHIFT; + +    return operator_table[op].opaque_info[is_dest_opaque | is_source_opaque]; +} + +/* + * Computing composite region + */ +static inline pixman_bool_t +clip_general_image (pixman_region32_t * region, +                    pixman_region32_t * clip, +                    int                 dx, +                    int                 dy) +{ +    if (pixman_region32_n_rects (region) == 1 && +        pixman_region32_n_rects (clip) == 1) +    { +	pixman_box32_t *  rbox = pixman_region32_rectangles (region, NULL); +	pixman_box32_t *  cbox = pixman_region32_rectangles (clip, NULL); +	int v; + +	if (rbox->x1 < (v = cbox->x1 + dx)) +	    rbox->x1 = v; +	if (rbox->x2 > (v = cbox->x2 + dx)) +	    rbox->x2 = v; +	if (rbox->y1 < (v = cbox->y1 + dy)) +	    rbox->y1 = v; +	if (rbox->y2 > (v = cbox->y2 + dy)) +	    rbox->y2 = v; +	if (rbox->x1 >= rbox->x2 || rbox->y1 >= rbox->y2) +	{ +	    pixman_region32_init (region); +	    return FALSE; +	} +    } +    else if (!pixman_region32_not_empty (clip)) +    { +	return FALSE; +    } +    else +    { +	if (dx || dy) +	    pixman_region32_translate (region, -dx, -dy); + +	if (!pixman_region32_intersect (region, region, clip)) +	    return FALSE; + +	if (dx || dy) +	    pixman_region32_translate (region, dx, dy); +    } + +    return pixman_region32_not_empty (region); +} + +static inline pixman_bool_t +clip_source_image (pixman_region32_t * region, +                   pixman_image_t *    image, +                   int                 dx, +                   int                 dy) +{ +    /* Source clips are ignored, unless they are explicitly turned on +     * and the clip in question was set by an X client. (Because if +     * the clip was not set by a client, then it is a hierarchy +     * clip and those should always be ignored for sources). +     */ +    if (!image->common.clip_sources || !image->common.client_clip) +	return TRUE; + +    return clip_general_image (region, +                               &image->common.clip_region, +                               dx, dy); +} + +/* + * returns FALSE if the final region is empty.  Indistinguishable from + * an allocation failure, but rendering ignores those anyways. + */ +pixman_bool_t +_pixman_compute_composite_region32 (pixman_region32_t * region, +				    pixman_image_t *    src_image, +				    pixman_image_t *    mask_image, +				    pixman_image_t *    dest_image, +				    int32_t             src_x, +				    int32_t             src_y, +				    int32_t             mask_x, +				    int32_t             mask_y, +				    int32_t             dest_x, +				    int32_t             dest_y, +				    int32_t             width, +				    int32_t             height) +{ +    region->extents.x1 = dest_x; +    region->extents.x2 = dest_x + width; +    region->extents.y1 = dest_y; +    region->extents.y2 = dest_y + height; + +    region->extents.x1 = MAX (region->extents.x1, 0); +    region->extents.y1 = MAX (region->extents.y1, 0); +    region->extents.x2 = MIN (region->extents.x2, dest_image->bits.width); +    region->extents.y2 = MIN (region->extents.y2, dest_image->bits.height); + +    region->data = 0; + +    /* Check for empty operation */ +    if (region->extents.x1 >= region->extents.x2 || +        region->extents.y1 >= region->extents.y2) +    { +	region->extents.x1 = 0; +	region->extents.x2 = 0; +	region->extents.y1 = 0; +	region->extents.y2 = 0; +	return FALSE; +    } + +    if (dest_image->common.have_clip_region) +    { +	if (!clip_general_image (region, &dest_image->common.clip_region, 0, 0)) +	    return FALSE; +    } + +    if (dest_image->common.alpha_map) +    { +	if (!pixman_region32_intersect_rect (region, region, +					     dest_image->common.alpha_origin_x, +					     dest_image->common.alpha_origin_y, +					     dest_image->common.alpha_map->width, +					     dest_image->common.alpha_map->height)) +	{ +	    return FALSE; +	} +	if (!pixman_region32_not_empty (region)) +	    return FALSE; +	if (dest_image->common.alpha_map->common.have_clip_region) +	{ +	    if (!clip_general_image (region, &dest_image->common.alpha_map->common.clip_region, +				     -dest_image->common.alpha_origin_x, +				     -dest_image->common.alpha_origin_y)) +	    { +		return FALSE; +	    } +	} +    } + +    /* clip against src */ +    if (src_image->common.have_clip_region) +    { +	if (!clip_source_image (region, src_image, dest_x - src_x, dest_y - src_y)) +	    return FALSE; +    } +    if (src_image->common.alpha_map && src_image->common.alpha_map->common.have_clip_region) +    { +	if (!clip_source_image (region, (pixman_image_t *)src_image->common.alpha_map, +	                        dest_x - (src_x - src_image->common.alpha_origin_x), +	                        dest_y - (src_y - src_image->common.alpha_origin_y))) +	{ +	    return FALSE; +	} +    } +    /* clip against mask */ +    if (mask_image && mask_image->common.have_clip_region) +    { +	if (!clip_source_image (region, mask_image, dest_x - mask_x, dest_y - mask_y)) +	    return FALSE; + +	if (mask_image->common.alpha_map && mask_image->common.alpha_map->common.have_clip_region) +	{ +	    if (!clip_source_image (region, (pixman_image_t *)mask_image->common.alpha_map, +	                            dest_x - (mask_x - mask_image->common.alpha_origin_x), +	                            dest_y - (mask_y - mask_image->common.alpha_origin_y))) +	    { +		return FALSE; +	    } +	} +    } + +    return TRUE; +} + +typedef struct box_48_16 box_48_16_t; + +struct box_48_16 +{ +    pixman_fixed_48_16_t        x1; +    pixman_fixed_48_16_t        y1; +    pixman_fixed_48_16_t        x2; +    pixman_fixed_48_16_t        y2; +}; + +static pixman_bool_t +compute_transformed_extents (pixman_transform_t   *transform, +			     const pixman_box32_t *extents, +			     box_48_16_t          *transformed) +{ +    pixman_fixed_48_16_t tx1, ty1, tx2, ty2; +    pixman_fixed_t x1, y1, x2, y2; +    int i; + +    x1 = pixman_int_to_fixed (extents->x1) + pixman_fixed_1 / 2; +    y1 = pixman_int_to_fixed (extents->y1) + pixman_fixed_1 / 2; +    x2 = pixman_int_to_fixed (extents->x2) - pixman_fixed_1 / 2; +    y2 = pixman_int_to_fixed (extents->y2) - pixman_fixed_1 / 2; + +    if (!transform) +    { +	transformed->x1 = x1; +	transformed->y1 = y1; +	transformed->x2 = x2; +	transformed->y2 = y2; + +	return TRUE; +    } + +    tx1 = ty1 = INT64_MAX; +    tx2 = ty2 = INT64_MIN; + +    for (i = 0; i < 4; ++i) +    { +	pixman_fixed_48_16_t tx, ty; +	pixman_vector_t v; + +	v.vector[0] = (i & 0x01)? x1 : x2; +	v.vector[1] = (i & 0x02)? y1 : y2; +	v.vector[2] = pixman_fixed_1; + +	if (!pixman_transform_point (transform, &v)) +	    return FALSE; + +	tx = (pixman_fixed_48_16_t)v.vector[0]; +	ty = (pixman_fixed_48_16_t)v.vector[1]; + +	if (tx < tx1) +	    tx1 = tx; +	if (ty < ty1) +	    ty1 = ty; +	if (tx > tx2) +	    tx2 = tx; +	if (ty > ty2) +	    ty2 = ty; +    } + +    transformed->x1 = tx1; +    transformed->y1 = ty1; +    transformed->x2 = tx2; +    transformed->y2 = ty2; + +    return TRUE; +} + +#define IS_16BIT(x) (((x) >= INT16_MIN) && ((x) <= INT16_MAX)) +#define ABS(f)      (((f) < 0)?  (-(f)) : (f)) +#define IS_16_16(f) (((f) >= pixman_min_fixed_48_16 && ((f) <= pixman_max_fixed_48_16))) + +static pixman_bool_t +analyze_extent (pixman_image_t       *image, +		const pixman_box32_t *extents, +		uint32_t             *flags) +{ +    pixman_transform_t *transform; +    pixman_fixed_t x_off, y_off; +    pixman_fixed_t width, height; +    pixman_fixed_t *params; +    box_48_16_t transformed; +    pixman_box32_t exp_extents; + +    if (!image) +	return TRUE; + +    /* Some compositing functions walk one step +     * outside the destination rectangle, so we +     * check here that the expanded-by-one source +     * extents in destination space fits in 16 bits +     */ +    if (!IS_16BIT (extents->x1 - 1)		|| +	!IS_16BIT (extents->y1 - 1)		|| +	!IS_16BIT (extents->x2 + 1)		|| +	!IS_16BIT (extents->y2 + 1)) +    { +	return FALSE; +    } + +    transform = image->common.transform; +    if (image->common.type == BITS) +    { +	/* During repeat mode calculations we might convert the +	 * width/height of an image to fixed 16.16, so we need +	 * them to be smaller than 16 bits. +	 */ +	if (image->bits.width >= 0x7fff	|| image->bits.height >= 0x7fff) +	    return FALSE; + +	if ((image->common.flags & FAST_PATH_ID_TRANSFORM) == FAST_PATH_ID_TRANSFORM && +	    extents->x1 >= 0 && +	    extents->y1 >= 0 && +	    extents->x2 <= image->bits.width && +	    extents->y2 <= image->bits.height) +	{ +	    *flags |= FAST_PATH_SAMPLES_COVER_CLIP_NEAREST; +	    return TRUE; +	} + +	switch (image->common.filter) +	{ +	case PIXMAN_FILTER_CONVOLUTION: +	    params = image->common.filter_params; +	    x_off = - pixman_fixed_e - ((params[0] - pixman_fixed_1) >> 1); +	    y_off = - pixman_fixed_e - ((params[1] - pixman_fixed_1) >> 1); +	    width = params[0]; +	    height = params[1]; +	    break; + +	case PIXMAN_FILTER_SEPARABLE_CONVOLUTION: +	    params = image->common.filter_params; +	    x_off = - pixman_fixed_e - ((params[0] - pixman_fixed_1) >> 1); +	    y_off = - pixman_fixed_e - ((params[1] - pixman_fixed_1) >> 1); +	    width = params[0]; +	    height = params[1]; +	    break; +	     +	case PIXMAN_FILTER_GOOD: +	case PIXMAN_FILTER_BEST: +	case PIXMAN_FILTER_BILINEAR: +	    x_off = - pixman_fixed_1 / 2; +	    y_off = - pixman_fixed_1 / 2; +	    width = pixman_fixed_1; +	    height = pixman_fixed_1; +	    break; + +	case PIXMAN_FILTER_FAST: +	case PIXMAN_FILTER_NEAREST: +	    x_off = - pixman_fixed_e; +	    y_off = - pixman_fixed_e; +	    width = 0; +	    height = 0; +	    break; + +	default: +	    return FALSE; +	} +    } +    else +    { +	x_off = 0; +	y_off = 0; +	width = 0; +	height = 0; +    } + +    if (!compute_transformed_extents (transform, extents, &transformed)) +	return FALSE; + +    if (image->common.type == BITS) +    { +	if (pixman_fixed_to_int (transformed.x1 - pixman_fixed_e) >= 0                && +	    pixman_fixed_to_int (transformed.y1 - pixman_fixed_e) >= 0                && +	    pixman_fixed_to_int (transformed.x2 - pixman_fixed_e) < image->bits.width && +	    pixman_fixed_to_int (transformed.y2 - pixman_fixed_e) < image->bits.height) +	{ +	    *flags |= FAST_PATH_SAMPLES_COVER_CLIP_NEAREST; +	} + +	if (pixman_fixed_to_int (transformed.x1 - pixman_fixed_1 / 2) >= 0		  && +	    pixman_fixed_to_int (transformed.y1 - pixman_fixed_1 / 2) >= 0		  && +	    pixman_fixed_to_int (transformed.x2 + pixman_fixed_1 / 2) < image->bits.width && +	    pixman_fixed_to_int (transformed.y2 + pixman_fixed_1 / 2) < image->bits.height) +	{ +	    *flags |= FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR; +	} +    } + +    /* Check we don't overflow when the destination extents are expanded by one. +     * This ensures that compositing functions can simply walk the source space +     * using 16.16 variables without worrying about overflow. +     */ +    exp_extents = *extents; +    exp_extents.x1 -= 1; +    exp_extents.y1 -= 1; +    exp_extents.x2 += 1; +    exp_extents.y2 += 1; + +    if (!compute_transformed_extents (transform, &exp_extents, &transformed)) +	return FALSE; +     +    if (!IS_16_16 (transformed.x1 + x_off - 8 * pixman_fixed_e)	|| +	!IS_16_16 (transformed.y1 + y_off - 8 * pixman_fixed_e)	|| +	!IS_16_16 (transformed.x2 + x_off + 8 * pixman_fixed_e + width)	|| +	!IS_16_16 (transformed.y2 + y_off + 8 * pixman_fixed_e + height)) +    { +	return FALSE; +    } + +    return TRUE; +} + +/* + * Work around GCC bug causing crashes in Mozilla with SSE2 + * + * When using -msse, gcc generates movdqa instructions assuming that + * the stack is 16 byte aligned. Unfortunately some applications, such + * as Mozilla and Mono, end up aligning the stack to 4 bytes, which + * causes the movdqa instructions to fail. + * + * The __force_align_arg_pointer__ makes gcc generate a prologue that + * realigns the stack pointer to 16 bytes. + * + * On x86-64 this is not necessary because the standard ABI already + * calls for a 16 byte aligned stack. + * + * See https://bugs.freedesktop.org/show_bug.cgi?id=15693 + */ +#if defined (USE_SSE2) && defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) +__attribute__((__force_align_arg_pointer__)) +#endif +PIXMAN_EXPORT void +pixman_image_composite32 (pixman_op_t      op, +                          pixman_image_t * src, +                          pixman_image_t * mask, +                          pixman_image_t * dest, +                          int32_t          src_x, +                          int32_t          src_y, +                          int32_t          mask_x, +                          int32_t          mask_y, +                          int32_t          dest_x, +                          int32_t          dest_y, +                          int32_t          width, +                          int32_t          height) +{ +    pixman_format_code_t src_format, mask_format, dest_format; +    pixman_region32_t region; +    pixman_box32_t extents; +    pixman_implementation_t *imp; +    pixman_composite_func_t func; +    pixman_composite_info_t info; +    const pixman_box32_t *pbox; +    int n; + +    _pixman_image_validate (src); +    if (mask) +	_pixman_image_validate (mask); +    _pixman_image_validate (dest); + +    src_format = src->common.extended_format_code; +    info.src_flags = src->common.flags; + +    if (mask && !(mask->common.flags & FAST_PATH_IS_OPAQUE)) +    { +	mask_format = mask->common.extended_format_code; +	info.mask_flags = mask->common.flags; +    } +    else +    { +	mask_format = PIXMAN_null; +	info.mask_flags = FAST_PATH_IS_OPAQUE | FAST_PATH_NO_ALPHA_MAP; +    } + +    dest_format = dest->common.extended_format_code; +    info.dest_flags = dest->common.flags; + +    /* Check for pixbufs */ +    if ((mask_format == PIXMAN_a8r8g8b8 || mask_format == PIXMAN_a8b8g8r8) && +	(src->type == BITS && src->bits.bits == mask->bits.bits)	   && +	(src->common.repeat == mask->common.repeat)			   && +	(info.src_flags & info.mask_flags & FAST_PATH_ID_TRANSFORM)	   && +	(src_x == mask_x && src_y == mask_y)) +    { +	if (src_format == PIXMAN_x8b8g8r8) +	    src_format = mask_format = PIXMAN_pixbuf; +	else if (src_format == PIXMAN_x8r8g8b8) +	    src_format = mask_format = PIXMAN_rpixbuf; +    } + +    pixman_region32_init (®ion); + +    if (!_pixman_compute_composite_region32 ( +	    ®ion, src, mask, dest, +	    src_x, src_y, mask_x, mask_y, dest_x, dest_y, width, height)) +    { +	goto out; +    } + +    extents = *pixman_region32_extents (®ion); + +    extents.x1 -= dest_x - src_x; +    extents.y1 -= dest_y - src_y; +    extents.x2 -= dest_x - src_x; +    extents.y2 -= dest_y - src_y; + +    if (!analyze_extent (src, &extents, &info.src_flags)) +	goto out; + +    extents.x1 -= src_x - mask_x; +    extents.y1 -= src_y - mask_y; +    extents.x2 -= src_x - mask_x; +    extents.y2 -= src_y - mask_y; + +    if (!analyze_extent (mask, &extents, &info.mask_flags)) +	goto out; + +    /* If the clip is within the source samples, and the samples are +     * opaque, then the source is effectively opaque. +     */ +#define NEAREST_OPAQUE	(FAST_PATH_SAMPLES_OPAQUE |			\ +			 FAST_PATH_NEAREST_FILTER |			\ +			 FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) +#define BILINEAR_OPAQUE	(FAST_PATH_SAMPLES_OPAQUE |			\ +			 FAST_PATH_BILINEAR_FILTER |			\ +			 FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR) + +    if ((info.src_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE || +	(info.src_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE) +    { +	info.src_flags |= FAST_PATH_IS_OPAQUE; +    } + +    if ((info.mask_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE || +	(info.mask_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE) +    { +	info.mask_flags |= FAST_PATH_IS_OPAQUE; +    } + +    /* +     * Check if we can replace our operator by a simpler one +     * if the src or dest are opaque. The output operator should be +     * mathematically equivalent to the source. +     */ +    info.op = optimize_operator (op, info.src_flags, info.mask_flags, info.dest_flags); + +    _pixman_implementation_lookup_composite ( +	get_implementation (), info.op, +	src_format, info.src_flags, +	mask_format, info.mask_flags, +	dest_format, info.dest_flags, +	&imp, &func); + +    info.src_image = src; +    info.mask_image = mask; +    info.dest_image = dest; + +    pbox = pixman_region32_rectangles (®ion, &n); + +    while (n--) +    { +	info.src_x = pbox->x1 + src_x - dest_x; +	info.src_y = pbox->y1 + src_y - dest_y; +	info.mask_x = pbox->x1 + mask_x - dest_x; +	info.mask_y = pbox->y1 + mask_y - dest_y; +	info.dest_x = pbox->x1; +	info.dest_y = pbox->y1; +	info.width = pbox->x2 - pbox->x1; +	info.height = pbox->y2 - pbox->y1; + +	func (imp, &info); + +	pbox++; +    } + +out: +    pixman_region32_fini (®ion); +} + +PIXMAN_EXPORT void +pixman_image_composite (pixman_op_t      op, +                        pixman_image_t * src, +                        pixman_image_t * mask, +                        pixman_image_t * dest, +                        int16_t          src_x, +                        int16_t          src_y, +                        int16_t          mask_x, +                        int16_t          mask_y, +                        int16_t          dest_x, +                        int16_t          dest_y, +                        uint16_t         width, +                        uint16_t         height) +{ +    pixman_image_composite32 (op, src, mask, dest, src_x, src_y,  +                              mask_x, mask_y, dest_x, dest_y, width, height); +} + +PIXMAN_EXPORT pixman_bool_t +pixman_blt (uint32_t *src_bits, +            uint32_t *dst_bits, +            int       src_stride, +            int       dst_stride, +            int       src_bpp, +            int       dst_bpp, +            int       src_x, +            int       src_y, +            int       dest_x, +            int       dest_y, +            int       width, +            int       height) +{ +    return _pixman_implementation_blt (get_implementation(), +				       src_bits, dst_bits, src_stride, dst_stride, +                                       src_bpp, dst_bpp, +                                       src_x, src_y, +                                       dest_x, dest_y, +                                       width, height); +} + +PIXMAN_EXPORT pixman_bool_t +pixman_fill (uint32_t *bits, +             int       stride, +             int       bpp, +             int       x, +             int       y, +             int       width, +             int       height, +             uint32_t  filler) +{ +    return _pixman_implementation_fill ( +	get_implementation(), bits, stride, bpp, x, y, width, height, filler); +} + +static uint32_t +color_to_uint32 (const pixman_color_t *color) +{ +    return +        (color->alpha >> 8 << 24) | +        (color->red >> 8 << 16) | +        (color->green & 0xff00) | +        (color->blue >> 8); +} + +static pixman_bool_t +color_to_pixel (const pixman_color_t *color, +                uint32_t *            pixel, +                pixman_format_code_t  format) +{ +    uint32_t c = color_to_uint32 (color); + +    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_RGBA_FLOAT) +    { +	return FALSE; +    } + +    if (!(format == PIXMAN_a8r8g8b8     || +          format == PIXMAN_x8r8g8b8     || +          format == PIXMAN_a8b8g8r8     || +          format == PIXMAN_x8b8g8r8     || +          format == PIXMAN_b8g8r8a8     || +          format == PIXMAN_b8g8r8x8     || +          format == PIXMAN_r8g8b8a8     || +          format == PIXMAN_r8g8b8x8     || +          format == PIXMAN_r5g6b5       || +          format == PIXMAN_b5g6r5       || +          format == PIXMAN_a8           || +          format == PIXMAN_a1)) +    { +	return FALSE; +    } + +    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_ABGR) +    { +	c = ((c & 0xff000000) >>  0) | +	    ((c & 0x00ff0000) >> 16) | +	    ((c & 0x0000ff00) >>  0) | +	    ((c & 0x000000ff) << 16); +    } +    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_BGRA) +    { +	c = ((c & 0xff000000) >> 24) | +	    ((c & 0x00ff0000) >>  8) | +	    ((c & 0x0000ff00) <<  8) | +	    ((c & 0x000000ff) << 24); +    } +    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_RGBA) +	c = ((c & 0xff000000) >> 24) | (c << 8); + +    if (format == PIXMAN_a1) +	c = c >> 31; +    else if (format == PIXMAN_a8) +	c = c >> 24; +    else if (format == PIXMAN_r5g6b5 || +             format == PIXMAN_b5g6r5) +	c = convert_8888_to_0565 (c); + +#if 0 +    printf ("color: %x %x %x %x\n", color->alpha, color->red, color->green, color->blue); +    printf ("pixel: %x\n", c); +#endif + +    *pixel = c; +    return TRUE; +} + +PIXMAN_EXPORT pixman_bool_t +pixman_image_fill_rectangles (pixman_op_t                 op, +                              pixman_image_t *            dest, +			      const pixman_color_t *      color, +                              int                         n_rects, +                              const pixman_rectangle16_t *rects) +{ +    pixman_box32_t stack_boxes[6]; +    pixman_box32_t *boxes; +    pixman_bool_t result; +    int i; + +    if (n_rects > 6) +    { +        boxes = pixman_malloc_ab (sizeof (pixman_box32_t), n_rects); +        if (boxes == NULL) +            return FALSE; +    } +    else +    { +        boxes = stack_boxes; +    } + +    for (i = 0; i < n_rects; ++i) +    { +        boxes[i].x1 = rects[i].x; +        boxes[i].y1 = rects[i].y; +        boxes[i].x2 = boxes[i].x1 + rects[i].width; +        boxes[i].y2 = boxes[i].y1 + rects[i].height; +    } + +    result = pixman_image_fill_boxes (op, dest, color, n_rects, boxes); + +    if (boxes != stack_boxes) +        free (boxes); +     +    return result; +} + +PIXMAN_EXPORT pixman_bool_t +pixman_image_fill_boxes (pixman_op_t           op, +                         pixman_image_t *      dest, +                         const pixman_color_t *color, +                         int                   n_boxes, +                         const pixman_box32_t *boxes) +{ +    pixman_image_t *solid; +    pixman_color_t c; +    int i; + +    _pixman_image_validate (dest); +     +    if (color->alpha == 0xffff) +    { +        if (op == PIXMAN_OP_OVER) +            op = PIXMAN_OP_SRC; +    } + +    if (op == PIXMAN_OP_CLEAR) +    { +        c.red = 0; +        c.green = 0; +        c.blue = 0; +        c.alpha = 0; + +        color = &c; + +        op = PIXMAN_OP_SRC; +    } + +    if (op == PIXMAN_OP_SRC) +    { +        uint32_t pixel; + +        if (color_to_pixel (color, &pixel, dest->bits.format)) +        { +            pixman_region32_t fill_region; +            int n_rects, j; +            pixman_box32_t *rects; + +            if (!pixman_region32_init_rects (&fill_region, boxes, n_boxes)) +                return FALSE; + +            if (dest->common.have_clip_region) +            { +                if (!pixman_region32_intersect (&fill_region, +                                                &fill_region, +                                                &dest->common.clip_region)) +                    return FALSE; +            } + +            rects = pixman_region32_rectangles (&fill_region, &n_rects); +            for (j = 0; j < n_rects; ++j) +            { +                const pixman_box32_t *rect = &(rects[j]); +                pixman_fill (dest->bits.bits, dest->bits.rowstride, PIXMAN_FORMAT_BPP (dest->bits.format), +                             rect->x1, rect->y1, rect->x2 - rect->x1, rect->y2 - rect->y1, +                             pixel); +            } + +            pixman_region32_fini (&fill_region); +            return TRUE; +        } +    } + +    solid = pixman_image_create_solid_fill (color); +    if (!solid) +        return FALSE; + +    for (i = 0; i < n_boxes; ++i) +    { +        const pixman_box32_t *box = &(boxes[i]); + +        pixman_image_composite32 (op, solid, NULL, dest, +                                  0, 0, 0, 0, +                                  box->x1, box->y1, +                                  box->x2 - box->x1, box->y2 - box->y1); +    } + +    pixman_image_unref (solid); + +    return TRUE; +} + +/** + * pixman_version: + * + * Returns the version of the pixman library encoded in a single + * integer as per %PIXMAN_VERSION_ENCODE. The encoding ensures that + * later versions compare greater than earlier versions. + * + * A run-time comparison to check that pixman's version is greater than + * or equal to version X.Y.Z could be performed as follows: + * + * <informalexample><programlisting> + * if (pixman_version() >= PIXMAN_VERSION_ENCODE(X,Y,Z)) {...} + * </programlisting></informalexample> + * + * See also pixman_version_string() as well as the compile-time + * equivalents %PIXMAN_VERSION and %PIXMAN_VERSION_STRING. + * + * Return value: the encoded version. + **/ +PIXMAN_EXPORT int +pixman_version (void) +{ +    return PIXMAN_VERSION; +} + +/** + * pixman_version_string: + * + * Returns the version of the pixman library as a human-readable string + * of the form "X.Y.Z". + * + * See also pixman_version() as well as the compile-time equivalents + * %PIXMAN_VERSION_STRING and %PIXMAN_VERSION. + * + * Return value: a string containing the version. + **/ +PIXMAN_EXPORT const char* +pixman_version_string (void) +{ +    return PIXMAN_VERSION_STRING; +} + +/** + * pixman_format_supported_source: + * @format: A pixman_format_code_t format + * + * Return value: whether the provided format code is a supported + * format for a pixman surface used as a source in + * rendering. + * + * Currently, all pixman_format_code_t values are supported. + **/ +PIXMAN_EXPORT pixman_bool_t +pixman_format_supported_source (pixman_format_code_t format) +{ +    switch (format) +    { +    /* 32 bpp formats */ +    case PIXMAN_a2b10g10r10: +    case PIXMAN_x2b10g10r10: +    case PIXMAN_a2r10g10b10: +    case PIXMAN_x2r10g10b10: +    case PIXMAN_a8r8g8b8: +    case PIXMAN_a8r8g8b8_sRGB: +    case PIXMAN_x8r8g8b8: +    case PIXMAN_a8b8g8r8: +    case PIXMAN_x8b8g8r8: +    case PIXMAN_b8g8r8a8: +    case PIXMAN_b8g8r8x8: +    case PIXMAN_r8g8b8a8: +    case PIXMAN_r8g8b8x8: +    case PIXMAN_r8g8b8: +    case PIXMAN_b8g8r8: +    case PIXMAN_r5g6b5: +    case PIXMAN_b5g6r5: +    case PIXMAN_x14r6g6b6: +    /* 16 bpp formats */ +    case PIXMAN_a1r5g5b5: +    case PIXMAN_x1r5g5b5: +    case PIXMAN_a1b5g5r5: +    case PIXMAN_x1b5g5r5: +    case PIXMAN_a4r4g4b4: +    case PIXMAN_x4r4g4b4: +    case PIXMAN_a4b4g4r4: +    case PIXMAN_x4b4g4r4: +    /* 8bpp formats */ +    case PIXMAN_a8: +    case PIXMAN_r3g3b2: +    case PIXMAN_b2g3r3: +    case PIXMAN_a2r2g2b2: +    case PIXMAN_a2b2g2r2: +    case PIXMAN_c8: +    case PIXMAN_g8: +    case PIXMAN_x4a4: +    /* Collides with PIXMAN_c8 +       case PIXMAN_x4c4: +     */ +    /* Collides with PIXMAN_g8 +       case PIXMAN_x4g4: +     */ +    /* 4bpp formats */ +    case PIXMAN_a4: +    case PIXMAN_r1g2b1: +    case PIXMAN_b1g2r1: +    case PIXMAN_a1r1g1b1: +    case PIXMAN_a1b1g1r1: +    case PIXMAN_c4: +    case PIXMAN_g4: +    /* 1bpp formats */ +    case PIXMAN_a1: +    case PIXMAN_g1: +    /* YUV formats */ +    case PIXMAN_yuy2: +    case PIXMAN_yv12: +	return TRUE; + +    default: +	return FALSE; +    } +} + +/** + * pixman_format_supported_destination: + * @format: A pixman_format_code_t format + * + * Return value: whether the provided format code is a supported + * format for a pixman surface used as a destination in + * rendering. + * + * Currently, all pixman_format_code_t values are supported + * except for the YUV formats. + **/ +PIXMAN_EXPORT pixman_bool_t +pixman_format_supported_destination (pixman_format_code_t format) +{ +    /* YUV formats cannot be written to at the moment */ +    if (format == PIXMAN_yuy2 || format == PIXMAN_yv12) +	return FALSE; + +    return pixman_format_supported_source (format); +} + +PIXMAN_EXPORT pixman_bool_t +pixman_compute_composite_region (pixman_region16_t * region, +                                 pixman_image_t *    src_image, +                                 pixman_image_t *    mask_image, +                                 pixman_image_t *    dest_image, +                                 int16_t             src_x, +                                 int16_t             src_y, +                                 int16_t             mask_x, +                                 int16_t             mask_y, +                                 int16_t             dest_x, +                                 int16_t             dest_y, +                                 uint16_t            width, +                                 uint16_t            height) +{ +    pixman_region32_t r32; +    pixman_bool_t retval; + +    pixman_region32_init (&r32); + +    retval = _pixman_compute_composite_region32 ( +	&r32, src_image, mask_image, dest_image, +	src_x, src_y, mask_x, mask_y, dest_x, dest_y, +	width, height); + +    if (retval) +    { +	if (!pixman_region16_copy_from_region32 (region, &r32)) +	    retval = FALSE; +    } + +    pixman_region32_fini (&r32); +    return retval; +} diff --git a/libs/pixman-0.40.0/pixman/pixman.h b/libs/pixman-0.40.0/pixman/pixman.h new file mode 100644 index 0000000..08303b5 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/pixman.h @@ -0,0 +1,1419 @@ +/*********************************************************** + +Copyright 1987, 1998  The Open Group + +Permission to use, copy, modify, distribute, and sell this software and its +documentation for any purpose is hereby granted without fee, provided that +the above copyright notice appear in all copies and that both that +copyright notice and this permission notice appear in supporting +documentation. + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE +OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +Except as contained in this notice, the name of The Open Group shall not be +used in advertising or otherwise to promote the sale, use or other dealings +in this Software without prior written authorization from The Open Group. + +Copyright 1987 by Digital Equipment Corporation, Maynard, Massachusetts. + +                        All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of Digital not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +DIGITAL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +DIGITAL BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ +/* + * Copyright © 1998, 2004 Keith Packard + * Copyright   2007 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Keith Packard not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission.  Keith Packard makes no + * representations about the suitability of this software for any purpose.  It + * is provided "as is" without express or implied warranty. + * + * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef PIXMAN_H__ +#define PIXMAN_H__ + +#include <pixman-version.h> + +#ifdef  __cplusplus +#define PIXMAN_BEGIN_DECLS extern "C" { +#define PIXMAN_END_DECLS } +#else +#define PIXMAN_BEGIN_DECLS +#define PIXMAN_END_DECLS +#endif + +PIXMAN_BEGIN_DECLS + +/* + * Standard integers + */ + +#if !defined (PIXMAN_DONT_DEFINE_STDINT) + +#if defined (_SVR4) || defined (SVR4) || defined (__OpenBSD__) || defined (_sgi) || defined (__sun) || defined (sun) || defined (__digital__) || defined (__HP_cc) +#  include <inttypes.h> +/* VS 2010 (_MSC_VER 1600) has stdint.h */ +#elif defined (_MSC_VER) && _MSC_VER < 1600 +typedef __int8 int8_t; +typedef unsigned __int8 uint8_t; +typedef __int16 int16_t; +typedef unsigned __int16 uint16_t; +typedef __int32 int32_t; +typedef unsigned __int32 uint32_t; +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +#elif defined (_AIX) +#  include <sys/inttypes.h> +#else +#  include <stdint.h> +#endif + +#endif + +/* + * Boolean + */ +typedef int pixman_bool_t; + +/* + * Fixpoint numbers + */ +typedef int64_t			pixman_fixed_32_32_t; +typedef pixman_fixed_32_32_t	pixman_fixed_48_16_t; +typedef uint32_t		pixman_fixed_1_31_t; +typedef uint32_t		pixman_fixed_1_16_t; +typedef int32_t			pixman_fixed_16_16_t; +typedef pixman_fixed_16_16_t	pixman_fixed_t; + +#define pixman_fixed_e			((pixman_fixed_t) 1) +#define pixman_fixed_1			(pixman_int_to_fixed(1)) +#define pixman_fixed_1_minus_e		(pixman_fixed_1 - pixman_fixed_e) +#define pixman_fixed_minus_1		(pixman_int_to_fixed(-1)) +#define pixman_fixed_to_int(f)		((int) ((f) >> 16)) +#define pixman_int_to_fixed(i)		((pixman_fixed_t) ((uint32_t) (i) << 16)) +#define pixman_fixed_to_double(f)	(double) ((f) / (double) pixman_fixed_1) +#define pixman_double_to_fixed(d)	((pixman_fixed_t) ((d) * 65536.0)) +#define pixman_fixed_frac(f)		((f) & pixman_fixed_1_minus_e) +#define pixman_fixed_floor(f)		((f) & ~pixman_fixed_1_minus_e) +#define pixman_fixed_ceil(f)		pixman_fixed_floor ((f) + pixman_fixed_1_minus_e) +#define pixman_fixed_fraction(f)	((f) & pixman_fixed_1_minus_e) +#define pixman_fixed_mod_2(f)		((f) & (pixman_fixed1 | pixman_fixed_1_minus_e)) +#define pixman_max_fixed_48_16		((pixman_fixed_48_16_t) 0x7fffffff) +#define pixman_min_fixed_48_16		(-((pixman_fixed_48_16_t) 1 << 31)) + +/* + * Misc structs + */ +typedef struct pixman_color pixman_color_t; +typedef struct pixman_point_fixed pixman_point_fixed_t; +typedef struct pixman_line_fixed pixman_line_fixed_t; +typedef struct pixman_vector pixman_vector_t; +typedef struct pixman_transform pixman_transform_t; + +struct pixman_color +{ +    uint16_t	red; +    uint16_t    green; +    uint16_t    blue; +    uint16_t    alpha; +}; + +struct pixman_point_fixed +{ +    pixman_fixed_t	x; +    pixman_fixed_t	y; +}; + +struct pixman_line_fixed +{ +    pixman_point_fixed_t	p1, p2; +}; + +/* + * Fixed point matrices + */ + +struct pixman_vector +{ +    pixman_fixed_t	vector[3]; +}; + +struct pixman_transform +{ +    pixman_fixed_t	matrix[3][3]; +}; + +/* forward declaration (sorry) */ +struct pixman_box16; +typedef  union pixman_image		pixman_image_t; + +PIXMAN_API +void          pixman_transform_init_identity    (struct pixman_transform       *matrix); + +PIXMAN_API +pixman_bool_t pixman_transform_point_3d         (const struct pixman_transform *transform, +						 struct pixman_vector          *vector); + +PIXMAN_API +pixman_bool_t pixman_transform_point            (const struct pixman_transform *transform, +						 struct pixman_vector          *vector); + +PIXMAN_API +pixman_bool_t pixman_transform_multiply         (struct pixman_transform       *dst, +						 const struct pixman_transform *l, +						 const struct pixman_transform *r); + +PIXMAN_API +void          pixman_transform_init_scale       (struct pixman_transform       *t, +						 pixman_fixed_t                 sx, +						 pixman_fixed_t                 sy); + +PIXMAN_API +pixman_bool_t pixman_transform_scale            (struct pixman_transform       *forward, +						 struct pixman_transform       *reverse, +						 pixman_fixed_t                 sx, +						 pixman_fixed_t                 sy); + +PIXMAN_API +void          pixman_transform_init_rotate      (struct pixman_transform       *t, +						 pixman_fixed_t                 cos, +						 pixman_fixed_t                 sin); + +PIXMAN_API +pixman_bool_t pixman_transform_rotate           (struct pixman_transform       *forward, +						 struct pixman_transform       *reverse, +						 pixman_fixed_t                 c, +						 pixman_fixed_t                 s); + +PIXMAN_API +void          pixman_transform_init_translate   (struct pixman_transform       *t, +						 pixman_fixed_t                 tx, +						 pixman_fixed_t                 ty); + +PIXMAN_API +pixman_bool_t pixman_transform_translate        (struct pixman_transform       *forward, +						 struct pixman_transform       *reverse, +						 pixman_fixed_t                 tx, +						 pixman_fixed_t                 ty); + +PIXMAN_API +pixman_bool_t pixman_transform_bounds           (const struct pixman_transform *matrix, +						 struct pixman_box16           *b); + +PIXMAN_API +pixman_bool_t pixman_transform_invert           (struct pixman_transform       *dst, +						 const struct pixman_transform *src); + +PIXMAN_API +pixman_bool_t pixman_transform_is_identity      (const struct pixman_transform *t); + +PIXMAN_API +pixman_bool_t pixman_transform_is_scale         (const struct pixman_transform *t); + +PIXMAN_API +pixman_bool_t pixman_transform_is_int_translate (const struct pixman_transform *t); + +PIXMAN_API +pixman_bool_t pixman_transform_is_inverse       (const struct pixman_transform *a, +						 const struct pixman_transform *b); + +/* + * Floating point matrices + */ +typedef struct pixman_f_transform pixman_f_transform_t; +typedef struct pixman_f_vector pixman_f_vector_t; + +struct pixman_f_vector +{ +    double  v[3]; +}; + +struct pixman_f_transform +{ +    double  m[3][3]; +}; + + +PIXMAN_API +pixman_bool_t pixman_transform_from_pixman_f_transform (struct pixman_transform         *t, +							const struct pixman_f_transform *ft); + +PIXMAN_API +void          pixman_f_transform_from_pixman_transform (struct pixman_f_transform       *ft, +							const struct pixman_transform   *t); + +PIXMAN_API +pixman_bool_t pixman_f_transform_invert                (struct pixman_f_transform       *dst, +							const struct pixman_f_transform *src); + +PIXMAN_API +pixman_bool_t pixman_f_transform_point                 (const struct pixman_f_transform *t, +							struct pixman_f_vector          *v); + +PIXMAN_API +void          pixman_f_transform_point_3d              (const struct pixman_f_transform *t, +							struct pixman_f_vector          *v); + +PIXMAN_API +void          pixman_f_transform_multiply              (struct pixman_f_transform       *dst, +							const struct pixman_f_transform *l, +							const struct pixman_f_transform *r); + +PIXMAN_API +void          pixman_f_transform_init_scale            (struct pixman_f_transform       *t, +							double                           sx, +							double                           sy); + +PIXMAN_API +pixman_bool_t pixman_f_transform_scale                 (struct pixman_f_transform       *forward, +							struct pixman_f_transform       *reverse, +							double                           sx, +							double                           sy); + +PIXMAN_API +void          pixman_f_transform_init_rotate           (struct pixman_f_transform       *t, +							double                           cos, +							double                           sin); + +PIXMAN_API +pixman_bool_t pixman_f_transform_rotate                (struct pixman_f_transform       *forward, +							struct pixman_f_transform       *reverse, +							double                           c, +							double                           s); + +PIXMAN_API +void          pixman_f_transform_init_translate        (struct pixman_f_transform       *t, +							double                           tx, +							double                           ty); + +PIXMAN_API +pixman_bool_t pixman_f_transform_translate             (struct pixman_f_transform       *forward, +							struct pixman_f_transform       *reverse, +							double                           tx, +							double                           ty); + +PIXMAN_API +pixman_bool_t pixman_f_transform_bounds                (const struct pixman_f_transform *t, +							struct pixman_box16             *b); + +PIXMAN_API +void          pixman_f_transform_init_identity         (struct pixman_f_transform       *t); + +typedef enum +{ +    PIXMAN_REPEAT_NONE, +    PIXMAN_REPEAT_NORMAL, +    PIXMAN_REPEAT_PAD, +    PIXMAN_REPEAT_REFLECT +} pixman_repeat_t; + +typedef enum +{ +    PIXMAN_DITHER_NONE, +    PIXMAN_DITHER_FAST, +    PIXMAN_DITHER_GOOD, +    PIXMAN_DITHER_BEST, +    PIXMAN_DITHER_ORDERED_BAYER_8, +    PIXMAN_DITHER_ORDERED_BLUE_NOISE_64, +} pixman_dither_t; + +typedef enum +{ +    PIXMAN_FILTER_FAST, +    PIXMAN_FILTER_GOOD, +    PIXMAN_FILTER_BEST, +    PIXMAN_FILTER_NEAREST, +    PIXMAN_FILTER_BILINEAR, +    PIXMAN_FILTER_CONVOLUTION, + +    /* The SEPARABLE_CONVOLUTION filter takes the following parameters: +     * +     *         width:           integer given as 16.16 fixpoint number +     *         height:          integer given as 16.16 fixpoint number +     *         x_phase_bits:	integer given as 16.16 fixpoint +     *         y_phase_bits:	integer given as 16.16 fixpoint +     *         xtables:         (1 << x_phase_bits) tables of size width +     *         ytables:         (1 << y_phase_bits) tables of size height +     * +     * When sampling at (x, y), the location is first rounded to one of +     * n_x_phases * n_y_phases subpixel positions. These subpixel positions +     * determine an xtable and a ytable to use. +     * +     * Conceptually a width x height matrix is then formed in which each entry +     * is the product of the corresponding entries in the x and y tables. +     * This matrix is then aligned with the image pixels such that its center +     * is as close as possible to the subpixel location chosen earlier. Then +     * the image is convolved with the matrix and the resulting pixel returned. +     */ +    PIXMAN_FILTER_SEPARABLE_CONVOLUTION +} pixman_filter_t; + +typedef enum +{ +    PIXMAN_OP_CLEAR			= 0x00, +    PIXMAN_OP_SRC			= 0x01, +    PIXMAN_OP_DST			= 0x02, +    PIXMAN_OP_OVER			= 0x03, +    PIXMAN_OP_OVER_REVERSE		= 0x04, +    PIXMAN_OP_IN			= 0x05, +    PIXMAN_OP_IN_REVERSE		= 0x06, +    PIXMAN_OP_OUT			= 0x07, +    PIXMAN_OP_OUT_REVERSE		= 0x08, +    PIXMAN_OP_ATOP			= 0x09, +    PIXMAN_OP_ATOP_REVERSE		= 0x0a, +    PIXMAN_OP_XOR			= 0x0b, +    PIXMAN_OP_ADD			= 0x0c, +    PIXMAN_OP_SATURATE			= 0x0d, + +    PIXMAN_OP_DISJOINT_CLEAR		= 0x10, +    PIXMAN_OP_DISJOINT_SRC		= 0x11, +    PIXMAN_OP_DISJOINT_DST		= 0x12, +    PIXMAN_OP_DISJOINT_OVER		= 0x13, +    PIXMAN_OP_DISJOINT_OVER_REVERSE	= 0x14, +    PIXMAN_OP_DISJOINT_IN		= 0x15, +    PIXMAN_OP_DISJOINT_IN_REVERSE	= 0x16, +    PIXMAN_OP_DISJOINT_OUT		= 0x17, +    PIXMAN_OP_DISJOINT_OUT_REVERSE	= 0x18, +    PIXMAN_OP_DISJOINT_ATOP		= 0x19, +    PIXMAN_OP_DISJOINT_ATOP_REVERSE	= 0x1a, +    PIXMAN_OP_DISJOINT_XOR		= 0x1b, + +    PIXMAN_OP_CONJOINT_CLEAR		= 0x20, +    PIXMAN_OP_CONJOINT_SRC		= 0x21, +    PIXMAN_OP_CONJOINT_DST		= 0x22, +    PIXMAN_OP_CONJOINT_OVER		= 0x23, +    PIXMAN_OP_CONJOINT_OVER_REVERSE	= 0x24, +    PIXMAN_OP_CONJOINT_IN		= 0x25, +    PIXMAN_OP_CONJOINT_IN_REVERSE	= 0x26, +    PIXMAN_OP_CONJOINT_OUT		= 0x27, +    PIXMAN_OP_CONJOINT_OUT_REVERSE	= 0x28, +    PIXMAN_OP_CONJOINT_ATOP		= 0x29, +    PIXMAN_OP_CONJOINT_ATOP_REVERSE	= 0x2a, +    PIXMAN_OP_CONJOINT_XOR		= 0x2b, + +    PIXMAN_OP_MULTIPLY                  = 0x30, +    PIXMAN_OP_SCREEN                    = 0x31, +    PIXMAN_OP_OVERLAY                   = 0x32, +    PIXMAN_OP_DARKEN                    = 0x33, +    PIXMAN_OP_LIGHTEN                   = 0x34, +    PIXMAN_OP_COLOR_DODGE               = 0x35, +    PIXMAN_OP_COLOR_BURN                = 0x36, +    PIXMAN_OP_HARD_LIGHT                = 0x37, +    PIXMAN_OP_SOFT_LIGHT                = 0x38, +    PIXMAN_OP_DIFFERENCE                = 0x39, +    PIXMAN_OP_EXCLUSION                 = 0x3a, +    PIXMAN_OP_HSL_HUE			= 0x3b, +    PIXMAN_OP_HSL_SATURATION		= 0x3c, +    PIXMAN_OP_HSL_COLOR			= 0x3d, +    PIXMAN_OP_HSL_LUMINOSITY		= 0x3e + +#ifdef PIXMAN_USE_INTERNAL_API +    , +    PIXMAN_N_OPERATORS, +    PIXMAN_OP_NONE = PIXMAN_N_OPERATORS +#endif +} pixman_op_t; + +/* + * Regions + */ +typedef struct pixman_region16_data	pixman_region16_data_t; +typedef struct pixman_box16		pixman_box16_t; +typedef struct pixman_rectangle16	pixman_rectangle16_t; +typedef struct pixman_region16		pixman_region16_t; + +struct pixman_region16_data { +    long		size; +    long		numRects; +/*  pixman_box16_t	rects[size];   in memory but not explicitly declared */ +}; + +struct pixman_rectangle16 +{ +    int16_t	x, y; +    uint16_t	width, height; +}; + +struct pixman_box16 +{ +    int16_t x1, y1, x2, y2; +}; + +struct pixman_region16 +{ +    pixman_box16_t          extents; +    pixman_region16_data_t *data; +}; + +typedef enum +{ +    PIXMAN_REGION_OUT, +    PIXMAN_REGION_IN, +    PIXMAN_REGION_PART +} pixman_region_overlap_t; + +/* This function exists only to make it possible to preserve + * the X ABI - it should go away at first opportunity. + */ +PIXMAN_API +void pixman_region_set_static_pointers (pixman_box16_t         *empty_box, +					pixman_region16_data_t *empty_data, +					pixman_region16_data_t *broken_data); + +/* creation/destruction */ +PIXMAN_API +void                    pixman_region_init               (pixman_region16_t *region); + +PIXMAN_API +void                    pixman_region_init_rect          (pixman_region16_t *region, +							  int                x, +							  int                y, +							  unsigned int       width, +							  unsigned int       height); + +PIXMAN_API +pixman_bool_t           pixman_region_init_rects         (pixman_region16_t *region, +							  const pixman_box16_t *boxes, +							  int                count); + +PIXMAN_API +void                    pixman_region_init_with_extents  (pixman_region16_t *region, +							  pixman_box16_t    *extents); + +PIXMAN_API +void                    pixman_region_init_from_image    (pixman_region16_t *region, +							  pixman_image_t    *image); + +PIXMAN_API +void                    pixman_region_fini               (pixman_region16_t *region); + + +/* manipulation */ +PIXMAN_API +void                    pixman_region_translate          (pixman_region16_t *region, +							  int                x, +							  int                y); + +PIXMAN_API +pixman_bool_t           pixman_region_copy               (pixman_region16_t *dest, +							  pixman_region16_t *source); + +PIXMAN_API +pixman_bool_t           pixman_region_intersect          (pixman_region16_t *new_reg, +							  pixman_region16_t *reg1, +							  pixman_region16_t *reg2); + +PIXMAN_API +pixman_bool_t           pixman_region_union              (pixman_region16_t *new_reg, +							  pixman_region16_t *reg1, +							  pixman_region16_t *reg2); + +PIXMAN_API +pixman_bool_t           pixman_region_union_rect         (pixman_region16_t *dest, +							  pixman_region16_t *source, +							  int                x, +							  int                y, +							  unsigned int       width, +							  unsigned int       height); + +PIXMAN_API +pixman_bool_t		pixman_region_intersect_rect     (pixman_region16_t *dest, +							  pixman_region16_t *source, +							  int                x, +							  int                y, +							  unsigned int       width, +							  unsigned int       height); + +PIXMAN_API +pixman_bool_t           pixman_region_subtract           (pixman_region16_t *reg_d, +							  pixman_region16_t *reg_m, +							  pixman_region16_t *reg_s); + +PIXMAN_API +pixman_bool_t           pixman_region_inverse            (pixman_region16_t *new_reg, +							  pixman_region16_t *reg1, +							  pixman_box16_t    *inv_rect); + +PIXMAN_API +pixman_bool_t           pixman_region_contains_point     (pixman_region16_t *region, +							  int                x, +							  int                y, +							  pixman_box16_t    *box); + +PIXMAN_API +pixman_region_overlap_t pixman_region_contains_rectangle (pixman_region16_t *region, +							  pixman_box16_t    *prect); + +PIXMAN_API +pixman_bool_t           pixman_region_not_empty          (pixman_region16_t *region); + +PIXMAN_API +pixman_box16_t *        pixman_region_extents            (pixman_region16_t *region); + +PIXMAN_API +int                     pixman_region_n_rects            (pixman_region16_t *region); + +PIXMAN_API +pixman_box16_t *        pixman_region_rectangles         (pixman_region16_t *region, +							  int               *n_rects); + +PIXMAN_API +pixman_bool_t           pixman_region_equal              (pixman_region16_t *region1, +							  pixman_region16_t *region2); + +PIXMAN_API +pixman_bool_t           pixman_region_selfcheck          (pixman_region16_t *region); + +PIXMAN_API +void                    pixman_region_reset              (pixman_region16_t *region, +							  pixman_box16_t    *box); + +PIXMAN_API +void			pixman_region_clear		 (pixman_region16_t *region); +/* + * 32 bit regions + */ +typedef struct pixman_region32_data	pixman_region32_data_t; +typedef struct pixman_box32		pixman_box32_t; +typedef struct pixman_rectangle32	pixman_rectangle32_t; +typedef struct pixman_region32		pixman_region32_t; + +struct pixman_region32_data { +    long		size; +    long		numRects; +/*  pixman_box32_t	rects[size];   in memory but not explicitly declared */ +}; + +struct pixman_rectangle32 +{ +    int32_t x, y; +    uint32_t width, height; +}; + +struct pixman_box32 +{ +    int32_t x1, y1, x2, y2; +}; + +struct pixman_region32 +{ +    pixman_box32_t          extents; +    pixman_region32_data_t  *data; +}; + +/* creation/destruction */ +PIXMAN_API +void                    pixman_region32_init               (pixman_region32_t *region); + +PIXMAN_API +void                    pixman_region32_init_rect          (pixman_region32_t *region, +							    int                x, +							    int                y, +							    unsigned int       width, +							    unsigned int       height); + +PIXMAN_API +pixman_bool_t           pixman_region32_init_rects         (pixman_region32_t *region, +							    const pixman_box32_t *boxes, +							    int                count); + +PIXMAN_API +void                    pixman_region32_init_with_extents  (pixman_region32_t *region, +							    pixman_box32_t    *extents); + +PIXMAN_API +void                    pixman_region32_init_from_image    (pixman_region32_t *region, +							    pixman_image_t    *image); + +PIXMAN_API +void                    pixman_region32_fini               (pixman_region32_t *region); + + +/* manipulation */ +PIXMAN_API +void                    pixman_region32_translate          (pixman_region32_t *region, +							    int                x, +							    int                y); + +PIXMAN_API +pixman_bool_t           pixman_region32_copy               (pixman_region32_t *dest, +							    pixman_region32_t *source); + +PIXMAN_API +pixman_bool_t           pixman_region32_intersect          (pixman_region32_t *new_reg, +							    pixman_region32_t *reg1, +							    pixman_region32_t *reg2); + +PIXMAN_API +pixman_bool_t           pixman_region32_union              (pixman_region32_t *new_reg, +							    pixman_region32_t *reg1, +							    pixman_region32_t *reg2); + +PIXMAN_API +pixman_bool_t		pixman_region32_intersect_rect     (pixman_region32_t *dest, +							    pixman_region32_t *source, +							    int                x, +							    int                y, +							    unsigned int       width, +							    unsigned int       height); + +PIXMAN_API +pixman_bool_t           pixman_region32_union_rect         (pixman_region32_t *dest, +							    pixman_region32_t *source, +							    int                x, +							    int                y, +							    unsigned int       width, +							    unsigned int       height); + +PIXMAN_API +pixman_bool_t           pixman_region32_subtract           (pixman_region32_t *reg_d, +							    pixman_region32_t *reg_m, +							    pixman_region32_t *reg_s); + +PIXMAN_API +pixman_bool_t           pixman_region32_inverse            (pixman_region32_t *new_reg, +							    pixman_region32_t *reg1, +							    pixman_box32_t    *inv_rect); + +PIXMAN_API +pixman_bool_t           pixman_region32_contains_point     (pixman_region32_t *region, +							    int                x, +							    int                y, +							    pixman_box32_t    *box); + +PIXMAN_API +pixman_region_overlap_t pixman_region32_contains_rectangle (pixman_region32_t *region, +							    pixman_box32_t    *prect); + +PIXMAN_API +pixman_bool_t           pixman_region32_not_empty          (pixman_region32_t *region); + +PIXMAN_API +pixman_box32_t *        pixman_region32_extents            (pixman_region32_t *region); + +PIXMAN_API +int                     pixman_region32_n_rects            (pixman_region32_t *region); + +PIXMAN_API +pixman_box32_t *        pixman_region32_rectangles         (pixman_region32_t *region, +							    int               *n_rects); + +PIXMAN_API +pixman_bool_t           pixman_region32_equal              (pixman_region32_t *region1, +							    pixman_region32_t *region2); + +PIXMAN_API +pixman_bool_t           pixman_region32_selfcheck          (pixman_region32_t *region); + +PIXMAN_API +void                    pixman_region32_reset              (pixman_region32_t *region, +							    pixman_box32_t    *box); + +PIXMAN_API +void			pixman_region32_clear		   (pixman_region32_t *region); + + +/* Copy / Fill / Misc */ +PIXMAN_API +pixman_bool_t pixman_blt                (uint32_t           *src_bits, +					 uint32_t           *dst_bits, +					 int                 src_stride, +					 int                 dst_stride, +					 int                 src_bpp, +					 int                 dst_bpp, +					 int                 src_x, +					 int                 src_y, +					 int                 dest_x, +					 int                 dest_y, +					 int                 width, +					 int                 height); + +PIXMAN_API +pixman_bool_t pixman_fill               (uint32_t           *bits, +					 int                 stride, +					 int                 bpp, +					 int                 x, +					 int                 y, +					 int                 width, +					 int                 height, +					 uint32_t            _xor); + + +PIXMAN_API +int           pixman_version            (void); + +PIXMAN_API +const char*   pixman_version_string     (void); + +/* + * Images + */ +typedef struct pixman_indexed		pixman_indexed_t; +typedef struct pixman_gradient_stop	pixman_gradient_stop_t; + +typedef uint32_t (* pixman_read_memory_func_t) (const void *src, int size); +typedef void     (* pixman_write_memory_func_t) (void *dst, uint32_t value, int size); + +typedef void     (* pixman_image_destroy_func_t) (pixman_image_t *image, void *data); + +struct pixman_gradient_stop { +    pixman_fixed_t x; +    pixman_color_t color; +}; + +#define PIXMAN_MAX_INDEXED  256 /* XXX depth must be <= 8 */ + +#if PIXMAN_MAX_INDEXED <= 256 +typedef uint8_t pixman_index_type; +#endif + +struct pixman_indexed +{ +    pixman_bool_t       color; +    uint32_t		rgba[PIXMAN_MAX_INDEXED]; +    pixman_index_type	ent[32768]; +}; + +/* + * While the protocol is generous in format support, the + * sample implementation allows only packed RGB and GBR + * representations for data to simplify software rendering, + */ +#define PIXMAN_FORMAT(bpp,type,a,r,g,b)	(((bpp) << 24) |  \ +					 ((type) << 16) | \ +					 ((a) << 12) |	  \ +					 ((r) << 8) |	  \ +					 ((g) << 4) |	  \ +					 ((b))) + +#define PIXMAN_FORMAT_BYTE(bpp,type,a,r,g,b) \ +	(((bpp >> 3) << 24) | \ +	(3 << 22) | ((type) << 16) | \ +	((a >> 3) << 12) | \ +	((r >> 3) << 8) | \ +	((g >> 3) << 4) | \ +	((b >> 3))) + +#define PIXMAN_FORMAT_RESHIFT(val, ofs, num) \ +	(((val >> (ofs)) & ((1 << (num)) - 1)) << ((val >> 22) & 3)) + +#define PIXMAN_FORMAT_BPP(f)	PIXMAN_FORMAT_RESHIFT(f, 24, 8) +#define PIXMAN_FORMAT_SHIFT(f)	((uint32_t)((f >> 22) & 3)) +#define PIXMAN_FORMAT_TYPE(f)	(((f) >> 16) & 0x3f) +#define PIXMAN_FORMAT_A(f)	PIXMAN_FORMAT_RESHIFT(f, 12, 4) +#define PIXMAN_FORMAT_R(f)	PIXMAN_FORMAT_RESHIFT(f, 8, 4) +#define PIXMAN_FORMAT_G(f)	PIXMAN_FORMAT_RESHIFT(f, 4, 4) +#define PIXMAN_FORMAT_B(f)	PIXMAN_FORMAT_RESHIFT(f, 0, 4) +#define PIXMAN_FORMAT_RGB(f)	(((f)      ) & 0xfff) +#define PIXMAN_FORMAT_VIS(f)	(((f)      ) & 0xffff) +#define PIXMAN_FORMAT_DEPTH(f)	(PIXMAN_FORMAT_A(f) +	\ +				 PIXMAN_FORMAT_R(f) +	\ +				 PIXMAN_FORMAT_G(f) +	\ +				 PIXMAN_FORMAT_B(f)) + +#define PIXMAN_TYPE_OTHER	0 +#define PIXMAN_TYPE_A		1 +#define PIXMAN_TYPE_ARGB	2 +#define PIXMAN_TYPE_ABGR	3 +#define PIXMAN_TYPE_COLOR	4 +#define PIXMAN_TYPE_GRAY	5 +#define PIXMAN_TYPE_YUY2	6 +#define PIXMAN_TYPE_YV12	7 +#define PIXMAN_TYPE_BGRA	8 +#define PIXMAN_TYPE_RGBA	9 +#define PIXMAN_TYPE_ARGB_SRGB	10 +#define PIXMAN_TYPE_RGBA_FLOAT	11 + +#define PIXMAN_FORMAT_COLOR(f)				\ +	(PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ARGB ||	\ +	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ABGR ||	\ +	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_BGRA ||	\ +	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_RGBA ||	\ +	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_RGBA_FLOAT) + +typedef enum { +/* 128bpp formats */ +    PIXMAN_rgba_float =	PIXMAN_FORMAT_BYTE(128,PIXMAN_TYPE_RGBA_FLOAT,32,32,32,32), +/* 96bpp formats */ +    PIXMAN_rgb_float =	PIXMAN_FORMAT_BYTE(96,PIXMAN_TYPE_RGBA_FLOAT,0,32,32,32), + +/* 32bpp formats */ +    PIXMAN_a8r8g8b8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,8,8,8,8), +    PIXMAN_x8r8g8b8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,8,8,8), +    PIXMAN_a8b8g8r8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,8,8,8,8), +    PIXMAN_x8b8g8r8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,8,8,8), +    PIXMAN_b8g8r8a8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,8,8,8,8), +    PIXMAN_b8g8r8x8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,0,8,8,8), +    PIXMAN_r8g8b8a8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_RGBA,8,8,8,8), +    PIXMAN_r8g8b8x8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_RGBA,0,8,8,8), +    PIXMAN_x14r6g6b6 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,6,6,6), +    PIXMAN_x2r10g10b10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,10,10,10), +    PIXMAN_a2r10g10b10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,2,10,10,10), +    PIXMAN_x2b10g10r10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,10,10,10), +    PIXMAN_a2b10g10r10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,2,10,10,10), + +/* sRGB formats */ +    PIXMAN_a8r8g8b8_sRGB = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB_SRGB,8,8,8,8), + +/* 24bpp formats */ +    PIXMAN_r8g8b8 =	 PIXMAN_FORMAT(24,PIXMAN_TYPE_ARGB,0,8,8,8), +    PIXMAN_b8g8r8 =	 PIXMAN_FORMAT(24,PIXMAN_TYPE_ABGR,0,8,8,8), + +/* 16bpp formats */ +    PIXMAN_r5g6b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,6,5), +    PIXMAN_b5g6r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,6,5), + +    PIXMAN_a1r5g5b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,1,5,5,5), +    PIXMAN_x1r5g5b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,5,5), +    PIXMAN_a1b5g5r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,1,5,5,5), +    PIXMAN_x1b5g5r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,5,5), +    PIXMAN_a4r4g4b4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,4,4,4,4), +    PIXMAN_x4r4g4b4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,4,4,4), +    PIXMAN_a4b4g4r4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,4,4,4,4), +    PIXMAN_x4b4g4r4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,4,4,4), + +/* 8bpp formats */ +    PIXMAN_a8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_A,8,0,0,0), +    PIXMAN_r3g3b2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,0,3,3,2), +    PIXMAN_b2g3r3 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,0,3,3,2), +    PIXMAN_a2r2g2b2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,2,2,2,2), +    PIXMAN_a2b2g2r2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,2,2,2,2), + +    PIXMAN_c8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0), +    PIXMAN_g8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0), + +    PIXMAN_x4a4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_A,4,0,0,0), + +    PIXMAN_x4c4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0), +    PIXMAN_x4g4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0), + +/* 4bpp formats */ +    PIXMAN_a4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_A,4,0,0,0), +    PIXMAN_r1g2b1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,0,1,2,1), +    PIXMAN_b1g2r1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,0,1,2,1), +    PIXMAN_a1r1g1b1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,1,1,1,1), +    PIXMAN_a1b1g1r1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,1,1,1,1), + +    PIXMAN_c4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_COLOR,0,0,0,0), +    PIXMAN_g4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_GRAY,0,0,0,0), + +/* 1bpp formats */ +    PIXMAN_a1 =		 PIXMAN_FORMAT(1,PIXMAN_TYPE_A,1,0,0,0), + +    PIXMAN_g1 =		 PIXMAN_FORMAT(1,PIXMAN_TYPE_GRAY,0,0,0,0), + +/* YUV formats */ +    PIXMAN_yuy2 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_YUY2,0,0,0,0), +    PIXMAN_yv12 =	 PIXMAN_FORMAT(12,PIXMAN_TYPE_YV12,0,0,0,0) +} pixman_format_code_t; + +/* Querying supported format values. */ +PIXMAN_API +pixman_bool_t pixman_format_supported_destination (pixman_format_code_t format); + +PIXMAN_API +pixman_bool_t pixman_format_supported_source      (pixman_format_code_t format); + +/* Constructors */ +PIXMAN_API +pixman_image_t *pixman_image_create_solid_fill       (const pixman_color_t         *color); + +PIXMAN_API +pixman_image_t *pixman_image_create_linear_gradient  (const pixman_point_fixed_t   *p1, +						      const pixman_point_fixed_t   *p2, +						      const pixman_gradient_stop_t *stops, +						      int                           n_stops); + +PIXMAN_API +pixman_image_t *pixman_image_create_radial_gradient  (const pixman_point_fixed_t   *inner, +						      const pixman_point_fixed_t   *outer, +						      pixman_fixed_t                inner_radius, +						      pixman_fixed_t                outer_radius, +						      const pixman_gradient_stop_t *stops, +						      int                           n_stops); + +PIXMAN_API +pixman_image_t *pixman_image_create_conical_gradient (const pixman_point_fixed_t   *center, +						      pixman_fixed_t                angle, +						      const pixman_gradient_stop_t *stops, +						      int                           n_stops); + +PIXMAN_API +pixman_image_t *pixman_image_create_bits             (pixman_format_code_t          format, +						      int                           width, +						      int                           height, +						      uint32_t                     *bits, +						      int                           rowstride_bytes); + +PIXMAN_API +pixman_image_t *pixman_image_create_bits_no_clear    (pixman_format_code_t format, +						      int                  width, +						      int                  height, +						      uint32_t *           bits, +						      int                  rowstride_bytes); + +/* Destructor */ +PIXMAN_API +pixman_image_t *pixman_image_ref                     (pixman_image_t               *image); + +PIXMAN_API +pixman_bool_t   pixman_image_unref                   (pixman_image_t               *image); + + +PIXMAN_API +void		pixman_image_set_destroy_function    (pixman_image_t		   *image, +						      pixman_image_destroy_func_t   function, +						      void			   *data); + +PIXMAN_API +void *		pixman_image_get_destroy_data        (pixman_image_t		   *image); + +/* Set properties */ +PIXMAN_API +pixman_bool_t   pixman_image_set_clip_region         (pixman_image_t               *image, +						      pixman_region16_t            *region); + +PIXMAN_API +pixman_bool_t   pixman_image_set_clip_region32       (pixman_image_t               *image, +						      pixman_region32_t            *region); + +PIXMAN_API +void		pixman_image_set_has_client_clip     (pixman_image_t               *image, +						      pixman_bool_t		    clien_clip); + +PIXMAN_API +pixman_bool_t   pixman_image_set_transform           (pixman_image_t               *image, +						      const pixman_transform_t     *transform); + +PIXMAN_API +void            pixman_image_set_repeat              (pixman_image_t               *image, +						      pixman_repeat_t               repeat); + +PIXMAN_API +void            pixman_image_set_dither              (pixman_image_t               *image, +						      pixman_dither_t               dither); + +PIXMAN_API +void            pixman_image_set_dither_offset       (pixman_image_t               *image, +						      int                           offset_x, +						      int                           offset_y); + +PIXMAN_API +pixman_bool_t   pixman_image_set_filter              (pixman_image_t               *image, +						      pixman_filter_t               filter, +						      const pixman_fixed_t         *filter_params, +						      int                           n_filter_params); + +PIXMAN_API +void		pixman_image_set_source_clipping     (pixman_image_t		   *image, +						      pixman_bool_t                 source_clipping); + +PIXMAN_API +void            pixman_image_set_alpha_map           (pixman_image_t               *image, +						      pixman_image_t               *alpha_map, +						      int16_t                       x, +						      int16_t                       y); + +PIXMAN_API +void            pixman_image_set_component_alpha     (pixman_image_t               *image, +						      pixman_bool_t                 component_alpha); + +PIXMAN_API +pixman_bool_t   pixman_image_get_component_alpha     (pixman_image_t               *image); + +PIXMAN_API +void		pixman_image_set_accessors	     (pixman_image_t		   *image, +						      pixman_read_memory_func_t	    read_func, +						      pixman_write_memory_func_t    write_func); + +PIXMAN_API +void		pixman_image_set_indexed	     (pixman_image_t		   *image, +						      const pixman_indexed_t	   *indexed); + +PIXMAN_API +uint32_t       *pixman_image_get_data                (pixman_image_t               *image); + +PIXMAN_API +int		pixman_image_get_width               (pixman_image_t               *image); + +PIXMAN_API +int             pixman_image_get_height              (pixman_image_t               *image); + +PIXMAN_API +int		pixman_image_get_stride              (pixman_image_t               *image); /* in bytes */ + +PIXMAN_API +int		pixman_image_get_depth               (pixman_image_t		   *image); + +PIXMAN_API +pixman_format_code_t pixman_image_get_format	     (pixman_image_t		   *image); + +typedef enum +{ +    PIXMAN_KERNEL_IMPULSE, +    PIXMAN_KERNEL_BOX, +    PIXMAN_KERNEL_LINEAR, +    PIXMAN_KERNEL_CUBIC, +    PIXMAN_KERNEL_GAUSSIAN, +    PIXMAN_KERNEL_LANCZOS2, +    PIXMAN_KERNEL_LANCZOS3, +    PIXMAN_KERNEL_LANCZOS3_STRETCHED       /* Jim Blinn's 'nice' filter */ +} pixman_kernel_t; + +/* Create the parameter list for a SEPARABLE_CONVOLUTION filter + * with the given kernels and scale parameters. + */ +PIXMAN_API +pixman_fixed_t * +pixman_filter_create_separable_convolution (int             *n_values, +					    pixman_fixed_t   scale_x, +					    pixman_fixed_t   scale_y, +					    pixman_kernel_t  reconstruct_x, +					    pixman_kernel_t  reconstruct_y, +					    pixman_kernel_t  sample_x, +					    pixman_kernel_t  sample_y, +					    int              subsample_bits_x, +					    int              subsample_bits_y); + + +PIXMAN_API +pixman_bool_t	pixman_image_fill_rectangles	     (pixman_op_t		    op, +						      pixman_image_t		   *image, +						      const pixman_color_t	   *color, +						      int			    n_rects, +						      const pixman_rectangle16_t   *rects); + +PIXMAN_API +pixman_bool_t   pixman_image_fill_boxes              (pixman_op_t                   op, +                                                      pixman_image_t               *dest, +                                                      const pixman_color_t         *color, +                                                      int                           n_boxes, +                                                      const pixman_box32_t         *boxes); + +/* Composite */ +PIXMAN_API +pixman_bool_t pixman_compute_composite_region (pixman_region16_t *region, +					       pixman_image_t    *src_image, +					       pixman_image_t    *mask_image, +					       pixman_image_t    *dest_image, +					       int16_t            src_x, +					       int16_t            src_y, +					       int16_t            mask_x, +					       int16_t            mask_y, +					       int16_t            dest_x, +					       int16_t            dest_y, +					       uint16_t           width, +					       uint16_t           height); + +PIXMAN_API +void          pixman_image_composite          (pixman_op_t        op, +					       pixman_image_t    *src, +					       pixman_image_t    *mask, +					       pixman_image_t    *dest, +					       int16_t            src_x, +					       int16_t            src_y, +					       int16_t            mask_x, +					       int16_t            mask_y, +					       int16_t            dest_x, +					       int16_t            dest_y, +					       uint16_t           width, +					       uint16_t           height); + +PIXMAN_API +void          pixman_image_composite32        (pixman_op_t        op, +					       pixman_image_t    *src, +					       pixman_image_t    *mask, +					       pixman_image_t    *dest, +					       int32_t            src_x, +					       int32_t            src_y, +					       int32_t            mask_x, +					       int32_t            mask_y, +					       int32_t            dest_x, +					       int32_t            dest_y, +					       int32_t            width, +					       int32_t            height); + +/* Executive Summary: This function is a no-op that only exists + * for historical reasons. + * + * There used to be a bug in the X server where it would rely on + * out-of-bounds accesses when it was asked to composite with a + * window as the source. It would create a pixman image pointing + * to some bogus position in memory, but then set a clip region + * to the position where the actual bits were. + * + * Due to a bug in old versions of pixman, where it would not clip + * against the image bounds when a clip region was set, this would + * actually work. So when the pixman bug was fixed, a workaround was + * added to allow certain out-of-bound accesses. This function disabled + * those workarounds. + * + * Since 0.21.2, pixman doesn't do these workarounds anymore, so now this + * function is a no-op. + */ +PIXMAN_API +void pixman_disable_out_of_bounds_workaround (void); + +/* + * Glyphs + */ +typedef struct pixman_glyph_cache_t pixman_glyph_cache_t; +typedef struct +{ +    int		x, y; +    const void *glyph; +} pixman_glyph_t; + +PIXMAN_API +pixman_glyph_cache_t *pixman_glyph_cache_create       (void); + +PIXMAN_API +void                  pixman_glyph_cache_destroy      (pixman_glyph_cache_t *cache); + +PIXMAN_API +void                  pixman_glyph_cache_freeze       (pixman_glyph_cache_t *cache); + +PIXMAN_API +void                  pixman_glyph_cache_thaw         (pixman_glyph_cache_t *cache); + +PIXMAN_API +const void *          pixman_glyph_cache_lookup       (pixman_glyph_cache_t *cache, +						       void                 *font_key, +						       void                 *glyph_key); + +PIXMAN_API +const void *          pixman_glyph_cache_insert       (pixman_glyph_cache_t *cache, +						       void                 *font_key, +						       void                 *glyph_key, +						       int		     origin_x, +						       int                   origin_y, +						       pixman_image_t       *glyph_image); + +PIXMAN_API +void                  pixman_glyph_cache_remove       (pixman_glyph_cache_t *cache, +						       void                 *font_key, +						       void                 *glyph_key); + +PIXMAN_API +void                  pixman_glyph_get_extents        (pixman_glyph_cache_t *cache, +						       int                   n_glyphs, +						       pixman_glyph_t       *glyphs, +						       pixman_box32_t       *extents); + +PIXMAN_API +pixman_format_code_t  pixman_glyph_get_mask_format    (pixman_glyph_cache_t *cache, +						       int		     n_glyphs, +						       const pixman_glyph_t *glyphs); + +PIXMAN_API +void                  pixman_composite_glyphs         (pixman_op_t           op, +						       pixman_image_t       *src, +						       pixman_image_t       *dest, +						       pixman_format_code_t  mask_format, +						       int32_t               src_x, +						       int32_t               src_y, +						       int32_t		     mask_x, +						       int32_t		     mask_y, +						       int32_t               dest_x, +						       int32_t               dest_y, +						       int32_t		     width, +						       int32_t		     height, +						       pixman_glyph_cache_t *cache, +						       int		     n_glyphs, +						       const pixman_glyph_t *glyphs); + +PIXMAN_API +void                  pixman_composite_glyphs_no_mask (pixman_op_t           op, +						       pixman_image_t       *src, +						       pixman_image_t       *dest, +						       int32_t               src_x, +						       int32_t               src_y, +						       int32_t               dest_x, +						       int32_t               dest_y, +						       pixman_glyph_cache_t *cache, +						       int		     n_glyphs, +						       const pixman_glyph_t *glyphs); + +/* + * Trapezoids + */ +typedef struct pixman_edge pixman_edge_t; +typedef struct pixman_trapezoid pixman_trapezoid_t; +typedef struct pixman_trap pixman_trap_t; +typedef struct pixman_span_fix pixman_span_fix_t; +typedef struct pixman_triangle pixman_triangle_t; + +/* + * An edge structure.  This represents a single polygon edge + * and can be quickly stepped across small or large gaps in the + * sample grid + */ +struct pixman_edge +{ +    pixman_fixed_t	x; +    pixman_fixed_t	e; +    pixman_fixed_t	stepx; +    pixman_fixed_t	signdx; +    pixman_fixed_t	dy; +    pixman_fixed_t	dx; + +    pixman_fixed_t	stepx_small; +    pixman_fixed_t	stepx_big; +    pixman_fixed_t	dx_small; +    pixman_fixed_t	dx_big; +}; + +struct pixman_trapezoid +{ +    pixman_fixed_t	top, bottom; +    pixman_line_fixed_t	left, right; +}; + +struct pixman_triangle +{ +    pixman_point_fixed_t p1, p2, p3; +}; + +/* whether 't' is a well defined not obviously empty trapezoid */ +#define pixman_trapezoid_valid(t)				   \ +    ((t)->left.p1.y != (t)->left.p2.y &&			   \ +     (t)->right.p1.y != (t)->right.p2.y &&			   \ +     ((t)->bottom > (t)->top)) + +struct pixman_span_fix +{ +    pixman_fixed_t	l, r, y; +}; + +struct pixman_trap +{ +    pixman_span_fix_t	top, bot; +}; + +PIXMAN_API +pixman_fixed_t pixman_sample_ceil_y        (pixman_fixed_t             y, +					    int                        bpp); + +PIXMAN_API +pixman_fixed_t pixman_sample_floor_y       (pixman_fixed_t             y, +					    int                        bpp); + +PIXMAN_API +void           pixman_edge_step            (pixman_edge_t             *e, +					    int                        n); + +PIXMAN_API +void           pixman_edge_init            (pixman_edge_t             *e, +					    int                        bpp, +					    pixman_fixed_t             y_start, +					    pixman_fixed_t             x_top, +					    pixman_fixed_t             y_top, +					    pixman_fixed_t             x_bot, +					    pixman_fixed_t             y_bot); + +PIXMAN_API +void           pixman_line_fixed_edge_init (pixman_edge_t             *e, +					    int                        bpp, +					    pixman_fixed_t             y, +					    const pixman_line_fixed_t *line, +					    int                        x_off, +					    int                        y_off); + +PIXMAN_API +void           pixman_rasterize_edges      (pixman_image_t            *image, +					    pixman_edge_t             *l, +					    pixman_edge_t             *r, +					    pixman_fixed_t             t, +					    pixman_fixed_t             b); + +PIXMAN_API +void           pixman_add_traps            (pixman_image_t            *image, +					    int16_t                    x_off, +					    int16_t                    y_off, +					    int                        ntrap, +					    const pixman_trap_t       *traps); + +PIXMAN_API +void           pixman_add_trapezoids       (pixman_image_t            *image, +					    int16_t                    x_off, +					    int                        y_off, +					    int                        ntraps, +					    const pixman_trapezoid_t  *traps); + +PIXMAN_API +void           pixman_rasterize_trapezoid  (pixman_image_t            *image, +					    const pixman_trapezoid_t  *trap, +					    int                        x_off, +					    int                        y_off); + +PIXMAN_API +void          pixman_composite_trapezoids (pixman_op_t		       op, +					   pixman_image_t *	       src, +					   pixman_image_t *	       dst, +					   pixman_format_code_t	       mask_format, +					   int			       x_src, +					   int			       y_src, +					   int			       x_dst, +					   int			       y_dst, +					   int			       n_traps, +					   const pixman_trapezoid_t *  traps); + +PIXMAN_API +void          pixman_composite_triangles (pixman_op_t		       op, +					  pixman_image_t *	       src, +					  pixman_image_t *	       dst, +					  pixman_format_code_t	       mask_format, +					  int			       x_src, +					  int			       y_src, +					  int			       x_dst, +					  int			       y_dst, +					  int			       n_tris, +					  const pixman_triangle_t *    tris); + +PIXMAN_API +void	      pixman_add_triangles       (pixman_image_t              *image, +					  int32_t	               x_off, +					  int32_t	               y_off, +					  int	                       n_tris, +					  const pixman_triangle_t     *tris); + +PIXMAN_END_DECLS + +#endif /* PIXMAN_H__ */ diff --git a/libs/pixman-0.40.0/pixman/solaris-hwcap.mapfile b/libs/pixman-0.40.0/pixman/solaris-hwcap.mapfile new file mode 100644 index 0000000..87efce1 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/solaris-hwcap.mapfile @@ -0,0 +1,30 @@ +############################################################################### +# +# Copyright 2009, Oracle and/or its affiliates. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +############################################################################### +# +# Override the linker's detection of CMOV/MMX/SSE instructions so this +# library isn't flagged as only usable on CPU's with those ISA's, since it +# checks at runtime for availability before calling them + +hwcap_1 = V0x0 FPU OVERRIDE;  | 
