diff options
author | sanine <sanine.not@pm.me> | 2022-10-12 13:26:36 -0500 |
---|---|---|
committer | sanine <sanine.not@pm.me> | 2022-10-12 13:26:36 -0500 |
commit | f567ea1e2798fd3156a416e61f083ea3e6b95719 (patch) | |
tree | 53b51bda93aadf9dd13fcd77635b8c4b5c813768 /libs/pixman-0.40.0/pixman/loongson-mmintrin.h | |
parent | 530ffd0b7d3c39757b20f00716e486b5caf89aff (diff) |
add pixman and libpngraylib
Diffstat (limited to 'libs/pixman-0.40.0/pixman/loongson-mmintrin.h')
-rw-r--r-- | libs/pixman-0.40.0/pixman/loongson-mmintrin.h | 412 |
1 files changed, 412 insertions, 0 deletions
diff --git a/libs/pixman-0.40.0/pixman/loongson-mmintrin.h b/libs/pixman-0.40.0/pixman/loongson-mmintrin.h new file mode 100644 index 0000000..0e79e86 --- /dev/null +++ b/libs/pixman-0.40.0/pixman/loongson-mmintrin.h @@ -0,0 +1,412 @@ +/* The gcc-provided loongson intrinsic functions are way too fucking broken + * to be of any use, otherwise I'd use them. + * + * - The hardware instructions are very similar to MMX or iwMMXt. Certainly + * close enough that they could have implemented the _mm_*-style intrinsic + * interface and had a ton of optimized code available to them. Instead they + * implemented something much, much worse. + * + * - pshuf takes a dead first argument, causing extra instructions to be + * generated. + * + * - There are no 64-bit shift or logical intrinsics, which means you have + * to implement them with inline assembly, but this is a nightmare because + * gcc doesn't understand that the integer vector datatypes are actually in + * floating-point registers, so you end up with braindead code like + * + * punpcklwd $f9,$f9,$f5 + * dmtc1 v0,$f8 + * punpcklwd $f19,$f19,$f5 + * dmfc1 t9,$f9 + * dmtc1 v0,$f9 + * dmtc1 t9,$f20 + * dmfc1 s0,$f19 + * punpcklbh $f20,$f20,$f2 + * + * where crap just gets copied back and forth between integer and floating- + * point registers ad nauseum. + * + * Instead of trying to workaround the problems from these crap intrinsics, I + * just implement the _mm_* intrinsics needed for pixman-mmx.c using inline + * assembly. + */ + +#include <stdint.h> + +/* vectors are stored in 64-bit floating-point registers */ +typedef double __m64; +/* having a 32-bit datatype allows us to use 32-bit loads in places like load8888 */ +typedef float __m32; + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setzero_si64 (void) +{ + return 0.0; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pi16 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("paddh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pi32 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("paddw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_pu16 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("paddush %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_pu8 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("paddusb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_and_si64 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("and %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("pcmpeqw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_empty (void) +{ + +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_madd_pi16 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("pmaddhw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhi_pu16 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("pmulhuh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mullo_pi16 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("pmullh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_si64 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("or %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_pu16 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("packushb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_pi32 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("packsswh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0) +{ + if (__builtin_constant_p (__w3) && + __builtin_constant_p (__w2) && + __builtin_constant_p (__w1) && + __builtin_constant_p (__w0)) + { + uint64_t val = ((uint64_t)__w3 << 48) + | ((uint64_t)__w2 << 32) + | ((uint64_t)__w1 << 16) + | ((uint64_t)__w0 << 0); + return *(__m64 *)&val; + } + else if (__w3 == __w2 && __w2 == __w1 && __w1 == __w0) + { + /* TODO: handle other cases */ + uint64_t val = __w3; + uint64_t imm = _MM_SHUFFLE (0, 0, 0, 0); + __m64 ret; + asm("pshufh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (*(__m64 *)&val), "f" (*(__m64 *)&imm) + ); + return ret; + } else { + uint64_t val = ((uint64_t)__w3 << 48) + | ((uint64_t)__w2 << 32) + | ((uint64_t)__w1 << 16) + | ((uint64_t)__w0 << 0); + return *(__m64 *)&val; + } +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi32 (unsigned __i1, unsigned __i0) +{ + if (__builtin_constant_p (__i1) && + __builtin_constant_p (__i0)) + { + uint64_t val = ((uint64_t)__i1 << 32) + | ((uint64_t)__i0 << 0); + return *(__m64 *)&val; + } + else if (__i1 == __i0) + { + uint64_t imm = _MM_SHUFFLE (1, 0, 1, 0); + __m64 ret; + asm("pshufh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm) + ); + return ret; + } else { + uint64_t val = ((uint64_t)__i1 << 32) + | ((uint64_t)__i0 << 0); + return *(__m64 *)&val; + } +} +#undef _MM_SHUFFLE + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_pi16 (__m64 __m, int64_t __n) +{ + __m64 ret; + asm("pshufh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__n) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_pi16 (__m64 __m, int64_t __count) +{ + __m64 ret; + asm("psllh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + return ret; +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_si64 (__m64 __m, int64_t __count) +{ + __m64 ret; + asm("dsll %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_pi16 (__m64 __m, int64_t __count) +{ + __m64 ret; + asm("psrlh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_pi32 (__m64 __m, int64_t __count) +{ + __m64 ret; + asm("psrlw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_si64 (__m64 __m, int64_t __count) +{ + __m64 ret; + asm("dsrl %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_pi16 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("psubh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("punpckhbh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("punpckhhw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("punpcklbh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32 datatype which + * allows load8888 to use 32-bit loads */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pi8_f (__m32 __m1, __m64 __m2) +{ + __m64 ret; + asm("punpcklbh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("punpcklhw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_si64 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("xor %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +loongson_extract_pi16 (__m64 __m, int64_t __pos) +{ + __m64 ret; + asm("pextrh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__pos) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +loongson_insert_pi16 (__m64 __m1, __m64 __m2, int64_t __pos) +{ + __m64 ret; + asm("pinsrh_%3 %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2), "i" (__pos) + ); + return ret; +} |