diff --git a/src/dsp/enc_msa.c b/src/dsp/enc_msa.c index e4344600..10457f68 100644 --- a/src/dsp/enc_msa.c +++ b/src/dsp/enc_msa.c @@ -15,6 +15,7 @@ #if defined(WEBP_USE_MSA) +#include #include "./msa_macro.h" //------------------------------------------------------------------------------ @@ -165,6 +166,57 @@ static void FTransformWHT(const int16_t* in, int16_t* out) { ST_SH2(out0, out1, out, 8); } +static int TTransform(const uint8_t* in, const uint16_t* w) { + int sum; + uint32_t in0_m, in1_m, in2_m, in3_m; + v16i8 src0; + v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3; + v4i32 dst0, dst1; + const v16i8 zero = { 0 }; + const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 }; + const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 }; + const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 }; + const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 }; + + LW4(in, BPS, in0_m, in1_m, in2_m, in3_m); + INSERT_W4_SB(in0_m, in1_m, in2_m, in3_m, src0); + ILVRL_B2_SH(zero, src0, tmp0, tmp1); + VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1); + ADDSUB2(in0, in1, tmp0, tmp1); + VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); + ADDSUB2(tmp2, tmp3, tmp0, tmp1); + VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1); + ADDSUB2(in0, in1, tmp0, tmp1); + VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); + ADDSUB2(tmp2, tmp3, tmp0, tmp1); + tmp0 = __msa_add_a_h(tmp0, (v8i16)zero); + tmp1 = __msa_add_a_h(tmp1, (v8i16)zero); + LD_SH2(w, 8, tmp2, tmp3); + DOTP_SH2_SW(tmp0, tmp1, tmp2, tmp3, dst0, dst1); + dst0 = dst0 + dst1; + sum = HADD_SW_S32(dst0); + return sum; +} + +static int Disto4x4(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { + const int sum1 = TTransform(a, w); + const int sum2 = TTransform(b, w); + return abs(sum2 - sum1) >> 5; +} + +static int Disto16x16(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { + int D = 0; + int x, y; + for (y = 0; y < 16 * BPS; y += 4 * BPS) { + for (x = 0; x < 16; x += 4) { + D += Disto4x4(a + x + y, b + x + y, w); + } + } + return D; +} + //------------------------------------------------------------------------------ // Entry point @@ -174,6 +226,9 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) { VP8ITransform = ITransform; VP8FTransform = FTransform; VP8FTransformWHT = FTransformWHT; + + VP8TDisto4x4 = Disto4x4; + VP8TDisto16x16 = Disto16x16; } #else // !WEBP_USE_MSA diff --git a/src/dsp/msa_macro.h b/src/dsp/msa_macro.h index c5c9d384..0ea1694c 100644 --- a/src/dsp/msa_macro.h +++ b/src/dsp/msa_macro.h @@ -393,6 +393,22 @@ } while (0) #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) +/* Description : Dot product of halfword vector elements + * Arguments : Inputs - mult0, mult1, cnst0, cnst1 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Signed halfword elements from 'mult0' are multiplied with + * signed halfword elements from 'cnst0' producing a result + * twice the size of input i.e. signed word. + * The multiplication result of adjacent odd-even elements + * are added together and written to the 'out0' vector + */ +#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \ + out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ +} while (0) +#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) + /* Description : Dot product & addition of halfword vector elements * Arguments : Inputs - mult0, mult1, cnst0, cnst1 * Outputs - out0, out1 @@ -448,6 +464,22 @@ CLIP_SW_0_255(in3); \ } while (0) +/* Description : Horizontal addition of 4 signed word elements of input vector + * Arguments : Input - in (signed word vector) + * Output - sum_m (i32 sum) + * Return Type - signed word (GP) + * Details : 4 signed word elements of 'in' vector are added together and + * the resulting integer sum is returned + */ +static WEBP_INLINE int32_t func_hadd_sw_s32(v4i32 in) { + const v2i64 res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); + const v2i64 res1_m = __msa_splati_d(res0_m, 1); + const v2i64 out = res0_m + res1_m; + int32_t sum_m = __msa_copy_s_w((v4i32)out, 0); + return sum_m; +} +#define HADD_SW_S32(in) func_hadd_sw_s32(in) + /* Description : Horizontal addition of 8 unsigned halfword elements * Arguments : Input - in (unsigned halfword vector) * Output - sum_m (u32 sum)