diff --git a/src/dsp/dec_msa.c b/src/dsp/dec_msa.c index 64603d16..8b9997fc 100644 --- a/src/dsp/dec_msa.c +++ b/src/dsp/dec_msa.c @@ -307,6 +307,35 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { ST6x1_UB(in0, start_in0_idx + 3, in1, start_in1_idx + 3, ptmp1, 4); \ } while (0) +#define LPF_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) do { \ + v16i8 p1_m, p0_m, q0_m, q1_m, filt, filt1, filt2; \ + const v16i8 cnst4b = __msa_ldi_b(4); \ + const v16i8 cnst3b = __msa_ldi_b(3); \ + \ + FLIP_SIGN4(p1_in, p0_in, q0_in, q1_in, p1_m, p0_m, q0_m, q1_m); \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + FILT_VAL(q0_m, p0_m, mask, filt); \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 = SRAI_B(filt1, 3); \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 = SRAI_B(filt2, 3); \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + q0_in = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_in = __msa_xori_b((v16u8)p0_m, 0x80); \ +} while (0) + +#define LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask) do { \ + v16u8 p1_a_sub_q1, p0_a_sub_q0; \ + \ + p0_a_sub_q0 = __msa_asub_u_b(p0, q0); \ + p1_a_sub_q1 = __msa_asub_u_b(p1, q1); \ + p1_a_sub_q1 = (v16u8)__msa_srli_b((v16i8)p1_a_sub_q1, 1); \ + p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0); \ + mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1); \ + mask = (mask <= b_limit); \ +} while (0) + static void VFilter16(uint8_t *src, int stride, int b_limit_in, int limit_in, int thresh_in) { uint8_t *ptemp = src - 4 * stride; @@ -592,6 +621,55 @@ static void HFilter8i(uint8_t* src_u, uint8_t* src_v, int stride, ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, src_v, stride); } +static void SimpleVFilter16(uint8_t* src, int stride, int b_limit_in) { + v16u8 p1, p0, q1, q0, mask; + const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in); + + LD_UB4(src - 2 * stride, stride, p1, p0, q0, q1); + LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask); + LPF_SIMPLE_FILT(p1, p0, q0, q1, mask); + ST_UB2(p0, q0, src - stride, stride); +} + +static void SimpleHFilter16(uint8_t* src, int stride, int b_limit_in) { + v16u8 p1, p0, q1, q0, mask, row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + v8i16 tmp0, tmp1; + const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in); + uint8_t* ptemp_src = src - 2; + + LD_UB8(ptemp_src, stride, row0, row1, row2, row3, row4, row5, row6, row7); + LD_UB8(ptemp_src + 8 * stride, stride, + row8, row9, row10, row11, row12, row13, row14, row15); + TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, + row8, row9, row10, row11, row12, row13, row14, row15, + p1, p0, q0, q1); + LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask); + LPF_SIMPLE_FILT(p1, p0, q0, q1, mask); + ILVRL_B2_SH(q0, p0, tmp1, tmp0); + ptemp_src += 1; + ST2x4_UB(tmp1, 0, ptemp_src, stride); + ptemp_src += 4 * stride; + ST2x4_UB(tmp1, 4, ptemp_src, stride); + ptemp_src += 4 * stride; + ST2x4_UB(tmp0, 0, ptemp_src, stride); + ptemp_src += 4 * stride; + ST2x4_UB(tmp0, 4, ptemp_src, stride); + ptemp_src += 4 * stride; +} + +static void SimpleVFilter16i(uint8_t* src_y, int stride, int b_limit_in) { + SimpleVFilter16(src_y + 4 * stride, stride, b_limit_in); + SimpleVFilter16(src_y + 8 * stride, stride, b_limit_in); + SimpleVFilter16(src_y + 12 * stride, stride, b_limit_in); +} + +static void SimpleHFilter16i(uint8_t* src_y, int stride, int b_limit_in) { + SimpleHFilter16(src_y + 4, stride, b_limit_in); + SimpleHFilter16(src_y + 8, stride, b_limit_in); + SimpleHFilter16(src_y + 12, stride, b_limit_in); +} + //------------------------------------------------------------------------------ // Entry point @@ -611,6 +689,10 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMSA(void) { VP8HFilter8 = HFilter8; VP8VFilter8i = VFilter8i; VP8HFilter8i = HFilter8i; + VP8SimpleVFilter16 = SimpleVFilter16; + VP8SimpleHFilter16 = SimpleHFilter16; + VP8SimpleVFilter16i = SimpleVFilter16i; + VP8SimpleHFilter16i = SimpleHFilter16i; } #else // !WEBP_USE_MSA diff --git a/src/dsp/msa_macro.h b/src/dsp/msa_macro.h index 5bf72e2e..2a7dab72 100644 --- a/src/dsp/msa_macro.h +++ b/src/dsp/msa_macro.h @@ -243,6 +243,32 @@ #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__) +/* Description : Store 2x4 byte block to destination memory from input vector + * Arguments : Inputs - in, stidx, pdst, stride + * Details : Index 'stidx' halfword element from 'in' vector is copied to + * the GP register and stored to (pdst) + * Index 'stidx+1' halfword element from 'in' vector is copied to + * the GP register and stored to (pdst + stride) + * Index 'stidx+2' halfword element from 'in' vector is copied to + * the GP register and stored to (pdst + 2 * stride) + * Index 'stidx+3' halfword element from 'in' vector is copied to + * the GP register and stored to (pdst + 3 * stride) + */ +#define ST2x4_UB(in, stidx, pdst, stride) { \ + uint8_t* pblk_2x4_m = (uint8_t*)pdst; \ + const uint16_t out0_m = __msa_copy_s_h((v8i16)in, stidx); \ + const uint16_t out1_m = __msa_copy_s_h((v8i16)in, stidx + 1); \ + const uint16_t out2_m = __msa_copy_s_h((v8i16)in, stidx + 2); \ + const uint16_t out3_m = __msa_copy_s_h((v8i16)in, stidx + 3); \ + SH(out0_m, pblk_2x4_m); \ + pblk_2x4_m += stride; \ + SH(out1_m, pblk_2x4_m); \ + pblk_2x4_m += stride; \ + SH(out2_m, pblk_2x4_m); \ + pblk_2x4_m += stride; \ + SH(out3_m, pblk_2x4_m); \ +} + /* Description : Store 4x4 byte block to destination memory from input vector * Arguments : Inputs - in0, in1, pdst, stride * Details : 'Idx0' word element from input vector 'in0' is copied to the @@ -418,6 +444,22 @@ #define ILVOD_H2_SH(...) ILVOD_H2(v8i16, __VA_ARGS__) #define ILVOD_H2_SW(...) ILVOD_H2(v4i32, __VA_ARGS__) +/* Description : Interleave even word elements from vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Even word elements of 'in0' and 'in1' are interleaved + * and written to 'out0' + */ +#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ + out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ +} +#define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__) +#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) +#define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__) +#define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__) + /* Description : Interleave even-odd word elements from vectors * Arguments : Inputs - in0, in1, in2, in3 * Outputs - out0, out1 @@ -436,6 +478,24 @@ #define ILVEVOD_W2_SH(...) ILVEVOD_W2(v8i16, __VA_ARGS__) #define ILVEVOD_W2_SW(...) ILVEVOD_W2(v4i32, __VA_ARGS__) +/* Description : Interleave even-odd half-word elements from vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Even half-word elements of 'in0' and 'in1' are interleaved + * and written to 'out0' + * Odd half-word elements of 'in2' and 'in3' are interleaved + * and written to 'out1' + */ +#define ILVEVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ + out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2); \ +} +#define ILVEVOD_H2_UB(...) ILVEVOD_H2(v16u8, __VA_ARGS__) +#define ILVEVOD_H2_UH(...) ILVEVOD_H2(v8u16, __VA_ARGS__) +#define ILVEVOD_H2_SH(...) ILVEVOD_H2(v8i16, __VA_ARGS__) +#define ILVEVOD_H2_SW(...) ILVEVOD_H2(v4i32, __VA_ARGS__) + /* Description : Interleave even double word elements from vectors * Arguments : Inputs - in0, in1, in2, in3 * Outputs - out0, out1 @@ -450,6 +510,7 @@ #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__) #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__) +#define ILVEV_D2_SD(...) ILVEV_D2(v2i64, __VA_ARGS__) /* Description : Interleave left half of byte elements from vectors * Arguments : Inputs - in0, in1, in2, in3 @@ -708,6 +769,28 @@ out3 = in0 - in3; \ } +/* Description : Transpose 16x4 block into 4x16 with byte elements in vectors + * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, + * in8, in9, in10, in11, in12, in13, in14, in15 + * Outputs - out0, out1, out2, out3 + * Return Type - unsigned byte + */ +#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15, \ + out0, out1, out2, out3) { \ + v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m, tmp4_m, tmp5_m; \ + ILVEV_W2_SD(in0, in4, in8, in12, tmp2_m, tmp3_m); \ + ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \ + ILVEV_D2_UB(tmp2_m, tmp3_m, tmp0_m, tmp1_m, out1, out3); \ + ILVEV_W2_SD(in2, in6, in10, in14, tmp4_m, tmp5_m); \ + ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \ + ILVEV_D2_SD(tmp4_m, tmp5_m, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \ + ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out0, out2); \ + ILVOD_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \ + ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out1, out3); \ +} + /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, * in8, in9, in10, in11, in12, in13, in14, in15