diff --git a/src/dsp/dec_msa.c b/src/dsp/dec_msa.c index e2f48bc0..f9f3e920 100644 --- a/src/dsp/dec_msa.c +++ b/src/dsp/dec_msa.c @@ -772,6 +772,99 @@ static void LD4(uint8_t* dst) { // Down-Left SW4(val0, val1, val2, val3, dst, BPS); } +// 16x16 + +static void DC16(uint8_t* dst) { // DC + uint32_t dc = 16; + int i; + const v16u8 rtop = LD_UB(dst - BPS); + const v8u16 dctop = __msa_hadd_u_h(rtop, rtop); + v16u8 out; + + for (i = 0; i < 16; ++i) { + dc += dst[-1 + i * BPS]; + } + dc += HADD_UH_U32(dctop); + out = (v16u8)__msa_fill_b(dc >> 5); + ST_UB8(out, out, out, out, out, out, out, out, dst, BPS); + ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS); +} + +static void TM16(uint8_t* dst) { + int j; + v8i16 d1, d2; + const v16i8 zero = { 0 }; + const v8i16 TL = (v8i16)__msa_fill_h(dst[-1 - BPS]); + const v16i8 T = LD_SB(dst - BPS); + + ILVRL_B2_SH(zero, T, d1, d2); + SUB2(d1, TL, d2, TL, d1, d2); + for (j = 0; j < 16; j += 4) { + v16i8 t0, t1, t2, t3; + v8i16 r0, r1, r2, r3, r4, r5, r6, r7; + const v8i16 L0 = (v8i16)__msa_fill_h(dst[-1 + 0 * BPS]); + const v8i16 L1 = (v8i16)__msa_fill_h(dst[-1 + 1 * BPS]); + const v8i16 L2 = (v8i16)__msa_fill_h(dst[-1 + 2 * BPS]); + const v8i16 L3 = (v8i16)__msa_fill_h(dst[-1 + 3 * BPS]); + ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3); + ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7); + CLIP_SH4_0_255(r0, r1, r2, r3); + CLIP_SH4_0_255(r4, r5, r6, r7); + PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3); + ST_SB4(t0, t1, t2, t3, dst, BPS); + dst += 4 * BPS; + } +} + +static void VE16(uint8_t* dst) { // vertical + const v16u8 rtop = LD_UB(dst - BPS); + ST_UB8(rtop, rtop, rtop, rtop, rtop, rtop, rtop, rtop, dst, BPS); + ST_UB8(rtop, rtop, rtop, rtop, rtop, rtop, rtop, rtop, dst + 8 * BPS, BPS); +} + +static void HE16(uint8_t* dst) { // horizontal + int j; + for (j = 16; j > 0; j -= 4) { + const v16u8 L0 = (v16u8)__msa_fill_b(dst[-1 + 0 * BPS]); + const v16u8 L1 = (v16u8)__msa_fill_b(dst[-1 + 1 * BPS]); + const v16u8 L2 = (v16u8)__msa_fill_b(dst[-1 + 2 * BPS]); + const v16u8 L3 = (v16u8)__msa_fill_b(dst[-1 + 3 * BPS]); + ST_UB4(L0, L1, L2, L3, dst, BPS); + dst += 4 * BPS; + } +} + +static void DC16NoTop(uint8_t* dst) { // DC with top samples not available + int j; + uint32_t dc = 8; + v16u8 out; + + for (j = 0; j < 16; ++j) { + dc += dst[-1 + j * BPS]; + } + out = (v16u8)__msa_fill_b(dc >> 4); + ST_UB8(out, out, out, out, out, out, out, out, dst, BPS); + ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS); +} + +static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available + uint32_t dc = 8; + const v16u8 rtop = LD_UB(dst - BPS); + const v8u16 dctop = __msa_hadd_u_h(rtop, rtop); + v16u8 out; + + dc += HADD_UH_U32(dctop); + out = (v16u8)__msa_fill_b(dc >> 4); + ST_UB8(out, out, out, out, out, out, out, out, dst, BPS); + ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS); +} + +static void DC16NoTopLeft(uint8_t* dst) { // DC with nothing + const v16u8 out = (v16u8)__msa_fill_b(0x80); + ST_UB8(out, out, out, out, out, out, out, out, dst, BPS); + ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS); +} + //------------------------------------------------------------------------------ // Entry point @@ -801,6 +894,13 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMSA(void) { VP8PredLuma4[2] = VE4; VP8PredLuma4[4] = RD4; VP8PredLuma4[6] = LD4; + VP8PredLuma16[0] = DC16; + VP8PredLuma16[1] = TM16; + VP8PredLuma16[2] = VE16; + VP8PredLuma16[3] = HE16; + VP8PredLuma16[4] = DC16NoTop; + VP8PredLuma16[5] = DC16NoLeft; + VP8PredLuma16[6] = DC16NoTopLeft; } #else // !WEBP_USE_MSA diff --git a/src/dsp/msa_macro.h b/src/dsp/msa_macro.h index 20cae4ee..82bb598e 100644 --- a/src/dsp/msa_macro.h +++ b/src/dsp/msa_macro.h @@ -243,6 +243,13 @@ #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__) +#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + pdst, stride) { \ + ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ + ST_B4(RTYPE, in4, in5, in6, in7, pdst + 4 * stride, stride); \ +} +#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) + /* Description : Store 2x4 byte block to destination memory from input vector * Arguments : Inputs - in, stidx, pdst, stride * Details : Index 'stidx' halfword element from 'in' vector is copied to @@ -359,6 +366,25 @@ CLIP_SW_0_255(in3); \ } +/* Description : Horizontal addition of 8 unsigned halfword elements + * Arguments : Input - in (unsigned halfword vector) + * Output - sum_m (u32 sum) + * Return Type - unsigned word + * Details : 8 unsigned halfword elements of input vector are added + * together and the resulting integer sum is returned + */ +static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) { + uint32_t sum_m; + v2u64 res0_m, res1_m; + const v4u32 res_m = __msa_hadd_u_w(in, in); + res0_m = __msa_hadd_u_d(res_m, res_m); + res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); + res0_m = res0_m + res1_m; + sum_m = __msa_copy_s_w((v4i32)res0_m, 0); + return sum_m; +} +#define HADD_UH_U32(in) func_hadd_uh_u32(in) + /* Description : Set element n input vector to GPR value * Arguments : Inputs - in0, in1, in2, in3 * Output - out @@ -660,6 +686,16 @@ #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__) +#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) +#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) +#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) +#define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__) + /* Description : Arithmetic immediate shift right all elements of word vector * Arguments : Inputs - in0, in1, shift * Outputs - in place operation @@ -745,6 +781,17 @@ ADD2(in4, in5, in6, in7, out2, out3); \ } +/* Description : Subtraction of 2 pairs of vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Details : Each element in 'in1' is subtracted from 'in0' and result is + * written to 'out0'. + */ +#define SUB2(in0, in1, in2, in3, out0, out1) { \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ +} + /* Description : Sign extend halfword elements from input vector and return * the result in pair of vectors * Arguments : Input - in (halfword vector)