diff --git a/src/dsp/enc_msa.c b/src/dsp/enc_msa.c index 04829dc5..f9becb14 100644 --- a/src/dsp/enc_msa.c +++ b/src/dsp/enc_msa.c @@ -762,6 +762,70 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { return sum; } +//------------------------------------------------------------------------------ +// Quantization + +static int QuantizeBlock(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { + int sum; + v8i16 in0, in1, sh0, sh1, out0, out1; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1; + v4i32 s0, s1, s2, s3, b0, b1, b2, b3, t0, t1, t2, t3; + const v8i16 zero = { 0 }; + const v8i16 zigzag0 = { 0, 1, 4, 8, 5, 2, 3, 6 }; + const v8i16 zigzag1 = { 9, 12, 13, 10, 7, 11, 14, 15 }; + const v8i16 maxlevel = __msa_fill_h(MAX_LEVEL); + + LD_SH2(&in[0], 8, in0, in1); + LD_SH2(&mtx->sharpen_[0], 8, sh0, sh1); + tmp4 = __msa_add_a_h(in0, zero); + tmp5 = __msa_add_a_h(in1, zero); + ILVRL_H2_SH(sh0, tmp4, tmp0, tmp1); + ILVRL_H2_SH(sh1, tmp5, tmp2, tmp3); + HADD_SH4_SW(tmp0, tmp1, tmp2, tmp3, s0, s1, s2, s3); + sign0 = (in0 < zero); + sign1 = (in1 < zero); // sign + LD_SH2(&mtx->iq_[0], 8, tmp0, tmp1); // iq + ILVRL_H2_SW(zero, tmp0, t0, t1); + ILVRL_H2_SW(zero, tmp1, t2, t3); + LD_SW4(&mtx->bias_[0], 4, b0, b1, b2, b3); // bias + MUL4(t0, s0, t1, s1, t2, s2, t3, s3, t0, t1, t2, t3); + ADD4(b0, t0, b1, t1, b2, t2, b3, t3, b0, b1, b2, b3); + SRAI_W4_SW(b0, b1, b2, b3, 17); + PCKEV_H2_SH(b1, b0, b3, b2, tmp2, tmp3); + tmp0 = (tmp2 > maxlevel); + tmp1 = (tmp3 > maxlevel); + tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0); + tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1); + SUB2(0, tmp2, 0, tmp3, tmp0, tmp1); + tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0); + tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1); + LD_SW4(&mtx->zthresh_[0], 4, t0, t1, t2, t3); // zthresh + t0 = (s0 > t0); + t1 = (s1 > t1); + t2 = (s2 > t2); + t3 = (s3 > t3); + PCKEV_H2_SH(t1, t0, t3, t2, tmp0, tmp1); + tmp4 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp2, (v16u8)tmp0); + tmp5 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp3, (v16u8)tmp1); + LD_SH2(&mtx->q_[0], 8, tmp0, tmp1); + MUL2(tmp4, tmp0, tmp5, tmp1, in0, in1); + VSHF_H2_SH(tmp4, tmp5, tmp4, tmp5, zigzag0, zigzag1, out0, out1); + ST_SH2(in0, in1, &in[0], 8); + ST_SH2(out0, out1, &out[0], 8); + out0 = __msa_add_a_h(out0, out1); + sum = HADD_SH_S32(out0); + return (sum > 0); +} + +static int Quantize2Blocks(int16_t in[32], int16_t out[32], + const VP8Matrix* const mtx) { + int nz; + nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; + nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; + return nz; +} + //------------------------------------------------------------------------------ // Entry point @@ -783,6 +847,10 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) { VP8SSE16x8 = SSE16x8; VP8SSE8x8 = SSE8x8; VP8SSE4x4 = SSE4x4; + + VP8EncQuantizeBlock = QuantizeBlock; + VP8EncQuantize2Blocks = Quantize2Blocks; + VP8EncQuantizeBlockWHT = QuantizeBlock; } #else // !WEBP_USE_MSA diff --git a/src/dsp/msa_macro.h b/src/dsp/msa_macro.h index 45a7121a..8c3b5bce 100644 --- a/src/dsp/msa_macro.h +++ b/src/dsp/msa_macro.h @@ -235,6 +235,26 @@ #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__) #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) +/* Description : Load vectors with 4 word elements with stride + * Arguments : Inputs - psrc, stride + * Outputs - out0, out1 + * Details : Load 4 word elements in 'out0' from (psrc) + * Load 4 word elements in 'out1' from (psrc + stride) + */ +#define LD_W2(RTYPE, psrc, stride, out0, out1) do { \ + out0 = LD_W(RTYPE, psrc); \ + out1 = LD_W(RTYPE, psrc + stride); \ +} while (0) +#define LD_UW2(...) LD_W2(v4u32, __VA_ARGS__) +#define LD_SW2(...) LD_W2(v4i32, __VA_ARGS__) + +#define LD_W4(RTYPE, psrc, stride, out0, out1, out2, out3) do { \ + LD_W2(RTYPE, psrc, stride, out0, out1); \ + LD_W2(RTYPE, psrc + 2 * stride, stride, out2, out3); \ +} while (0) +#define LD_UW4(...) LD_W4(v4u32, __VA_ARGS__) +#define LD_SW4(...) LD_W4(v4i32, __VA_ARGS__) + /* Description : Store vectors of 16 byte elements with stride * Arguments : Inputs - in0, in1, pdst, stride * Details : Store 16 byte elements from 'in0' to (pdst) @@ -480,6 +500,23 @@ static WEBP_INLINE int32_t func_hadd_sw_s32(v4i32 in) { } #define HADD_SW_S32(in) func_hadd_sw_s32(in) +/* Description : Horizontal addition of 8 signed halfword elements + * Arguments : Input - in (signed halfword vector) + * Output - sum_m (s32 sum) + * Return Type - signed word + * Details : 8 signed halfword elements of input vector are added + * together and the resulting integer sum is returned + */ +static WEBP_INLINE int32_t func_hadd_sh_s32(v8i16 in) { + const v4i32 res = __msa_hadd_s_w(in, in); + const v2i64 res0 = __msa_hadd_s_d(res, res); + const v2i64 res1 = __msa_splati_d(res0, 1); + const v2i64 res2 = res0 + res1; + const int32_t sum_m = __msa_copy_s_w((v4i32)res2, 0); + return sum_m; +} +#define HADD_SH_S32(in) func_hadd_sh_s32(in) + /* Description : Horizontal addition of 8 unsigned halfword elements * Arguments : Input - in (unsigned halfword vector) * Output - sum_m (u32 sum) @@ -498,6 +535,26 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) { } #define HADD_UH_U32(in) func_hadd_uh_u32(in) +/* Description : Horizontal addition of signed half word vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each signed odd half word element from 'in0' is added to + even signed half word element from 'in0' (pairwise) and the + halfword result is written in 'out0' +*/ +#define HADD_SH2(RTYPE, in0, in1, out0, out1) do { \ + out0 = (RTYPE)__msa_hadd_s_w((v8i16)in0, (v8i16)in0); \ + out1 = (RTYPE)__msa_hadd_s_w((v8i16)in1, (v8i16)in1); \ +} while (0) +#define HADD_SH2_SW(...) HADD_SH2(v4i32, __VA_ARGS__) + +#define HADD_SH4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) do { \ + HADD_SH2(RTYPE, in0, in1, out0, out1); \ + HADD_SH2(RTYPE, in2, in3, out2, out3); \ +} while (0) +#define HADD_SH4_SW(...) HADD_SH4(v4i32, __VA_ARGS__) + /* Description : Horizontal subtraction of unsigned byte vector elements * Arguments : Inputs - in0, in1 * Outputs - out0, out1 @@ -990,6 +1047,23 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) { out1 = in0 - in1; \ } while (0) +/* Description : Multiplication of pairs of vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Details : Each element from 'in0' is multiplied with elements from 'in1' + * and the result is written to 'out0' + */ +#define MUL2(in0, in1, in2, in3, out0, out1) do { \ + out0 = in0 * in1; \ + out1 = in2 * in3; \ +} while (0) + +#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) do { \ + MUL2(in0, in1, in2, in3, out0, out1); \ + MUL2(in4, in5, in6, in7, out2, out3); \ +} while (0) + /* Description : Sign extend halfword elements from right half of the vector * Arguments : Input - in (halfword vector) * Output - out (sign extended word vector)