Add MSA optimized encoder transform functions

We add the following MSA optimized encoder transform functions: - ITransform - FTransform - FTransformWHT Change-Id: Ia6b17556aba5aff2d7a88208905fb45293d080a8
2025-07-15 13:29:54 +02:00 · 2016-07-05 20:00:43 +05:30
parent dce64bfa1b
commit 435308e029
8 changed files with 318 additions and 4 deletions
--- a/Android.mk
+++ b/Android.mk
@ -85,6 +85,7 @@ dsp_enc_srcs := \
    src/dsp/enc_avx2.c \
    src/dsp/enc_mips32.c \
    src/dsp/enc_mips_dsp_r2.c \
    src/dsp/enc_msa.c \
    src/dsp/enc_neon.$(NEON) \
    src/dsp/enc_sse2.c \
    src/dsp/enc_sse41.c \
--- a/Makefile.vc
+++ b/Makefile.vc
@ -233,6 +233,7 @@ DSP_ENC_OBJS = \
    $(DIROBJ)\dsp\enc_avx2.obj \
    $(DIROBJ)\dsp\enc_mips32.obj \
    $(DIROBJ)\dsp\enc_mips_dsp_r2.obj \
    $(DIROBJ)\dsp\enc_msa.obj \
    $(DIROBJ)\dsp\enc_neon.obj \
    $(DIROBJ)\dsp\enc_sse2.obj \
    $(DIROBJ)\dsp\enc_sse41.obj \
--- a/build.gradle
+++ b/build.gradle
@ -165,6 +165,7 @@ model {
            include "enc_avx2.c"
            include "enc_mips32.c"
            include "enc_mips_dsp_r2.c"
 	    include "enc_msa.c"
            include "enc_neon.$NEON"
            include "enc_sse2.c"
            include "enc_sse41.c"
--- a/makefile.unix
+++ b/makefile.unix
@ -177,6 +177,7 @@ DSP_ENC_OBJS = \
    src/dsp/enc_avx2.o \
    src/dsp/enc_mips32.o \
    src/dsp/enc_mips_dsp_r2.o \
    src/dsp/enc_msa.o \
    src/dsp/enc_neon.o \
    src/dsp/enc_sse2.o \
    src/dsp/enc_sse41.o \
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@ -2,7 +2,7 @@ noinst_LTLIBRARIES = libwebpdsp.la libwebpdsp_avx2.la
 noinst_LTLIBRARIES += libwebpdsp_sse2.la libwebpdspdecode_sse2.la
 noinst_LTLIBRARIES += libwebpdsp_sse41.la libwebpdspdecode_sse41.la
 noinst_LTLIBRARIES += libwebpdsp_neon.la libwebpdspdecode_neon.la
-noinst_LTLIBRARIES += libwebpdspdecode_msa.la
+noinst_LTLIBRARIES += libwebpdsp_msa.la libwebpdspdecode_msa.la
 if BUILD_LIBWEBPDECODER
  noinst_LTLIBRARIES += libwebpdspdecode.la
@ -86,8 +86,8 @@ libwebpdspdecode_msa_la_SOURCES += dec_msa.c
 libwebpdspdecode_msa_la_SOURCES += filters_msa.c
 libwebpdspdecode_msa_la_SOURCES += lossless_msa.c
 libwebpdspdecode_msa_la_SOURCES += msa_macro.h
-libwebpdspdecode_msa_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
+libwebpdspdecode_msa_la_CPPFLAGS = $(libwebpdsp_msa_la_CPPFLAGS)
-libwebpdspdecode_msa_la_CFLAGS = $(AM_CFLAGS)
+libwebpdspdecode_msa_la_CFLAGS = $(libwebpdsp_msa_la_CFLAGS)
 libwebpdsp_sse2_la_SOURCES =
 libwebpdsp_sse2_la_SOURCES += argb_sse2.c
@ -112,6 +112,12 @@ libwebpdsp_neon_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
 libwebpdsp_neon_la_CFLAGS = $(AM_CFLAGS) $(NEON_FLAGS)
 libwebpdsp_neon_la_LIBADD = libwebpdspdecode_neon.la
 libwebpdsp_msa_la_SOURCES =
 libwebpdsp_msa_la_SOURCES += enc_msa.c
 libwebpdsp_msa_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
 libwebpdsp_msa_la_CFLAGS = $(AM_CFLAGS)
 libwebpdsp_msa_la_LIBADD = libwebpdspdecode_msa.la
 libwebpdsp_la_SOURCES = $(COMMON_SOURCES) $(ENC_SOURCES)
 noinst_HEADERS =
@ -126,7 +132,7 @@ libwebpdsp_la_LIBADD =
 libwebpdsp_la_LIBADD += libwebpdsp_avx2.la libwebpdsp_sse2.la
 libwebpdsp_la_LIBADD += libwebpdsp_sse41.la
 libwebpdsp_la_LIBADD += libwebpdsp_neon.la
-libwebpdsp_la_LIBADD += libwebpdspdecode_msa.la
+libwebpdsp_la_LIBADD += libwebpdsp_msa.la
 if BUILD_LIBWEBPDECODER
  libwebpdspdecode_la_SOURCES = $(COMMON_SOURCES)
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@ -795,6 +795,7 @@ extern void VP8EncDspInitAVX2(void);
 extern void VP8EncDspInitNEON(void);
 extern void VP8EncDspInitMIPS32(void);
 extern void VP8EncDspInitMIPSdspR2(void);
 extern void VP8EncDspInitMSA(void);
 static volatile VP8CPUInfo enc_last_cpuinfo_used =
    (VP8CPUInfo)&enc_last_cpuinfo_used;
@ -857,6 +858,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      VP8EncDspInitMIPSdspR2();
    }
 #endif
 #if defined(WEBP_USE_MSA)
    if (VP8GetCPUInfo(kMSA)) {
      VP8EncDspInitMSA();
    }
 #endif
  }
  enc_last_cpuinfo_used = VP8GetCPUInfo;
--- a/src/dsp/enc_msa.c
+++ b/src/dsp/enc_msa.c
@ -0,0 +1,183 @@
 // Copyright 2016 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // MSA version of encoder dsp functions.
 //
 // Author:  Prashant Patil   (prashant.patil@imgtec.com)
 #include "./dsp.h"
 #if defined(WEBP_USE_MSA)
 #include "./msa_macro.h"
 //------------------------------------------------------------------------------
 // Transforms
 #define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
  v4i32 a1_m, b1_m, c1_m, d1_m;                                     \
  const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091);              \
  const v4i32 sinpi8sqrt2 = __msa_fill_w(35468);                    \
  v4i32 c_tmp1_m = in1 * sinpi8sqrt2;                               \
  v4i32 c_tmp2_m = in3 * cospi8sqrt2minus1;                         \
  v4i32 d_tmp1_m = in1 * cospi8sqrt2minus1;                         \
  v4i32 d_tmp2_m = in3 * sinpi8sqrt2;                               \
                                                                    \
  ADDSUB2(in0, in2, a1_m, b1_m);                                    \
  SRAI_W2_SW(c_tmp1_m, c_tmp2_m, 16);                               \
  c_tmp2_m = c_tmp2_m + in3;                                        \
  c1_m = c_tmp1_m - c_tmp2_m;                                       \
  SRAI_W2_SW(d_tmp1_m, d_tmp2_m, 16);                               \
  d_tmp1_m = d_tmp1_m + in1;                                        \
  d1_m = d_tmp1_m + d_tmp2_m;                                       \
  BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);      \
 } while (0)
 static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
                                      uint8_t* dst) {
  v8i16 input0, input1;
  v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
  v4i32 res0, res1, res2, res3;
  v16i8 dest0, dest1, dest2, dest3;
  const v16i8 zero = { 0 };
  LD_SH2(in, 8, input0, input1);
  UNPCK_SH_SW(input0, in0, in1);
  UNPCK_SH_SW(input1, in2, in3);
  IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
  TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
  IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
  SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
  TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
  LD_SB4(ref, BPS, dest0, dest1, dest2, dest3);
  ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
             res0, res1, res2, res3);
  ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
             res0, res1, res2, res3);
  ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
  CLIP_SW4_0_255(res0, res1, res2, res3);
  PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
  res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
 }
 static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
                       int do_two) {
  ITransformOne(ref, in, dst);
  if (do_two) {
    ITransformOne(ref + 4, in + 16, dst + 4);
  }
 }
 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  uint64_t out0, out1, out2, out3;
  uint32_t in0, in1, in2, in3;
  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  v8i16 t0, t1, t2, t3;
  v16u8 srcl0, srcl1, src0, src1;
  const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
  const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
  const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
  const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 };
  const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 };
  const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 };
  LW4(src, BPS, in0, in1, in2, in3);
  INSERT_W4_UB(in0, in1, in2, in3, src0);
  LW4(ref, BPS, in0, in1, in2, in3);
  INSERT_W4_UB(in0, in1, in2, in3, src1);
  ILVRL_B2_UB(src0, src1, srcl0, srcl1);
  HSUB_UB2_SH(srcl0, srcl1, t0, t1);
  VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
  ADDSUB2(t2, t3, t0, t1);
  t0 = SRLI_H(t0, 3);
  VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
  tmp0 = __msa_hadd_s_w(t3, t3);
  tmp2 = __msa_hsub_s_w(t3, t3);
  FILL_W2_SW(1812, 937, tmp1, tmp3);
  DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
  SRAI_W2_SW(tmp1, tmp3, 9);
  PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
  VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
  ADDSUB2(t2, t3, t0, t1);
  VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
  tmp0 = __msa_hadd_s_w(t3, t3);
  tmp2 = __msa_hsub_s_w(t3, t3);
  ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2);
  SRAI_W2_SW(tmp0, tmp2, 4);
  FILL_W2_SW(12000, 51000, tmp1, tmp3);
  DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
  SRAI_W2_SW(tmp1, tmp3, 16);
  UNPCK_R_SH_SW(t1, tmp4);
  tmp5 = __msa_ceqi_w(tmp4, 0);
  tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5);
  tmp5 = __msa_fill_w(1);
  tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4);
  tmp1 += tmp5;
  PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
  out0 = __msa_copy_s_d((v2i64)t0, 0);
  out1 = __msa_copy_s_d((v2i64)t0, 1);
  out2 = __msa_copy_s_d((v2i64)t1, 0);
  out3 = __msa_copy_s_d((v2i64)t1, 1);
  SD4(out0, out1, out2, out3, out, 8);
 }
 static void FTransformWHT(const int16_t* in, int16_t* out) {
  v8i16 in0 = { 0 };
  v8i16 in1 = { 0 };
  v8i16 tmp0, tmp1, tmp2, tmp3;
  v8i16 out0, out1;
  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
  in0 = __msa_insert_h(in0, 0, in[  0]);
  in0 = __msa_insert_h(in0, 1, in[ 64]);
  in0 = __msa_insert_h(in0, 2, in[128]);
  in0 = __msa_insert_h(in0, 3, in[192]);
  in0 = __msa_insert_h(in0, 4, in[ 16]);
  in0 = __msa_insert_h(in0, 5, in[ 80]);
  in0 = __msa_insert_h(in0, 6, in[144]);
  in0 = __msa_insert_h(in0, 7, in[208]);
  in1 = __msa_insert_h(in1, 0, in[ 48]);
  in1 = __msa_insert_h(in1, 1, in[112]);
  in1 = __msa_insert_h(in1, 2, in[176]);
  in1 = __msa_insert_h(in1, 3, in[240]);
  in1 = __msa_insert_h(in1, 4, in[ 32]);
  in1 = __msa_insert_h(in1, 5, in[ 96]);
  in1 = __msa_insert_h(in1, 6, in[160]);
  in1 = __msa_insert_h(in1, 7, in[224]);
  ADDSUB2(in0, in1, tmp0, tmp1);
  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
  ADDSUB2(tmp2, tmp3, tmp0, tmp1);
  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
  ADDSUB2(in0, in1, tmp0, tmp1);
  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
  ADDSUB2(tmp2, tmp3, out0, out1);
  SRAI_H2_SH(out0, out1, 1);
  ST_SH2(out0, out1, out, 8);
 }
 //------------------------------------------------------------------------------
 // Entry point
 extern void VP8EncDspInitMSA(void);
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
  VP8ITransform = ITransform;
  VP8FTransform = FTransform;
  VP8FTransformWHT = FTransformWHT;
 }
 #else  // !WEBP_USE_MSA
 WEBP_DSP_INIT_STUB(VP8EncDspInitMSA)
 #endif  // WEBP_USE_MSA
--- a/src/dsp/msa_macro.h
+++ b/src/dsp/msa_macro.h
@ -23,14 +23,18 @@
 #ifdef CLANG_BUILD
  #define ADDVI_H(a, b)  __msa_addvi_h((v8i16)a, b)
  #define ADDVI_W(a, b)  __msa_addvi_w((v4i32)a, b)
  #define SRAI_B(a, b)  __msa_srai_b((v16i8)a, b)
  #define SRAI_H(a, b)  __msa_srai_h((v8i16)a, b)
  #define SRAI_W(a, b)  __msa_srai_w((v4i32)a, b)
  #define SRLI_H(a, b)  __msa_srli_h((v8i16)a, b)
 #else
  #define ADDVI_H(a, b)  (a + b)
  #define ADDVI_W(a, b)  (a + b)
  #define SRAI_B(a, b)  (a >> b)
  #define SRAI_H(a, b)  (a >> b)
  #define SRAI_W(a, b)  (a >> b)
  #define SRLI_H(a, b)  (a << b)
 #endif
 #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc))
@ -257,6 +261,18 @@
 } while (0)
 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
 /* Description : Store vectors of 8 halfword elements with stride
 * Arguments   : Inputs - in0, in1, pdst, stride
 * Details     : Store 8 halfword elements from 'in0' to (pdst)
 *               Store 8 halfword elements from 'in1' to (pdst + stride)
 */
 #define ST_H2(RTYPE, in0, in1, pdst, stride) do {  \
  ST_H(RTYPE, in0, pdst);                          \
  ST_H(RTYPE, in1, pdst + stride);                 \
 } while (0)
 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
 /* Description : Store 2x4 byte block to destination memory from input vector
 * Arguments   : Inputs - in, stidx, pdst, stride
 * Details     : Index 'stidx' halfword element from 'in' vector is copied to
@ -377,6 +393,22 @@
 } while (0)
 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
 /* Description : Dot product & addition of halfword vector elements
 * Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
 *               Outputs - out0, out1
 *               Return Type - as per RTYPE
 * Details     : Signed halfword elements from 'mult0' are multiplied with
 *               signed halfword elements from 'cnst0' producing a result
 *               twice the size of input i.e. signed word.
 *               The multiplication result of adjacent odd-even elements
 *               are added to the 'out0' vector
 */
 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do {      \
  out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0);  \
  out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1);  \
 } while (0)
 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
 /* Description : Clips all signed halfword elements of input vector
 *               between 0 & 255
 * Arguments   : Input/output  - val
@ -434,6 +466,22 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
 }
 #define HADD_UH_U32(in) func_hadd_uh_u32(in)
 /* Description : Horizontal subtraction of unsigned byte vector elements
 * Arguments   : Inputs  - in0, in1
 *               Outputs - out0, out1
 *               Return Type - as per RTYPE
 * Details     : Each unsigned odd byte element from 'in0' is subtracted from
 *               even unsigned byte element from 'in0' (pairwise) and the
 *               halfword result is written to 'out0'
 */
 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) do {       \
  out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0);  \
  out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1);  \
 } while (0)
 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
 #define HSUB_UB2_SW(...) HSUB_UB2(v4i32, __VA_ARGS__)
 /* Description : Set element n input vector to GPR value
 * Arguments   : Inputs - in0, in1, in2, in3
 *               Output - out
@ -745,6 +793,23 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
 /* Description : Pack even halfword elements of vector pairs
 * Arguments   : Inputs  - in0, in1, in2, in3
 *               Outputs - out0, out1
 *               Return Type - as per RTYPE
 * Details     : Even halfword elements of 'in0' are copied to the left half of
 *               'out0' & even halfword elements of 'in1' are copied to the
 *               right half of 'out0'.
 */
 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
  out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);        \
  out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);        \
 } while (0)
 #define PCKEV_H2_UH(...) PCKEV_H2(v8u16, __VA_ARGS__)
 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
 #define PCKEV_H2_UW(...) PCKEV_H2(v4u32, __VA_ARGS__)
 /* Description : Arithmetic immediate shift right all elements of word vector
 * Arguments   : Inputs  - in0, in1, shift
 *               Outputs - in place operation
@ -814,6 +879,30 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
 #define ADDVI_H2_SH(...) ADDVI_H2(v8i16, __VA_ARGS__)
 #define ADDVI_H2_UH(...) ADDVI_H2(v8u16, __VA_ARGS__)
 /* Description : Addition of 2 pairs of word vectors
 * Arguments   : Inputs  - in0, in1, in2, in3
 *               Outputs - out0, out1
 * Details     : Each element in 'in0' is added to 'in1' and result is written
 *               to 'out0'.
 */
 #define ADDVI_W2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
  out0 = (RTYPE)ADDVI_W(in0, in1);                            \
  out1 = (RTYPE)ADDVI_W(in2, in3);                            \
 } while (0)
 #define ADDVI_W2_SW(...) ADDVI_W2(v4i32, __VA_ARGS__)
 /* Description : Fill 2 pairs of word vectors with GP registers
 * Arguments   : Inputs  - in0, in1
 *               Outputs - out0, out1
 * Details     : GP register in0 is replicated in each word element of out0
 *               GP register in1 is replicated in each word element of out1
 */
 #define FILL_W2(RTYPE, in0, in1, out0, out1) do {  \
  out0 = (RTYPE)__msa_fill_w(in0);                 \
  out1 = (RTYPE)__msa_fill_w(in1);                 \
 } while (0)
 #define FILL_W2_SW(...) FILL_W2(v4i32, __VA_ARGS__)
 /* Description : Addition of 2 pairs of vectors
 * Arguments   : Inputs  - in0, in1, in2, in3
 *               Outputs - out0, out1
@ -842,6 +931,32 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
  out1 = in2 - in3;                                \
 } while (0)
 /* Description : Addition - Subtraction of input vectors
 * Arguments   : Inputs  - in0, in1
 *               Outputs - out0, out1
 * Details     : Each element in 'in1' is added to 'in0' and result is
 *               written to 'out0'.
 *               Each element in 'in1' is subtracted from 'in0' and result is
 *               written to 'out1'.
 */
 #define ADDSUB2(in0, in1, out0, out1) do {  \
  out0 = in0 + in1;                         \
  out1 = in0 - in1;                         \
 } while (0)
 /* Description : Sign extend halfword elements from right half of the vector
 * Arguments   : Input  - in    (halfword vector)
 *               Output - out   (sign extended word vector)
 *               Return Type - signed word
 * Details     : Sign bit of halfword elements from input vector 'in' is
 *               extracted and interleaved with same vector 'in0' to generate
 *               4 word elements keeping sign intact
 */
 #define UNPCK_R_SH_SW(in, out) do {                   \
  const v8i16 sign_m = __msa_clti_s_h((v8i16)in, 0);  \
  out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);       \
 } while (0)
 /* Description : Sign extend halfword elements from input vector and return
 *               the result in pair of vectors
 * Arguments   : Input   - in            (halfword vector)