wasm: Add VFilter16

BUG=webp:352 Change-Id: I97f38aee5de063957c1512f6bd429c0e84c02087
2025-09-01 00:32:07 +02:00 · 2017-07-05 15:50:47 -07:00
parent e6e3ec335c
commit bafa90ccd8
1 changed files with 295 additions and 0 deletions
--- a/src/dsp/dec_wasm.c
+++ b/src/dsp/dec_wasm.c
@@ -305,6 +305,299 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
  }
 }
 //------------------------------------------------------------------------------
 // Loop Filter (Paragraph 15)
 /*
  Currently, the add/sub sat instructions are not supported, however in the
  future, they will be.  So for now, we will cheat and use the builtins.
  See https://github.com/WebAssembly/meetings/blob/master/2017/CG-05.md
    Poll: Adopt the saturating integer arithmetic operations
    {i8x16,i16x8}.{add,sub}_saturate_[su].
 */
 static WEBP_INLINE uint8x16 uint8x16_add_sat(const uint8x16 a,
                                             const uint8x16 b) {
  // TODO(slavarnway): add generic implementation for non-x86
  return (uint8x16)__builtin_ia32_paddusb128(a, b);
 }
 static WEBP_INLINE int8x16 int8x16_add_sat(const int8x16 a, const int8x16 b) {
  // TODO(slavarnway): add generic implementation for non-x86
  return (int8x16)__builtin_ia32_paddsb128(a, b);
 }
 static WEBP_INLINE uint8x16 uint8x16_sub_sat(const uint8x16 a,
                                             const uint8x16 b) {
  // TODO(slavarnway): add generic implementation for non-x86
  return (uint8x16)__builtin_ia32_psubusb128(a, b);
 }
 static WEBP_INLINE int8x16 int8x16_sub_sat(const int8x16 a, const int8x16 b) {
  // TODO(slavarnway): add generic implementation for non-x86
  return (int8x16)__builtin_ia32_psubsb128(a, b);
 }
 static WEBP_INLINE uint8x16 _max_u8x16(const uint8x16 a, const uint8x16 b) {
  const uint8x16 s1 = (a > b);
  return (s1 & a) | (~s1 & b);
 }
 // Compute abs(p - q) = subs(p - q) OR subs(q - p)
 static WEBP_INLINE int8x16 abs_diff(int8x16 p, int8x16 q) {
  const int8x16 a = uint8x16_sub_sat(p, q);
  const int8x16 b = uint8x16_sub_sat(q, p);
  return a | b;
 }
 // int16 to int8 with saturation.
 static inline int8x16 _pack_sw_2_sb(const int16x8 lo, const int16x8 hi) {
 #if 1
  const int16x8 k7f = splat_int16(0x007f);
  const int16x8 kff80 = splat_int16(0xff80);
  const int16x8 s1_lo = (lo < k7f);
  const int16x8 a_lo = (s1_lo & lo) | (~s1_lo & k7f);
  const int16x8 s2_lo = (a_lo > kff80);
  const int16x8 a2_lo = (s2_lo & a_lo) | (~s2_lo & kff80);
  const int16x8 s1_hi = (hi < k7f);
  const int16x8 a_hi = (s1_hi & hi) | (~s1_hi & k7f);
  const int16x8 s2_hi = (a_hi > kff80);
  const int16x8 a2_hi = (s2_hi & a_hi) | (~s2_hi & kff80);
  return (int8x16)__builtin_shufflevector((int8x16)a2_lo, (int8x16)a2_hi, 0, 2,
                                          4, 6, 8, 10, 12, 14, 16, 18, 20, 22,
                                          24, 26, 28, 30);
 #else
  return (int8x16)__builtin_ia32_packsswb128(lo, hi);
 #endif
 }
 // Shift each byte of "x" by 3 bits while preserving by the sign bit.
 static WEBP_INLINE void SignedShift8b(int8x16* const x) {
  const int8x16 zero = {0};
  const int16x8 eleven = splat_int16(3 + 8);
  const int16x8 lo_0 = (int16x8)__builtin_shufflevector(
      *x, zero, 16, 0, 16, 1, 16, 2, 16, 3, 16, 4, 16, 5, 16, 6, 16, 7);
  const int16x8 hi_0 = (int16x8)__builtin_shufflevector(
      *x, zero, 16, 8, 16, 9, 16, 10, 16, 11, 16, 12, 16, 13, 16, 14, 16, 15);
  const int16x8 lo_1 = lo_0 >> eleven;
  const int16x8 hi_1 = hi_0 >> eleven;
  *x = _pack_sw_2_sb(lo_1, hi_1);
 }
 #define FLIP_SIGN_BIT2(a, b) \
  {                          \
    a = a ^ sign_bit;        \
    b = b ^ sign_bit;        \
  }
 #define FLIP_SIGN_BIT4(a, b, c, d) \
  {                                \
    FLIP_SIGN_BIT2(a, b);          \
    FLIP_SIGN_BIT2(c, d);          \
  }
 // input/output is uint8_t
 static WEBP_INLINE void GetNotHEV(const int8x16* const p1,
                                  const int8x16* const p0,
                                  const int8x16* const q0,
                                  const int8x16* const q1, int hev_thresh,
                                  int8x16* const not_hev) {
  const int8x16 zero = {0};
  const int8x16 t_1 = abs_diff(*p1, *p0);
  const int8x16 t_2 = abs_diff(*q1, *q0);
  const int8x16 h = splat_uint8(hev_thresh);
  const int8x16 t_max = _max_u8x16(t_1, t_2);
  const int8x16 t_max_h = uint8x16_sub_sat(t_max, h);
  *not_hev = (t_max_h == zero);  // not_hev <= t1 && not_hev <= t2
 }
 // input pixels are int8_t
 static WEBP_INLINE void GetBaseDelta(const int8x16* const p1,
                                     const int8x16* const p0,
                                     const int8x16* const q0,
                                     const int8x16* const q1,
                                     int8x16* const delta) {
  // beware of addition order, for saturation!
  const int8x16 p1_q1 = int8x16_sub_sat(*p1, *q1);   // p1 - q1
  const int8x16 q0_p0 = int8x16_sub_sat(*q0, *p0);   // q0 - p0
  const int8x16 s1 = int8x16_add_sat(p1_q1, q0_p0);  // p1 - q1 + 1 * (q0 - p0)
  const int8x16 s2 = int8x16_add_sat(q0_p0, s1);     // p1 - q1 + 2 * (q0 - p0)
  const int8x16 s3 = int8x16_add_sat(q0_p0, s2);     // p1 - q1 + 3 * (q0 - p0)
  *delta = s3;
 }
 // input and output are int8_t
 static WEBP_INLINE void DoSimpleFilter(int8x16* const p0, int8x16* const q0,
                                       const int8x16* const fl) {
  const int8x16 k3 = splat_uint8(3);
  const int8x16 k4 = splat_uint8(4);
  int8x16 v3 = int8x16_add_sat(*fl, k3);
  int8x16 v4 = int8x16_add_sat(*fl, k4);
  SignedShift8b(&v4);              // v4 >> 3
  SignedShift8b(&v3);              // v3 >> 3
  *q0 = int8x16_sub_sat(*q0, v4);  // q0 -= v4
  *p0 = int8x16_add_sat(*p0, v3);  // p0 += v3
 }
 // Updates values of 2 pixels at MB edge during complex filtering.
 // Update operations:
 // q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
 // Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
 static WEBP_INLINE void Update2Pixels(int8x16* const pi, int8x16* const qi,
                                      const int16x8* const a0_lo,
                                      const int16x8* const a0_hi) {
  const int16x8 _7 = splat_int16(7);
  const int16x8 a1_lo = *a0_lo >> _7;
  const int16x8 a1_hi = *a0_hi >> _7;
  const int8x16 delta = _pack_sw_2_sb(a1_lo, a1_hi);
  const int8x16 sign_bit = (int8x16)splat_uint8(0x80);
  *pi = int8x16_add_sat(*pi, delta);
  *qi = int8x16_sub_sat(*qi, delta);
  FLIP_SIGN_BIT2(*pi, *qi);
 }
 // input pixels are uint8_t
 static WEBP_INLINE void NeedsFilter(const int8x16* const p1,
                                    const int8x16* const p0,
                                    const int8x16* const q0,
                                    const int8x16* const q1, int thresh,
                                    int8x16* const mask) {
  const int8x16 zero = {0};
  const int16x8 one = {1, 1, 1, 1, 1, 1, 1, 1};
  const int8x16 m_thresh = splat_uint8(thresh);
  const int8x16 t1 = abs_diff(*p1, *q1);  // abs(p1 - q1)
  const uint8x16 kFE = splat_uint8(0xFE);
  const uint16x8 t2 = t1 & kFE;                 // set lsb of each byte to zero
  const uint16x8 t3 = t2 >> one;                // abs(p1 - q1) / 2
  const int8x16 t4 = abs_diff(*p0, *q0);        // abs(p0 - q0)
  const int8x16 t5 = uint8x16_add_sat(t4, t4);  // abs(p0 - q0) * 2
  const int8x16 t6 = uint8x16_add_sat(t5, t3);  // abs(p0-q0)*2 + abs(p1-q1)/2
  const int8x16 t7 = uint8x16_sub_sat(t6, m_thresh);  // mask <= m_thresh
  *mask = (t7 == zero);
 }
 //------------------------------------------------------------------------------
 // Edge filtering functions
 // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
 static WEBP_INLINE void DoFilter6(int8x16* const p2, int8x16* const p1,
                                  int8x16* const p0, int8x16* const q0,
                                  int8x16* const q1, int8x16* const q2,
                                  const int8x16* const mask, int hev_thresh) {
  const int8x16 zero = {0};
  const int8x16 sign_bit = splat_uint8(0x80);
  int8x16 a, not_hev;
  // compute hev mask
  GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
  FLIP_SIGN_BIT2(*p2, *q2);
  GetBaseDelta(p1, p0, q0, q1, &a);
  {  // do simple filter on pixels with hev
    const int8x16 m = (~not_hev) & *mask;
    const int8x16 f = a & m;
    DoSimpleFilter(p0, q0, &f);
  }
  {  // do strong filter on pixels with not hev
    const int32x4 k9 = {0x0900, 0x0900, 0x0900, 0x0900};
    const int16x8 k63 = splat_int16(63);
    const int16x8 m = not_hev & *mask;
    const int16x8 f = a & m;
    const int16x8 f_lo =
        (int16x8)__builtin_shufflevector((int8x16)f, zero, 16, 0, 16, 1, 16, 2,
                                         16, 3, 16, 4, 16, 5, 16, 6, 16, 7);
    const int16x8 f_hi = (int16x8)__builtin_shufflevector(
        (int8x16)f, zero, 16, 8, 16, 9, 16, 10, 16, 11, 16, 12, 16, 13, 16, 14,
        16, 15);
    const int16x8 f9_lo = _mulhi_int16x8(f_lo, k9);  // Filter (lo) * 9
    const int16x8 f9_hi = _mulhi_int16x8(f_hi, k9);  // Filter (hi) * 9
    const int16x8 a2_lo = f9_lo + k63;  // Filter * 9 + 63
    const int16x8 a2_hi = f9_hi + k63;  // Filter * 9 + 63
    const int16x8 a1_lo = a2_lo + f9_lo;  // Filter * 18 + 63
    const int16x8 a1_hi = a2_hi + f9_hi;  // Filter * 18 + 63
    const int16x8 a0_lo = a1_lo + f9_lo;  // Filter * 27 + 63
    const int16x8 a0_hi = a1_hi + f9_hi;  // Filter * 27 + 63
    Update2Pixels(p2, q2, &a2_lo, &a2_hi);
    Update2Pixels(p1, q1, &a1_lo, &a1_hi);
    Update2Pixels(p0, q0, &a0_lo, &a0_hi);
  }
 }
 //------------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)
 #define MAX_DIFF1(p3, p2, p1, p0, m)     \
  do {                                   \
    m = abs_diff(p1, p0);                \
    m = _max_u8x16(m, abs_diff(p3, p2)); \
    m = _max_u8x16(m, abs_diff(p2, p1)); \
  } while (0)
 #define MAX_DIFF2(p3, p2, p1, p0, m)     \
  do {                                   \
    m = _max_u8x16(m, abs_diff(p1, p0)); \
    m = _max_u8x16(m, abs_diff(p3, p2)); \
    m = _max_u8x16(m, abs_diff(p2, p1)); \
  } while (0)
 #define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) \
  {                                              \
    memcpy(&e1, &(p)[0 * stride], 16);           \
    memcpy(&e2, &(p)[1 * stride], 16);           \
    memcpy(&e3, &(p)[2 * stride], 16);           \
    memcpy(&e4, &(p)[3 * stride], 16);           \
  }
 static WEBP_INLINE void ComplexMask(const int8x16* const p1,
                                    const int8x16* const p0,
                                    const int8x16* const q0,
                                    const int8x16* const q1, int thresh,
                                    int ithresh, int8x16* const mask) {
  const int8x16 zero = {0};
  const uint8x16 it = splat_uint8(ithresh);
  const int8x16 diff = uint8x16_sub_sat(*mask, it);
  const int8x16 thresh_mask = (diff == zero);
  int8x16 filter_mask;
  NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask);
  *mask = thresh_mask & filter_mask;
 }
 // on macroblock edges
 static void VFilter16(uint8_t* p, int stride, int thresh, int ithresh,
                      int hev_thresh) {
  int8x16 t1;
  int8x16 mask;
  int8x16 p2, p1, p0, q0, q1, q2;
  // Load p3, p2, p1, p0
  LOAD_H_EDGES4(p - 4 * stride, stride, t1, p2, p1, p0);
  MAX_DIFF1(t1, p2, p1, p0, mask);
  // Load q0, q1, q2, q3
  LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
  MAX_DIFF2(t1, q2, q1, q0, mask);
  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
  // Store
  memcpy(&p[-3 * stride], &p2, 16);
  memcpy(&p[-2 * stride], &p1, 16);
  memcpy(&p[-1 * stride], &p0, 16);
  memcpy(&p[+0 * stride], &q0, 16);
  memcpy(&p[+1 * stride], &q1, 16);
  memcpy(&p[+2 * stride], &q2, 16);
 }
 //------------------------------------------------------------------------------
 // 4x4 predictions
@@ -700,6 +993,8 @@ extern void VP8DspInitWASM(void);
 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitWASM(void) {
  VP8Transform = Transform;
  VP8VFilter16 = VFilter16;
  VP8PredLuma4[1] = TM4;
  VP8PredLuma4[2] = VE4;
  VP8PredLuma4[4] = RD4;