From bafa90ccd8c3e5b40cf3e8e3745abcf5dc93d423 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Wed, 5 Jul 2017 15:50:47 -0700
Subject: [PATCH] wasm: Add VFilter16

BUG=webp:352

Change-Id: I97f38aee5de063957c1512f6bd429c0e84c02087
---
 src/dsp/dec_wasm.c | 295 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 295 insertions(+)

diff --git a/src/dsp/dec_wasm.c b/src/dsp/dec_wasm.c
index b8937f09..f3f9c2fe 100644
--- a/src/dsp/dec_wasm.c
+++ b/src/dsp/dec_wasm.c
@@ -305,6 +305,299 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
   }
 }
 
+//------------------------------------------------------------------------------
+// Loop Filter (Paragraph 15)
+
+/*
+  Currently, the add/sub sat instructions are not supported, however in the
+  future, they will be.  So for now, we will cheat and use the builtins.
+
+  See https://github.com/WebAssembly/meetings/blob/master/2017/CG-05.md
+    Poll: Adopt the saturating integer arithmetic operations
+    {i8x16,i16x8}.{add,sub}_saturate_[su].
+*/
+
+static WEBP_INLINE uint8x16 uint8x16_add_sat(const uint8x16 a,
+                                             const uint8x16 b) {
+  // TODO(slavarnway): add generic implementation for non-x86
+  return (uint8x16)__builtin_ia32_paddusb128(a, b);
+}
+
+static WEBP_INLINE int8x16 int8x16_add_sat(const int8x16 a, const int8x16 b) {
+  // TODO(slavarnway): add generic implementation for non-x86
+  return (int8x16)__builtin_ia32_paddsb128(a, b);
+}
+
+static WEBP_INLINE uint8x16 uint8x16_sub_sat(const uint8x16 a,
+                                             const uint8x16 b) {
+  // TODO(slavarnway): add generic implementation for non-x86
+  return (uint8x16)__builtin_ia32_psubusb128(a, b);
+}
+
+static WEBP_INLINE int8x16 int8x16_sub_sat(const int8x16 a, const int8x16 b) {
+  // TODO(slavarnway): add generic implementation for non-x86
+  return (int8x16)__builtin_ia32_psubsb128(a, b);
+}
+
+static WEBP_INLINE uint8x16 _max_u8x16(const uint8x16 a, const uint8x16 b) {
+  const uint8x16 s1 = (a > b);
+  return (s1 & a) | (~s1 & b);
+}
+
+// Compute abs(p - q) = subs(p - q) OR subs(q - p)
+static WEBP_INLINE int8x16 abs_diff(int8x16 p, int8x16 q) {
+  const int8x16 a = uint8x16_sub_sat(p, q);
+  const int8x16 b = uint8x16_sub_sat(q, p);
+  return a | b;
+}
+
+// int16 to int8 with saturation.
+static inline int8x16 _pack_sw_2_sb(const int16x8 lo, const int16x8 hi) {
+#if 1
+  const int16x8 k7f = splat_int16(0x007f);
+  const int16x8 kff80 = splat_int16(0xff80);
+  const int16x8 s1_lo = (lo < k7f);
+  const int16x8 a_lo = (s1_lo & lo) | (~s1_lo & k7f);
+  const int16x8 s2_lo = (a_lo > kff80);
+  const int16x8 a2_lo = (s2_lo & a_lo) | (~s2_lo & kff80);
+  const int16x8 s1_hi = (hi < k7f);
+  const int16x8 a_hi = (s1_hi & hi) | (~s1_hi & k7f);
+  const int16x8 s2_hi = (a_hi > kff80);
+  const int16x8 a2_hi = (s2_hi & a_hi) | (~s2_hi & kff80);
+  return (int8x16)__builtin_shufflevector((int8x16)a2_lo, (int8x16)a2_hi, 0, 2,
+                                          4, 6, 8, 10, 12, 14, 16, 18, 20, 22,
+                                          24, 26, 28, 30);
+#else
+  return (int8x16)__builtin_ia32_packsswb128(lo, hi);
+#endif
+}
+
+// Shift each byte of "x" by 3 bits while preserving by the sign bit.
+static WEBP_INLINE void SignedShift8b(int8x16* const x) {
+  const int8x16 zero = {0};
+  const int16x8 eleven = splat_int16(3 + 8);
+  const int16x8 lo_0 = (int16x8)__builtin_shufflevector(
+      *x, zero, 16, 0, 16, 1, 16, 2, 16, 3, 16, 4, 16, 5, 16, 6, 16, 7);
+  const int16x8 hi_0 = (int16x8)__builtin_shufflevector(
+      *x, zero, 16, 8, 16, 9, 16, 10, 16, 11, 16, 12, 16, 13, 16, 14, 16, 15);
+  const int16x8 lo_1 = lo_0 >> eleven;
+  const int16x8 hi_1 = hi_0 >> eleven;
+  *x = _pack_sw_2_sb(lo_1, hi_1);
+}
+
+#define FLIP_SIGN_BIT2(a, b) \
+  {                          \
+    a = a ^ sign_bit;        \
+    b = b ^ sign_bit;        \
+  }
+
+#define FLIP_SIGN_BIT4(a, b, c, d) \
+  {                                \
+    FLIP_SIGN_BIT2(a, b);          \
+    FLIP_SIGN_BIT2(c, d);          \
+  }
+
+// input/output is uint8_t
+static WEBP_INLINE void GetNotHEV(const int8x16* const p1,
+                                  const int8x16* const p0,
+                                  const int8x16* const q0,
+                                  const int8x16* const q1, int hev_thresh,
+                                  int8x16* const not_hev) {
+  const int8x16 zero = {0};
+  const int8x16 t_1 = abs_diff(*p1, *p0);
+  const int8x16 t_2 = abs_diff(*q1, *q0);
+  const int8x16 h = splat_uint8(hev_thresh);
+  const int8x16 t_max = _max_u8x16(t_1, t_2);
+  const int8x16 t_max_h = uint8x16_sub_sat(t_max, h);
+  *not_hev = (t_max_h == zero);  // not_hev <= t1 && not_hev <= t2
+}
+
+// input pixels are int8_t
+static WEBP_INLINE void GetBaseDelta(const int8x16* const p1,
+                                     const int8x16* const p0,
+                                     const int8x16* const q0,
+                                     const int8x16* const q1,
+                                     int8x16* const delta) {
+  // beware of addition order, for saturation!
+  const int8x16 p1_q1 = int8x16_sub_sat(*p1, *q1);   // p1 - q1
+  const int8x16 q0_p0 = int8x16_sub_sat(*q0, *p0);   // q0 - p0
+  const int8x16 s1 = int8x16_add_sat(p1_q1, q0_p0);  // p1 - q1 + 1 * (q0 - p0)
+  const int8x16 s2 = int8x16_add_sat(q0_p0, s1);     // p1 - q1 + 2 * (q0 - p0)
+  const int8x16 s3 = int8x16_add_sat(q0_p0, s2);     // p1 - q1 + 3 * (q0 - p0)
+  *delta = s3;
+}
+
+// input and output are int8_t
+static WEBP_INLINE void DoSimpleFilter(int8x16* const p0, int8x16* const q0,
+                                       const int8x16* const fl) {
+  const int8x16 k3 = splat_uint8(3);
+  const int8x16 k4 = splat_uint8(4);
+  int8x16 v3 = int8x16_add_sat(*fl, k3);
+  int8x16 v4 = int8x16_add_sat(*fl, k4);
+  SignedShift8b(&v4);              // v4 >> 3
+  SignedShift8b(&v3);              // v3 >> 3
+  *q0 = int8x16_sub_sat(*q0, v4);  // q0 -= v4
+  *p0 = int8x16_add_sat(*p0, v3);  // p0 += v3
+}
+
+// Updates values of 2 pixels at MB edge during complex filtering.
+// Update operations:
+// q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
+// Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
+static WEBP_INLINE void Update2Pixels(int8x16* const pi, int8x16* const qi,
+                                      const int16x8* const a0_lo,
+                                      const int16x8* const a0_hi) {
+  const int16x8 _7 = splat_int16(7);
+  const int16x8 a1_lo = *a0_lo >> _7;
+  const int16x8 a1_hi = *a0_hi >> _7;
+  const int8x16 delta = _pack_sw_2_sb(a1_lo, a1_hi);
+  const int8x16 sign_bit = (int8x16)splat_uint8(0x80);
+  *pi = int8x16_add_sat(*pi, delta);
+  *qi = int8x16_sub_sat(*qi, delta);
+  FLIP_SIGN_BIT2(*pi, *qi);
+}
+
+// input pixels are uint8_t
+static WEBP_INLINE void NeedsFilter(const int8x16* const p1,
+                                    const int8x16* const p0,
+                                    const int8x16* const q0,
+                                    const int8x16* const q1, int thresh,
+                                    int8x16* const mask) {
+  const int8x16 zero = {0};
+  const int16x8 one = {1, 1, 1, 1, 1, 1, 1, 1};
+  const int8x16 m_thresh = splat_uint8(thresh);
+  const int8x16 t1 = abs_diff(*p1, *q1);  // abs(p1 - q1)
+  const uint8x16 kFE = splat_uint8(0xFE);
+  const uint16x8 t2 = t1 & kFE;                 // set lsb of each byte to zero
+  const uint16x8 t3 = t2 >> one;                // abs(p1 - q1) / 2
+  const int8x16 t4 = abs_diff(*p0, *q0);        // abs(p0 - q0)
+  const int8x16 t5 = uint8x16_add_sat(t4, t4);  // abs(p0 - q0) * 2
+  const int8x16 t6 = uint8x16_add_sat(t5, t3);  // abs(p0-q0)*2 + abs(p1-q1)/2
+  const int8x16 t7 = uint8x16_sub_sat(t6, m_thresh);  // mask <= m_thresh
+  *mask = (t7 == zero);
+}
+
+//------------------------------------------------------------------------------
+// Edge filtering functions
+
+// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
+static WEBP_INLINE void DoFilter6(int8x16* const p2, int8x16* const p1,
+                                  int8x16* const p0, int8x16* const q0,
+                                  int8x16* const q1, int8x16* const q2,
+                                  const int8x16* const mask, int hev_thresh) {
+  const int8x16 zero = {0};
+  const int8x16 sign_bit = splat_uint8(0x80);
+  int8x16 a, not_hev;
+
+  // compute hev mask
+  GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
+
+  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
+  FLIP_SIGN_BIT2(*p2, *q2);
+  GetBaseDelta(p1, p0, q0, q1, &a);
+
+  {  // do simple filter on pixels with hev
+    const int8x16 m = (~not_hev) & *mask;
+    const int8x16 f = a & m;
+    DoSimpleFilter(p0, q0, &f);
+  }
+
+  {  // do strong filter on pixels with not hev
+    const int32x4 k9 = {0x0900, 0x0900, 0x0900, 0x0900};
+    const int16x8 k63 = splat_int16(63);
+
+    const int16x8 m = not_hev & *mask;
+    const int16x8 f = a & m;
+    const int16x8 f_lo =
+        (int16x8)__builtin_shufflevector((int8x16)f, zero, 16, 0, 16, 1, 16, 2,
+                                         16, 3, 16, 4, 16, 5, 16, 6, 16, 7);
+    const int16x8 f_hi = (int16x8)__builtin_shufflevector(
+        (int8x16)f, zero, 16, 8, 16, 9, 16, 10, 16, 11, 16, 12, 16, 13, 16, 14,
+        16, 15);
+
+    const int16x8 f9_lo = _mulhi_int16x8(f_lo, k9);  // Filter (lo) * 9
+    const int16x8 f9_hi = _mulhi_int16x8(f_hi, k9);  // Filter (hi) * 9
+
+    const int16x8 a2_lo = f9_lo + k63;  // Filter * 9 + 63
+    const int16x8 a2_hi = f9_hi + k63;  // Filter * 9 + 63
+
+    const int16x8 a1_lo = a2_lo + f9_lo;  // Filter * 18 + 63
+    const int16x8 a1_hi = a2_hi + f9_hi;  // Filter * 18 + 63
+
+    const int16x8 a0_lo = a1_lo + f9_lo;  // Filter * 27 + 63
+    const int16x8 a0_hi = a1_hi + f9_hi;  // Filter * 27 + 63
+
+    Update2Pixels(p2, q2, &a2_lo, &a2_hi);
+    Update2Pixels(p1, q1, &a1_lo, &a1_hi);
+    Update2Pixels(p0, q0, &a0_lo, &a0_hi);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Complex In-loop filtering (Paragraph 15.3)
+#define MAX_DIFF1(p3, p2, p1, p0, m)     \
+  do {                                   \
+    m = abs_diff(p1, p0);                \
+    m = _max_u8x16(m, abs_diff(p3, p2)); \
+    m = _max_u8x16(m, abs_diff(p2, p1)); \
+  } while (0)
+
+#define MAX_DIFF2(p3, p2, p1, p0, m)     \
+  do {                                   \
+    m = _max_u8x16(m, abs_diff(p1, p0)); \
+    m = _max_u8x16(m, abs_diff(p3, p2)); \
+    m = _max_u8x16(m, abs_diff(p2, p1)); \
+  } while (0)
+
+#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) \
+  {                                              \
+    memcpy(&e1, &(p)[0 * stride], 16);           \
+    memcpy(&e2, &(p)[1 * stride], 16);           \
+    memcpy(&e3, &(p)[2 * stride], 16);           \
+    memcpy(&e4, &(p)[3 * stride], 16);           \
+  }
+
+static WEBP_INLINE void ComplexMask(const int8x16* const p1,
+                                    const int8x16* const p0,
+                                    const int8x16* const q0,
+                                    const int8x16* const q1, int thresh,
+                                    int ithresh, int8x16* const mask) {
+  const int8x16 zero = {0};
+  const uint8x16 it = splat_uint8(ithresh);
+  const int8x16 diff = uint8x16_sub_sat(*mask, it);
+  const int8x16 thresh_mask = (diff == zero);
+  int8x16 filter_mask;
+  NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask);
+  *mask = thresh_mask & filter_mask;
+}
+
+// on macroblock edges
+static void VFilter16(uint8_t* p, int stride, int thresh, int ithresh,
+                      int hev_thresh) {
+  int8x16 t1;
+  int8x16 mask;
+  int8x16 p2, p1, p0, q0, q1, q2;
+
+  // Load p3, p2, p1, p0
+  LOAD_H_EDGES4(p - 4 * stride, stride, t1, p2, p1, p0);
+  MAX_DIFF1(t1, p2, p1, p0, mask);
+
+  // Load q0, q1, q2, q3
+  LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
+  MAX_DIFF2(t1, q2, q1, q0, mask);
+
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+
+  // Store
+  memcpy(&p[-3 * stride], &p2, 16);
+  memcpy(&p[-2 * stride], &p1, 16);
+  memcpy(&p[-1 * stride], &p0, 16);
+  memcpy(&p[+0 * stride], &q0, 16);
+  memcpy(&p[+1 * stride], &q1, 16);
+  memcpy(&p[+2 * stride], &q2, 16);
+}
+
 //------------------------------------------------------------------------------
 // 4x4 predictions
 
@@ -700,6 +993,8 @@ extern void VP8DspInitWASM(void);
 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitWASM(void) {
   VP8Transform = Transform;
 
+  VP8VFilter16 = VFilter16;
+
   VP8PredLuma4[1] = TM4;
   VP8PredLuma4[2] = VE4;
   VP8PredLuma4[4] = RD4;