From bafa90ccd8c3e5b40cf3e8e3745abcf5dc93d423 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 5 Jul 2017 15:50:47 -0700 Subject: [PATCH] wasm: Add VFilter16 BUG=webp:352 Change-Id: I97f38aee5de063957c1512f6bd429c0e84c02087 --- src/dsp/dec_wasm.c | 295 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 295 insertions(+) diff --git a/src/dsp/dec_wasm.c b/src/dsp/dec_wasm.c index b8937f09..f3f9c2fe 100644 --- a/src/dsp/dec_wasm.c +++ b/src/dsp/dec_wasm.c @@ -305,6 +305,299 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) { } } +//------------------------------------------------------------------------------ +// Loop Filter (Paragraph 15) + +/* + Currently, the add/sub sat instructions are not supported, however in the + future, they will be. So for now, we will cheat and use the builtins. + + See https://github.com/WebAssembly/meetings/blob/master/2017/CG-05.md + Poll: Adopt the saturating integer arithmetic operations + {i8x16,i16x8}.{add,sub}_saturate_[su]. +*/ + +static WEBP_INLINE uint8x16 uint8x16_add_sat(const uint8x16 a, + const uint8x16 b) { + // TODO(slavarnway): add generic implementation for non-x86 + return (uint8x16)__builtin_ia32_paddusb128(a, b); +} + +static WEBP_INLINE int8x16 int8x16_add_sat(const int8x16 a, const int8x16 b) { + // TODO(slavarnway): add generic implementation for non-x86 + return (int8x16)__builtin_ia32_paddsb128(a, b); +} + +static WEBP_INLINE uint8x16 uint8x16_sub_sat(const uint8x16 a, + const uint8x16 b) { + // TODO(slavarnway): add generic implementation for non-x86 + return (uint8x16)__builtin_ia32_psubusb128(a, b); +} + +static WEBP_INLINE int8x16 int8x16_sub_sat(const int8x16 a, const int8x16 b) { + // TODO(slavarnway): add generic implementation for non-x86 + return (int8x16)__builtin_ia32_psubsb128(a, b); +} + +static WEBP_INLINE uint8x16 _max_u8x16(const uint8x16 a, const uint8x16 b) { + const uint8x16 s1 = (a > b); + return (s1 & a) | (~s1 & b); +} + +// Compute abs(p - q) = subs(p - q) OR subs(q - p) +static WEBP_INLINE int8x16 abs_diff(int8x16 p, int8x16 q) { + const int8x16 a = uint8x16_sub_sat(p, q); + const int8x16 b = uint8x16_sub_sat(q, p); + return a | b; +} + +// int16 to int8 with saturation. +static inline int8x16 _pack_sw_2_sb(const int16x8 lo, const int16x8 hi) { +#if 1 + const int16x8 k7f = splat_int16(0x007f); + const int16x8 kff80 = splat_int16(0xff80); + const int16x8 s1_lo = (lo < k7f); + const int16x8 a_lo = (s1_lo & lo) | (~s1_lo & k7f); + const int16x8 s2_lo = (a_lo > kff80); + const int16x8 a2_lo = (s2_lo & a_lo) | (~s2_lo & kff80); + const int16x8 s1_hi = (hi < k7f); + const int16x8 a_hi = (s1_hi & hi) | (~s1_hi & k7f); + const int16x8 s2_hi = (a_hi > kff80); + const int16x8 a2_hi = (s2_hi & a_hi) | (~s2_hi & kff80); + return (int8x16)__builtin_shufflevector((int8x16)a2_lo, (int8x16)a2_hi, 0, 2, + 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, + 24, 26, 28, 30); +#else + return (int8x16)__builtin_ia32_packsswb128(lo, hi); +#endif +} + +// Shift each byte of "x" by 3 bits while preserving by the sign bit. +static WEBP_INLINE void SignedShift8b(int8x16* const x) { + const int8x16 zero = {0}; + const int16x8 eleven = splat_int16(3 + 8); + const int16x8 lo_0 = (int16x8)__builtin_shufflevector( + *x, zero, 16, 0, 16, 1, 16, 2, 16, 3, 16, 4, 16, 5, 16, 6, 16, 7); + const int16x8 hi_0 = (int16x8)__builtin_shufflevector( + *x, zero, 16, 8, 16, 9, 16, 10, 16, 11, 16, 12, 16, 13, 16, 14, 16, 15); + const int16x8 lo_1 = lo_0 >> eleven; + const int16x8 hi_1 = hi_0 >> eleven; + *x = _pack_sw_2_sb(lo_1, hi_1); +} + +#define FLIP_SIGN_BIT2(a, b) \ + { \ + a = a ^ sign_bit; \ + b = b ^ sign_bit; \ + } + +#define FLIP_SIGN_BIT4(a, b, c, d) \ + { \ + FLIP_SIGN_BIT2(a, b); \ + FLIP_SIGN_BIT2(c, d); \ + } + +// input/output is uint8_t +static WEBP_INLINE void GetNotHEV(const int8x16* const p1, + const int8x16* const p0, + const int8x16* const q0, + const int8x16* const q1, int hev_thresh, + int8x16* const not_hev) { + const int8x16 zero = {0}; + const int8x16 t_1 = abs_diff(*p1, *p0); + const int8x16 t_2 = abs_diff(*q1, *q0); + const int8x16 h = splat_uint8(hev_thresh); + const int8x16 t_max = _max_u8x16(t_1, t_2); + const int8x16 t_max_h = uint8x16_sub_sat(t_max, h); + *not_hev = (t_max_h == zero); // not_hev <= t1 && not_hev <= t2 +} + +// input pixels are int8_t +static WEBP_INLINE void GetBaseDelta(const int8x16* const p1, + const int8x16* const p0, + const int8x16* const q0, + const int8x16* const q1, + int8x16* const delta) { + // beware of addition order, for saturation! + const int8x16 p1_q1 = int8x16_sub_sat(*p1, *q1); // p1 - q1 + const int8x16 q0_p0 = int8x16_sub_sat(*q0, *p0); // q0 - p0 + const int8x16 s1 = int8x16_add_sat(p1_q1, q0_p0); // p1 - q1 + 1 * (q0 - p0) + const int8x16 s2 = int8x16_add_sat(q0_p0, s1); // p1 - q1 + 2 * (q0 - p0) + const int8x16 s3 = int8x16_add_sat(q0_p0, s2); // p1 - q1 + 3 * (q0 - p0) + *delta = s3; +} + +// input and output are int8_t +static WEBP_INLINE void DoSimpleFilter(int8x16* const p0, int8x16* const q0, + const int8x16* const fl) { + const int8x16 k3 = splat_uint8(3); + const int8x16 k4 = splat_uint8(4); + int8x16 v3 = int8x16_add_sat(*fl, k3); + int8x16 v4 = int8x16_add_sat(*fl, k4); + SignedShift8b(&v4); // v4 >> 3 + SignedShift8b(&v3); // v3 >> 3 + *q0 = int8x16_sub_sat(*q0, v4); // q0 -= v4 + *p0 = int8x16_add_sat(*p0, v3); // p0 += v3 +} + +// Updates values of 2 pixels at MB edge during complex filtering. +// Update operations: +// q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)] +// Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip). +static WEBP_INLINE void Update2Pixels(int8x16* const pi, int8x16* const qi, + const int16x8* const a0_lo, + const int16x8* const a0_hi) { + const int16x8 _7 = splat_int16(7); + const int16x8 a1_lo = *a0_lo >> _7; + const int16x8 a1_hi = *a0_hi >> _7; + const int8x16 delta = _pack_sw_2_sb(a1_lo, a1_hi); + const int8x16 sign_bit = (int8x16)splat_uint8(0x80); + *pi = int8x16_add_sat(*pi, delta); + *qi = int8x16_sub_sat(*qi, delta); + FLIP_SIGN_BIT2(*pi, *qi); +} + +// input pixels are uint8_t +static WEBP_INLINE void NeedsFilter(const int8x16* const p1, + const int8x16* const p0, + const int8x16* const q0, + const int8x16* const q1, int thresh, + int8x16* const mask) { + const int8x16 zero = {0}; + const int16x8 one = {1, 1, 1, 1, 1, 1, 1, 1}; + const int8x16 m_thresh = splat_uint8(thresh); + const int8x16 t1 = abs_diff(*p1, *q1); // abs(p1 - q1) + const uint8x16 kFE = splat_uint8(0xFE); + const uint16x8 t2 = t1 & kFE; // set lsb of each byte to zero + const uint16x8 t3 = t2 >> one; // abs(p1 - q1) / 2 + const int8x16 t4 = abs_diff(*p0, *q0); // abs(p0 - q0) + const int8x16 t5 = uint8x16_add_sat(t4, t4); // abs(p0 - q0) * 2 + const int8x16 t6 = uint8x16_add_sat(t5, t3); // abs(p0-q0)*2 + abs(p1-q1)/2 + const int8x16 t7 = uint8x16_sub_sat(t6, m_thresh); // mask <= m_thresh + *mask = (t7 == zero); +} + +//------------------------------------------------------------------------------ +// Edge filtering functions + +// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) +static WEBP_INLINE void DoFilter6(int8x16* const p2, int8x16* const p1, + int8x16* const p0, int8x16* const q0, + int8x16* const q1, int8x16* const q2, + const int8x16* const mask, int hev_thresh) { + const int8x16 zero = {0}; + const int8x16 sign_bit = splat_uint8(0x80); + int8x16 a, not_hev; + + // compute hev mask + GetNotHEV(p1, p0, q0, q1, hev_thresh, ¬_hev); + + FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); + FLIP_SIGN_BIT2(*p2, *q2); + GetBaseDelta(p1, p0, q0, q1, &a); + + { // do simple filter on pixels with hev + const int8x16 m = (~not_hev) & *mask; + const int8x16 f = a & m; + DoSimpleFilter(p0, q0, &f); + } + + { // do strong filter on pixels with not hev + const int32x4 k9 = {0x0900, 0x0900, 0x0900, 0x0900}; + const int16x8 k63 = splat_int16(63); + + const int16x8 m = not_hev & *mask; + const int16x8 f = a & m; + const int16x8 f_lo = + (int16x8)__builtin_shufflevector((int8x16)f, zero, 16, 0, 16, 1, 16, 2, + 16, 3, 16, 4, 16, 5, 16, 6, 16, 7); + const int16x8 f_hi = (int16x8)__builtin_shufflevector( + (int8x16)f, zero, 16, 8, 16, 9, 16, 10, 16, 11, 16, 12, 16, 13, 16, 14, + 16, 15); + + const int16x8 f9_lo = _mulhi_int16x8(f_lo, k9); // Filter (lo) * 9 + const int16x8 f9_hi = _mulhi_int16x8(f_hi, k9); // Filter (hi) * 9 + + const int16x8 a2_lo = f9_lo + k63; // Filter * 9 + 63 + const int16x8 a2_hi = f9_hi + k63; // Filter * 9 + 63 + + const int16x8 a1_lo = a2_lo + f9_lo; // Filter * 18 + 63 + const int16x8 a1_hi = a2_hi + f9_hi; // Filter * 18 + 63 + + const int16x8 a0_lo = a1_lo + f9_lo; // Filter * 27 + 63 + const int16x8 a0_hi = a1_hi + f9_hi; // Filter * 27 + 63 + + Update2Pixels(p2, q2, &a2_lo, &a2_hi); + Update2Pixels(p1, q1, &a1_lo, &a1_hi); + Update2Pixels(p0, q0, &a0_lo, &a0_hi); + } +} + +//------------------------------------------------------------------------------ +// Complex In-loop filtering (Paragraph 15.3) +#define MAX_DIFF1(p3, p2, p1, p0, m) \ + do { \ + m = abs_diff(p1, p0); \ + m = _max_u8x16(m, abs_diff(p3, p2)); \ + m = _max_u8x16(m, abs_diff(p2, p1)); \ + } while (0) + +#define MAX_DIFF2(p3, p2, p1, p0, m) \ + do { \ + m = _max_u8x16(m, abs_diff(p1, p0)); \ + m = _max_u8x16(m, abs_diff(p3, p2)); \ + m = _max_u8x16(m, abs_diff(p2, p1)); \ + } while (0) + +#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) \ + { \ + memcpy(&e1, &(p)[0 * stride], 16); \ + memcpy(&e2, &(p)[1 * stride], 16); \ + memcpy(&e3, &(p)[2 * stride], 16); \ + memcpy(&e4, &(p)[3 * stride], 16); \ + } + +static WEBP_INLINE void ComplexMask(const int8x16* const p1, + const int8x16* const p0, + const int8x16* const q0, + const int8x16* const q1, int thresh, + int ithresh, int8x16* const mask) { + const int8x16 zero = {0}; + const uint8x16 it = splat_uint8(ithresh); + const int8x16 diff = uint8x16_sub_sat(*mask, it); + const int8x16 thresh_mask = (diff == zero); + int8x16 filter_mask; + NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask); + *mask = thresh_mask & filter_mask; +} + +// on macroblock edges +static void VFilter16(uint8_t* p, int stride, int thresh, int ithresh, + int hev_thresh) { + int8x16 t1; + int8x16 mask; + int8x16 p2, p1, p0, q0, q1, q2; + + // Load p3, p2, p1, p0 + LOAD_H_EDGES4(p - 4 * stride, stride, t1, p2, p1, p0); + MAX_DIFF1(t1, p2, p1, p0, mask); + + // Load q0, q1, q2, q3 + LOAD_H_EDGES4(p, stride, q0, q1, q2, t1); + MAX_DIFF2(t1, q2, q1, q0, mask); + + ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); + DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); + + // Store + memcpy(&p[-3 * stride], &p2, 16); + memcpy(&p[-2 * stride], &p1, 16); + memcpy(&p[-1 * stride], &p0, 16); + memcpy(&p[+0 * stride], &q0, 16); + memcpy(&p[+1 * stride], &q1, 16); + memcpy(&p[+2 * stride], &q2, 16); +} + //------------------------------------------------------------------------------ // 4x4 predictions @@ -700,6 +993,8 @@ extern void VP8DspInitWASM(void); WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitWASM(void) { VP8Transform = Transform; + VP8VFilter16 = VFilter16; + VP8PredLuma4[1] = TM4; VP8PredLuma4[2] = VE4; VP8PredLuma4[4] = RD4;