From 26029568b71c7334086c5fb5007255ffb0f2c7f3 Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 29 Mar 2014 19:40:35 -0700 Subject: [PATCH] dec_neon: add strong loopfilter intrinsics vertical only currently, 2.5-3% faster placed under USE_INTRINSICS as this change depends on the simple loopfilter improves the simple loopfilter slightly thanks to some reorganization Change-Id: I6611441fa54228549b21ea74c013cb78d53c7155 --- src/dsp/dec_neon.c | 166 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 150 insertions(+), 16 deletions(-) diff --git a/src/dsp/dec_neon.c b/src/dsp/dec_neon.c index 92fe1a5d..d7bff65e 100644 --- a/src/dsp/dec_neon.c +++ b/src/dsp/dec_neon.c @@ -169,22 +169,13 @@ static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0, return s3; } -static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0, - const uint8x16_t q0, const uint8x16_t q1, - uint8x16_t* const op0, uint8x16_t* const oq0, - int thresh) { - const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh); - const int8x16_t p1s = FlipSign(p1); - const int8x16_t p0s = FlipSign(p0); - const int8x16_t q0s = FlipSign(q0); - const int8x16_t q1s = FlipSign(q1); - const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s); - const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask)); - // DoSimpleFilter: +static void DoSimpleFilter(const int8x16_t p0s, const int8x16_t q0s, + const int8x16_t delta, + uint8x16_t* const op0, uint8x16_t* const oq0) { const int8x16_t kCst3 = vdupq_n_s8(0x03); const int8x16_t kCst4 = vdupq_n_s8(0x04); - const int8x16_t delta_p3 = vqaddq_s8(delta1, kCst3); - const int8x16_t delta_p4 = vqaddq_s8(delta1, kCst4); + const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3); + const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4); const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3); const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3); const int8x16_t sp0 = vqaddq_s8(p0s, delta3); @@ -192,6 +183,19 @@ static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0, *op0 = FlipSignBack(sp0); *oq0 = FlipSignBack(sq0); } + +static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, + const uint8x16_t mask, + uint8x16_t* const op0, uint8x16_t* const oq0) { + const int8x16_t p1s = FlipSign(p1); + const int8x16_t p0s = FlipSign(p0); + const int8x16_t q0s = FlipSign(q0); + const int8x16_t q1s = FlipSign(q1); + const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s); + const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask)); + DoSimpleFilter(p0s, q0s, delta1, op0, oq0); +} #endif // USE_INTRINSICS // Load/Store vertical edge @@ -266,7 +270,10 @@ static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) { static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) { uint8x16_t p1, p0, q0, q1, op0, oq0; Load16x4(p, stride, &p1, &p0, &q0, &q1); - DoFilter2(p1, p0, q0, q1, &op0, &oq0, thresh); + { + const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh); + DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0); + } Store16x2(op0, oq0, p, stride); } @@ -280,7 +287,10 @@ static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) { static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) { uint8x16_t p1, p0, q0, q1, oq0, op0; Load4x16(p, stride, &p1, &p0, &q0, &q1); - DoFilter2(p1, p0, q0, q1, &op0, &oq0, thresh); + { + const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh); + DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0); + } Store2x16(op0, oq0, p, stride); } @@ -330,6 +340,127 @@ static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) { } } +//------------------------------------------------------------------------------ +// Complex In-loop filtering (Paragraph 15.3) + +#if defined(USE_INTRINSICS) +static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, + int hev_thresh) { + const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh); + const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0) + const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0) + const uint8x16_t mask1 = vcgtq_u8(a_p1_p0, hev_thresh_v); + const uint8x16_t mask2 = vcgtq_u8(a_q1_q0, hev_thresh_v); + const uint8x16_t mask = vorrq_u8(mask1, mask2); + return mask; +} + +static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2, + const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, + const uint8x16_t q2, const uint8x16_t q3, + int ithresh) { + const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh); + const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2) + const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1) + const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0) + const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2); // abs(q3 - q2) + const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1); // abs(q2 - q1) + const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0) + const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1); + const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2); + const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0); + const uint8x16_t max12 = vmaxq_u8(max1, max2); + const uint8x16_t max = vmaxq_u8(max12, max3); + const uint8x16_t mask = vcgeq_u8(ithresh_v, max); + return mask; +} + +static void DoFilter6( + const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2, + const uint8x16_t mask, const uint8x16_t hev_mask, + uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0, + uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) { + const int8x16_t p2s = FlipSign(p2); + const int8x16_t p1s = FlipSign(p1); + int8x16_t p0s = FlipSign(p0); + int8x16_t q0s = FlipSign(q0); + const int8x16_t q1s = FlipSign(q1); + const int8x16_t q2s = FlipSign(q2); + const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask); + const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s); + const int8x16_t simple_lf_delta = + vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask)); + uint8x16_t tmp_p0, tmp_q0; + + // Use the simple loopfilter on pixels with hev. + DoSimpleFilter(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0); + p0s = FlipSign(tmp_p0); + q0s = FlipSign(tmp_q0); + + // Use the complex loopfilter on pixels without hev. + { + const uint8x16_t not_hev = vmvnq_u8(hev_mask); + const uint8x16_t complex_lf_mask = vandq_u8(mask, not_hev); + const int8x16_t complex_lf_delta = + vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask)); + const int16x8_t kCst63 = vdupq_n_s16(63); + const int8x8_t kCst27 = vdup_n_s8(27); + const int8x8_t kCst18 = vdup_n_s8(18); + const int8x8_t kCst9 = vdup_n_s8(9); + const int8x8_t delta_lo = vget_low_s8(complex_lf_delta); + const int8x8_t delta_hi = vget_high_s8(complex_lf_delta); + const int16x8_t s1_lo = vmlal_s8(kCst63, kCst27, delta_lo); // 63 + 27 * a + const int16x8_t s1_hi = vmlal_s8(kCst63, kCst27, delta_hi); // 63 + 27 * a + const int16x8_t s2_lo = vmlal_s8(kCst63, kCst18, delta_lo); // 63 + 18 * a + const int16x8_t s2_hi = vmlal_s8(kCst63, kCst18, delta_hi); // 63 + 18 * a + const int16x8_t s3_lo = vmlal_s8(kCst63, kCst9, delta_lo); // 63 + 9 * a + const int16x8_t s3_hi = vmlal_s8(kCst63, kCst9, delta_hi); // 63 + 9 * a + const int8x8_t a1_lo = vqshrn_n_s16(s1_lo, 7); + const int8x8_t a1_hi = vqshrn_n_s16(s1_hi, 7); + const int8x8_t a2_lo = vqshrn_n_s16(s2_lo, 7); + const int8x8_t a2_hi = vqshrn_n_s16(s2_hi, 7); + const int8x8_t a3_lo = vqshrn_n_s16(s3_lo, 7); + const int8x8_t a3_hi = vqshrn_n_s16(s3_hi, 7); + const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi); + const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi); + const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi); + + *op0 = FlipSignBack(vqaddq_s8(p0s, a1)); // clip(p0 + a1) + *oq0 = FlipSignBack(vqsubq_s8(q0s, a1)); // clip(q0 - q1) + *oq1 = FlipSignBack(vqsubq_s8(q1s, a2)); // clip(q1 - a2) + *op1 = FlipSignBack(vqaddq_s8(p1s, a2)); // clip(p1 + a2) + *oq2 = FlipSignBack(vqsubq_s8(q2s, a3)); // clip(q2 - a3) + *op2 = FlipSignBack(vqaddq_s8(p2s, a3)); // clip(p2 + a3) + } +} + +// on macroblock edges +static void VFilter16(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { + uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; + Load16x4(p - 2 * stride, stride, &p3, &p2, &p1, &p0); + Load16x4(p + 2 * stride, stride, &q0, &q1, &q2, &q3); + { + const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh); + const uint8x16_t mask2 = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, + ithresh); + const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); + const uint8x16_t mask = vandq_u8(mask1, mask2); + uint8x16_t op2, op1, op0, oq0, oq1, oq2; + + DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask, + &op2, &op1, &op0, &oq0, &oq1, &oq2); + Store16x2(op2, op1, p - 2 * stride, stride); + Store16x2(op0, oq0, p + 0 * stride, stride); + Store16x2(oq1, oq2, p + 2 * stride, stride); + } +} + +#endif // USE_INTRINSICS + //----------------------------------------------------------------------------- // Inverse transforms (Paragraph 14.4) @@ -634,6 +765,9 @@ void VP8DspInitNEON(void) { VP8TransformDC = TransformDC; VP8TransformWHT = TransformWHT; +#if defined(USE_INTRINSICS) + VP8VFilter16 = VFilter16; +#endif VP8SimpleVFilter16 = SimpleVFilter16NEON; VP8SimpleHFilter16 = SimpleHFilter16NEON; VP8SimpleVFilter16i = SimpleVFilter16iNEON;