mirror of
https://github.com/webmproject/libwebp.git
synced 2024-12-28 14:38:21 +01:00
dec_neon: add strong loopfilter intrinsics
vertical only currently, 2.5-3% faster placed under USE_INTRINSICS as this change depends on the simple loopfilter improves the simple loopfilter slightly thanks to some reorganization Change-Id: I6611441fa54228549b21ea74c013cb78d53c7155
This commit is contained in:
parent
cca7d7ef0f
commit
26029568b7
@ -169,22 +169,13 @@ static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
|
||||
return s3;
|
||||
}
|
||||
|
||||
static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
|
||||
const uint8x16_t q0, const uint8x16_t q1,
|
||||
uint8x16_t* const op0, uint8x16_t* const oq0,
|
||||
int thresh) {
|
||||
const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
|
||||
const int8x16_t p1s = FlipSign(p1);
|
||||
const int8x16_t p0s = FlipSign(p0);
|
||||
const int8x16_t q0s = FlipSign(q0);
|
||||
const int8x16_t q1s = FlipSign(q1);
|
||||
const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
|
||||
const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
|
||||
// DoSimpleFilter:
|
||||
static void DoSimpleFilter(const int8x16_t p0s, const int8x16_t q0s,
|
||||
const int8x16_t delta,
|
||||
uint8x16_t* const op0, uint8x16_t* const oq0) {
|
||||
const int8x16_t kCst3 = vdupq_n_s8(0x03);
|
||||
const int8x16_t kCst4 = vdupq_n_s8(0x04);
|
||||
const int8x16_t delta_p3 = vqaddq_s8(delta1, kCst3);
|
||||
const int8x16_t delta_p4 = vqaddq_s8(delta1, kCst4);
|
||||
const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
|
||||
const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
|
||||
const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
|
||||
const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
|
||||
const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
|
||||
@ -192,6 +183,19 @@ static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
|
||||
*op0 = FlipSignBack(sp0);
|
||||
*oq0 = FlipSignBack(sq0);
|
||||
}
|
||||
|
||||
static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
|
||||
const uint8x16_t q0, const uint8x16_t q1,
|
||||
const uint8x16_t mask,
|
||||
uint8x16_t* const op0, uint8x16_t* const oq0) {
|
||||
const int8x16_t p1s = FlipSign(p1);
|
||||
const int8x16_t p0s = FlipSign(p0);
|
||||
const int8x16_t q0s = FlipSign(q0);
|
||||
const int8x16_t q1s = FlipSign(q1);
|
||||
const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
|
||||
const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
|
||||
DoSimpleFilter(p0s, q0s, delta1, op0, oq0);
|
||||
}
|
||||
#endif // USE_INTRINSICS
|
||||
|
||||
// Load/Store vertical edge
|
||||
@ -266,7 +270,10 @@ static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
|
||||
static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
|
||||
uint8x16_t p1, p0, q0, q1, op0, oq0;
|
||||
Load16x4(p, stride, &p1, &p0, &q0, &q1);
|
||||
DoFilter2(p1, p0, q0, q1, &op0, &oq0, thresh);
|
||||
{
|
||||
const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
|
||||
DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
|
||||
}
|
||||
Store16x2(op0, oq0, p, stride);
|
||||
}
|
||||
|
||||
@ -280,7 +287,10 @@ static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
|
||||
static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {
|
||||
uint8x16_t p1, p0, q0, q1, oq0, op0;
|
||||
Load4x16(p, stride, &p1, &p0, &q0, &q1);
|
||||
DoFilter2(p1, p0, q0, q1, &op0, &oq0, thresh);
|
||||
{
|
||||
const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
|
||||
DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
|
||||
}
|
||||
Store2x16(op0, oq0, p, stride);
|
||||
}
|
||||
|
||||
@ -330,6 +340,127 @@ static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Complex In-loop filtering (Paragraph 15.3)
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
|
||||
const uint8x16_t q0, const uint8x16_t q1,
|
||||
int hev_thresh) {
|
||||
const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
|
||||
const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
|
||||
const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
|
||||
const uint8x16_t mask1 = vcgtq_u8(a_p1_p0, hev_thresh_v);
|
||||
const uint8x16_t mask2 = vcgtq_u8(a_q1_q0, hev_thresh_v);
|
||||
const uint8x16_t mask = vorrq_u8(mask1, mask2);
|
||||
return mask;
|
||||
}
|
||||
|
||||
static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,
|
||||
const uint8x16_t p1, const uint8x16_t p0,
|
||||
const uint8x16_t q0, const uint8x16_t q1,
|
||||
const uint8x16_t q2, const uint8x16_t q3,
|
||||
int ithresh) {
|
||||
const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
|
||||
const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2)
|
||||
const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1)
|
||||
const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
|
||||
const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2); // abs(q3 - q2)
|
||||
const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1); // abs(q2 - q1)
|
||||
const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
|
||||
const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
|
||||
const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
|
||||
const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
|
||||
const uint8x16_t max12 = vmaxq_u8(max1, max2);
|
||||
const uint8x16_t max = vmaxq_u8(max12, max3);
|
||||
const uint8x16_t mask = vcgeq_u8(ithresh_v, max);
|
||||
return mask;
|
||||
}
|
||||
|
||||
static void DoFilter6(
|
||||
const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
|
||||
const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
|
||||
const uint8x16_t mask, const uint8x16_t hev_mask,
|
||||
uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
|
||||
uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
|
||||
const int8x16_t p2s = FlipSign(p2);
|
||||
const int8x16_t p1s = FlipSign(p1);
|
||||
int8x16_t p0s = FlipSign(p0);
|
||||
int8x16_t q0s = FlipSign(q0);
|
||||
const int8x16_t q1s = FlipSign(q1);
|
||||
const int8x16_t q2s = FlipSign(q2);
|
||||
const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
|
||||
const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
|
||||
const int8x16_t simple_lf_delta =
|
||||
vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
|
||||
uint8x16_t tmp_p0, tmp_q0;
|
||||
|
||||
// Use the simple loopfilter on pixels with hev.
|
||||
DoSimpleFilter(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
|
||||
p0s = FlipSign(tmp_p0);
|
||||
q0s = FlipSign(tmp_q0);
|
||||
|
||||
// Use the complex loopfilter on pixels without hev.
|
||||
{
|
||||
const uint8x16_t not_hev = vmvnq_u8(hev_mask);
|
||||
const uint8x16_t complex_lf_mask = vandq_u8(mask, not_hev);
|
||||
const int8x16_t complex_lf_delta =
|
||||
vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
|
||||
const int16x8_t kCst63 = vdupq_n_s16(63);
|
||||
const int8x8_t kCst27 = vdup_n_s8(27);
|
||||
const int8x8_t kCst18 = vdup_n_s8(18);
|
||||
const int8x8_t kCst9 = vdup_n_s8(9);
|
||||
const int8x8_t delta_lo = vget_low_s8(complex_lf_delta);
|
||||
const int8x8_t delta_hi = vget_high_s8(complex_lf_delta);
|
||||
const int16x8_t s1_lo = vmlal_s8(kCst63, kCst27, delta_lo); // 63 + 27 * a
|
||||
const int16x8_t s1_hi = vmlal_s8(kCst63, kCst27, delta_hi); // 63 + 27 * a
|
||||
const int16x8_t s2_lo = vmlal_s8(kCst63, kCst18, delta_lo); // 63 + 18 * a
|
||||
const int16x8_t s2_hi = vmlal_s8(kCst63, kCst18, delta_hi); // 63 + 18 * a
|
||||
const int16x8_t s3_lo = vmlal_s8(kCst63, kCst9, delta_lo); // 63 + 9 * a
|
||||
const int16x8_t s3_hi = vmlal_s8(kCst63, kCst9, delta_hi); // 63 + 9 * a
|
||||
const int8x8_t a1_lo = vqshrn_n_s16(s1_lo, 7);
|
||||
const int8x8_t a1_hi = vqshrn_n_s16(s1_hi, 7);
|
||||
const int8x8_t a2_lo = vqshrn_n_s16(s2_lo, 7);
|
||||
const int8x8_t a2_hi = vqshrn_n_s16(s2_hi, 7);
|
||||
const int8x8_t a3_lo = vqshrn_n_s16(s3_lo, 7);
|
||||
const int8x8_t a3_hi = vqshrn_n_s16(s3_hi, 7);
|
||||
const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
|
||||
const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
|
||||
const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
|
||||
|
||||
*op0 = FlipSignBack(vqaddq_s8(p0s, a1)); // clip(p0 + a1)
|
||||
*oq0 = FlipSignBack(vqsubq_s8(q0s, a1)); // clip(q0 - q1)
|
||||
*oq1 = FlipSignBack(vqsubq_s8(q1s, a2)); // clip(q1 - a2)
|
||||
*op1 = FlipSignBack(vqaddq_s8(p1s, a2)); // clip(p1 + a2)
|
||||
*oq2 = FlipSignBack(vqsubq_s8(q2s, a3)); // clip(q2 - a3)
|
||||
*op2 = FlipSignBack(vqaddq_s8(p2s, a3)); // clip(p2 + a3)
|
||||
}
|
||||
}
|
||||
|
||||
// on macroblock edges
|
||||
static void VFilter16(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
|
||||
Load16x4(p - 2 * stride, stride, &p3, &p2, &p1, &p0);
|
||||
Load16x4(p + 2 * stride, stride, &q0, &q1, &q2, &q3);
|
||||
{
|
||||
const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh);
|
||||
const uint8x16_t mask2 = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
|
||||
ithresh);
|
||||
const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
|
||||
const uint8x16_t mask = vandq_u8(mask1, mask2);
|
||||
uint8x16_t op2, op1, op0, oq0, oq1, oq2;
|
||||
|
||||
DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
|
||||
&op2, &op1, &op0, &oq0, &oq1, &oq2);
|
||||
Store16x2(op2, op1, p - 2 * stride, stride);
|
||||
Store16x2(op0, oq0, p + 0 * stride, stride);
|
||||
Store16x2(oq1, oq2, p + 2 * stride, stride);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // USE_INTRINSICS
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Inverse transforms (Paragraph 14.4)
|
||||
|
||||
@ -634,6 +765,9 @@ void VP8DspInitNEON(void) {
|
||||
VP8TransformDC = TransformDC;
|
||||
VP8TransformWHT = TransformWHT;
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
VP8VFilter16 = VFilter16;
|
||||
#endif
|
||||
VP8SimpleVFilter16 = SimpleVFilter16NEON;
|
||||
VP8SimpleHFilter16 = SimpleHFilter16NEON;
|
||||
VP8SimpleVFilter16i = SimpleVFilter16iNEON;
|
||||
|
Loading…
Reference in New Issue
Block a user