From 1b92fe75a14b6a11a04695eb52a651f5d958450a Mon Sep 17 00:00:00 2001 From: James Zern Date: Thu, 5 Dec 2019 21:00:45 -0800 Subject: [PATCH] DC16_NEON,aarch64: use vaddlv saves 3 instructions, neutral to mildly faster on a pixel 3a Change-Id: I6ae57e8e38d4149167ea14e27cd2b32113b4f8e7 --- src/dsp/dec_neon.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/dsp/dec_neon.c b/src/dsp/dec_neon.c index 0b99f00a..37e2e69d 100644 --- a/src/dsp/dec_neon.c +++ b/src/dsp/dec_neon.c @@ -1511,11 +1511,16 @@ static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) { if (do_top) { const uint8x16_t A = vld1q_u8(dst - BPS); // top row +#if defined(__aarch64__) + const uint16_t p3 = vaddlvq_u8(A); + sum_top = vdupq_n_u16(p3); +#else const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); const uint16x4_t p2 = vpadd_u16(p1, p1); const uint16x4_t p3 = vpadd_u16(p2, p2); sum_top = vcombine_u16(p3, p3); +#endif } if (do_left) {