DC16_NEON,aarch64: use vaddlv

saves 3 instructions, neutral to mildly faster on a pixel 3a Change-Id: I6ae57e8e38d4149167ea14e27cd2b32113b4f8e7
2025-08-31 08:12:09 +02:00 · 2019-12-05 21:00:45 -08:00
parent 53f3d8cf7e
commit 1b92fe75a1
1 changed files with 5 additions and 0 deletions
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@@ -1511,11 +1511,16 @@ static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {

  if (do_top) {
    const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
+#if defined(__aarch64__)
+    const uint16_t p3 = vaddlvq_u8(A);
+    sum_top = vdupq_n_u16(p3);
+#else
    const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
    const uint16x4_t p2 = vpadd_u16(p1, p1);
    const uint16x4_t p3 = vpadd_u16(p2, p2);
    sum_top = vcombine_u16(p3, p3);
+#endif
  }

  if (do_left) {