From e68765af4256287b55334c362fcf0f177f4bbb7c Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 9 Sep 2022 22:17:46 -0700 Subject: [PATCH] dsp,neon: use vaddv in a few more places SumToInt_NEON horizontal_add_uint32x4 Change-Id: I881831a7b2bab35a1810b0d83fee761470f3e09f --- src/dsp/enc_neon.c | 9 +++++++-- src/dsp/quant.h | 13 +++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c index 601962ba..3a04111c 100644 --- a/src/dsp/enc_neon.c +++ b/src/dsp/enc_neon.c @@ -764,9 +764,14 @@ static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a, // Horizontal sum of all four uint32_t values in 'sum'. static int SumToInt_NEON(uint32x4_t sum) { +#if defined(__aarch64__) + return (int)vaddvq_u32(sum); +#else const uint64x2_t sum2 = vpaddlq_u32(sum); - const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1); - return (int)sum3; + const uint32x2_t sum3 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(sum2)), + vreinterpret_u32_u64(vget_high_u64(sum2))); + return (int)vget_lane_u32(sum3, 0); +#endif } static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) { diff --git a/src/dsp/quant.h b/src/dsp/quant.h index 5e8dba8d..fc099bf9 100644 --- a/src/dsp/quant.h +++ b/src/dsp/quant.h @@ -21,10 +21,15 @@ #define IsFlat IsFlat_NEON -static uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) { +static uint32_t horizontal_add_uint32x4(const uint32x4_t a) { +#if defined(__aarch64__) + return vaddvq_u32(a); +#else const uint64x2_t b = vpaddlq_u32(a); - return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +#endif } static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks, @@ -45,7 +50,7 @@ static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks, levels += 16; } - return thresh >= (int32_t)vget_lane_u32(horizontal_add_uint32x4(sum), 0); + return thresh >= (int)horizontal_add_uint32x4(sum); } #else