dsp,neon: use vaddv in a few more places

SumToInt_NEON
horizontal_add_uint32x4

Change-Id: I881831a7b2bab35a1810b0d83fee761470f3e09f
This commit is contained in:
James Zern 2022-09-09 22:17:46 -07:00
parent e8f83de286
commit e68765af42
2 changed files with 16 additions and 6 deletions

View File

@ -764,9 +764,14 @@ static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
// Horizontal sum of all four uint32_t values in 'sum'.
static int SumToInt_NEON(uint32x4_t sum) {
#if defined(__aarch64__)
return (int)vaddvq_u32(sum);
#else
const uint64x2_t sum2 = vpaddlq_u32(sum);
const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
return (int)sum3;
const uint32x2_t sum3 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(sum2)),
vreinterpret_u32_u64(vget_high_u64(sum2)));
return (int)vget_lane_u32(sum3, 0);
#endif
}
static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {

View File

@ -21,10 +21,15 @@
#define IsFlat IsFlat_NEON
static uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) {
static uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
#if defined(__aarch64__)
return vaddvq_u32(a);
#else
const uint64x2_t b = vpaddlq_u32(a);
return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
vreinterpret_u32_u64(vget_high_u64(b)));
return vget_lane_u32(c, 0);
#endif
}
static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks,
@ -45,7 +50,7 @@ static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks,
levels += 16;
}
return thresh >= (int32_t)vget_lane_u32(horizontal_add_uint32x4(sum), 0);
return thresh >= (int)horizontal_add_uint32x4(sum);
}
#else