DC8_NEON,aarch64: use vaddv

results in one fewer instruction for both DC8uv_NEON and
DC8uvNoLeft_NEON

Change-Id: Ia4e6f4dbc070079cdc2496a698bd4b34198ea164
This commit is contained in:
James Zern 2019-12-05 21:00:10 -08:00
parent b0e09e346f
commit e2575e05cb

View File

@ -1428,10 +1428,16 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
if (do_top) { if (do_top) {
const uint8x8_t A = vld1_u8(dst - BPS); // top row const uint8x8_t A = vld1_u8(dst - BPS); // top row
#if defined(__aarch64__)
const uint16x8_t B = vmovl_u8(A);
const uint16_t p2 = vaddvq_u16(B);
sum_top = vdupq_n_u16(p2);
#else
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
const uint16x4_t p1 = vpadd_u16(p0, p0); const uint16x4_t p1 = vpadd_u16(p0, p0);
const uint16x4_t p2 = vpadd_u16(p1, p1); const uint16x4_t p2 = vpadd_u16(p1, p1);
sum_top = vcombine_u16(p2, p2); sum_top = vcombine_u16(p2, p2);
#endif
} }
if (do_left) { if (do_left) {