From ca221bbc488575905cef363d826a5e03b75995a2 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 28 Jul 2015 19:37:06 -0700 Subject: [PATCH 1/2] ll_enc_neon: enable VP8LSubtractGreenFromBlueAndRed this moves the function outside the WEBP_USE_INTRINSICS check. there's no alternative version and it's ~54% faster at the function level and mildly faster overall Change-Id: Ibc648e9ee35021d48901e05aa596aa01067796a2 --- src/dsp/lossless_enc_neon.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/dsp/lossless_enc_neon.c b/src/dsp/lossless_enc_neon.c index 8cd5a7ad..ee634b93 100644 --- a/src/dsp/lossless_enc_neon.c +++ b/src/dsp/lossless_enc_neon.c @@ -20,8 +20,6 @@ #include "./lossless.h" #include "./neon.h" -#ifdef WEBP_USE_INTRINSICS - //------------------------------------------------------------------------------ // Subtract-Green Transform @@ -72,17 +70,13 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { #undef USE_VTBLQ -#endif // WEBP_USE_INTRINSICS - //------------------------------------------------------------------------------ // Entry point extern void VP8LEncDspInitNEON(void); WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitNEON(void) { -#ifdef WEBP_USE_INTRINSICS VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed; -#endif } #else // !WEBP_USE_NEON From 2a010f992ade1d1cf3cf39a2bc5278c653205223 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 28 Jul 2015 19:44:58 -0700 Subject: [PATCH 2/2] lossless_neon: remove predictors 5-13 operating on single uint32's isn't helped by NEON. this improves aarch64 performance by ~4% Change-Id: I9fb25a8962de7b80e893e756ee7c76393cfd40c7 --- src/dsp/lossless_neon.c | 131 ---------------------------------------- 1 file changed, 131 deletions(-) diff --git a/src/dsp/lossless_neon.c b/src/dsp/lossless_neon.c index 41388c01..94a96201 100644 --- a/src/dsp/lossless_neon.c +++ b/src/dsp/lossless_neon.c @@ -139,125 +139,6 @@ static void ConvertBGRAToRGB(const uint32_t* src, #endif // !WORK_AROUND_GCC -//------------------------------------------------------------------------------ - -#ifdef WEBP_USE_INTRINSICS - -static WEBP_INLINE uint32_t Average2(const uint32_t* const a, - const uint32_t* const b) { - const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a)); - const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b)); - const uint8x8_t avg = vhadd_u8(a0, b0); - return vget_lane_u32(vreinterpret_u32_u8(avg), 0); -} - -static WEBP_INLINE uint32_t Average3(const uint32_t* const a, - const uint32_t* const b, - const uint32_t* const c) { - const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a)); - const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b)); - const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c)); - const uint8x8_t avg1 = vhadd_u8(a0, c0); - const uint8x8_t avg2 = vhadd_u8(avg1, b0); - return vget_lane_u32(vreinterpret_u32_u8(avg2), 0); -} - -static WEBP_INLINE uint32_t Average4(const uint32_t* const a, - const uint32_t* const b, - const uint32_t* const c, - const uint32_t* const d) { - const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a)); - const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b)); - const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c)); - const uint8x8_t d0 = vreinterpret_u8_u64(vcreate_u64(*d)); - const uint8x8_t avg1 = vhadd_u8(a0, b0); - const uint8x8_t avg2 = vhadd_u8(c0, d0); - const uint8x8_t avg3 = vhadd_u8(avg1, avg2); - return vget_lane_u32(vreinterpret_u32_u8(avg3), 0); -} - -static uint32_t Predictor5(uint32_t left, const uint32_t* const top) { - return Average3(&left, top + 0, top + 1); -} - -static uint32_t Predictor6(uint32_t left, const uint32_t* const top) { - return Average2(&left, top - 1); -} - -static uint32_t Predictor7(uint32_t left, const uint32_t* const top) { - return Average2(&left, top + 0); -} - -static uint32_t Predictor8(uint32_t left, const uint32_t* const top) { - (void)left; - return Average2(top - 1, top + 0); -} - -static uint32_t Predictor9(uint32_t left, const uint32_t* const top) { - (void)left; - return Average2(top + 0, top + 1); -} - -static uint32_t Predictor10(uint32_t left, const uint32_t* const top) { - return Average4(&left, top - 1, top + 0, top + 1); -} - -//------------------------------------------------------------------------------ - -static WEBP_INLINE uint32_t Select(const uint32_t* const c0, - const uint32_t* const c1, - const uint32_t* const c2) { - const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0)); - const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1)); - const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2)); - const uint8x8_t bc = vabd_u8(p1, p2); // |b-c| - const uint8x8_t ac = vabd_u8(p0, p2); // |a-c| - const int16x4_t sum_bc = vreinterpret_s16_u16(vpaddl_u8(bc)); - const int16x4_t sum_ac = vreinterpret_s16_u16(vpaddl_u8(ac)); - const int32x2_t diff = vpaddl_s16(vsub_s16(sum_bc, sum_ac)); - const int32_t pa_minus_pb = vget_lane_s32(diff, 0); - return (pa_minus_pb <= 0) ? *c0 : *c1; -} - -static uint32_t Predictor11(uint32_t left, const uint32_t* const top) { - return Select(top + 0, &left, top - 1); -} - -static WEBP_INLINE uint32_t ClampedAddSubtractFull(const uint32_t* const c0, - const uint32_t* const c1, - const uint32_t* const c2) { - const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0)); - const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1)); - const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2)); - const uint16x8_t sum0 = vaddl_u8(p0, p1); // add and widen - const uint16x8_t sum1 = vqsubq_u16(sum0, vmovl_u8(p2)); // widen and subtract - const uint8x8_t out = vqmovn_u16(sum1); // narrow and clamp - return vget_lane_u32(vreinterpret_u32_u8(out), 0); -} - -static uint32_t Predictor12(uint32_t left, const uint32_t* const top) { - return ClampedAddSubtractFull(&left, top + 0, top - 1); -} - -static WEBP_INLINE uint32_t ClampedAddSubtractHalf(const uint32_t* const c0, - const uint32_t* const c1, - const uint32_t* const c2) { - const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0)); - const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1)); - const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2)); - const uint8x8_t avg = vhadd_u8(p0, p1); // Average(c0,c1) - const uint8x8_t ab = vshr_n_u8(vqsub_u8(avg, p2), 1); // (a-b)>>1 saturated - const uint8x8_t ba = vshr_n_u8(vqsub_u8(p2, avg), 1); // (b-a)>>1 saturated - const uint8x8_t out = vqsub_u8(vqadd_u8(avg, ab), ba); - return vget_lane_u32(vreinterpret_u32_u8(out), 0); -} - -static uint32_t Predictor13(uint32_t left, const uint32_t* const top) { - return ClampedAddSubtractHalf(&left, top + 0, top - 1); -} - -#endif // WEBP_USE_INTRINSICS - //------------------------------------------------------------------------------ // Subtract-Green Transform @@ -318,18 +199,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) { VP8LConvertBGRAToBGR = ConvertBGRAToBGR; VP8LConvertBGRAToRGB = ConvertBGRAToRGB; -#ifdef WEBP_USE_INTRINSICS - VP8LPredictors[5] = Predictor5; - VP8LPredictors[6] = Predictor6; - VP8LPredictors[7] = Predictor7; - VP8LPredictors[8] = Predictor8; - VP8LPredictors[9] = Predictor9; - VP8LPredictors[10] = Predictor10; - VP8LPredictors[11] = Predictor11; - VP8LPredictors[12] = Predictor12; - VP8LPredictors[13] = Predictor13; -#endif - VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; }