From 32de385eca6506286ffbe87a75006ac20c60f665 Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 21 Oct 2014 18:06:08 +0200 Subject: [PATCH] dec_neon: add VE4 intra predictor based on SSE2 version, ~59% faster Change-Id: Iaa2181eb51bd975de0e9fe5c7b66ed18188f0e3b --- src/dsp/dec_neon.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/dsp/dec_neon.c b/src/dsp/dec_neon.c index 39be3d3a..3ea8e74c 100644 --- a/src/dsp/dec_neon.c +++ b/src/dsp/dec_neon.c @@ -1258,6 +1258,25 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { } #undef MUL +//------------------------------------------------------------------------------ +// 4x4 + +static void VE4(uint8_t* dst) { // vertical + // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS. + const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row + const uint64x1_t A1 = vshr_n_u64(A0, 8); + const uint64x1_t A2 = vshr_n_u64(A0, 16); + const uint8x8_t ABCDEFG = vreinterpret_u8_u64(A0); + const uint8x8_t BCDEFG_ = vreinterpret_u8_u64(A1); + const uint8x8_t CDEFG__ = vreinterpret_u8_u64(A2); + const uint8x8_t b = vhadd_u8(ABCDEFG, CDEFG__); + const uint8x8_t avg = vrhadd_u8(b, BCDEFG_); + int i; + for (i = 0; i < 4; ++i) { + vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0); + } +} + #endif // WEBP_USE_NEON //------------------------------------------------------------------------------ @@ -1288,5 +1307,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) { VP8SimpleHFilter16 = SimpleHFilter16; VP8SimpleVFilter16i = SimpleVFilter16i; VP8SimpleHFilter16i = SimpleHFilter16i; + + VP8PredLuma4[2] = VE4; #endif // WEBP_USE_NEON }