From 0339fa26eb233d2a756625e507ee51dd62c0562f Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 31 Jan 2015 09:47:15 -0800 Subject: [PATCH] lossless_neon: enable subtract green for aarch64 similar to: 1ba61b0 enable NEON intrinsics in aarch64 builds vtbl1_u8 is available everywhere but Xcode-based iOS arm64 builds, use vtbl1q_u8 there. performance varies based on the input, 1-3% on encode was observed (cherry picked from commit 416e1cea9b7f7a626341005cced947add7da5c54) Change-Id: Ifec35b37eb856acfcf69ed7f16fa078cd40b7034 --- src/dsp/lossless_neon.c | 48 ++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/src/dsp/lossless_neon.c b/src/dsp/lossless_neon.c index 987767b5..9f8fa9da 100644 --- a/src/dsp/lossless_neon.c +++ b/src/dsp/lossless_neon.c @@ -259,20 +259,44 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) { //------------------------------------------------------------------------------ // Subtract-Green Transform -// vtbl? are unavailable in iOS/arm64 builds. -#if !defined(__aarch64__) +// vtbl?_u8 are marked unavailable for iOS arm64, use wider versions there. +#if defined(__APPLE__) && defined(__aarch64__) && \ + defined(__apple_build_version__) +#define USE_VTBLQ +#endif -// 255 = byte will be zero'd +#ifdef USE_VTBLQ +// 255 = byte will be zeroed +static const uint8_t kGreenShuffle[16] = { + 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255 +}; + +static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, + const uint8x16_t shuffle) { + return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)), + vtbl1q_u8(argb, vget_high_u8(shuffle))); +} +#else // !USE_VTBLQ +// 255 = byte will be zeroed static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 }; +static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, + const uint8x8_t shuffle) { + return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), + vtbl1_u8(vget_high_u8(argb), shuffle)); +} +#endif // USE_VTBLQ + static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { const uint32_t* const end = argb_data + (num_pixels & ~3); +#ifdef USE_VTBLQ + const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); +#else const uint8x8_t shuffle = vld1_u8(kGreenShuffle); +#endif for (; argb_data < end; argb_data += 4) { const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); - const uint8x16_t greens = - vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), - vtbl1_u8(vget_high_u8(argb), shuffle)); + const uint8x16_t greens = DoGreenShuffle(argb, shuffle); vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens)); } // fallthrough and finish off with plain-C @@ -281,19 +305,21 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { const uint32_t* const end = argb_data + (num_pixels & ~3); +#ifdef USE_VTBLQ + const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); +#else const uint8x8_t shuffle = vld1_u8(kGreenShuffle); +#endif for (; argb_data < end; argb_data += 4) { const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); - const uint8x16_t greens = - vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), - vtbl1_u8(vget_high_u8(argb), shuffle)); + const uint8x16_t greens = DoGreenShuffle(argb, shuffle); vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens)); } // fallthrough and finish off with plain-C VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3); } -#endif // !__aarch64__ +#undef USE_VTBLQ #endif // USE_INTRINSICS @@ -320,11 +346,9 @@ void VP8LDspInitNEON(void) { VP8LPredictors[12] = Predictor12; VP8LPredictors[13] = Predictor13; -#if !defined(__aarch64__) VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed; VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; #endif -#endif #endif // WEBP_USE_NEON }