diff --git a/src/dsp/lossless_neon.c b/src/dsp/lossless_neon.c index 927873e9..d182c5ad 100644 --- a/src/dsp/lossless_neon.c +++ b/src/dsp/lossless_neon.c @@ -13,7 +13,7 @@ #include "./dsp.h" -#if 0 // defined(WEBP_USE_NEON) +#if defined(WEBP_USE_NEON) #include @@ -22,10 +22,17 @@ //------------------------------------------------------------------------------ // Colorspace conversion functions +#if defined(__GNUC__) +#define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__) +#endif + +#if defined(LOCAL_GCC_VERSION) && (LOCAL_GCC_VERSION >= 0x408) +// gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for +// gcc-4.8.x at least. static void ConvertBGRAToRGBA(const uint32_t* src, int num_pixels, uint8_t* dst) { - const uint32_t* const end = src + num_pixels - 16; - for (; src <= end; src += 16) { + const uint32_t* const end = src + (num_pixels & ~15); + for (; src < end; src += 16) { uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); // swap B and R. (VSWP d0,d2 has no intrinsics equivalent!) const uint8x16_t tmp = pixel.val[0]; @@ -34,36 +41,105 @@ static void ConvertBGRAToRGBA(const uint32_t* src, vst4q_u8(dst, pixel); dst += 64; } - num_pixels &= 15; - VP8LConvertBGRAToRGBA_C(src, num_pixels, dst); // left-overs + VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs } static void ConvertBGRAToBGR(const uint32_t* src, int num_pixels, uint8_t* dst) { - const uint32_t* const end = src + num_pixels - 16; - for (; src <= end; src += 16) { + const uint32_t* const end = src + (num_pixels & ~15); + for (; src < end; src += 16) { const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); const uint8x16x3_t tmp = { { pixel.val[0], pixel.val[1], pixel.val[2] } }; vst3q_u8(dst, tmp); dst += 48; } - num_pixels &= 15; - VP8LConvertBGRAToBGR_C(src, num_pixels, dst); // left-overs + VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst); // left-overs } static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels, uint8_t* dst) { - const uint32_t* const end = src + num_pixels - 16; - for (; src <= end; src += 16) { + const uint32_t* const end = src + (num_pixels & ~15); + for (; src < end; src += 16) { const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } }; vst3q_u8(dst, tmp); dst += 48; } - num_pixels &= 15; - VP8LConvertBGRAToRGB_C(src, num_pixels, dst); // left-overs + VP8LConvertBGRAToRGB_C(src, num_pixels & 15, dst); // left-overs } +#else + +// gcc-4.6.0 fallback + +static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 }; + +static void ConvertBGRAToRGBA(const uint32_t* src, + int num_pixels, uint8_t* dst) { + const uint32_t* const end = src + (num_pixels & ~1); + const uint8x8_t shuffle = vld1_u8(kRGBAShuffle); + for (; src < end; src += 2) { + const uint8x8_t pixels = vld1_u8((uint8_t*)src); + vst1_u8(dst, vtbl1_u8(pixels, shuffle)); + dst += 8; + } + VP8LConvertBGRAToRGBA_C(src, num_pixels & 1, dst); // left-overs +} + +static const uint8_t kBGRShuffle[3][8] = { + { 0, 1, 2, 4, 5, 6, 8, 9 }, + { 10, 12, 13, 14, 16, 17, 18, 20 }, + { 21, 22, 24, 25, 26, 28, 29, 30 } +}; + +static void ConvertBGRAToBGR(const uint32_t* src, + int num_pixels, uint8_t* dst) { + const uint32_t* const end = src + (num_pixels & ~7); + const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]); + const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]); + const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]); + for (; src < end; src += 8) { + const uint8x8x4_t pixels = {{ + vld1_u8((const uint8_t*)(src + 0)), + vld1_u8((const uint8_t*)(src + 2)), + vld1_u8((const uint8_t*)(src + 4)), + vld1_u8((const uint8_t*)(src + 6)) }}; + vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0)); + vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); + vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); + dst += 8 * 3; + } + VP8LConvertBGRAToBGR_C(src, num_pixels & 7, dst); // left-overs +} + +static const uint8_t kRGBShuffle[3][8] = { + { 2, 1, 0, 6, 5, 4, 10, 9 }, + { 8, 14, 13, 12, 18, 17, 16, 22 }, + { 21, 20, 26, 25, 24, 30, 29, 28 } +}; + +static void ConvertBGRAToRGB(const uint32_t* src, + int num_pixels, uint8_t* dst) { + const uint32_t* const end = src + (num_pixels & ~7); + const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]); + const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]); + const uint8x8_t shuffle2 = vld1_u8(kRGBShuffle[2]); + for (; src < end; src += 8) { + const uint8x8x4_t pixels = {{ + vld1_u8((const uint8_t*)(src + 0)), + vld1_u8((const uint8_t*)(src + 2)), + vld1_u8((const uint8_t*)(src + 4)), + vld1_u8((const uint8_t*)(src + 6)) }}; + vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0)); + vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); + vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); + dst += 8 * 3; + } + VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst); // left-overs +} + +#endif // gcc-4.8 + #endif // WEBP_USE_NEON //------------------------------------------------------------------------------ @@ -71,9 +147,7 @@ static void ConvertBGRAToRGB(const uint32_t* src, extern void VP8LDspInitNEON(void); void VP8LDspInitNEON(void) { -// TODO(jzern): these are producing incorrect results with a gcc-4.6/NDK -// build. -#if 0 // defined(WEBP_USE_NEON) +#if defined(WEBP_USE_NEON) VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; VP8LConvertBGRAToBGR = ConvertBGRAToBGR; VP8LConvertBGRAToRGB = ConvertBGRAToRGB;