mirror of
https://github.com/webmproject/libwebp.git
synced 2024-12-27 22:28:22 +01:00
fix the gcc-4.6.0 bug by implementing alternative method
previous functions are a bit faster with gcc-4.8, so we keep them for now. Change-Id: I4081e5af66fbf606295d8a83875c1b889729b4dc
This commit is contained in:
parent
2b1b4d5ae9
commit
abe6f48709
@ -13,7 +13,7 @@
|
||||
|
||||
#include "./dsp.h"
|
||||
|
||||
#if 0 // defined(WEBP_USE_NEON)
|
||||
#if defined(WEBP_USE_NEON)
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
@ -22,10 +22,17 @@
|
||||
//------------------------------------------------------------------------------
|
||||
// Colorspace conversion functions
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
|
||||
#endif
|
||||
|
||||
#if defined(LOCAL_GCC_VERSION) && (LOCAL_GCC_VERSION >= 0x408)
|
||||
// gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
|
||||
// gcc-4.8.x at least.
|
||||
static void ConvertBGRAToRGBA(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const uint32_t* const end = src + num_pixels - 16;
|
||||
for (; src <= end; src += 16) {
|
||||
const uint32_t* const end = src + (num_pixels & ~15);
|
||||
for (; src < end; src += 16) {
|
||||
uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
|
||||
// swap B and R. (VSWP d0,d2 has no intrinsics equivalent!)
|
||||
const uint8x16_t tmp = pixel.val[0];
|
||||
@ -34,36 +41,105 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
|
||||
vst4q_u8(dst, pixel);
|
||||
dst += 64;
|
||||
}
|
||||
num_pixels &= 15;
|
||||
VP8LConvertBGRAToRGBA_C(src, num_pixels, dst); // left-overs
|
||||
VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs
|
||||
}
|
||||
|
||||
static void ConvertBGRAToBGR(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const uint32_t* const end = src + num_pixels - 16;
|
||||
for (; src <= end; src += 16) {
|
||||
const uint32_t* const end = src + (num_pixels & ~15);
|
||||
for (; src < end; src += 16) {
|
||||
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
|
||||
const uint8x16x3_t tmp = { { pixel.val[0], pixel.val[1], pixel.val[2] } };
|
||||
vst3q_u8(dst, tmp);
|
||||
dst += 48;
|
||||
}
|
||||
num_pixels &= 15;
|
||||
VP8LConvertBGRAToBGR_C(src, num_pixels, dst); // left-overs
|
||||
VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst); // left-overs
|
||||
}
|
||||
|
||||
static void ConvertBGRAToRGB(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const uint32_t* const end = src + num_pixels - 16;
|
||||
for (; src <= end; src += 16) {
|
||||
const uint32_t* const end = src + (num_pixels & ~15);
|
||||
for (; src < end; src += 16) {
|
||||
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
|
||||
const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } };
|
||||
vst3q_u8(dst, tmp);
|
||||
dst += 48;
|
||||
}
|
||||
num_pixels &= 15;
|
||||
VP8LConvertBGRAToRGB_C(src, num_pixels, dst); // left-overs
|
||||
VP8LConvertBGRAToRGB_C(src, num_pixels & 15, dst); // left-overs
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// gcc-4.6.0 fallback
|
||||
|
||||
static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
|
||||
|
||||
static void ConvertBGRAToRGBA(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const uint32_t* const end = src + (num_pixels & ~1);
|
||||
const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
|
||||
for (; src < end; src += 2) {
|
||||
const uint8x8_t pixels = vld1_u8((uint8_t*)src);
|
||||
vst1_u8(dst, vtbl1_u8(pixels, shuffle));
|
||||
dst += 8;
|
||||
}
|
||||
VP8LConvertBGRAToRGBA_C(src, num_pixels & 1, dst); // left-overs
|
||||
}
|
||||
|
||||
static const uint8_t kBGRShuffle[3][8] = {
|
||||
{ 0, 1, 2, 4, 5, 6, 8, 9 },
|
||||
{ 10, 12, 13, 14, 16, 17, 18, 20 },
|
||||
{ 21, 22, 24, 25, 26, 28, 29, 30 }
|
||||
};
|
||||
|
||||
static void ConvertBGRAToBGR(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const uint32_t* const end = src + (num_pixels & ~7);
|
||||
const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
|
||||
const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
|
||||
const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]);
|
||||
for (; src < end; src += 8) {
|
||||
const uint8x8x4_t pixels = {{
|
||||
vld1_u8((const uint8_t*)(src + 0)),
|
||||
vld1_u8((const uint8_t*)(src + 2)),
|
||||
vld1_u8((const uint8_t*)(src + 4)),
|
||||
vld1_u8((const uint8_t*)(src + 6)) }};
|
||||
vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0));
|
||||
vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1));
|
||||
vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
|
||||
dst += 8 * 3;
|
||||
}
|
||||
VP8LConvertBGRAToBGR_C(src, num_pixels & 7, dst); // left-overs
|
||||
}
|
||||
|
||||
static const uint8_t kRGBShuffle[3][8] = {
|
||||
{ 2, 1, 0, 6, 5, 4, 10, 9 },
|
||||
{ 8, 14, 13, 12, 18, 17, 16, 22 },
|
||||
{ 21, 20, 26, 25, 24, 30, 29, 28 }
|
||||
};
|
||||
|
||||
static void ConvertBGRAToRGB(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const uint32_t* const end = src + (num_pixels & ~7);
|
||||
const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
|
||||
const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
|
||||
const uint8x8_t shuffle2 = vld1_u8(kRGBShuffle[2]);
|
||||
for (; src < end; src += 8) {
|
||||
const uint8x8x4_t pixels = {{
|
||||
vld1_u8((const uint8_t*)(src + 0)),
|
||||
vld1_u8((const uint8_t*)(src + 2)),
|
||||
vld1_u8((const uint8_t*)(src + 4)),
|
||||
vld1_u8((const uint8_t*)(src + 6)) }};
|
||||
vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0));
|
||||
vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1));
|
||||
vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
|
||||
dst += 8 * 3;
|
||||
}
|
||||
VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst); // left-overs
|
||||
}
|
||||
|
||||
#endif // gcc-4.8
|
||||
|
||||
#endif // WEBP_USE_NEON
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
@ -71,9 +147,7 @@ static void ConvertBGRAToRGB(const uint32_t* src,
|
||||
extern void VP8LDspInitNEON(void);
|
||||
|
||||
void VP8LDspInitNEON(void) {
|
||||
// TODO(jzern): these are producing incorrect results with a gcc-4.6/NDK
|
||||
// build.
|
||||
#if 0 // defined(WEBP_USE_NEON)
|
||||
#if defined(WEBP_USE_NEON)
|
||||
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
|
||||
VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
|
||||
VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
|
||||
|
Loading…
Reference in New Issue
Block a user