Merge "fix the gcc-4.6.0 bug by implementing alternative method"

This commit is contained in:
skal 2014-04-08 23:25:59 -07:00 committed by Gerrit Code Review
commit c503b485b6

View File

@ -13,7 +13,7 @@
#include "./dsp.h" #include "./dsp.h"
#if 0 // defined(WEBP_USE_NEON) #if defined(WEBP_USE_NEON)
#include <arm_neon.h> #include <arm_neon.h>
@ -22,10 +22,17 @@
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Colorspace conversion functions // Colorspace conversion functions
#if defined(__GNUC__)
#define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
#endif
#if defined(LOCAL_GCC_VERSION) && (LOCAL_GCC_VERSION >= 0x408)
// gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
// gcc-4.8.x at least.
static void ConvertBGRAToRGBA(const uint32_t* src, static void ConvertBGRAToRGBA(const uint32_t* src,
int num_pixels, uint8_t* dst) { int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + num_pixels - 16; const uint32_t* const end = src + (num_pixels & ~15);
for (; src <= end; src += 16) { for (; src < end; src += 16) {
uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
// swap B and R. (VSWP d0,d2 has no intrinsics equivalent!) // swap B and R. (VSWP d0,d2 has no intrinsics equivalent!)
const uint8x16_t tmp = pixel.val[0]; const uint8x16_t tmp = pixel.val[0];
@ -34,36 +41,105 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
vst4q_u8(dst, pixel); vst4q_u8(dst, pixel);
dst += 64; dst += 64;
} }
num_pixels &= 15; VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs
VP8LConvertBGRAToRGBA_C(src, num_pixels, dst); // left-overs
} }
static void ConvertBGRAToBGR(const uint32_t* src, static void ConvertBGRAToBGR(const uint32_t* src,
int num_pixels, uint8_t* dst) { int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + num_pixels - 16; const uint32_t* const end = src + (num_pixels & ~15);
for (; src <= end; src += 16) { for (; src < end; src += 16) {
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
const uint8x16x3_t tmp = { { pixel.val[0], pixel.val[1], pixel.val[2] } }; const uint8x16x3_t tmp = { { pixel.val[0], pixel.val[1], pixel.val[2] } };
vst3q_u8(dst, tmp); vst3q_u8(dst, tmp);
dst += 48; dst += 48;
} }
num_pixels &= 15; VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst); // left-overs
VP8LConvertBGRAToBGR_C(src, num_pixels, dst); // left-overs
} }
static void ConvertBGRAToRGB(const uint32_t* src, static void ConvertBGRAToRGB(const uint32_t* src,
int num_pixels, uint8_t* dst) { int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + num_pixels - 16; const uint32_t* const end = src + (num_pixels & ~15);
for (; src <= end; src += 16) { for (; src < end; src += 16) {
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } }; const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } };
vst3q_u8(dst, tmp); vst3q_u8(dst, tmp);
dst += 48; dst += 48;
} }
num_pixels &= 15; VP8LConvertBGRAToRGB_C(src, num_pixels & 15, dst); // left-overs
VP8LConvertBGRAToRGB_C(src, num_pixels, dst); // left-overs
} }
#else
// gcc-4.6.0 fallback
static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
static void ConvertBGRAToRGBA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~1);
const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
for (; src < end; src += 2) {
const uint8x8_t pixels = vld1_u8((uint8_t*)src);
vst1_u8(dst, vtbl1_u8(pixels, shuffle));
dst += 8;
}
VP8LConvertBGRAToRGBA_C(src, num_pixels & 1, dst); // left-overs
}
static const uint8_t kBGRShuffle[3][8] = {
{ 0, 1, 2, 4, 5, 6, 8, 9 },
{ 10, 12, 13, 14, 16, 17, 18, 20 },
{ 21, 22, 24, 25, 26, 28, 29, 30 }
};
static void ConvertBGRAToBGR(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~7);
const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]);
for (; src < end; src += 8) {
const uint8x8x4_t pixels = {{
vld1_u8((const uint8_t*)(src + 0)),
vld1_u8((const uint8_t*)(src + 2)),
vld1_u8((const uint8_t*)(src + 4)),
vld1_u8((const uint8_t*)(src + 6)) }};
vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0));
vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1));
vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
dst += 8 * 3;
}
VP8LConvertBGRAToBGR_C(src, num_pixels & 7, dst); // left-overs
}
static const uint8_t kRGBShuffle[3][8] = {
{ 2, 1, 0, 6, 5, 4, 10, 9 },
{ 8, 14, 13, 12, 18, 17, 16, 22 },
{ 21, 20, 26, 25, 24, 30, 29, 28 }
};
static void ConvertBGRAToRGB(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~7);
const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
const uint8x8_t shuffle2 = vld1_u8(kRGBShuffle[2]);
for (; src < end; src += 8) {
const uint8x8x4_t pixels = {{
vld1_u8((const uint8_t*)(src + 0)),
vld1_u8((const uint8_t*)(src + 2)),
vld1_u8((const uint8_t*)(src + 4)),
vld1_u8((const uint8_t*)(src + 6)) }};
vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0));
vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1));
vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
dst += 8 * 3;
}
VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst); // left-overs
}
#endif // gcc-4.8
#endif // WEBP_USE_NEON #endif // WEBP_USE_NEON
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
@ -71,9 +147,7 @@ static void ConvertBGRAToRGB(const uint32_t* src,
extern void VP8LDspInitNEON(void); extern void VP8LDspInitNEON(void);
void VP8LDspInitNEON(void) { void VP8LDspInitNEON(void) {
// TODO(jzern): these are producing incorrect results with a gcc-4.6/NDK #if defined(WEBP_USE_NEON)
// build.
#if 0 // defined(WEBP_USE_NEON)
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR; VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB; VP8LConvertBGRAToRGB = ConvertBGRAToRGB;