10% faster table-less SSE2/NEON version of YUV->RGB conversion

* Precision is slightly different
* also implemented in SSE2 the missing WebPUpsamplers for MODE_ARGB, MODE_Argb, MODE_RGB565, etc.
* removing yuv_tables_sse2.h saved ~8k of binary size
* the mips32/mips_dsp_r2 code is disabled for now, since it has drifted away
* the NEON code is somewhat tricky

Change-Id: Icf205faa62cf46c2825d79f3af6725dc1ec7f052
This commit is contained in:
Pascal Massimino
2015-12-08 12:31:47 +01:00
committed by James Zern
parent bd91af200a
commit ac761a3738
11 changed files with 286 additions and 887 deletions

View File

@ -89,7 +89,8 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
//-----------------------------------------------------------------------------
// YUV->RGB conversion
static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
// note: we represent the 33050 large constant as 32768 + 282
static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 };
#define v255 vdup_n_u8(255)
@ -117,38 +118,35 @@ static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
vst4_u8(out, b_g_r_v255); \
} while (0)
#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) { \
#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) do { \
int i; \
for (i = 0; i < N; i += 8) { \
const int off = ((cur_x) + i) * XSTEP; \
uint8x8_t y = vld1_u8((src_y) + (cur_x) + i); \
uint8x8_t u = vld1_u8((src_uv) + i); \
uint8x8_t v = vld1_u8((src_uv) + i + 16); \
const int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); \
const int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); \
const int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); \
int32x4_t yl = vmull_lane_s16(vget_low_s16(yy), cf16, 0); \
int32x4_t yh = vmull_lane_s16(vget_high_s16(yy), cf16, 0); \
const int32x4_t rl = vmlal_lane_s16(yl, vget_low_s16(vv), cf16, 1);\
const int32x4_t rh = vmlal_lane_s16(yh, vget_high_s16(vv), cf16, 1);\
int32x4_t gl = vmlsl_lane_s16(yl, vget_low_s16(uu), cf16, 2); \
int32x4_t gh = vmlsl_lane_s16(yh, vget_high_s16(uu), cf16, 2); \
const int32x4_t bl = vmovl_s16(vget_low_s16(uu)); \
const int32x4_t bh = vmovl_s16(vget_high_s16(uu)); \
gl = vmlsl_lane_s16(gl, vget_low_s16(vv), cf16, 3); \
gh = vmlsl_lane_s16(gh, vget_high_s16(vv), cf16, 3); \
yl = vmlaq_lane_s32(yl, bl, cf32, 0); \
yh = vmlaq_lane_s32(yh, bh, cf32, 0); \
/* vrshrn_n_s32() already incorporates the rounding constant */ \
y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, YUV_FIX2), \
vrshrn_n_s32(rh, YUV_FIX2))); \
u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, YUV_FIX2), \
vrshrn_n_s32(gh, YUV_FIX2))); \
v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(yl, YUV_FIX2), \
vrshrn_n_s32(yh, YUV_FIX2))); \
STORE_ ## FMT(out + off, y, u, v); \
const uint8x8_t y = vld1_u8((src_y) + (cur_x) + i); \
const uint8x8_t u = vld1_u8((src_uv) + i + 0); \
const uint8x8_t v = vld1_u8((src_uv) + i + 16); \
const int16x8_t Y0 = vreinterpretq_s16_u16(vshll_n_u8(y, 7)); \
const int16x8_t U0 = vreinterpretq_s16_u16(vshll_n_u8(u, 7)); \
const int16x8_t V0 = vreinterpretq_s16_u16(vshll_n_u8(v, 7)); \
const int16x8_t Y1 = vqdmulhq_lane_s16(Y0, coeff1, 0); \
const int16x8_t R0 = vqdmulhq_lane_s16(V0, coeff1, 1); \
const int16x8_t G0 = vqdmulhq_lane_s16(U0, coeff1, 2); \
const int16x8_t G1 = vqdmulhq_lane_s16(V0, coeff1, 3); \
const int16x8_t B0 = vqdmulhq_n_s16(U0, 282); \
const int16x8_t R1 = vqaddq_s16(Y1, R_Rounder); \
const int16x8_t G2 = vqaddq_s16(Y1, G_Rounder); \
const int16x8_t B1 = vqaddq_s16(Y1, B_Rounder); \
const int16x8_t R2 = vqaddq_s16(R0, R1); \
const int16x8_t G3 = vqaddq_s16(G0, G1); \
const int16x8_t B2 = vqaddq_s16(B0, B1); \
const int16x8_t G4 = vqsubq_s16(G2, G3); \
const int16x8_t B3 = vqaddq_s16(B2, U0); \
const uint8x8_t R = vqshrun_n_s16(R2, YUV_FIX2); \
const uint8x8_t G = vqshrun_n_s16(G4, YUV_FIX2); \
const uint8x8_t B = vqshrun_n_s16(B3, YUV_FIX2); \
STORE_ ## FMT(out + off, R, G, B); \
} \
}
} while (0)
#define CONVERT1(FUNC, XSTEP, N, src_y, src_uv, rgb, cur_x) { \
int i; \
@ -163,9 +161,9 @@ static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
#define CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, uv, \
top_dst, bottom_dst, cur_x, len) { \
CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x) \
CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x); \
if (bottom_y != NULL) { \
CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x) \
CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \
} \
}
@ -195,10 +193,10 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y, \
const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
\
const int16x4_t cf16 = vld1_s16(kCoeffs); \
const int32x2_t cf32 = vdup_n_s32(kUToB); \
const uint8x8_t u16 = vdup_n_u8(16); \
const uint8x8_t u128 = vdup_n_u8(128); \
const int16x4_t coeff1 = vld1_s16(kCoeffs1); \
const int16x8_t R_Rounder = vdupq_n_s16(-14234); \
const int16x8_t G_Rounder = vdupq_n_s16(8708); \
const int16x8_t B_Rounder = vdupq_n_s16(-17685); \
\
/* Treat the first pixel in regular way */ \
assert(top_y != NULL); \