mirror of
https://github.com/webmproject/libwebp.git
synced 2025-07-16 05:49:51 +02:00
5-7% faster SSE2 versions of YUV->RGB conversion functions
The C-version gets ~7-8% slower in order to match the SSE2 output exactly. The old (now off-by-1) code is kept under the WEBP_YUV_USE_TABLE flag for reference. (note that calc rounding precision is slightly better ~= +0.02dB) on ARM-neon, we somehow recover the ~4% speed that was lost by mimicking the initial C-version (see https://gerrit.chromium.org/gerrit/#/c/41610) Change-Id: Ia4363c5ed9b4c9edff5d932b002e57bb7814bf6f
This commit is contained in:
@ -87,8 +87,8 @@ extern "C" {
|
||||
GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \
|
||||
\
|
||||
/* pack the alternate pixels */ \
|
||||
PACK_AND_STORE(a, b, diag1, diag2, &(out)[0 * 32]); \
|
||||
PACK_AND_STORE(c, d, diag2, diag1, &(out)[2 * 32]); \
|
||||
PACK_AND_STORE(a, b, diag1, diag2, out + 0); /* store top */ \
|
||||
PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32); /* store bottom */ \
|
||||
}
|
||||
|
||||
// Turn the macro into a function for reducing code-size when non-critical
|
||||
@ -108,68 +108,68 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
|
||||
Upsample32Pixels(r1, r2, out); \
|
||||
}
|
||||
|
||||
#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, uv, \
|
||||
#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, \
|
||||
top_dst, bottom_dst, cur_x, num_pixels) { \
|
||||
int n; \
|
||||
for (n = 0; n < (num_pixels); ++n) { \
|
||||
FUNC(top_y[(cur_x) + n], (uv)[n], (uv)[32 + n], \
|
||||
FUNC(top_y[(cur_x) + n], r_u[n], r_v[n], \
|
||||
top_dst + ((cur_x) + n) * XSTEP); \
|
||||
} \
|
||||
if (bottom_y != NULL) { \
|
||||
for (n = 0; n < (num_pixels); ++n) { \
|
||||
FUNC(bottom_y[(cur_x) + n], (uv)[64 + n], (uv)[64 + 32 + n], \
|
||||
FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n], \
|
||||
bottom_dst + ((cur_x) + n) * XSTEP); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, \
|
||||
top_dst, bottom_dst, cur_x) do { \
|
||||
FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP); \
|
||||
if (bottom_y != NULL) { \
|
||||
FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64, \
|
||||
bottom_dst + (cur_x) * XSTEP); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
|
||||
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
|
||||
const uint8_t* top_u, const uint8_t* top_v, \
|
||||
const uint8_t* cur_u, const uint8_t* cur_v, \
|
||||
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
|
||||
int block; \
|
||||
/* 16 byte aligned array to cache reconstructed u and v */ \
|
||||
int uv_pos, pos; \
|
||||
/* 16byte-aligned array to cache reconstructed u and v */ \
|
||||
uint8_t uv_buf[4 * 32 + 15]; \
|
||||
uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
|
||||
const int uv_len = (len + 1) >> 1; \
|
||||
/* 17 pixels must be read-able for each block */ \
|
||||
const int num_blocks = (uv_len - 1) >> 4; \
|
||||
const int leftover = uv_len - num_blocks * 16; \
|
||||
const int last_pos = 1 + 32 * num_blocks; \
|
||||
uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
|
||||
uint8_t* const r_v = r_u + 32; \
|
||||
\
|
||||
const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
|
||||
const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
|
||||
\
|
||||
assert(len > 0); \
|
||||
/* Treat the first pixel in regular way */ \
|
||||
assert(top_y != NULL); \
|
||||
{ \
|
||||
const int u0 = (top_u[0] + u_diag) >> 1; \
|
||||
const int v0 = (top_v[0] + v_diag) >> 1; \
|
||||
FUNC(top_y[0], u0, v0, top_dst); \
|
||||
{ /* Treat the first pixel in regular way */ \
|
||||
const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
|
||||
const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
|
||||
const int u0_t = (top_u[0] + u_diag) >> 1; \
|
||||
const int v0_t = (top_v[0] + v_diag) >> 1; \
|
||||
FUNC(top_y[0], u0_t, v0_t, top_dst); \
|
||||
if (bottom_y != NULL) { \
|
||||
const int u0_b = (cur_u[0] + u_diag) >> 1; \
|
||||
const int v0_b = (cur_v[0] + v_diag) >> 1; \
|
||||
FUNC(bottom_y[0], u0_b, v0_b, bottom_dst); \
|
||||
} \
|
||||
} \
|
||||
if (bottom_y != NULL) { \
|
||||
const int u0 = (cur_u[0] + u_diag) >> 1; \
|
||||
const int v0 = (cur_v[0] + v_diag) >> 1; \
|
||||
FUNC(bottom_y[0], u0, v0, bottom_dst); \
|
||||
/* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */ \
|
||||
for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) { \
|
||||
UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u); \
|
||||
UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v); \
|
||||
CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos); \
|
||||
} \
|
||||
\
|
||||
for (block = 0; block < num_blocks; ++block) { \
|
||||
UPSAMPLE_32PIXELS(top_u, cur_u, r_uv + 0 * 32); \
|
||||
UPSAMPLE_32PIXELS(top_v, cur_v, r_uv + 1 * 32); \
|
||||
CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \
|
||||
32 * block + 1, 32) \
|
||||
top_u += 16; \
|
||||
cur_u += 16; \
|
||||
top_v += 16; \
|
||||
cur_v += 16; \
|
||||
if (len > 1) { \
|
||||
const int left_over = ((len + 1) >> 1) - (pos >> 1); \
|
||||
assert(left_over > 0); \
|
||||
UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u); \
|
||||
UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v); \
|
||||
CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, \
|
||||
pos, len - pos); \
|
||||
} \
|
||||
\
|
||||
UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv + 0 * 32); \
|
||||
UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 1 * 32); \
|
||||
CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \
|
||||
last_pos, len - last_pos); \
|
||||
}
|
||||
|
||||
// SSE2 variants of the fancy upsampler.
|
||||
@ -183,6 +183,7 @@ SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)
|
||||
#undef UPSAMPLE_32PIXELS
|
||||
#undef UPSAMPLE_LAST_BLOCK
|
||||
#undef CONVERT2RGB
|
||||
#undef CONVERT2RGB_32
|
||||
#undef SSE2_UPSAMPLE_FUNC
|
||||
|
||||
#endif // FANCY_UPSAMPLING
|
||||
@ -197,6 +198,7 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
|
||||
|
||||
void WebPInitUpsamplersSSE2(void) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
VP8YUVInitSSE2();
|
||||
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePairSSE2;
|
||||
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2;
|
||||
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePairSSE2;
|
||||
@ -214,7 +216,7 @@ void WebPInitPremultiplySSE2(void) {
|
||||
#else
|
||||
|
||||
// this empty function is to avoid an empty .o
|
||||
void WebPInitPremultiplySSE2(void);
|
||||
void WebPInitPremultiplySSE2(void) {}
|
||||
|
||||
#endif // FANCY_UPSAMPLING
|
||||
|
||||
|
Reference in New Issue
Block a user