SSE2 implementation of ImportRowShrink

some limitations: only for RGBA output,
and if reduction factor is not too small (dst_width > src_width / 128)

20-25% faster, ~4-6% global improvement total decoding.

Change-Id: I95366ddaa4a38e0a96bed754dfe790126f7bb84a
This commit is contained in:
Pascal Massimino 2015-10-07 21:34:22 -07:00
parent b4e731cd93
commit 932fd4df61

View File

@ -25,6 +25,63 @@
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
const uint8_t* src) {
const int x_sub = wrk->x_sub;
int accum = 0;
const __m128i zero = _mm_setzero_si128();
const __m128i mult0 = _mm_set1_epi16(x_sub);
const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale);
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
__m128i sum = zero;
rescaler_t* frow = wrk->frow;
const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width;
if (wrk->num_channels != 4) {
return WebPRescalerImportRowShrinkC(wrk, src);
}
if (wrk->x_add > (x_sub << 7)) {
return WebPRescalerImportRowShrinkC(wrk, src);
}
assert(!WebPRescalerInputDone(wrk));
assert(!wrk->x_expand);
for (; frow < frow_end; frow += 4) {
__m128i base = zero;
accum += wrk->x_add;
while (accum > 0) {
const __m128i A = _mm_cvtsi32_si128(*(int*)src);
src += 4;
base = _mm_unpacklo_epi8(A, zero);
// To avoid overflow, we need: base * x_add / x_sub < 32768
// => x_add < x_sub << 7. That's a 1/128 reduction ratio limit.
sum = _mm_add_epi16(sum, base);
accum -= x_sub;
}
{ // Emit next horizontal pixel.
const __m128i mult = _mm_set1_epi16(-accum);
const __m128i frac0 = _mm_mullo_epi16(base, mult); // 16b x 16b -> 32b
const __m128i frac1 = _mm_mulhi_epu16(base, mult);
const __m128i frac = _mm_unpacklo_epi16(frac0, frac1); // frac is 32b
const __m128i A0 = _mm_mullo_epi16(sum, mult0);
const __m128i A1 = _mm_mulhi_epu16(sum, mult0);
const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // sum * x_sub
const __m128i frow_out = _mm_sub_epi32(B0, frac); // sum * x_sub - frac
const __m128i D0 = _mm_srli_epi64(frac, 32);
const __m128i D1 = _mm_mul_epu32(frac, mult1); // 32b x 16b -> 64b
const __m128i D2 = _mm_mul_epu32(D0, mult1);
const __m128i E1 = _mm_add_epi64(D1, rounder);
const __m128i E2 = _mm_add_epi64(D2, rounder);
const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2));
const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2));
const __m128i G = _mm_unpacklo_epi32(F1, F2);
sum = _mm_packs_epi32(G, zero);
_mm_storeu_si128((__m128i*)frow, frow_out);
}
}
assert(accum == 0);
}
//------------------------------------------------------------------------------
// Row export
@ -222,6 +279,7 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
extern void WebPRescalerDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitSSE2(void) {
WebPRescalerImportRowShrink = RescalerImportRowShrinkSSE2;
WebPRescalerExportRowExpand = RescalerExportRowExpandSSE2;
WebPRescalerExportRowShrink = RescalerExportRowShrinkSSE2;
}