mirror of
https://github.com/webmproject/libwebp.git
synced 2024-11-20 04:18:26 +01:00
SSE2 implementation of ImportRowShrink
some limitations: only for RGBA output, and if reduction factor is not too small (dst_width > src_width / 128) 20-25% faster, ~4-6% global improvement total decoding. Change-Id: I95366ddaa4a38e0a96bed754dfe790126f7bb84a
This commit is contained in:
parent
b4e731cd93
commit
932fd4df61
@ -25,6 +25,63 @@
|
||||
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
|
||||
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
|
||||
|
||||
static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
|
||||
const uint8_t* src) {
|
||||
const int x_sub = wrk->x_sub;
|
||||
int accum = 0;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i mult0 = _mm_set1_epi16(x_sub);
|
||||
const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale);
|
||||
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
|
||||
__m128i sum = zero;
|
||||
rescaler_t* frow = wrk->frow;
|
||||
const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width;
|
||||
|
||||
if (wrk->num_channels != 4) {
|
||||
return WebPRescalerImportRowShrinkC(wrk, src);
|
||||
}
|
||||
if (wrk->x_add > (x_sub << 7)) {
|
||||
return WebPRescalerImportRowShrinkC(wrk, src);
|
||||
}
|
||||
assert(!WebPRescalerInputDone(wrk));
|
||||
assert(!wrk->x_expand);
|
||||
|
||||
for (; frow < frow_end; frow += 4) {
|
||||
__m128i base = zero;
|
||||
accum += wrk->x_add;
|
||||
while (accum > 0) {
|
||||
const __m128i A = _mm_cvtsi32_si128(*(int*)src);
|
||||
src += 4;
|
||||
base = _mm_unpacklo_epi8(A, zero);
|
||||
// To avoid overflow, we need: base * x_add / x_sub < 32768
|
||||
// => x_add < x_sub << 7. That's a 1/128 reduction ratio limit.
|
||||
sum = _mm_add_epi16(sum, base);
|
||||
accum -= x_sub;
|
||||
}
|
||||
{ // Emit next horizontal pixel.
|
||||
const __m128i mult = _mm_set1_epi16(-accum);
|
||||
const __m128i frac0 = _mm_mullo_epi16(base, mult); // 16b x 16b -> 32b
|
||||
const __m128i frac1 = _mm_mulhi_epu16(base, mult);
|
||||
const __m128i frac = _mm_unpacklo_epi16(frac0, frac1); // frac is 32b
|
||||
const __m128i A0 = _mm_mullo_epi16(sum, mult0);
|
||||
const __m128i A1 = _mm_mulhi_epu16(sum, mult0);
|
||||
const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // sum * x_sub
|
||||
const __m128i frow_out = _mm_sub_epi32(B0, frac); // sum * x_sub - frac
|
||||
const __m128i D0 = _mm_srli_epi64(frac, 32);
|
||||
const __m128i D1 = _mm_mul_epu32(frac, mult1); // 32b x 16b -> 64b
|
||||
const __m128i D2 = _mm_mul_epu32(D0, mult1);
|
||||
const __m128i E1 = _mm_add_epi64(D1, rounder);
|
||||
const __m128i E2 = _mm_add_epi64(D2, rounder);
|
||||
const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2));
|
||||
const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2));
|
||||
const __m128i G = _mm_unpacklo_epi32(F1, F2);
|
||||
sum = _mm_packs_epi32(G, zero);
|
||||
_mm_storeu_si128((__m128i*)frow, frow_out);
|
||||
}
|
||||
}
|
||||
assert(accum == 0);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Row export
|
||||
|
||||
@ -222,6 +279,7 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
|
||||
extern void WebPRescalerDspInitSSE2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitSSE2(void) {
|
||||
WebPRescalerImportRowShrink = RescalerImportRowShrinkSSE2;
|
||||
WebPRescalerExportRowExpand = RescalerExportRowExpandSSE2;
|
||||
WebPRescalerExportRowShrink = RescalerExportRowShrinkSSE2;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user