diff --git a/src/dsp/rescaler_sse2.c b/src/dsp/rescaler_sse2.c index 01337cf8..20f8d9c5 100644 --- a/src/dsp/rescaler_sse2.c +++ b/src/dsp/rescaler_sse2.c @@ -25,6 +25,63 @@ #define ROUNDER (WEBP_RESCALER_ONE >> 1) #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX) +static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk, + const uint8_t* src) { + const int x_sub = wrk->x_sub; + int accum = 0; + const __m128i zero = _mm_setzero_si128(); + const __m128i mult0 = _mm_set1_epi16(x_sub); + const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale); + const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); + __m128i sum = zero; + rescaler_t* frow = wrk->frow; + const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width; + + if (wrk->num_channels != 4) { + return WebPRescalerImportRowShrinkC(wrk, src); + } + if (wrk->x_add > (x_sub << 7)) { + return WebPRescalerImportRowShrinkC(wrk, src); + } + assert(!WebPRescalerInputDone(wrk)); + assert(!wrk->x_expand); + + for (; frow < frow_end; frow += 4) { + __m128i base = zero; + accum += wrk->x_add; + while (accum > 0) { + const __m128i A = _mm_cvtsi32_si128(*(int*)src); + src += 4; + base = _mm_unpacklo_epi8(A, zero); + // To avoid overflow, we need: base * x_add / x_sub < 32768 + // => x_add < x_sub << 7. That's a 1/128 reduction ratio limit. + sum = _mm_add_epi16(sum, base); + accum -= x_sub; + } + { // Emit next horizontal pixel. + const __m128i mult = _mm_set1_epi16(-accum); + const __m128i frac0 = _mm_mullo_epi16(base, mult); // 16b x 16b -> 32b + const __m128i frac1 = _mm_mulhi_epu16(base, mult); + const __m128i frac = _mm_unpacklo_epi16(frac0, frac1); // frac is 32b + const __m128i A0 = _mm_mullo_epi16(sum, mult0); + const __m128i A1 = _mm_mulhi_epu16(sum, mult0); + const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // sum * x_sub + const __m128i frow_out = _mm_sub_epi32(B0, frac); // sum * x_sub - frac + const __m128i D0 = _mm_srli_epi64(frac, 32); + const __m128i D1 = _mm_mul_epu32(frac, mult1); // 32b x 16b -> 64b + const __m128i D2 = _mm_mul_epu32(D0, mult1); + const __m128i E1 = _mm_add_epi64(D1, rounder); + const __m128i E2 = _mm_add_epi64(D2, rounder); + const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2)); + const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2)); + const __m128i G = _mm_unpacklo_epi32(F1, F2); + sum = _mm_packs_epi32(G, zero); + _mm_storeu_si128((__m128i*)frow, frow_out); + } + } + assert(accum == 0); +} + //------------------------------------------------------------------------------ // Row export @@ -222,6 +279,7 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) { extern void WebPRescalerDspInitSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitSSE2(void) { + WebPRescalerImportRowShrink = RescalerImportRowShrinkSSE2; WebPRescalerExportRowExpand = RescalerExportRowExpandSSE2; WebPRescalerExportRowShrink = RescalerExportRowShrinkSSE2; }