From 690b491af16507701fe2da6ec7d51b6c2c1c126b Mon Sep 17 00:00:00 2001 From: Pascal Massimino Date: Thu, 11 Sep 2014 22:35:08 +0200 Subject: [PATCH] fix loop bug in DispatchAlpha() * We were re-doing most of the work in plain-C as 'left-over'. * we were always returning has_alpha = true because of a bad mask all_0xff These bugs were conservative and silent, in the sense that we were 'just' doing more work than necessary. Now, the SSE2 version is really 2x faster than the C version. Change-Id: I6c8132a267fe3c7a3d1fa70e7a5fcd10719543fa --- src/dec/io.c | 2 +- src/dsp/alpha_processing_sse2.c | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/dec/io.c b/src/dec/io.c index 623af93c..afedf26b 100644 --- a/src/dec/io.c +++ b/src/dec/io.c @@ -210,7 +210,7 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) { const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w, num_rows, dst, buf->stride); - // alpha_mask is < 0xff if there's non-trivial alpha to premultiply with. + // has_alpha is true if there's non-trivial alpha to premultiply with. if (has_alpha && WebPIsPremultipliedMode(colorspace)) { WebPApplyAlphaMultiply(base_rgba, alpha_first, mb_w, num_rows, buf->stride); diff --git a/src/dsp/alpha_processing_sse2.c b/src/dsp/alpha_processing_sse2.c index bc6adf80..5d6dcaf4 100644 --- a/src/dsp/alpha_processing_sse2.c +++ b/src/dsp/alpha_processing_sse2.c @@ -27,20 +27,19 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride, int i, j; const __m128i zero = _mm_setzero_si128(); const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u); // to preserve RGB - const __m128i all_0xff = _mm_set_epi32(~0u, ~0u, 0, 0); + const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); __m128i all_alphas = all_0xff; // We must be able to access 3 extra bytes after the last written byte // 'dst[4 * width - 4]', because we don't know if alpha is the first or the // last byte of the quadruplet. - const int limit = (width - 1) >> 3; + const int limit = (width - 1) & ~7; for (j = 0; j < height; ++j) { - const uint8_t* in = alpha; __m128i* out = (__m128i*)dst; - for (i = 0; i < limit; ++i) { + for (i = 0; i < limit; i += 8) { // load 8 alpha bytes - const __m128i a0 = _mm_loadl_epi64((__m128i*)in); // zeroes upper bytes + const __m128i a0 = _mm_loadl_epi64((__m128i*)&alpha[i]); const __m128i a1 = _mm_unpacklo_epi8(a0, zero); const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); @@ -59,7 +58,6 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride, // accumulate eight alpha 'and' in parallel all_alphas = _mm_and_si128(all_alphas, a0); out += 2; - in += 8; } for (; i < width; ++i) { const uint32_t alpha_value = alpha[i];