From 690b491af16507701fe2da6ec7d51b6c2c1c126b Mon Sep 17 00:00:00 2001
From: Pascal Massimino <pascal.massimino@gmail.com>
Date: Thu, 11 Sep 2014 22:35:08 +0200
Subject: [PATCH] fix loop bug in DispatchAlpha()

* We were re-doing most of the work in plain-C as 'left-over'.
* we were always returning has_alpha = true because of a bad mask all_0xff

These bugs were conservative and silent, in the sense that we were 'just' doing
more work than necessary.

Now, the SSE2 version is really 2x faster than the C version.

Change-Id: I6c8132a267fe3c7a3d1fa70e7a5fcd10719543fa
---
 src/dec/io.c                    |  2 +-
 src/dsp/alpha_processing_sse2.c | 10 ++++------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/dec/io.c b/src/dec/io.c
index 623af93c..afedf26b 100644
--- a/src/dec/io.c
+++ b/src/dec/io.c
@@ -210,7 +210,7 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
     const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w,
                                             num_rows, dst, buf->stride);
 
-    // alpha_mask is < 0xff if there's non-trivial alpha to premultiply with.
+    // has_alpha is true if there's non-trivial alpha to premultiply with.
     if (has_alpha && WebPIsPremultipliedMode(colorspace)) {
       WebPApplyAlphaMultiply(base_rgba, alpha_first,
                              mb_w, num_rows, buf->stride);
diff --git a/src/dsp/alpha_processing_sse2.c b/src/dsp/alpha_processing_sse2.c
index bc6adf80..5d6dcaf4 100644
--- a/src/dsp/alpha_processing_sse2.c
+++ b/src/dsp/alpha_processing_sse2.c
@@ -27,20 +27,19 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
   int i, j;
   const __m128i zero = _mm_setzero_si128();
   const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u);  // to preserve RGB
-  const __m128i all_0xff = _mm_set_epi32(~0u, ~0u, 0, 0);
+  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
   __m128i all_alphas = all_0xff;
 
   // We must be able to access 3 extra bytes after the last written byte
   // 'dst[4 * width - 4]', because we don't know if alpha is the first or the
   // last byte of the quadruplet.
-  const int limit = (width - 1) >> 3;
+  const int limit = (width - 1) & ~7;
 
   for (j = 0; j < height; ++j) {
-    const uint8_t* in = alpha;
     __m128i* out = (__m128i*)dst;
-    for (i = 0; i < limit; ++i) {
+    for (i = 0; i < limit; i += 8) {
       // load 8 alpha bytes
-      const __m128i a0 = _mm_loadl_epi64((__m128i*)in);   // zeroes upper bytes
+      const __m128i a0 = _mm_loadl_epi64((__m128i*)&alpha[i]);
       const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
       const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
       const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
@@ -59,7 +58,6 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
       // accumulate eight alpha 'and' in parallel
       all_alphas = _mm_and_si128(all_alphas, a0);
       out += 2;
-      in += 8;
     }
     for (; i < width; ++i) {
       const uint32_t alpha_value = alpha[i];