fix loop bug in DispatchAlpha()

* We were re-doing most of the work in plain-C as 'left-over'. * we were always returning has_alpha = true because of a bad mask all_0xff These bugs were conservative and silent, in the sense that we were 'just' doing more work than necessary. Now, the SSE2 version is really 2x faster than the C version. Change-Id: I6c8132a267fe3c7a3d1fa70e7a5fcd10719543fa
2025-06-07 14:34:22 +02:00 · 2014-09-11 22:35:08 +02:00 · 2014-09-11 22:35:08 +02:00 · 690b491af1
commit 690b491af1
parent 3101f53720
2 changed files with 5 additions and 7 deletions
--- a/src/dec/io.c
+++ b/src/dec/io.c
@ -210,7 +210,7 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
    const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w,
                                            num_rows, dst, buf->stride);
-    // alpha_mask is < 0xff if there's non-trivial alpha to premultiply with.
+    // has_alpha is true if there's non-trivial alpha to premultiply with.
    if (has_alpha && WebPIsPremultipliedMode(colorspace)) {
      WebPApplyAlphaMultiply(base_rgba, alpha_first,
                             mb_w, num_rows, buf->stride);
--- a/src/dsp/alpha_processing_sse2.c
+++ b/src/dsp/alpha_processing_sse2.c
@ -27,20 +27,19 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
  int i, j;
  const __m128i zero = _mm_setzero_si128();
  const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u);  // to preserve RGB
-  const __m128i all_0xff = _mm_set_epi32(~0u, ~0u, 0, 0);
+  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
  __m128i all_alphas = all_0xff;
  // We must be able to access 3 extra bytes after the last written byte
  // 'dst[4 * width - 4]', because we don't know if alpha is the first or the
  // last byte of the quadruplet.
-  const int limit = (width - 1) >> 3;
+  const int limit = (width - 1) & ~7;
  for (j = 0; j < height; ++j) {
    const uint8_t* in = alpha;
    __m128i* out = (__m128i*)dst;
-    for (i = 0; i < limit; ++i) {
+    for (i = 0; i < limit; i += 8) {
      // load 8 alpha bytes
-      const __m128i a0 = _mm_loadl_epi64((__m128i*)in);   // zeroes upper bytes
+      const __m128i a0 = _mm_loadl_epi64((__m128i*)&alpha[i]);
      const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
      const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
      const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
@ -59,7 +58,6 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
      // accumulate eight alpha 'and' in parallel
      all_alphas = _mm_and_si128(all_alphas, a0);
      out += 2;
      in += 8;
    }
    for (; i < width; ++i) {
      const uint32_t alpha_value = alpha[i];