SSE2 version of GradientUnfilter

somewhat 1-2% faster decoder for lossy+alpha Change-Id: Ib317e26e9fcb8d37af02668ffbfccc4664e659fe
2025-08-11 02:20:33 +02:00 · 2015-01-31 23:16:10 +01:00
parent 72831f6b28
commit 77724f70e9
1 changed files with 45 additions and 15 deletions
--- a/src/dsp/filters_sse2.c
+++ b/src/dsp/filters_sse2.c
@ -36,6 +36,7 @@ static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
                           uint8_t* dst, int length, int inverse) {
  int i;
  const int max_pos = length & ~31;
+  assert(length >= 0);
  if (inverse) {
    for (i = 0; i < max_pos; i += 32) {
      const __m128i A0 = _mm_loadu_si128((const __m128i*)&src[i +  0]);
@ -67,10 +68,10 @@ static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
 static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length,
                            int inverse) {
  int i;
+  if (length <= 0) return;
  if (inverse) {
    const int max_pos = length & ~7;
-    __m128i last =
-        _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, dst[-1]);
+    __m128i last = _mm_set_epi32(0, 0, 0, dst[-1]);
    for (i = 0; i < max_pos; i += 8) {
      const __m128i A0 = _mm_loadl_epi64((const __m128i*)(src + i));
      const __m128i A1 = _mm_add_epi8(A0, last);
@ -218,45 +219,74 @@ static void GradientPredictDirect(const uint8_t* const row,
  }
 }

+static void GradientPredictInverse(const uint8_t* const in,
+                                   const uint8_t* const top,
+                                   uint8_t* const row, int length) {
+  if (length > 0) {
+    int i;
+    const int max_pos = length & ~7;
+    const __m128i zero = _mm_setzero_si128();
+    __m128i A = _mm_set_epi32(0, 0, 0, row[-1]);   // left sample
+    for (i = 0; i < max_pos; i += 8) {
+      const __m128i tmp0 = _mm_loadl_epi64((const __m128i*)&top[i]);
+      const __m128i tmp1 = _mm_loadl_epi64((const __m128i*)&top[i - 1]);
+      const __m128i B = _mm_unpacklo_epi8(tmp0, zero);
+      const __m128i C = _mm_unpacklo_epi8(tmp1, zero);
+      const __m128i tmp2 = _mm_loadl_epi64((const __m128i*)&in[i]);
+      const __m128i D = _mm_unpacklo_epi8(tmp2, zero);   // base input
+      const __m128i E = _mm_sub_epi16(B, C);  // unclipped gradient basis B - C
+      __m128i out = zero;                     // accumulator for output
+      __m128i mask_hi = _mm_set_epi32(0, 0, 0, 0xff);
+      int k = 8;
+      while (1) {
+        const __m128i tmp3 = _mm_add_epi16(A, E);        // delta = A + B - C
+        const __m128i tmp4 = _mm_min_epi16(tmp3, mask_hi);
+        const __m128i tmp5 = _mm_max_epi16(tmp4, zero);  // clipped delta
+        const __m128i tmp6 = _mm_add_epi16(tmp5, D);     // add to in[] values
+        A = _mm_and_si128(tmp6, mask_hi);                // 1-complement clip
+        out = _mm_or_si128(out, A);                      // accumulate output
+        if (--k == 0) break;
+        A = _mm_slli_si128(A, 2);                        // rotate left sample
+        mask_hi = _mm_slli_si128(mask_hi, 2);            // rotate mask
+      }
+      A = _mm_srli_si128(A, 14);       // prepare left sample for next iteration
+      _mm_storel_epi64((__m128i*)&row[i], _mm_packus_epi16(out, zero));
+    }
+    for (; i < length; ++i) {
+      row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]);
+    }
+  }
+}
+
 static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
                                         int width, int height, int stride,
                                         int row, int num_rows,
                                         int inverse, uint8_t* out) {
-  const uint8_t* preds;
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
  SANITY_CHECK(in, out);
  in += start_offset;
  out += start_offset;
-  preds = inverse ? out : in;

  // left prediction for top scan-line
  if (row == 0) {
    out[0] = in[0];
    PredictLineLeft(in + 1, out + 1, width - 1, inverse);
    row = 1;
-    preds += stride;
    in += stride;
    out += stride;
  }

  // Filter line-by-line.
  while (row < last_row) {
-    int w;
-    // leftmost pixel: predict from above.
-    PredictLineC(in, preds - stride, out, 1, inverse);
    if (inverse) {
-      for (w = 1; w < width; ++w) {
-        const int pred = GradientPredictorC(out[w - 1],
-                                            out[w - stride],
-                                            out[w - stride - 1]);
-        out[w] = in[w] + pred;
-      }
+      PredictLineC(in, out - stride, out, 1, inverse);  // predict from above
+      GradientPredictInverse(in + 1, out + 1 - stride, out + 1, width - 1);
    } else {
+      PredictLineC(in, in - stride, out, 1, inverse);
      GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1);
    }
    ++row;
-    preds += stride;
    in += stride;
    out += stride;
  }