Implement true-motion prediction in SSE2

(along with DC/HE/VE for chroma/luma16) Overall effect is ~1% faster decoding. Change-Id: I90917e050d61874cbc8da0e88f26b5dd6131c265
2025-07-13 06:24:27 +02:00 · 2015-02-04 17:02:22 +01:00
parent 774d4cb758
commit 19f0ba0eb9
1 changed files with 178 additions and 0 deletions
--- a/src/dsp/dec_sse2.c
+++ b/src/dsp/dec_sse2.c
@ -1069,6 +1069,166 @@ static void RD4(uint8_t* dst) {   // Down-right
 #undef DST
 #undef AVG3

+//------------------------------------------------------------------------------
+// Luma 16x16
+
+static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
+  const uint8_t* top = dst - BPS;
+  const __m128i zero = _mm_setzero_si128();
+  int y;
+  if (size == 4) {
+    const __m128i top_values = _mm_cvtsi32_si128(*(int*)top);
+    const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+    for (y = 0; y < 4; ++y, dst += BPS) {
+      const int val = dst[-1] - top[-1];
+      const __m128i base = _mm_set1_epi16(val);
+      const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+      *(int*)dst = _mm_cvtsi128_si32(out);
+    }
+  } else if (size == 8) {
+    const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+    const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+    for (y = 0; y < 8; ++y, dst += BPS) {
+      const int val = dst[-1] - top[-1];
+      const __m128i base = _mm_set1_epi16(val);
+      const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+      _mm_storel_epi64((__m128i*)dst, out);
+    }
+  } else {
+    const __m128i top_values = _mm_loadu_si128((const __m128i*)top);
+    const __m128i top_base_0 = _mm_unpacklo_epi8(top_values, zero);
+    const __m128i top_base_1 = _mm_unpackhi_epi8(top_values, zero);
+    for (y = 0; y < 16; ++y, dst += BPS) {
+      const int val = dst[-1] - top[-1];
+      const __m128i base = _mm_set1_epi16(val);
+      const __m128i out_0 = _mm_add_epi16(base, top_base_0);
+      const __m128i out_1 = _mm_add_epi16(base, top_base_1);
+      const __m128i out = _mm_packus_epi16(out_0, out_1);
+      _mm_storeu_si128((__m128i*)dst, out);
+    }
+  }
+}
+
+static void TM4(uint8_t* dst)   { TrueMotion(dst, 4); }
+static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
+static void TM16(uint8_t* dst)  { TrueMotion(dst, 16); }
+
+static void VE16(uint8_t* dst) {
+  const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
+  int j;
+  for (j = 0; j < 16; ++j) {
+    _mm_storeu_si128((__m128i*)(dst + j * BPS), top);
+  }
+}
+
+static void HE16(uint8_t* dst) {     // horizontal
+  int j;
+  for (j = 16; j > 0; --j) {
+    const __m128i values = _mm_set1_epi8(dst[-1]);
+    _mm_storeu_si128((__m128i*)dst, values);
+    dst += BPS;
+  }
+}
+
+static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
+  int j;
+  const __m128i values = _mm_set1_epi8(v);
+  for (j = 0; j < 16; ++j) {
+    _mm_storeu_si128((__m128i*)(dst + j * BPS), values);
+  }
+}
+
+static void DC16(uint8_t* dst) {    // DC
+  int DC = 16;
+  int j;
+  for (j = 0; j < 16; ++j) {
+    DC += dst[-1 + j * BPS] + dst[j - BPS];
+  }
+  Put16(DC >> 5, dst);
+}
+
+static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
+  int DC = 8;
+  int j;
+  for (j = 0; j < 16; ++j) {
+    DC += dst[-1 + j * BPS];
+  }
+  Put16(DC >> 4, dst);
+}
+
+static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
+  int DC = 8;
+  int i;
+  for (i = 0; i < 16; ++i) {
+    DC += dst[i - BPS];
+  }
+  Put16(DC >> 4, dst);
+}
+
+static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
+  Put16(0x80, dst);
+}
+
+//------------------------------------------------------------------------------
+// Chroma
+
+static void VE8uv(uint8_t* dst) {    // vertical
+  int j;
+  const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
+  for (j = 0; j < 8; ++j) {
+    _mm_storel_epi64((__m128i*)(dst + j * BPS), top);
+  }
+}
+
+static void HE8uv(uint8_t* dst) {    // horizontal
+  int j;
+  for (j = 0; j < 8; ++j) {
+    const __m128i values = _mm_set1_epi8(dst[-1]);
+    _mm_storel_epi64((__m128i*)dst, values);
+    dst += BPS;
+  }
+}
+
+// helper for chroma-DC predictions
+static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
+  int j;
+  const __m128i values = _mm_set1_epi8(v);
+  for (j = 0; j < 8; ++j) {
+    _mm_storel_epi64((__m128i*)(dst + j * BPS), values);
+  }
+}
+
+static void DC8uv(uint8_t* dst) {     // DC
+  int dc0 = 8;
+  int i;
+  for (i = 0; i < 8; ++i) {
+    dc0 += dst[i - BPS] + dst[-1 + i * BPS];
+  }
+  Put8x8uv(dc0 >> 4, dst);
+}
+
+static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+  int dc0 = 4;
+  int i;
+  for (i = 0; i < 8; ++i) {
+    dc0 += dst[i - BPS];
+  }
+  Put8x8uv(dc0 >> 3, dst);
+}
+
+static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
+  int dc0 = 4;
+  int i;
+  for (i = 0; i < 8; ++i) {
+    dc0 += dst[-1 + i * BPS];
+  }
+  Put8x8uv(dc0 >> 3, dst);
+}
+
+static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
+  Put8x8uv(0x80, dst);
+}
+
 #endif   // WEBP_USE_SSE2

 //------------------------------------------------------------------------------
@ -1097,10 +1257,28 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE2(void) {
  VP8SimpleVFilter16i = SimpleVFilter16i;
  VP8SimpleHFilter16i = SimpleHFilter16i;

+  VP8PredLuma4[1] = TM4;
  VP8PredLuma4[2] = VE4;
  VP8PredLuma4[4] = RD4;
  VP8PredLuma4[5] = VR4;
  VP8PredLuma4[6] = LD4;
  VP8PredLuma4[7] = VL4;
+
+  VP8PredLuma16[0] = DC16;
+  VP8PredLuma16[1] = TM16;
+  VP8PredLuma16[2] = VE16;
+  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[4] = DC16NoTop;
+  VP8PredLuma16[5] = DC16NoLeft;
+  VP8PredLuma16[6] = DC16NoTopLeft;
+
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TM8uv;
+  VP8PredChroma8[2] = VE8uv;
+  VP8PredChroma8[3] = HE8uv;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+  VP8PredChroma8[6] = DC8uvNoTopLeft;
+
 #endif   // WEBP_USE_SSE2
 }