sse2 version of the complex filter

12-15% faster. (only inner edge is implemented for now) patch by Somnath Banerjee (somnath at google dot com)
2025-10-28 00:53:03 +01:00 · 2011-06-20 00:27:47 -07:00
parent 96ed9ce0fb
commit bd2f65f67c
1 changed files with 303 additions and 77 deletions
--- a/src/dec/dsp_sse2.c
+++ b/src/dec/dsp_sse2.c
@@ -245,8 +245,8 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
 // Compute abs(p - q) = subs(p - q) OR subs(q - p)
 #define MM_ABS(p, q)  _mm_or_si128(                                     \
-  _mm_subs_epu8(*(q), *(p)),                                            \
+    _mm_subs_epu8((q), (p)),                                            \
-  _mm_subs_epu8(*(p), *(q)))
+    _mm_subs_epu8((p), (q)))
 // Shift each byte of "a" by N bits while preserving by the sign bit.
 //
@@ -264,23 +264,20 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
  a = _mm_or_si128(t, a);                                               \
 }
 static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,
                        const __m128i* q1, int thresh, __m128i *mask) {
-  __m128i t1, t2;
+  __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
  *mask = _mm_set1_epi8(0xFE);
  t1 = _mm_and_si128(t1, *mask);        // set lsb of each byte to zero
  t1 = _mm_srli_epi16(t1, 1);           // abs(p1 - q1) / 2
-  t1 = MM_ABS(p1, q1);               // abs(p1 - q1)
+  *mask = MM_ABS(*p0, *q0);             // abs(p0 - q0)
-  t2 = _mm_set1_epi8(0xFE);
+  *mask = _mm_adds_epu8(*mask, *mask);  // abs(p0 - q0) * 2
-  t1 = _mm_and_si128(t1, t2);        // set lsb of each byte to zero
+  *mask = _mm_adds_epu8(*mask, t1);     // abs(p0 - q0) * 2 + abs(p1 - q1) / 2
  t1 = _mm_srli_epi16(t1, 1);        // abs(p1 - q1) / 2
  t2 = MM_ABS(p0, q0);
  t2 = _mm_adds_epu8(t2, t2);        // abs(p0 - q0) * 2
  t2 = _mm_adds_epu8(t2, t1);        // abs(p0 - q0) * 2 + abs(p1 - q1) / 2
  t1 = _mm_set1_epi8(thresh);
-  t2 = _mm_subs_epu8(t2, t1);  // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > thresh
+  *mask = _mm_subs_epu8(*mask, t1);     // mask <= thresh
-  *mask = _mm_cmpeq_epi8(t2, _mm_setzero_si128());
+  *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128());
 }
 //-----------------------------------------------------------------------------
@@ -308,21 +305,82 @@ static void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0,
  // Do +4 side
  t2 = _mm_set1_epi8(4);
-  t2 = _mm_adds_epi8(t2, t1);        // 3 * (q0 - p0) + (p1 - q1) + 4
+  t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 4
  SIGNED_SHIFT_N(t2, 3);             // t2 >> 3
-  *q0 = _mm_subs_epi8(*q0, t2);      // q0 -= a
+  *q0 = _mm_subs_epi8(*q0, t2);      // q0 -= t2
  // Now do +3 side
  t2 = _mm_set1_epi8(3);
-  t2 = _mm_adds_epi8(t2, t1);        // +3 instead of +4
+  t2 = _mm_adds_epi8(t1, t2);        // +3 instead of +4
  SIGNED_SHIFT_N(t2, 3);             // t2 >> 3
-  *p0 = _mm_adds_epi8(*p0, t2);      // p0 += b
+  *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
  // unoffset
  *p0 = _mm_xor_si128(*p0, sign_bit);
  *q0 = _mm_xor_si128(*q0, sign_bit);
 }
 // Applies filter on p1, p0, q0 and q1
 static void DoFilter4(__m128i* p1, __m128i *p0, __m128i* q0, __m128i* q1,
                      const __m128i* mask, int hev_thresh) {
  __m128i t1, t2, t3;
  __m128i hev = _mm_set1_epi8(hev_thresh);
  const __m128i sign_bit = _mm_set1_epi8(0x80);
  // compute hev mask
  t1 = MM_ABS(*p1, *p0);
  t2 = MM_ABS(*q1, *q0);
  t1 = _mm_subs_epu8(t1, hev);       // abs(p1 - p0) - hev_tresh
  t2 = _mm_subs_epu8(t2, hev);       // abs(q1 - q0) - hev_tresh
  hev = _mm_or_si128(t1, t2);        // hev <= t1 || hev <= t2
  t1 = _mm_setzero_si128();
  hev = _mm_cmpeq_epi8(hev, t1);
  t1 = _mm_set1_epi16(0xffff);       // load 0xff on all bytes
  hev = _mm_xor_si128(hev, t1);      // hev > t1 && hev > t2
  // convert to signed values
  *p0 = _mm_xor_si128(*p0, sign_bit);
  *q0 = _mm_xor_si128(*q0, sign_bit);
  *p1 = _mm_xor_si128(*p1, sign_bit);
  *q1 = _mm_xor_si128(*q1, sign_bit);
  t1 = _mm_subs_epi8(*p1, *q1);      // p1 - q1
  t1 = _mm_and_si128(hev, t1);       // hev(p1 - q1)
  t2 = _mm_subs_epi8(*q0, *p0);      // q0 - p0
  t1 = _mm_adds_epi8(t1, t2);        // hev(p1 - q1) + 1 * (q0 - p0)
  t1 = _mm_adds_epi8(t1, t2);        // hev(p1 - q1) + 2 * (q0 - p0)
  t1 = _mm_adds_epi8(t1, t2);        // hev(p1 - q1) + 3 * (q0 - p0)
  t1 = _mm_and_si128(t1, *mask);     // mask filter values we don't care about
  // Do +4 side
  t2 = _mm_set1_epi8(4);
  t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 4
  SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
  t3 = t2;                           // save t2
  *q0 = _mm_subs_epi8(*q0, t2);      // q0 -= t2
  // Now do +3 side
  t2 = _mm_set1_epi8(3);
  t2 = _mm_adds_epi8(t1, t2);        // +3 instead of +4
  SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
  *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
  t2 = _mm_set1_epi8(1);
  t3 = _mm_adds_epi8(t3, t2);
  SIGNED_SHIFT_N(t3, 1);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 4
  hev = _mm_andnot_si128(hev, t3);   // if !hev
  *q1 = _mm_subs_epi8(*q1, hev);     // q1 -= t3
  *p1 = _mm_adds_epi8(*p1, hev);     // p1 += t3
  // unoffset
  *p0 = _mm_xor_si128(*p0, sign_bit);
  *q0 = _mm_xor_si128(*q0, sign_bit);
  *p1 = _mm_xor_si128(*p1, sign_bit);
  *q1 = _mm_xor_si128(*q1, sign_bit);
 }
 // Reads 8 rows across a vertical edge.
 //
 // TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into
@@ -366,6 +424,40 @@ static void Load8x4(const uint8_t* b, int stride, __m128i* p, __m128i* q) {
  *q = _mm_unpackhi_epi32(t1, t2);
 }
 static inline void Load16x4(const uint8_t* r0, const uint8_t* r8, int stride,
                            __m128i* p1, __m128i* p0,
                            __m128i* q0, __m128i* q1) {
  __m128i t1, t2;
  // Assume the pixels around the edge (|) are numbered as follows
  //                00 01 | 02 03
  //                10 11 | 12 13
  //                 ...  |  ...
  //                e0 e1 | e2 e3
  //                f0 f1 | f2 f3
  //
  // r0 is pointing to the 0th row (00)
  // r8 is pointing to the 8th row (80)
  // Load
  // p1 = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
  // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
  // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
  // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
  Load8x4(r0, stride, p1, q0);
  Load8x4(r8, stride, p0, q1);
  t1 = *p1;
  t2 = *q0;
  // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
  // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
  // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
  // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
  *p1 = _mm_unpacklo_epi64(t1, *p0);
  *p0 = _mm_unpackhi_epi64(t1, *p0);
  *q0 = _mm_unpacklo_epi64(t2, *q1);
  *q1 = _mm_unpackhi_epi64(t2, *q1);
 }
 static inline void Store4x4(__m128i* x, uint8_t* dst, int stride) {
  int i;
  for (i = 0; i < 4; ++i, dst += stride) {
@@ -374,6 +466,44 @@ static inline void Store4x4(__m128i* x, uint8_t* dst, int stride) {
  }
 }
 // Transpose back and store
 static inline void Store16x4(uint8_t* r0, uint8_t* r8, int stride, __m128i* p1,
                             __m128i* p0, __m128i* q0, __m128i* q1) {
  __m128i t1;
  // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
  // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
  t1 = *p0;
  *p0 = _mm_unpacklo_epi8(*p1, t1);
  *p1 = _mm_unpackhi_epi8(*p1, t1);
  // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
  // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
  t1 = *q0;
  *q0 = _mm_unpacklo_epi8(t1, *q1);
  *q1 = _mm_unpackhi_epi8(t1, *q1);
  // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
  // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
  t1 = *p0;
  *p0 = _mm_unpacklo_epi16(t1, *q0);
  *q0 = _mm_unpackhi_epi16(t1, *q0);
  // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
  // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
  t1 = *p1;
  *p1 = _mm_unpacklo_epi16(t1, *q1);
  *q1 = _mm_unpackhi_epi16(t1, *q1);
  Store4x4(p0, r0, stride);
  r0 += 4 * stride;
  Store4x4(q0, r0, stride);
  Store4x4(p1, r8, stride);
  r8 += 4 * stride;
  Store4x4(q1, r8, stride);
 }
 //-----------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)
@@ -392,69 +522,13 @@ static void SimpleVFilter16SSE2(uint8_t* p, int stride, int thresh) {
 }
 static void SimpleHFilter16SSE2(uint8_t* p, int stride, int thresh) {
  __m128i t1, t2;
  __m128i p1, p0, q0, q1;
  // Assume the pixels around the edge (|) are numbered as follows
  //                00 01 | 02 03
  //                10 11 | 12 13
  //                 ...  |  ...
  //                e0 e1 | e2 e3
  //                f0 f1 | f2 f3
  p -= 2;  // beginning of the first segment
-  // Load
+  p -= 2;  // beginning of p1
  // p1 = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
  // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
  // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
  // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
  Load8x4(p, stride, &p1, &q0);
  Load8x4(p + 8 * stride, stride, &p0, &q1);
-  t1 = p1;
+  Load16x4(p, p + 8 * stride,  stride, &p1, &p0, &q0, &q1);
  t2 = q0;
  // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
  // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
  // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
  // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
  p1 = _mm_unpacklo_epi64(p1, p0);
  p0 = _mm_unpackhi_epi64(t1, p0);
  q0 = _mm_unpacklo_epi64(q0, q1);
  q1 = _mm_unpackhi_epi64(t2, q1);
  // Filter
  DoFilter2(&p1, &p0, &q0, &q1, thresh);
-  t1 = p0;
+  Store16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
  t2 = q0;
  // Transpose back to write out
  // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
  // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
  // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
  // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
  p0 = _mm_unpacklo_epi8(p1, t1);
  p1 = _mm_unpackhi_epi8(p1, t1);
  q0 = _mm_unpacklo_epi8(t2, q1);
  q1 = _mm_unpackhi_epi8(t2, q1);
  t1 = p0;
  t2 = p1;
  // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
  // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
  // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
  // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
  p0 = _mm_unpacklo_epi16(p0, q0);
  q0 = _mm_unpackhi_epi16(t1, q0);
  p1 = _mm_unpacklo_epi16(p1, q1);
  q1 = _mm_unpackhi_epi16(t2, q1);
  // Store
  Store4x4(&p0, p, stride);
  p += 4 * stride;
  Store4x4(&q0, p, stride);
  p += 4 * stride;
  Store4x4(&p1, p, stride);
  p += 4 * stride;
  Store4x4(&q1, p, stride);
 }
 static void SimpleVFilter16iSSE2(uint8_t* p, int stride, int thresh) {
@@ -473,11 +547,163 @@ static void SimpleHFilter16iSSE2(uint8_t* p, int stride, int thresh) {
  }
 }
 //-----------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)
 #define MAX_DIFF1(p3, p2, p1, p0, m) {                                  \
  m = MM_ABS(p3, p2);                                                   \
  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                  \
  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                  \
 }
 #define MAX_DIFF2(p3, p2, p1, p0, m) {                                  \
  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                  \
  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                  \
  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                  \
 }
 #define LOADUV(p, u, v, stride) {                                       \
  p = _mm_loadl_epi64((__m128i*)&u[(stride)]);                          \
  p = _mm_unpacklo_epi64(p, _mm_loadl_epi64((__m128i*)&v[(stride)]));   \
 }
 #define STOREUV(p, u, v, stride) {                                      \
  _mm_storel_epi64((__m128i*)&u[(stride)], p);                          \
  p = _mm_unpackhi_epi64(p, p);                                         \
  _mm_storel_epi64((__m128i*)&v[(stride)], p);                          \
 }
 #define COMPLEX_FL_MASK(p1, p0, q0, q1, t, it, mask) {                  \
  mask = _mm_subs_epu8(mask, it);                                       \
  mask = _mm_cmpeq_epi8(mask, _mm_setzero_si128());                     \
  NeedsFilter(&p1, &p0, &q0, &q1, t, &it);                              \
  mask = _mm_and_si128(mask, it);                                       \
 }
 static void VFilter16iSSE2(uint8_t* p, int stride,
                           int thresh, int ithresh, int hev_thresh) {
  int k;
  __m128i mask;
  __m128i t1, t2, p1, p0, q0, q1;
  for (k = 3; k > 0; --k) {
    p += 4 * stride;
    // Load
    t2 = _mm_loadu_si128((__m128i*)&p[-4 * stride]);    // p3
    t1 = _mm_loadu_si128((__m128i*)&p[-3 * stride]);    // p2
    p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]);    // p1
    p0 = _mm_loadu_si128((__m128i*)&p[-1 * stride]);    // p0
    MAX_DIFF1(t2, t1, p1, p0, mask);
    q0 = _mm_loadu_si128((__m128i*)&p[0 * stride]);     // q0
    q1 = _mm_loadu_si128((__m128i*)&p[1 * stride]);     // q1
    t1 = _mm_loadu_si128((__m128i*)&p[2 * stride]);     // q2
    t2 = _mm_loadu_si128((__m128i*)&p[3 * stride]);     // q3
    MAX_DIFF2(t2, t1, q1, q0, mask);
    t1 = _mm_set1_epi8(ithresh);
    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, t1, mask);
    DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
    // Store
    _mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
    _mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
    _mm_storeu_si128((__m128i*)&p[0 * stride], q0);
    _mm_storeu_si128((__m128i*)&p[1 * stride], q1);
  }
 }
 static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
                          int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i t1, t2, p1, p0, q0, q1;
  u += 4 * stride;
  v += 4 * stride;
  // Load
  LOADUV(t2, u, v, -4 * stride);      // p3
  LOADUV(t1, u, v, -3 * stride);      // p2
  LOADUV(p1, u, v, -2 * stride);      // p1
  LOADUV(p0, u, v, -1 * stride);      // p0
  MAX_DIFF1(t2, t1, p1, p0, mask);
  LOADUV(q0, u, v, 0 * stride);       // q0
  LOADUV(q1, u, v, 1 * stride);       // q1
  LOADUV(t1, u, v, 2 * stride);       // q2
  LOADUV(t2, u, v, 3 * stride);       // q3
  MAX_DIFF2(t2, t1, q1, q0, mask);
  t1 = _mm_set1_epi8(ithresh);
  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, t1, mask);
  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
  // Store
  STOREUV(p1, u, v, -2 * stride);
  STOREUV(p0, u, v, -1 * stride);
  STOREUV(q0, u, v, 0 * stride);
  STOREUV(q1, u, v, 1 * stride);
 }
 static void HFilter16iSSE2(uint8_t* p, int stride,
                           int thresh, int ithresh, int hev_thresh) {
  int k;
  uint8_t* b;
  __m128i mask;
  __m128i t1, t2, p1, p0, q0, q1;
  for (k = 3; k > 0; --k) {
    b = p;
    Load16x4(b, b + 8 * stride, stride, &t2, &t1, &p1, &p0);  // p3, p2, p1, p0
    MAX_DIFF1(t2, t1, p1, p0, mask);
    b += 4;  // beginning of q0
    Load16x4(b, b + 8 * stride, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
    MAX_DIFF2(t2, t1, q1, q0, mask);
    t1 = _mm_set1_epi8(ithresh);
    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, t1, mask);
    DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
    b -= 2;  // beginning of p1
    Store16x4(b, b + 8 * stride, stride, &p1, &p0, &q0, &q1);
    p += 4;
  }
 }
 static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
                          int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i t1, t2, p1, p0, q0, q1;
  Load16x4(u, v, stride, &t2, &t1, &p1, &p0);   // p3, p2, p1, p0
  MAX_DIFF1(t2, t1, p1, p0, mask);
  u += 4;  // beginning of q0
  v += 4;
  Load16x4(u, v, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
  MAX_DIFF2(t2, t1, q1, q0, mask);
  t1 = _mm_set1_epi8(ithresh);
  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, t1, mask);
  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
  u -= 2;  // beginning of p1
  v -= 2;
  Store16x4(u, v, stride, &p1, &p0, &q0, &q1);
 }
 extern void VP8DspInitSSE2(void);
 void VP8DspInitSSE2(void) {
  VP8Transform = TransformSSE2;
  VP8HFilter16i = HFilter16iSSE2;
  VP8VFilter16i = VFilter16iSSE2;
  VP8VFilter8i = VFilter8iSSE2;
  VP8HFilter8i = HFilter8iSSE2;
  VP8SimpleVFilter16 = SimpleVFilter16SSE2;
  VP8SimpleHFilter16 = SimpleHFilter16SSE2;
  VP8SimpleVFilter16i = SimpleVFilter16iSSE2;