sse2 version of the complex filter

12-15% faster. (only inner edge is implemented for now) patch by Somnath Banerjee (somnath at google dot com)
2025-06-06 06:04:20 +02:00 · 2011-06-20 00:27:47 -07:00 · 2011-06-20 00:27:47 -07:00 · bd2f65f67c
commit bd2f65f67c
parent 96ed9ce0fb
1 changed files with 303 additions and 77 deletions
--- a/src/dec/dsp_sse2.c
+++ b/src/dec/dsp_sse2.c
@ -245,8 +245,8 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {

 // Compute abs(p - q) = subs(p - q) OR subs(q - p)
 #define MM_ABS(p, q)  _mm_or_si128(                                     \
-  _mm_subs_epu8(*(q), *(p)),                                            \
-  _mm_subs_epu8(*(p), *(q)))
+    _mm_subs_epu8((q), (p)),                                            \
+    _mm_subs_epu8((p), (q)))

 // Shift each byte of "a" by N bits while preserving by the sign bit.
 //
@ -264,23 +264,20 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
  a = _mm_or_si128(t, a);                                               \
 }

-
 static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,
                        const __m128i* q1, int thresh, __m128i *mask) {
-  __m128i t1, t2;
+  __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
+  *mask = _mm_set1_epi8(0xFE);
+  t1 = _mm_and_si128(t1, *mask);        // set lsb of each byte to zero
+  t1 = _mm_srli_epi16(t1, 1);           // abs(p1 - q1) / 2

-  t1 = MM_ABS(p1, q1);               // abs(p1 - q1)
-  t2 = _mm_set1_epi8(0xFE);
-  t1 = _mm_and_si128(t1, t2);        // set lsb of each byte to zero
-  t1 = _mm_srli_epi16(t1, 1);        // abs(p1 - q1) / 2
-
-  t2 = MM_ABS(p0, q0);
-  t2 = _mm_adds_epu8(t2, t2);        // abs(p0 - q0) * 2
-  t2 = _mm_adds_epu8(t2, t1);        // abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+  *mask = MM_ABS(*p0, *q0);             // abs(p0 - q0)
+  *mask = _mm_adds_epu8(*mask, *mask);  // abs(p0 - q0) * 2
+  *mask = _mm_adds_epu8(*mask, t1);     // abs(p0 - q0) * 2 + abs(p1 - q1) / 2

  t1 = _mm_set1_epi8(thresh);
-  t2 = _mm_subs_epu8(t2, t1);  // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > thresh
-  *mask = _mm_cmpeq_epi8(t2, _mm_setzero_si128());
+  *mask = _mm_subs_epu8(*mask, t1);     // mask <= thresh
+  *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128());
 }

 //-----------------------------------------------------------------------------
@ -308,21 +305,82 @@ static void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0,

  // Do +4 side
  t2 = _mm_set1_epi8(4);
-  t2 = _mm_adds_epi8(t2, t1);        // 3 * (q0 - p0) + (p1 - q1) + 4
+  t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 4
  SIGNED_SHIFT_N(t2, 3);             // t2 >> 3
-  *q0 = _mm_subs_epi8(*q0, t2);      // q0 -= a
+  *q0 = _mm_subs_epi8(*q0, t2);      // q0 -= t2

  // Now do +3 side
  t2 = _mm_set1_epi8(3);
-  t2 = _mm_adds_epi8(t2, t1);        // +3 instead of +4
+  t2 = _mm_adds_epi8(t1, t2);        // +3 instead of +4
  SIGNED_SHIFT_N(t2, 3);             // t2 >> 3
-  *p0 = _mm_adds_epi8(*p0, t2);      // p0 += b
+  *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2

  // unoffset
  *p0 = _mm_xor_si128(*p0, sign_bit);
  *q0 = _mm_xor_si128(*q0, sign_bit);
 }

+// Applies filter on p1, p0, q0 and q1
+static void DoFilter4(__m128i* p1, __m128i *p0, __m128i* q0, __m128i* q1,
+                      const __m128i* mask, int hev_thresh) {
+  __m128i t1, t2, t3;
+  __m128i hev = _mm_set1_epi8(hev_thresh);
+  const __m128i sign_bit = _mm_set1_epi8(0x80);
+
+  // compute hev mask
+  t1 = MM_ABS(*p1, *p0);
+  t2 = MM_ABS(*q1, *q0);
+  t1 = _mm_subs_epu8(t1, hev);       // abs(p1 - p0) - hev_tresh
+  t2 = _mm_subs_epu8(t2, hev);       // abs(q1 - q0) - hev_tresh
+
+  hev = _mm_or_si128(t1, t2);        // hev <= t1 || hev <= t2
+  t1 = _mm_setzero_si128();
+  hev = _mm_cmpeq_epi8(hev, t1);
+  t1 = _mm_set1_epi16(0xffff);       // load 0xff on all bytes
+  hev = _mm_xor_si128(hev, t1);      // hev > t1 && hev > t2
+
+  // convert to signed values
+  *p0 = _mm_xor_si128(*p0, sign_bit);
+  *q0 = _mm_xor_si128(*q0, sign_bit);
+  *p1 = _mm_xor_si128(*p1, sign_bit);
+  *q1 = _mm_xor_si128(*q1, sign_bit);
+
+  t1 = _mm_subs_epi8(*p1, *q1);      // p1 - q1
+  t1 = _mm_and_si128(hev, t1);       // hev(p1 - q1)
+  t2 = _mm_subs_epi8(*q0, *p0);      // q0 - p0
+  t1 = _mm_adds_epi8(t1, t2);        // hev(p1 - q1) + 1 * (q0 - p0)
+  t1 = _mm_adds_epi8(t1, t2);        // hev(p1 - q1) + 2 * (q0 - p0)
+  t1 = _mm_adds_epi8(t1, t2);        // hev(p1 - q1) + 3 * (q0 - p0)
+  t1 = _mm_and_si128(t1, *mask);     // mask filter values we don't care about
+
+  // Do +4 side
+  t2 = _mm_set1_epi8(4);
+  t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 4
+  SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
+  t3 = t2;                           // save t2
+  *q0 = _mm_subs_epi8(*q0, t2);      // q0 -= t2
+
+  // Now do +3 side
+  t2 = _mm_set1_epi8(3);
+  t2 = _mm_adds_epi8(t1, t2);        // +3 instead of +4
+  SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
+  *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
+
+  t2 = _mm_set1_epi8(1);
+  t3 = _mm_adds_epi8(t3, t2);
+  SIGNED_SHIFT_N(t3, 1);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 4
+
+  hev = _mm_andnot_si128(hev, t3);   // if !hev
+  *q1 = _mm_subs_epi8(*q1, hev);     // q1 -= t3
+  *p1 = _mm_adds_epi8(*p1, hev);     // p1 += t3
+
+  // unoffset
+  *p0 = _mm_xor_si128(*p0, sign_bit);
+  *q0 = _mm_xor_si128(*q0, sign_bit);
+  *p1 = _mm_xor_si128(*p1, sign_bit);
+  *q1 = _mm_xor_si128(*q1, sign_bit);
+}
+
 // Reads 8 rows across a vertical edge.
 //
 // TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into
@ -366,6 +424,40 @@ static void Load8x4(const uint8_t* b, int stride, __m128i* p, __m128i* q) {
  *q = _mm_unpackhi_epi32(t1, t2);
 }

+static inline void Load16x4(const uint8_t* r0, const uint8_t* r8, int stride,
+                            __m128i* p1, __m128i* p0,
+                            __m128i* q0, __m128i* q1) {
+  __m128i t1, t2;
+  // Assume the pixels around the edge (|) are numbered as follows
+  //                00 01 | 02 03
+  //                10 11 | 12 13
+  //                 ...  |  ...
+  //                e0 e1 | e2 e3
+  //                f0 f1 | f2 f3
+  //
+  // r0 is pointing to the 0th row (00)
+  // r8 is pointing to the 8th row (80)
+
+  // Load
+  // p1 = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+  // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+  // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+  // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+  Load8x4(r0, stride, p1, q0);
+  Load8x4(r8, stride, p0, q1);
+
+  t1 = *p1;
+  t2 = *q0;
+  // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+  // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+  // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+  // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+  *p1 = _mm_unpacklo_epi64(t1, *p0);
+  *p0 = _mm_unpackhi_epi64(t1, *p0);
+  *q0 = _mm_unpacklo_epi64(t2, *q1);
+  *q1 = _mm_unpackhi_epi64(t2, *q1);
+}
+
 static inline void Store4x4(__m128i* x, uint8_t* dst, int stride) {
  int i;
  for (i = 0; i < 4; ++i, dst += stride) {
@ -374,6 +466,44 @@ static inline void Store4x4(__m128i* x, uint8_t* dst, int stride) {
  }
 }

+// Transpose back and store
+static inline void Store16x4(uint8_t* r0, uint8_t* r8, int stride, __m128i* p1,
+                             __m128i* p0, __m128i* q0, __m128i* q1) {
+  __m128i t1;
+
+  // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+  // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+  t1 = *p0;
+  *p0 = _mm_unpacklo_epi8(*p1, t1);
+  *p1 = _mm_unpackhi_epi8(*p1, t1);
+
+  // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+  // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+  t1 = *q0;
+  *q0 = _mm_unpacklo_epi8(t1, *q1);
+  *q1 = _mm_unpackhi_epi8(t1, *q1);
+
+  // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+  // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+  t1 = *p0;
+  *p0 = _mm_unpacklo_epi16(t1, *q0);
+  *q0 = _mm_unpackhi_epi16(t1, *q0);
+
+  // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+  // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+  t1 = *p1;
+  *p1 = _mm_unpacklo_epi16(t1, *q1);
+  *q1 = _mm_unpackhi_epi16(t1, *q1);
+
+  Store4x4(p0, r0, stride);
+  r0 += 4 * stride;
+  Store4x4(q0, r0, stride);
+
+  Store4x4(p1, r8, stride);
+  r8 += 4 * stride;
+  Store4x4(q1, r8, stride);
+}
+
 //-----------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)

@ -392,69 +522,13 @@ static void SimpleVFilter16SSE2(uint8_t* p, int stride, int thresh) {
 }

 static void SimpleHFilter16SSE2(uint8_t* p, int stride, int thresh) {
-  __m128i t1, t2;
  __m128i p1, p0, q0, q1;
-  // Assume the pixels around the edge (|) are numbered as follows
-  //                00 01 | 02 03
-  //                10 11 | 12 13
-  //                 ...  |  ...
-  //                e0 e1 | e2 e3
-  //                f0 f1 | f2 f3
-  p -= 2;  // beginning of the first segment

-  // Load
-  // p1 = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-  // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-  // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-  // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
-  Load8x4(p, stride, &p1, &q0);
-  Load8x4(p + 8 * stride, stride, &p0, &q1);
+  p -= 2;  // beginning of p1

-  t1 = p1;
-  t2 = q0;
-  // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-  // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-  // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-  // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-  p1 = _mm_unpacklo_epi64(p1, p0);
-  p0 = _mm_unpackhi_epi64(t1, p0);
-  q0 = _mm_unpacklo_epi64(q0, q1);
-  q1 = _mm_unpackhi_epi64(t2, q1);
-
-  // Filter
+  Load16x4(p, p + 8 * stride,  stride, &p1, &p0, &q0, &q1);
  DoFilter2(&p1, &p0, &q0, &q1, thresh);
-  t1 = p0;
-  t2 = q0;
-
-  // Transpose back to write out
-  // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
-  // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
-  // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-  // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-  p0 = _mm_unpacklo_epi8(p1, t1);
-  p1 = _mm_unpackhi_epi8(p1, t1);
-  q0 = _mm_unpacklo_epi8(t2, q1);
-  q1 = _mm_unpackhi_epi8(t2, q1);
-
-  t1 = p0;
-  t2 = p1;
-  // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
-  // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
-  // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
-  // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
-  p0 = _mm_unpacklo_epi16(p0, q0);
-  q0 = _mm_unpackhi_epi16(t1, q0);
-  p1 = _mm_unpacklo_epi16(p1, q1);
-  q1 = _mm_unpackhi_epi16(t2, q1);
-
-  // Store
-  Store4x4(&p0, p, stride);
-  p += 4 * stride;
-  Store4x4(&q0, p, stride);
-  p += 4 * stride;
-  Store4x4(&p1, p, stride);
-  p += 4 * stride;
-  Store4x4(&q1, p, stride);
+  Store16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
 }

 static void SimpleVFilter16iSSE2(uint8_t* p, int stride, int thresh) {
@ -473,11 +547,163 @@ static void SimpleHFilter16iSSE2(uint8_t* p, int stride, int thresh) {
  }
 }

+//-----------------------------------------------------------------------------
+// Complex In-loop filtering (Paragraph 15.3)
+
+#define MAX_DIFF1(p3, p2, p1, p0, m) {                                  \
+  m = MM_ABS(p3, p2);                                                   \
+  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                  \
+  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                  \
+}
+
+#define MAX_DIFF2(p3, p2, p1, p0, m) {                                  \
+  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                  \
+  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                  \
+  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                  \
+}
+
+#define LOADUV(p, u, v, stride) {                                       \
+  p = _mm_loadl_epi64((__m128i*)&u[(stride)]);                          \
+  p = _mm_unpacklo_epi64(p, _mm_loadl_epi64((__m128i*)&v[(stride)]));   \
+}
+
+#define STOREUV(p, u, v, stride) {                                      \
+  _mm_storel_epi64((__m128i*)&u[(stride)], p);                          \
+  p = _mm_unpackhi_epi64(p, p);                                         \
+  _mm_storel_epi64((__m128i*)&v[(stride)], p);                          \
+}
+
+#define COMPLEX_FL_MASK(p1, p0, q0, q1, t, it, mask) {                  \
+  mask = _mm_subs_epu8(mask, it);                                       \
+  mask = _mm_cmpeq_epi8(mask, _mm_setzero_si128());                     \
+  NeedsFilter(&p1, &p0, &q0, &q1, t, &it);                              \
+  mask = _mm_and_si128(mask, it);                                       \
+}
+
+static void VFilter16iSSE2(uint8_t* p, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
+  int k;
+  __m128i mask;
+  __m128i t1, t2, p1, p0, q0, q1;
+
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+
+    // Load
+    t2 = _mm_loadu_si128((__m128i*)&p[-4 * stride]);    // p3
+    t1 = _mm_loadu_si128((__m128i*)&p[-3 * stride]);    // p2
+    p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]);    // p1
+    p0 = _mm_loadu_si128((__m128i*)&p[-1 * stride]);    // p0
+    MAX_DIFF1(t2, t1, p1, p0, mask);
+
+    q0 = _mm_loadu_si128((__m128i*)&p[0 * stride]);     // q0
+    q1 = _mm_loadu_si128((__m128i*)&p[1 * stride]);     // q1
+    t1 = _mm_loadu_si128((__m128i*)&p[2 * stride]);     // q2
+    t2 = _mm_loadu_si128((__m128i*)&p[3 * stride]);     // q3
+    MAX_DIFF2(t2, t1, q1, q0, mask);
+
+    t1 = _mm_set1_epi8(ithresh);
+    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, t1, mask);
+    DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+
+    // Store
+    _mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
+    _mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
+    _mm_storeu_si128((__m128i*)&p[0 * stride], q0);
+    _mm_storeu_si128((__m128i*)&p[1 * stride], q1);
+  }
+}
+
+static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
+  __m128i mask;
+  __m128i t1, t2, p1, p0, q0, q1;
+
+  u += 4 * stride;
+  v += 4 * stride;
+
+  // Load
+  LOADUV(t2, u, v, -4 * stride);      // p3
+  LOADUV(t1, u, v, -3 * stride);      // p2
+  LOADUV(p1, u, v, -2 * stride);      // p1
+  LOADUV(p0, u, v, -1 * stride);      // p0
+  MAX_DIFF1(t2, t1, p1, p0, mask);
+
+  LOADUV(q0, u, v, 0 * stride);       // q0
+  LOADUV(q1, u, v, 1 * stride);       // q1
+  LOADUV(t1, u, v, 2 * stride);       // q2
+  LOADUV(t2, u, v, 3 * stride);       // q3
+  MAX_DIFF2(t2, t1, q1, q0, mask);
+
+  t1 = _mm_set1_epi8(ithresh);
+  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, t1, mask);
+  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+
+  // Store
+  STOREUV(p1, u, v, -2 * stride);
+  STOREUV(p0, u, v, -1 * stride);
+  STOREUV(q0, u, v, 0 * stride);
+  STOREUV(q1, u, v, 1 * stride);
+}
+
+static void HFilter16iSSE2(uint8_t* p, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
+  int k;
+  uint8_t* b;
+  __m128i mask;
+  __m128i t1, t2, p1, p0, q0, q1;
+
+  for (k = 3; k > 0; --k) {
+    b = p;
+    Load16x4(b, b + 8 * stride, stride, &t2, &t1, &p1, &p0);  // p3, p2, p1, p0
+    MAX_DIFF1(t2, t1, p1, p0, mask);
+
+    b += 4;  // beginning of q0
+    Load16x4(b, b + 8 * stride, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
+    MAX_DIFF2(t2, t1, q1, q0, mask);
+
+    t1 = _mm_set1_epi8(ithresh);
+    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, t1, mask);
+    DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+
+    b -= 2;  // beginning of p1
+    Store16x4(b, b + 8 * stride, stride, &p1, &p0, &q0, &q1);
+
+    p += 4;
+  }
+}
+
+static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
+  __m128i mask;
+  __m128i t1, t2, p1, p0, q0, q1;
+  Load16x4(u, v, stride, &t2, &t1, &p1, &p0);   // p3, p2, p1, p0
+  MAX_DIFF1(t2, t1, p1, p0, mask);
+
+  u += 4;  // beginning of q0
+  v += 4;
+  Load16x4(u, v, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
+  MAX_DIFF2(t2, t1, q1, q0, mask);
+
+  t1 = _mm_set1_epi8(ithresh);
+  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, t1, mask);
+  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+
+  u -= 2;  // beginning of p1
+  v -= 2;
+  Store16x4(u, v, stride, &p1, &p0, &q0, &q1);
+}
+
 extern void VP8DspInitSSE2(void);

 void VP8DspInitSSE2(void) {
  VP8Transform = TransformSSE2;

+  VP8HFilter16i = HFilter16iSSE2;
+  VP8VFilter16i = VFilter16iSSE2;
+  VP8VFilter8i = VFilter8iSSE2;
+  VP8HFilter8i = HFilter8iSSE2;
+
  VP8SimpleVFilter16 = SimpleVFilter16SSE2;
  VP8SimpleHFilter16 = SimpleHFilter16SSE2;
  VP8SimpleVFilter16i = SimpleVFilter16iSSE2;