diff --git a/src/dec/dsp_sse2.c b/src/dec/dsp_sse2.c index b7427fdf..82278bc3 100644 --- a/src/dec/dsp_sse2.c +++ b/src/dec/dsp_sse2.c @@ -245,8 +245,8 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) { // Compute abs(p - q) = subs(p - q) OR subs(q - p) #define MM_ABS(p, q) _mm_or_si128( \ - _mm_subs_epu8(*(q), *(p)), \ - _mm_subs_epu8(*(p), *(q))) + _mm_subs_epu8((q), (p)), \ + _mm_subs_epu8((p), (q))) // Shift each byte of "a" by N bits while preserving by the sign bit. // @@ -264,23 +264,20 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) { a = _mm_or_si128(t, a); \ } - static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0, const __m128i* q1, int thresh, __m128i *mask) { - __m128i t1, t2; + __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1) + *mask = _mm_set1_epi8(0xFE); + t1 = _mm_and_si128(t1, *mask); // set lsb of each byte to zero + t1 = _mm_srli_epi16(t1, 1); // abs(p1 - q1) / 2 - t1 = MM_ABS(p1, q1); // abs(p1 - q1) - t2 = _mm_set1_epi8(0xFE); - t1 = _mm_and_si128(t1, t2); // set lsb of each byte to zero - t1 = _mm_srli_epi16(t1, 1); // abs(p1 - q1) / 2 - - t2 = MM_ABS(p0, q0); - t2 = _mm_adds_epu8(t2, t2); // abs(p0 - q0) * 2 - t2 = _mm_adds_epu8(t2, t1); // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + *mask = MM_ABS(*p0, *q0); // abs(p0 - q0) + *mask = _mm_adds_epu8(*mask, *mask); // abs(p0 - q0) * 2 + *mask = _mm_adds_epu8(*mask, t1); // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 t1 = _mm_set1_epi8(thresh); - t2 = _mm_subs_epu8(t2, t1); // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > thresh - *mask = _mm_cmpeq_epi8(t2, _mm_setzero_si128()); + *mask = _mm_subs_epu8(*mask, t1); // mask <= thresh + *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128()); } //----------------------------------------------------------------------------- @@ -308,21 +305,82 @@ static void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0, // Do +4 side t2 = _mm_set1_epi8(4); - t2 = _mm_adds_epi8(t2, t1); // 3 * (q0 - p0) + (p1 - q1) + 4 + t2 = _mm_adds_epi8(t1, t2); // 3 * (q0 - p0) + (p1 - q1) + 4 SIGNED_SHIFT_N(t2, 3); // t2 >> 3 - *q0 = _mm_subs_epi8(*q0, t2); // q0 -= a + *q0 = _mm_subs_epi8(*q0, t2); // q0 -= t2 // Now do +3 side t2 = _mm_set1_epi8(3); - t2 = _mm_adds_epi8(t2, t1); // +3 instead of +4 + t2 = _mm_adds_epi8(t1, t2); // +3 instead of +4 SIGNED_SHIFT_N(t2, 3); // t2 >> 3 - *p0 = _mm_adds_epi8(*p0, t2); // p0 += b + *p0 = _mm_adds_epi8(*p0, t2); // p0 += t2 // unoffset *p0 = _mm_xor_si128(*p0, sign_bit); *q0 = _mm_xor_si128(*q0, sign_bit); } +// Applies filter on p1, p0, q0 and q1 +static void DoFilter4(__m128i* p1, __m128i *p0, __m128i* q0, __m128i* q1, + const __m128i* mask, int hev_thresh) { + __m128i t1, t2, t3; + __m128i hev = _mm_set1_epi8(hev_thresh); + const __m128i sign_bit = _mm_set1_epi8(0x80); + + // compute hev mask + t1 = MM_ABS(*p1, *p0); + t2 = MM_ABS(*q1, *q0); + t1 = _mm_subs_epu8(t1, hev); // abs(p1 - p0) - hev_tresh + t2 = _mm_subs_epu8(t2, hev); // abs(q1 - q0) - hev_tresh + + hev = _mm_or_si128(t1, t2); // hev <= t1 || hev <= t2 + t1 = _mm_setzero_si128(); + hev = _mm_cmpeq_epi8(hev, t1); + t1 = _mm_set1_epi16(0xffff); // load 0xff on all bytes + hev = _mm_xor_si128(hev, t1); // hev > t1 && hev > t2 + + // convert to signed values + *p0 = _mm_xor_si128(*p0, sign_bit); + *q0 = _mm_xor_si128(*q0, sign_bit); + *p1 = _mm_xor_si128(*p1, sign_bit); + *q1 = _mm_xor_si128(*q1, sign_bit); + + t1 = _mm_subs_epi8(*p1, *q1); // p1 - q1 + t1 = _mm_and_si128(hev, t1); // hev(p1 - q1) + t2 = _mm_subs_epi8(*q0, *p0); // q0 - p0 + t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 1 * (q0 - p0) + t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 2 * (q0 - p0) + t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 3 * (q0 - p0) + t1 = _mm_and_si128(t1, *mask); // mask filter values we don't care about + + // Do +4 side + t2 = _mm_set1_epi8(4); + t2 = _mm_adds_epi8(t1, t2); // 3 * (q0 - p0) + (p1 - q1) + 4 + SIGNED_SHIFT_N(t2, 3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 + t3 = t2; // save t2 + *q0 = _mm_subs_epi8(*q0, t2); // q0 -= t2 + + // Now do +3 side + t2 = _mm_set1_epi8(3); + t2 = _mm_adds_epi8(t1, t2); // +3 instead of +4 + SIGNED_SHIFT_N(t2, 3); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 + *p0 = _mm_adds_epi8(*p0, t2); // p0 += t2 + + t2 = _mm_set1_epi8(1); + t3 = _mm_adds_epi8(t3, t2); + SIGNED_SHIFT_N(t3, 1); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 4 + + hev = _mm_andnot_si128(hev, t3); // if !hev + *q1 = _mm_subs_epi8(*q1, hev); // q1 -= t3 + *p1 = _mm_adds_epi8(*p1, hev); // p1 += t3 + + // unoffset + *p0 = _mm_xor_si128(*p0, sign_bit); + *q0 = _mm_xor_si128(*q0, sign_bit); + *p1 = _mm_xor_si128(*p1, sign_bit); + *q1 = _mm_xor_si128(*q1, sign_bit); +} + // Reads 8 rows across a vertical edge. // // TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into @@ -366,6 +424,40 @@ static void Load8x4(const uint8_t* b, int stride, __m128i* p, __m128i* q) { *q = _mm_unpackhi_epi32(t1, t2); } +static inline void Load16x4(const uint8_t* r0, const uint8_t* r8, int stride, + __m128i* p1, __m128i* p0, + __m128i* q0, __m128i* q1) { + __m128i t1, t2; + // Assume the pixels around the edge (|) are numbered as follows + // 00 01 | 02 03 + // 10 11 | 12 13 + // ... | ... + // e0 e1 | e2 e3 + // f0 f1 | f2 f3 + // + // r0 is pointing to the 0th row (00) + // r8 is pointing to the 8th row (80) + + // Load + // p1 = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 + // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 + Load8x4(r0, stride, p1, q0); + Load8x4(r8, stride, p0, q1); + + t1 = *p1; + t2 = *q0; + // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + *p1 = _mm_unpacklo_epi64(t1, *p0); + *p0 = _mm_unpackhi_epi64(t1, *p0); + *q0 = _mm_unpacklo_epi64(t2, *q1); + *q1 = _mm_unpackhi_epi64(t2, *q1); +} + static inline void Store4x4(__m128i* x, uint8_t* dst, int stride) { int i; for (i = 0; i < 4; ++i, dst += stride) { @@ -374,6 +466,44 @@ static inline void Store4x4(__m128i* x, uint8_t* dst, int stride) { } } +// Transpose back and store +static inline void Store16x4(uint8_t* r0, uint8_t* r8, int stride, __m128i* p1, + __m128i* p0, __m128i* q0, __m128i* q1) { + __m128i t1; + + // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + t1 = *p0; + *p0 = _mm_unpacklo_epi8(*p1, t1); + *p1 = _mm_unpackhi_epi8(*p1, t1); + + // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + t1 = *q0; + *q0 = _mm_unpacklo_epi8(t1, *q1); + *q1 = _mm_unpackhi_epi8(t1, *q1); + + // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + t1 = *p0; + *p0 = _mm_unpacklo_epi16(t1, *q0); + *q0 = _mm_unpackhi_epi16(t1, *q0); + + // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 + t1 = *p1; + *p1 = _mm_unpacklo_epi16(t1, *q1); + *q1 = _mm_unpackhi_epi16(t1, *q1); + + Store4x4(p0, r0, stride); + r0 += 4 * stride; + Store4x4(q0, r0, stride); + + Store4x4(p1, r8, stride); + r8 += 4 * stride; + Store4x4(q1, r8, stride); +} + //----------------------------------------------------------------------------- // Simple In-loop filtering (Paragraph 15.2) @@ -392,69 +522,13 @@ static void SimpleVFilter16SSE2(uint8_t* p, int stride, int thresh) { } static void SimpleHFilter16SSE2(uint8_t* p, int stride, int thresh) { - __m128i t1, t2; __m128i p1, p0, q0, q1; - // Assume the pixels around the edge (|) are numbered as follows - // 00 01 | 02 03 - // 10 11 | 12 13 - // ... | ... - // e0 e1 | e2 e3 - // f0 f1 | f2 f3 - p -= 2; // beginning of the first segment - // Load - // p1 = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 - // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 - // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 - Load8x4(p, stride, &p1, &q0); - Load8x4(p + 8 * stride, stride, &p0, &q1); + p -= 2; // beginning of p1 - t1 = p1; - t2 = q0; - // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 - // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 - // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - p1 = _mm_unpacklo_epi64(p1, p0); - p0 = _mm_unpackhi_epi64(t1, p0); - q0 = _mm_unpacklo_epi64(q0, q1); - q1 = _mm_unpackhi_epi64(t2, q1); - - // Filter + Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1); DoFilter2(&p1, &p0, &q0, &q1, thresh); - t1 = p0; - t2 = q0; - - // Transpose back to write out - // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 - // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 - // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 - // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 - p0 = _mm_unpacklo_epi8(p1, t1); - p1 = _mm_unpackhi_epi8(p1, t1); - q0 = _mm_unpacklo_epi8(t2, q1); - q1 = _mm_unpackhi_epi8(t2, q1); - - t1 = p0; - t2 = p1; - // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 - // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 - // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 - // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 - p0 = _mm_unpacklo_epi16(p0, q0); - q0 = _mm_unpackhi_epi16(t1, q0); - p1 = _mm_unpacklo_epi16(p1, q1); - q1 = _mm_unpackhi_epi16(t2, q1); - - // Store - Store4x4(&p0, p, stride); - p += 4 * stride; - Store4x4(&q0, p, stride); - p += 4 * stride; - Store4x4(&p1, p, stride); - p += 4 * stride; - Store4x4(&q1, p, stride); + Store16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1); } static void SimpleVFilter16iSSE2(uint8_t* p, int stride, int thresh) { @@ -473,11 +547,163 @@ static void SimpleHFilter16iSSE2(uint8_t* p, int stride, int thresh) { } } +//----------------------------------------------------------------------------- +// Complex In-loop filtering (Paragraph 15.3) + +#define MAX_DIFF1(p3, p2, p1, p0, m) { \ + m = MM_ABS(p3, p2); \ + m = _mm_max_epu8(m, MM_ABS(p2, p1)); \ + m = _mm_max_epu8(m, MM_ABS(p1, p0)); \ +} + +#define MAX_DIFF2(p3, p2, p1, p0, m) { \ + m = _mm_max_epu8(m, MM_ABS(p3, p2)); \ + m = _mm_max_epu8(m, MM_ABS(p2, p1)); \ + m = _mm_max_epu8(m, MM_ABS(p1, p0)); \ +} + +#define LOADUV(p, u, v, stride) { \ + p = _mm_loadl_epi64((__m128i*)&u[(stride)]); \ + p = _mm_unpacklo_epi64(p, _mm_loadl_epi64((__m128i*)&v[(stride)])); \ +} + +#define STOREUV(p, u, v, stride) { \ + _mm_storel_epi64((__m128i*)&u[(stride)], p); \ + p = _mm_unpackhi_epi64(p, p); \ + _mm_storel_epi64((__m128i*)&v[(stride)], p); \ +} + +#define COMPLEX_FL_MASK(p1, p0, q0, q1, t, it, mask) { \ + mask = _mm_subs_epu8(mask, it); \ + mask = _mm_cmpeq_epi8(mask, _mm_setzero_si128()); \ + NeedsFilter(&p1, &p0, &q0, &q1, t, &it); \ + mask = _mm_and_si128(mask, it); \ +} + +static void VFilter16iSSE2(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { + int k; + __m128i mask; + __m128i t1, t2, p1, p0, q0, q1; + + for (k = 3; k > 0; --k) { + p += 4 * stride; + + // Load + t2 = _mm_loadu_si128((__m128i*)&p[-4 * stride]); // p3 + t1 = _mm_loadu_si128((__m128i*)&p[-3 * stride]); // p2 + p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]); // p1 + p0 = _mm_loadu_si128((__m128i*)&p[-1 * stride]); // p0 + MAX_DIFF1(t2, t1, p1, p0, mask); + + q0 = _mm_loadu_si128((__m128i*)&p[0 * stride]); // q0 + q1 = _mm_loadu_si128((__m128i*)&p[1 * stride]); // q1 + t1 = _mm_loadu_si128((__m128i*)&p[2 * stride]); // q2 + t2 = _mm_loadu_si128((__m128i*)&p[3 * stride]); // q3 + MAX_DIFF2(t2, t1, q1, q0, mask); + + t1 = _mm_set1_epi8(ithresh); + COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, t1, mask); + DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); + + // Store + _mm_storeu_si128((__m128i*)&p[-2 * stride], p1); + _mm_storeu_si128((__m128i*)&p[-1 * stride], p0); + _mm_storeu_si128((__m128i*)&p[0 * stride], q0); + _mm_storeu_si128((__m128i*)&p[1 * stride], q1); + } +} + +static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { + __m128i mask; + __m128i t1, t2, p1, p0, q0, q1; + + u += 4 * stride; + v += 4 * stride; + + // Load + LOADUV(t2, u, v, -4 * stride); // p3 + LOADUV(t1, u, v, -3 * stride); // p2 + LOADUV(p1, u, v, -2 * stride); // p1 + LOADUV(p0, u, v, -1 * stride); // p0 + MAX_DIFF1(t2, t1, p1, p0, mask); + + LOADUV(q0, u, v, 0 * stride); // q0 + LOADUV(q1, u, v, 1 * stride); // q1 + LOADUV(t1, u, v, 2 * stride); // q2 + LOADUV(t2, u, v, 3 * stride); // q3 + MAX_DIFF2(t2, t1, q1, q0, mask); + + t1 = _mm_set1_epi8(ithresh); + COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, t1, mask); + DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); + + // Store + STOREUV(p1, u, v, -2 * stride); + STOREUV(p0, u, v, -1 * stride); + STOREUV(q0, u, v, 0 * stride); + STOREUV(q1, u, v, 1 * stride); +} + +static void HFilter16iSSE2(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { + int k; + uint8_t* b; + __m128i mask; + __m128i t1, t2, p1, p0, q0, q1; + + for (k = 3; k > 0; --k) { + b = p; + Load16x4(b, b + 8 * stride, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0 + MAX_DIFF1(t2, t1, p1, p0, mask); + + b += 4; // beginning of q0 + Load16x4(b, b + 8 * stride, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3 + MAX_DIFF2(t2, t1, q1, q0, mask); + + t1 = _mm_set1_epi8(ithresh); + COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, t1, mask); + DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); + + b -= 2; // beginning of p1 + Store16x4(b, b + 8 * stride, stride, &p1, &p0, &q0, &q1); + + p += 4; + } +} + +static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { + __m128i mask; + __m128i t1, t2, p1, p0, q0, q1; + Load16x4(u, v, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0 + MAX_DIFF1(t2, t1, p1, p0, mask); + + u += 4; // beginning of q0 + v += 4; + Load16x4(u, v, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3 + MAX_DIFF2(t2, t1, q1, q0, mask); + + t1 = _mm_set1_epi8(ithresh); + COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, t1, mask); + DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); + + u -= 2; // beginning of p1 + v -= 2; + Store16x4(u, v, stride, &p1, &p0, &q0, &q1); +} + extern void VP8DspInitSSE2(void); void VP8DspInitSSE2(void) { VP8Transform = TransformSSE2; + VP8HFilter16i = HFilter16iSSE2; + VP8VFilter16i = VFilter16iSSE2; + VP8VFilter8i = VFilter8iSSE2; + VP8HFilter8i = HFilter8iSSE2; + VP8SimpleVFilter16 = SimpleVFilter16SSE2; VP8SimpleHFilter16 = SimpleHFilter16SSE2; VP8SimpleVFilter16i = SimpleVFilter16iSSE2;