sse2 version of the complex filter

12-15% faster.
(only inner edge is implemented for now)

patch by Somnath Banerjee (somnath at google dot com)
This commit is contained in:
Pascal Massimino 2011-06-20 00:27:47 -07:00
parent 96ed9ce0fb
commit bd2f65f67c

View File

@ -245,8 +245,8 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
// Compute abs(p - q) = subs(p - q) OR subs(q - p)
#define MM_ABS(p, q) _mm_or_si128( \
_mm_subs_epu8(*(q), *(p)), \
_mm_subs_epu8(*(p), *(q)))
_mm_subs_epu8((q), (p)), \
_mm_subs_epu8((p), (q)))
// Shift each byte of "a" by N bits while preserving by the sign bit.
//
@ -264,23 +264,20 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
a = _mm_or_si128(t, a); \
}
static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,
const __m128i* q1, int thresh, __m128i *mask) {
__m128i t1, t2;
__m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1)
*mask = _mm_set1_epi8(0xFE);
t1 = _mm_and_si128(t1, *mask); // set lsb of each byte to zero
t1 = _mm_srli_epi16(t1, 1); // abs(p1 - q1) / 2
t1 = MM_ABS(p1, q1); // abs(p1 - q1)
t2 = _mm_set1_epi8(0xFE);
t1 = _mm_and_si128(t1, t2); // set lsb of each byte to zero
t1 = _mm_srli_epi16(t1, 1); // abs(p1 - q1) / 2
t2 = MM_ABS(p0, q0);
t2 = _mm_adds_epu8(t2, t2); // abs(p0 - q0) * 2
t2 = _mm_adds_epu8(t2, t1); // abs(p0 - q0) * 2 + abs(p1 - q1) / 2
*mask = MM_ABS(*p0, *q0); // abs(p0 - q0)
*mask = _mm_adds_epu8(*mask, *mask); // abs(p0 - q0) * 2
*mask = _mm_adds_epu8(*mask, t1); // abs(p0 - q0) * 2 + abs(p1 - q1) / 2
t1 = _mm_set1_epi8(thresh);
t2 = _mm_subs_epu8(t2, t1); // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > thresh
*mask = _mm_cmpeq_epi8(t2, _mm_setzero_si128());
*mask = _mm_subs_epu8(*mask, t1); // mask <= thresh
*mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128());
}
//-----------------------------------------------------------------------------
@ -308,21 +305,82 @@ static void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0,
// Do +4 side
t2 = _mm_set1_epi8(4);
t2 = _mm_adds_epi8(t2, t1); // 3 * (q0 - p0) + (p1 - q1) + 4
t2 = _mm_adds_epi8(t1, t2); // 3 * (q0 - p0) + (p1 - q1) + 4
SIGNED_SHIFT_N(t2, 3); // t2 >> 3
*q0 = _mm_subs_epi8(*q0, t2); // q0 -= a
*q0 = _mm_subs_epi8(*q0, t2); // q0 -= t2
// Now do +3 side
t2 = _mm_set1_epi8(3);
t2 = _mm_adds_epi8(t2, t1); // +3 instead of +4
t2 = _mm_adds_epi8(t1, t2); // +3 instead of +4
SIGNED_SHIFT_N(t2, 3); // t2 >> 3
*p0 = _mm_adds_epi8(*p0, t2); // p0 += b
*p0 = _mm_adds_epi8(*p0, t2); // p0 += t2
// unoffset
*p0 = _mm_xor_si128(*p0, sign_bit);
*q0 = _mm_xor_si128(*q0, sign_bit);
}
// Applies filter on p1, p0, q0 and q1
static void DoFilter4(__m128i* p1, __m128i *p0, __m128i* q0, __m128i* q1,
const __m128i* mask, int hev_thresh) {
__m128i t1, t2, t3;
__m128i hev = _mm_set1_epi8(hev_thresh);
const __m128i sign_bit = _mm_set1_epi8(0x80);
// compute hev mask
t1 = MM_ABS(*p1, *p0);
t2 = MM_ABS(*q1, *q0);
t1 = _mm_subs_epu8(t1, hev); // abs(p1 - p0) - hev_tresh
t2 = _mm_subs_epu8(t2, hev); // abs(q1 - q0) - hev_tresh
hev = _mm_or_si128(t1, t2); // hev <= t1 || hev <= t2
t1 = _mm_setzero_si128();
hev = _mm_cmpeq_epi8(hev, t1);
t1 = _mm_set1_epi16(0xffff); // load 0xff on all bytes
hev = _mm_xor_si128(hev, t1); // hev > t1 && hev > t2
// convert to signed values
*p0 = _mm_xor_si128(*p0, sign_bit);
*q0 = _mm_xor_si128(*q0, sign_bit);
*p1 = _mm_xor_si128(*p1, sign_bit);
*q1 = _mm_xor_si128(*q1, sign_bit);
t1 = _mm_subs_epi8(*p1, *q1); // p1 - q1
t1 = _mm_and_si128(hev, t1); // hev(p1 - q1)
t2 = _mm_subs_epi8(*q0, *p0); // q0 - p0
t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 1 * (q0 - p0)
t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 2 * (q0 - p0)
t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 3 * (q0 - p0)
t1 = _mm_and_si128(t1, *mask); // mask filter values we don't care about
// Do +4 side
t2 = _mm_set1_epi8(4);
t2 = _mm_adds_epi8(t1, t2); // 3 * (q0 - p0) + (p1 - q1) + 4
SIGNED_SHIFT_N(t2, 3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
t3 = t2; // save t2
*q0 = _mm_subs_epi8(*q0, t2); // q0 -= t2
// Now do +3 side
t2 = _mm_set1_epi8(3);
t2 = _mm_adds_epi8(t1, t2); // +3 instead of +4
SIGNED_SHIFT_N(t2, 3); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
*p0 = _mm_adds_epi8(*p0, t2); // p0 += t2
t2 = _mm_set1_epi8(1);
t3 = _mm_adds_epi8(t3, t2);
SIGNED_SHIFT_N(t3, 1); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 4
hev = _mm_andnot_si128(hev, t3); // if !hev
*q1 = _mm_subs_epi8(*q1, hev); // q1 -= t3
*p1 = _mm_adds_epi8(*p1, hev); // p1 += t3
// unoffset
*p0 = _mm_xor_si128(*p0, sign_bit);
*q0 = _mm_xor_si128(*q0, sign_bit);
*p1 = _mm_xor_si128(*p1, sign_bit);
*q1 = _mm_xor_si128(*q1, sign_bit);
}
// Reads 8 rows across a vertical edge.
//
// TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into
@ -366,6 +424,40 @@ static void Load8x4(const uint8_t* b, int stride, __m128i* p, __m128i* q) {
*q = _mm_unpackhi_epi32(t1, t2);
}
static inline void Load16x4(const uint8_t* r0, const uint8_t* r8, int stride,
__m128i* p1, __m128i* p0,
__m128i* q0, __m128i* q1) {
__m128i t1, t2;
// Assume the pixels around the edge (|) are numbered as follows
// 00 01 | 02 03
// 10 11 | 12 13
// ... | ...
// e0 e1 | e2 e3
// f0 f1 | f2 f3
//
// r0 is pointing to the 0th row (00)
// r8 is pointing to the 8th row (80)
// Load
// p1 = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
// q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
// p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
// q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
Load8x4(r0, stride, p1, q0);
Load8x4(r8, stride, p0, q1);
t1 = *p1;
t2 = *q0;
// p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
// p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
// q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
// q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
*p1 = _mm_unpacklo_epi64(t1, *p0);
*p0 = _mm_unpackhi_epi64(t1, *p0);
*q0 = _mm_unpacklo_epi64(t2, *q1);
*q1 = _mm_unpackhi_epi64(t2, *q1);
}
static inline void Store4x4(__m128i* x, uint8_t* dst, int stride) {
int i;
for (i = 0; i < 4; ++i, dst += stride) {
@ -374,6 +466,44 @@ static inline void Store4x4(__m128i* x, uint8_t* dst, int stride) {
}
}
// Transpose back and store
static inline void Store16x4(uint8_t* r0, uint8_t* r8, int stride, __m128i* p1,
__m128i* p0, __m128i* q0, __m128i* q1) {
__m128i t1;
// p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
// p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
t1 = *p0;
*p0 = _mm_unpacklo_epi8(*p1, t1);
*p1 = _mm_unpackhi_epi8(*p1, t1);
// q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
// q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
t1 = *q0;
*q0 = _mm_unpacklo_epi8(t1, *q1);
*q1 = _mm_unpackhi_epi8(t1, *q1);
// p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
// q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
t1 = *p0;
*p0 = _mm_unpacklo_epi16(t1, *q0);
*q0 = _mm_unpackhi_epi16(t1, *q0);
// p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
// q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
t1 = *p1;
*p1 = _mm_unpacklo_epi16(t1, *q1);
*q1 = _mm_unpackhi_epi16(t1, *q1);
Store4x4(p0, r0, stride);
r0 += 4 * stride;
Store4x4(q0, r0, stride);
Store4x4(p1, r8, stride);
r8 += 4 * stride;
Store4x4(q1, r8, stride);
}
//-----------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)
@ -392,69 +522,13 @@ static void SimpleVFilter16SSE2(uint8_t* p, int stride, int thresh) {
}
static void SimpleHFilter16SSE2(uint8_t* p, int stride, int thresh) {
__m128i t1, t2;
__m128i p1, p0, q0, q1;
// Assume the pixels around the edge (|) are numbered as follows
// 00 01 | 02 03
// 10 11 | 12 13
// ... | ...
// e0 e1 | e2 e3
// f0 f1 | f2 f3
p -= 2; // beginning of the first segment
// Load
// p1 = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
// q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
// p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
// q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
Load8x4(p, stride, &p1, &q0);
Load8x4(p + 8 * stride, stride, &p0, &q1);
p -= 2; // beginning of p1
t1 = p1;
t2 = q0;
// p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
// p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
// q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
// q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
p1 = _mm_unpacklo_epi64(p1, p0);
p0 = _mm_unpackhi_epi64(t1, p0);
q0 = _mm_unpacklo_epi64(q0, q1);
q1 = _mm_unpackhi_epi64(t2, q1);
// Filter
Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
DoFilter2(&p1, &p0, &q0, &q1, thresh);
t1 = p0;
t2 = q0;
// Transpose back to write out
// p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
// p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
// q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
// q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
p0 = _mm_unpacklo_epi8(p1, t1);
p1 = _mm_unpackhi_epi8(p1, t1);
q0 = _mm_unpacklo_epi8(t2, q1);
q1 = _mm_unpackhi_epi8(t2, q1);
t1 = p0;
t2 = p1;
// p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
// q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
// p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
// q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
p0 = _mm_unpacklo_epi16(p0, q0);
q0 = _mm_unpackhi_epi16(t1, q0);
p1 = _mm_unpacklo_epi16(p1, q1);
q1 = _mm_unpackhi_epi16(t2, q1);
// Store
Store4x4(&p0, p, stride);
p += 4 * stride;
Store4x4(&q0, p, stride);
p += 4 * stride;
Store4x4(&p1, p, stride);
p += 4 * stride;
Store4x4(&q1, p, stride);
Store16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
}
static void SimpleVFilter16iSSE2(uint8_t* p, int stride, int thresh) {
@ -473,11 +547,163 @@ static void SimpleHFilter16iSSE2(uint8_t* p, int stride, int thresh) {
}
}
//-----------------------------------------------------------------------------
// Complex In-loop filtering (Paragraph 15.3)
#define MAX_DIFF1(p3, p2, p1, p0, m) { \
m = MM_ABS(p3, p2); \
m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
m = _mm_max_epu8(m, MM_ABS(p1, p0)); \
}
#define MAX_DIFF2(p3, p2, p1, p0, m) { \
m = _mm_max_epu8(m, MM_ABS(p3, p2)); \
m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
m = _mm_max_epu8(m, MM_ABS(p1, p0)); \
}
#define LOADUV(p, u, v, stride) { \
p = _mm_loadl_epi64((__m128i*)&u[(stride)]); \
p = _mm_unpacklo_epi64(p, _mm_loadl_epi64((__m128i*)&v[(stride)])); \
}
#define STOREUV(p, u, v, stride) { \
_mm_storel_epi64((__m128i*)&u[(stride)], p); \
p = _mm_unpackhi_epi64(p, p); \
_mm_storel_epi64((__m128i*)&v[(stride)], p); \
}
#define COMPLEX_FL_MASK(p1, p0, q0, q1, t, it, mask) { \
mask = _mm_subs_epu8(mask, it); \
mask = _mm_cmpeq_epi8(mask, _mm_setzero_si128()); \
NeedsFilter(&p1, &p0, &q0, &q1, t, &it); \
mask = _mm_and_si128(mask, it); \
}
static void VFilter16iSSE2(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;
for (k = 3; k > 0; --k) {
p += 4 * stride;
// Load
t2 = _mm_loadu_si128((__m128i*)&p[-4 * stride]); // p3
t1 = _mm_loadu_si128((__m128i*)&p[-3 * stride]); // p2
p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]); // p1
p0 = _mm_loadu_si128((__m128i*)&p[-1 * stride]); // p0
MAX_DIFF1(t2, t1, p1, p0, mask);
q0 = _mm_loadu_si128((__m128i*)&p[0 * stride]); // q0
q1 = _mm_loadu_si128((__m128i*)&p[1 * stride]); // q1
t1 = _mm_loadu_si128((__m128i*)&p[2 * stride]); // q2
t2 = _mm_loadu_si128((__m128i*)&p[3 * stride]); // q3
MAX_DIFF2(t2, t1, q1, q0, mask);
t1 = _mm_set1_epi8(ithresh);
COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, t1, mask);
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
// Store
_mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
_mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
_mm_storeu_si128((__m128i*)&p[0 * stride], q0);
_mm_storeu_si128((__m128i*)&p[1 * stride], q1);
}
}
static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;
u += 4 * stride;
v += 4 * stride;
// Load
LOADUV(t2, u, v, -4 * stride); // p3
LOADUV(t1, u, v, -3 * stride); // p2
LOADUV(p1, u, v, -2 * stride); // p1
LOADUV(p0, u, v, -1 * stride); // p0
MAX_DIFF1(t2, t1, p1, p0, mask);
LOADUV(q0, u, v, 0 * stride); // q0
LOADUV(q1, u, v, 1 * stride); // q1
LOADUV(t1, u, v, 2 * stride); // q2
LOADUV(t2, u, v, 3 * stride); // q3
MAX_DIFF2(t2, t1, q1, q0, mask);
t1 = _mm_set1_epi8(ithresh);
COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, t1, mask);
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
// Store
STOREUV(p1, u, v, -2 * stride);
STOREUV(p0, u, v, -1 * stride);
STOREUV(q0, u, v, 0 * stride);
STOREUV(q1, u, v, 1 * stride);
}
static void HFilter16iSSE2(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
uint8_t* b;
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;
for (k = 3; k > 0; --k) {
b = p;
Load16x4(b, b + 8 * stride, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0
MAX_DIFF1(t2, t1, p1, p0, mask);
b += 4; // beginning of q0
Load16x4(b, b + 8 * stride, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3
MAX_DIFF2(t2, t1, q1, q0, mask);
t1 = _mm_set1_epi8(ithresh);
COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, t1, mask);
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
b -= 2; // beginning of p1
Store16x4(b, b + 8 * stride, stride, &p1, &p0, &q0, &q1);
p += 4;
}
}
static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;
Load16x4(u, v, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0
MAX_DIFF1(t2, t1, p1, p0, mask);
u += 4; // beginning of q0
v += 4;
Load16x4(u, v, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3
MAX_DIFF2(t2, t1, q1, q0, mask);
t1 = _mm_set1_epi8(ithresh);
COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, t1, mask);
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
u -= 2; // beginning of p1
v -= 2;
Store16x4(u, v, stride, &p1, &p0, &q0, &q1);
}
extern void VP8DspInitSSE2(void);
void VP8DspInitSSE2(void) {
VP8Transform = TransformSSE2;
VP8HFilter16i = HFilter16iSSE2;
VP8VFilter16i = VFilter16iSSE2;
VP8VFilter8i = VFilter8iSSE2;
VP8HFilter8i = HFilter8iSSE2;
VP8SimpleVFilter16 = SimpleVFilter16SSE2;
VP8SimpleHFilter16 = SimpleHFilter16SSE2;
VP8SimpleVFilter16i = SimpleVFilter16iSSE2;