Merge "strong filtering speed-up (~2-3% x86, ~1-2% for NEON)"

This commit is contained in:
James Zern 2014-06-02 23:06:18 -07:00 committed by Gerrit Code Review
commit 9754d39a4e
2 changed files with 94 additions and 76 deletions

View File

@ -620,16 +620,16 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
#endif // USE_INTRINSICS #endif // USE_INTRINSICS
static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
int k; uint32_t k;
for (k = 3; k > 0; --k) { for (k = 3; k != 0; --k) {
p += 4 * stride; p += 4 * stride;
SimpleVFilter16(p, stride, thresh); SimpleVFilter16(p, stride, thresh);
} }
} }
static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
int k; uint32_t k;
for (k = 3; k > 0; --k) { for (k = 3; k != 0; --k) {
p += 4; p += 4;
SimpleHFilter16(p, stride, thresh); SimpleHFilter16(p, stride, thresh);
} }
@ -845,18 +845,23 @@ static void HFilter16(uint8_t* p, int stride,
// on three inner edges // on three inner edges
static void VFilter16i(uint8_t* p, int stride, static void VFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) { int thresh, int ithresh, int hev_thresh) {
int k; uint32_t k;
for (k = 3; k > 0; --k) { uint8x16_t p3, p2, p1, p0;
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; Load16x4(p + 2 * stride, stride, &p3, &p2, &p1, &p0);
for (k = 3; k != 0; --k) {
uint8x16_t q0, q1, q2, q3;
p += 4 * stride; p += 4 * stride;
Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); Load16x4(p + 2 * stride, stride, &q0, &q1, &q2, &q3);
{ {
const uint8x16_t mask = const uint8x16_t mask =
NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh); NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
uint8x16_t op1, op0, oq0, oq1; // p3 and p2 are not just temporary variables here: they will be
DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1); // re-used for next span. And q2/q3 will become p1/p0 accordingly.
Store16x4(op1, op0, oq0, oq1, p, stride); DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
Store16x4(p1, p0, p3, p2, p, stride);
p1 = q2;
p0 = q3;
} }
} }
} }
@ -864,18 +869,21 @@ static void VFilter16i(uint8_t* p, int stride,
#if !defined(WORK_AROUND_GCC) #if !defined(WORK_AROUND_GCC)
static void HFilter16i(uint8_t* p, int stride, static void HFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) { int thresh, int ithresh, int hev_thresh) {
int k; uint32_t k;
for (k = 3; k > 0; --k) { uint8x16_t p3, p2, p1, p0;
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; Load4x16(p + 2, stride, &p3, &p2, &p1, &p0);
for (k = 3; k != 0; --k) {
uint8x16_t q0, q1, q2, q3;
p += 4; p += 4;
Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); Load4x16(p + 2, stride, &q0, &q1, &q2, &q3);
{ {
const uint8x16_t mask = const uint8x16_t mask =
NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh); NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
uint8x16_t op1, op0, oq0, oq1; DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1); Store4x16(p1, p0, p3, p2, p, stride);
Store4x16(op1, op0, oq0, oq1, p, stride); p1 = q2;
p0 = q3;
} }
} }
} }

View File

@ -608,43 +608,45 @@ static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
} }
// Transpose back and store // Transpose back and store
static WEBP_INLINE void Store16x4(__m128i* const p1, __m128i* const p0, static WEBP_INLINE void Store16x4(const __m128i* const p1,
__m128i* const q0, __m128i* const q1, const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
uint8_t* r0, uint8_t* r8, uint8_t* r0, uint8_t* r8,
int stride) { int stride) {
__m128i t1; __m128i t1, p1_s, p0_s, q0_s, q1_s;
// p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
// p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
t1 = *p0; t1 = *p0;
*p0 = _mm_unpacklo_epi8(*p1, t1); p0_s = _mm_unpacklo_epi8(*p1, t1);
*p1 = _mm_unpackhi_epi8(*p1, t1); p1_s = _mm_unpackhi_epi8(*p1, t1);
// q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
// q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
t1 = *q0; t1 = *q0;
*q0 = _mm_unpacklo_epi8(t1, *q1); q0_s = _mm_unpacklo_epi8(t1, *q1);
*q1 = _mm_unpackhi_epi8(t1, *q1); q1_s = _mm_unpackhi_epi8(t1, *q1);
// p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
// q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
t1 = *p0; t1 = p0_s;
*p0 = _mm_unpacklo_epi16(t1, *q0); p0_s = _mm_unpacklo_epi16(t1, q0_s);
*q0 = _mm_unpackhi_epi16(t1, *q0); q0_s = _mm_unpackhi_epi16(t1, q0_s);
// p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
// q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
t1 = *p1; t1 = p1_s;
*p1 = _mm_unpacklo_epi16(t1, *q1); p1_s = _mm_unpacklo_epi16(t1, q1_s);
*q1 = _mm_unpackhi_epi16(t1, *q1); q1_s = _mm_unpackhi_epi16(t1, q1_s);
Store4x4(p0, r0, stride); Store4x4(&p0_s, r0, stride);
r0 += 4 * stride; r0 += 4 * stride;
Store4x4(q0, r0, stride); Store4x4(&q0_s, r0, stride);
Store4x4(p1, r8, stride); Store4x4(&p1_s, r8, stride);
r8 += 4 * stride; r8 += 4 * stride;
Store4x4(q1, r8, stride); Store4x4(&q1_s, r8, stride);
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
@ -693,17 +695,17 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Complex In-loop filtering (Paragraph 15.3) // Complex In-loop filtering (Paragraph 15.3)
#define MAX_DIFF1(p3, p2, p1, p0, m) { \ #define MAX_DIFF1(p3, p2, p1, p0, m) do { \
m = MM_ABS(p3, p2); \ m = MM_ABS(p1, p0); \
m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
m = _mm_max_epu8(m, MM_ABS(p1, p0)); \
}
#define MAX_DIFF2(p3, p2, p1, p0, m) { \
m = _mm_max_epu8(m, MM_ABS(p3, p2)); \ m = _mm_max_epu8(m, MM_ABS(p3, p2)); \
m = _mm_max_epu8(m, MM_ABS(p2, p1)); \ m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
} while (0)
#define MAX_DIFF2(p3, p2, p1, p0, m) do { \
m = _mm_max_epu8(m, MM_ABS(p1, p0)); \ m = _mm_max_epu8(m, MM_ABS(p1, p0)); \
} m = _mm_max_epu8(m, MM_ABS(p3, p2)); \
m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
} while (0)
#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) { \ #define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) { \
e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]); \ e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]); \
@ -712,10 +714,11 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]); \ e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]); \
} }
#define LOADUV_H_EDGE(p, u, v, stride) { \ #define LOADUV_H_EDGE(p, u, v, stride) do { \
p = _mm_loadl_epi64((__m128i*)&(u)[(stride)]); \ const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]); \
p = _mm_unpacklo_epi64(p, _mm_loadl_epi64((__m128i*)&(v)[(stride)])); \ const __m128i V = _mm_loadl_epi64((__m128i*)&(v)[(stride)]); \
} p = _mm_unpacklo_epi64(U, V); \
} while (0)
#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) { \ #define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) { \
LOADUV_H_EDGE(e1, u, v, 0 * stride); \ LOADUV_H_EDGE(e1, u, v, 0 * stride); \
@ -794,54 +797,61 @@ static void HFilter16(uint8_t* p, int stride,
static void VFilter16i(uint8_t* p, int stride, static void VFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) { int thresh, int ithresh, int hev_thresh) {
int k; int k;
__m128i mask; __m128i p3, p2, p1, p0; // loop invariants
__m128i t1, t2, p1, p0, q0, q1;
LOAD_H_EDGES4(p, stride, p3, p2, p1, p0); // prologue
for (k = 3; k > 0; --k) { for (k = 3; k > 0; --k) {
// Load p3, p2, p1, p0 __m128i mask, tmp1, tmp2;
LOAD_H_EDGES4(p, stride, t2, t1, p1, p0); uint8_t* const b = p + 2 * stride; // beginning of p1
MAX_DIFF1(t2, t1, p1, p0, mask);
p += 4 * stride; p += 4 * stride;
// Load q0, q1, q2, q3 MAX_DIFF1(p3, p2, p1, p0, mask); // compute partial mask
LOAD_H_EDGES4(p, stride, q0, q1, t1, t2); LOAD_H_EDGES4(p, stride, p3, p2, tmp1, tmp2);
MAX_DIFF2(t2, t1, q1, q0, mask); MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); // p3 and p2 are not just temporary variables here: they will be
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); // re-used for next span. And q2/q3 will become p1/p0 accordingly.
ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
// Store // Store
_mm_storeu_si128((__m128i*)&p[-2 * stride], p1); _mm_storeu_si128((__m128i*)&b[0 * stride], p1);
_mm_storeu_si128((__m128i*)&p[-1 * stride], p0); _mm_storeu_si128((__m128i*)&b[1 * stride], p0);
_mm_storeu_si128((__m128i*)&p[0 * stride], q0); _mm_storeu_si128((__m128i*)&b[2 * stride], p3);
_mm_storeu_si128((__m128i*)&p[1 * stride], q1); _mm_storeu_si128((__m128i*)&b[3 * stride], p2);
// rotate samples
p1 = tmp1;
p0 = tmp2;
} }
} }
static void HFilter16i(uint8_t* p, int stride, static void HFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) { int thresh, int ithresh, int hev_thresh) {
int k; int k;
uint8_t* b; __m128i p3, p2, p1, p0; // loop invariants
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1; Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0); // prologue
for (k = 3; k > 0; --k) { for (k = 3; k > 0; --k) {
b = p; __m128i mask, tmp1, tmp2;
Load16x4(b, b + 8 * stride, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0 uint8_t* const b = p + 2; // beginning of p1
MAX_DIFF1(t2, t1, p1, p0, mask);
b += 4; // beginning of q0 p += 4; // beginning of q0 (and next span)
Load16x4(b, b + 8 * stride, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3
MAX_DIFF2(t2, t1, q1, q0, mask);
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); MAX_DIFF1(p3, p2, p1, p0, mask); // compute partial mask
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
b -= 2; // beginning of p1 ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
Store16x4(&p1, &p0, &q0, &q1, b, b + 8 * stride, stride); DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
p += 4; Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
// rotate samples
p1 = tmp1;
p0 = tmp2;
} }
} }