strong filtering speed-up (~2-3% x86, ~1-2% for NEON)

Extract loop invariant and avoid storing/loading samples
if they can be re-used. This is particularly interesting when
a transpose is involved (HFilter16i).

Change-Id: I93274620f6da220a35025ff8708ff0c9ee8c4139
This commit is contained in:
skal
2014-06-02 14:41:46 +02:00
parent ac591cf22e
commit ea8b0a171d
2 changed files with 94 additions and 76 deletions

View File

@ -620,16 +620,16 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
#endif // USE_INTRINSICS
static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
uint32_t k;
for (k = 3; k != 0; --k) {
p += 4 * stride;
SimpleVFilter16(p, stride, thresh);
}
}
static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
uint32_t k;
for (k = 3; k != 0; --k) {
p += 4;
SimpleHFilter16(p, stride, thresh);
}
@ -845,18 +845,23 @@ static void HFilter16(uint8_t* p, int stride,
// on three inner edges
static void VFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
for (k = 3; k > 0; --k) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
uint32_t k;
uint8x16_t p3, p2, p1, p0;
Load16x4(p + 2 * stride, stride, &p3, &p2, &p1, &p0);
for (k = 3; k != 0; --k) {
uint8x16_t q0, q1, q2, q3;
p += 4 * stride;
Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
Load16x4(p + 2 * stride, stride, &q0, &q1, &q2, &q3);
{
const uint8x16_t mask =
NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
uint8x16_t op1, op0, oq0, oq1;
DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
Store16x4(op1, op0, oq0, oq1, p, stride);
// p3 and p2 are not just temporary variables here: they will be
// re-used for next span. And q2/q3 will become p1/p0 accordingly.
DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
Store16x4(p1, p0, p3, p2, p, stride);
p1 = q2;
p0 = q3;
}
}
}
@ -864,18 +869,21 @@ static void VFilter16i(uint8_t* p, int stride,
#if !defined(WORK_AROUND_GCC)
static void HFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
for (k = 3; k > 0; --k) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
uint32_t k;
uint8x16_t p3, p2, p1, p0;
Load4x16(p + 2, stride, &p3, &p2, &p1, &p0);
for (k = 3; k != 0; --k) {
uint8x16_t q0, q1, q2, q3;
p += 4;
Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
Load4x16(p + 2, stride, &q0, &q1, &q2, &q3);
{
const uint8x16_t mask =
NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
uint8x16_t op1, op0, oq0, oq1;
DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
Store4x16(op1, op0, oq0, oq1, p, stride);
DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
Store4x16(p1, p0, p3, p2, p, stride);
p1 = q2;
p0 = q3;
}
}
}