mirror of
https://github.com/webmproject/libwebp.git
synced 2025-07-16 05:49:51 +02:00
strong filtering speed-up (~2-3% x86, ~1-2% for NEON)
Extract loop invariant and avoid storing/loading samples if they can be re-used. This is particularly interesting when a transpose is involved (HFilter16i). Change-Id: I93274620f6da220a35025ff8708ff0c9ee8c4139
This commit is contained in:
@ -620,16 +620,16 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
|
||||
#endif // USE_INTRINSICS
|
||||
|
||||
static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
|
||||
int k;
|
||||
for (k = 3; k > 0; --k) {
|
||||
uint32_t k;
|
||||
for (k = 3; k != 0; --k) {
|
||||
p += 4 * stride;
|
||||
SimpleVFilter16(p, stride, thresh);
|
||||
}
|
||||
}
|
||||
|
||||
static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
|
||||
int k;
|
||||
for (k = 3; k > 0; --k) {
|
||||
uint32_t k;
|
||||
for (k = 3; k != 0; --k) {
|
||||
p += 4;
|
||||
SimpleHFilter16(p, stride, thresh);
|
||||
}
|
||||
@ -845,18 +845,23 @@ static void HFilter16(uint8_t* p, int stride,
|
||||
// on three inner edges
|
||||
static void VFilter16i(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
int k;
|
||||
for (k = 3; k > 0; --k) {
|
||||
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
|
||||
uint32_t k;
|
||||
uint8x16_t p3, p2, p1, p0;
|
||||
Load16x4(p + 2 * stride, stride, &p3, &p2, &p1, &p0);
|
||||
for (k = 3; k != 0; --k) {
|
||||
uint8x16_t q0, q1, q2, q3;
|
||||
p += 4 * stride;
|
||||
Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
||||
Load16x4(p + 2 * stride, stride, &q0, &q1, &q2, &q3);
|
||||
{
|
||||
const uint8x16_t mask =
|
||||
NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
|
||||
const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
|
||||
uint8x16_t op1, op0, oq0, oq1;
|
||||
DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
|
||||
Store16x4(op1, op0, oq0, oq1, p, stride);
|
||||
// p3 and p2 are not just temporary variables here: they will be
|
||||
// re-used for next span. And q2/q3 will become p1/p0 accordingly.
|
||||
DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
|
||||
Store16x4(p1, p0, p3, p2, p, stride);
|
||||
p1 = q2;
|
||||
p0 = q3;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -864,18 +869,21 @@ static void VFilter16i(uint8_t* p, int stride,
|
||||
#if !defined(WORK_AROUND_GCC)
|
||||
static void HFilter16i(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
int k;
|
||||
for (k = 3; k > 0; --k) {
|
||||
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
|
||||
uint32_t k;
|
||||
uint8x16_t p3, p2, p1, p0;
|
||||
Load4x16(p + 2, stride, &p3, &p2, &p1, &p0);
|
||||
for (k = 3; k != 0; --k) {
|
||||
uint8x16_t q0, q1, q2, q3;
|
||||
p += 4;
|
||||
Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
||||
Load4x16(p + 2, stride, &q0, &q1, &q2, &q3);
|
||||
{
|
||||
const uint8x16_t mask =
|
||||
NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
|
||||
const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
|
||||
uint8x16_t op1, op0, oq0, oq1;
|
||||
DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
|
||||
Store4x16(op1, op0, oq0, oq1, p, stride);
|
||||
DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
|
||||
Store4x16(p1, p0, p3, p2, p, stride);
|
||||
p1 = q2;
|
||||
p0 = q3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user