speed-up fancy upscaler

by processing two rows at a time.
The [9 3 3 1] weights are decomposed as
[1 1 1 1] + [0 2 2 0] + [8 0 0 0] for better
reuse of sub-expressions, too.

Change-Id: I87ab549048ed249d38add73bb3241dfa0c583328
This commit is contained in:
Pascal Massimino 2010-12-14 13:52:07 -08:00
parent 9145f3bc93
commit d72180a489

View File

@ -76,87 +76,98 @@ typedef enum { MODE_RGB = 0, MODE_RGBA = 1,
// we interpolate u/v as:
// ([9*a + 3*b + 3*c + d 3*a + 9*b + 3*c + d] + [8 8]) / 16
// ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16
#define MIX_ODD(a, b, c, d) \
((9 * (a) + 3 * ((b) + (c)) + (d) + 0x00080008u) >> 4)
#define MIX_EVEN(a, b, c, d) \
((9 * (c) + 3 * ((d) + (a)) + (b) + 0x00080008u) >> 4)
// We process u and v together stashed into 32bit (16bit each).
// Note that we could store the pair (3*t_uv + uv, t_uv + 3*uv)
// instead of (t_uv, uv), into a 64bit variable. Doing so, we could
// simplify the MIXing a bit and save two multiplies. TODO(skal).
#define LOAD_UV(u,v) ((u) | ((v) << 16))
// Macro festival, so we can define all of rgb/bgr/rgba/bgra cases
// for odd and even lines
#define UPSCALE_FUNC(FUNC_NAME, MIX, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* cur_y, \
const uint8_t* cur_u, const uint8_t* cur_v, \
const uint8_t* top_u, const uint8_t* top_v, \
int len, uint8_t* dst) { \
int x; \
uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \
uint32_t l_uv = LOAD_UV(cur_u[0], cur_v[0]); /* left-sample */ \
uint32_t uv0 = MIX(tl_uv, tl_uv, l_uv, l_uv); \
FUNC(cur_y[0], uv0 & 0xff, (uv0 >> 16), dst); \
len -= 1; /* first pixel is done. */ \
for (x = 1; x <= (len >> 1); ++x) { \
const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]); /* top sample */ \
const uint32_t uv = LOAD_UV(cur_u[x], cur_v[x]); /* sample */ \
const uint32_t uv0 = MIX(tl_uv, t_uv, l_uv, uv); \
const uint32_t uv1 = MIX(t_uv, tl_uv, uv, l_uv); \
FUNC(cur_y[2*x-1], uv0 & 0xff, (uv0 >> 16), dst + (2*x-1) * XSTEP); \
FUNC(cur_y[2*x ], uv1 & 0xff, (uv1 >> 16), dst + (2*x ) * XSTEP); \
tl_uv = t_uv; \
l_uv = uv; \
} \
if (len & 1) { \
uv0 = MIX(tl_uv, tl_uv, l_uv, l_uv); \
FUNC(cur_y[len], uv0 & 0xff, (uv0 >> 16), dst + len * XSTEP); \
} \
} \
#define UPSCALE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static inline void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
int x; \
const int last_pixel_pair = (len - 1) >> 1; \
uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \
uint32_t l_uv = LOAD_UV(cur_u[0], cur_v[0]); /* left-sample */ \
if (top_y) { \
const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst); \
} \
if (bottom_y) { \
const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst); \
} \
for (x = 1; x <= last_pixel_pair; ++x) { \
const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]); /* top sample */ \
const uint32_t uv = LOAD_UV(cur_u[x], cur_v[x]); /* sample */ \
/* precompute invariant values associated with first and second diagonals*/\
const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u; \
const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3; \
const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3; \
if (top_y) { \
const uint32_t uv0 = (diag_12 + tl_uv) >> 1; \
const uint32_t uv1 = (diag_03 + t_uv) >> 1; \
FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
top_dst + (2 * x - 1) * XSTEP); \
FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16), \
top_dst + (2 * x - 0) * XSTEP); \
} \
if (bottom_y) { \
const uint32_t uv0 = (diag_03 + l_uv) >> 1; \
const uint32_t uv1 = (diag_12 + uv) >> 1; \
FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
bottom_dst + (2 * x - 1) * XSTEP); \
FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16), \
bottom_dst + (2 * x + 0) * XSTEP); \
} \
tl_uv = t_uv; \
l_uv = uv; \
} \
if (!(len & 1)) { \
if (top_y) { \
const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
top_dst + (len - 1) * XSTEP); \
} \
if (bottom_y) { \
const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
bottom_dst + (len - 1) * XSTEP); \
} \
} \
}
// All variants implemented.
UPSCALE_FUNC(UpscaleEvenRgb, MIX_EVEN, VP8YuvToRgb, 3)
UPSCALE_FUNC(UpscaleOddRgb, MIX_ODD, VP8YuvToRgb, 3)
UPSCALE_FUNC(UpscaleEvenBgr, MIX_EVEN, VP8YuvToBgr, 3)
UPSCALE_FUNC(UpscaleOddBgr, MIX_ODD, VP8YuvToBgr, 3)
UPSCALE_FUNC(UpscaleEvenRgba, MIX_EVEN, VP8YuvToRgba, 4)
UPSCALE_FUNC(UpscaleOddRgba, MIX_ODD, VP8YuvToRgba, 4)
UPSCALE_FUNC(UpscaleEvenBgra, MIX_EVEN, VP8YuvToBgra, 4)
UPSCALE_FUNC(UpscaleOddBgra, MIX_ODD, VP8YuvToBgra, 4)
UPSCALE_FUNC(UpscaleRgbLinePair, VP8YuvToRgb, 3)
UPSCALE_FUNC(UpscaleBgrLinePair, VP8YuvToBgr, 3)
UPSCALE_FUNC(UpscaleRgbaLinePair, VP8YuvToRgba, 4)
UPSCALE_FUNC(UpscaleBgraLinePair, VP8YuvToBgra, 4)
// Main driver function.
static inline void UpscaleLine(const uint8_t* cur_y,
const uint8_t* cur_u, const uint8_t* cur_v,
const uint8_t* top_u, const uint8_t* top_v,
int len, uint8_t* dst, int odd, CSP_MODE mode) {
if (odd) {
if (mode == MODE_RGB) {
UpscaleOddRgb(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
} else if (mode == MODE_BGR) {
UpscaleOddBgr(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
} else if (mode == MODE_RGBA) {
UpscaleOddRgba(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
} else {
UpscaleOddBgra(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
}
static inline
void UpscaleLinePair(const uint8_t* top_y, const uint8_t* bottom_y,
const uint8_t* top_u, const uint8_t* top_v,
const uint8_t* cur_u, const uint8_t* cur_v,
uint8_t* top_dst, uint8_t* bottom_dst, int len,
CSP_MODE mode) {
if (mode == MODE_RGB) {
UpscaleRgbLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
top_dst, bottom_dst, len);
} else if (mode == MODE_BGR) {
UpscaleBgrLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
top_dst, bottom_dst, len);
} else if (mode == MODE_RGBA) {
UpscaleRgbaLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
top_dst, bottom_dst, len);
} else {
if (mode == MODE_RGB) {
UpscaleEvenRgb(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
} else if (mode == MODE_BGR) {
UpscaleEvenBgr(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
} else if (mode == MODE_RGBA) {
UpscaleEvenRgba(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
} else {
UpscaleEvenBgra(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
}
assert(mode == MODE_BGRA);
UpscaleBgraLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
top_dst, bottom_dst, len);
}
}
#undef LOAD_UV
#undef UPSCALE_FUNC
#undef MIX_ODD
#undef MIX_EVEN
#endif // FANCY_UPSCALING
@ -196,65 +207,53 @@ static void CustomPut(const VP8Io* io) {
uint8_t* dst = p->output + io->mb_y * p->stride;
if (io->fancy_upscaling) {
#ifdef FANCY_UPSCALING
const uint8_t* cur_y;
const uint8_t* cur_y = io->y;
const uint8_t* cur_u = io->u;
const uint8_t* cur_v = io->v;
const uint8_t* top_u = p->top_u;
const uint8_t* top_v = p->top_v;
int y = io->mb_y;
int y_end = io->mb_y + io->mb_h - 1;
if (y > 0) {
// If mid-fly, we need to finish the previous line.
cur_y = p->top_y;
dst -= p->stride;
y -= 1;
int y_end = io->mb_y + io->mb_h;
if (y == 0) {
// First line is special cased. We mirror the u/v samples at boundary.
UpscaleLinePair(NULL, cur_y, cur_u, cur_v, cur_u, cur_v,
NULL, dst, w, p->mode);
} else {
// else we "replicate" the u/v sample of the first line
// We can finish the left-over line from previous call
UpscaleLinePair(p->top_y, cur_y, top_u, top_v, cur_u, cur_v,
dst - p->stride, dst, w, p->mode);
}
// Loop over each output pairs of row.
for (; y + 2 < y_end; y += 2) {
top_u = cur_u;
top_v = cur_v;
// and start with the top line
cur_y = io->y;
cur_u += io->uv_stride;
cur_v += io->uv_stride;
dst += 2 * p->stride;
cur_y += 2 * io->y_stride;
UpscaleLinePair(cur_y - io->y_stride, cur_y,
top_u, top_v, cur_u, cur_v,
dst - p->stride, dst, w, p->mode);
}
if (y_end >= io->height - 1) {
// for the very last rows, we can process them right now
y_end = io->height;
} else {
// we won't process the very last line this time,
// waiting for the next call instead.
}
// Loop over each output row.
for (; y < y_end; ++y) {
if (y & 1) { // odd lines
UpscaleLine(cur_y, cur_u, cur_v, top_u, top_v, w, dst, 1, p->mode);
} else { // even lines
UpscaleLine(cur_y, cur_u, cur_v, top_u, top_v, w, dst, 0, p->mode);
top_u = cur_u;
top_v = cur_v;
if (y < io->height - 2) {
cur_u += io->uv_stride;
cur_v += io->uv_stride;
}
}
dst += p->stride;
if (cur_y == p->top_y) {
cur_y = io->y;
} else {
cur_y += io->y_stride;
}
}
// Save the unfinished samples for next call (if we're not done yet).
if (y < io->height - 1) {
// move to last row
cur_y += io->y_stride;
if (y_end != io->height) {
// Save the unfinished samples for next call (as we're not done yet).
memcpy(p->top_y, cur_y, w * sizeof(*p->top_y));
memcpy(p->top_u, top_u, uv_w * sizeof(*p->top_u));
memcpy(p->top_v, top_v, uv_w * sizeof(*p->top_v));
memcpy(p->top_u, cur_u, uv_w * sizeof(*p->top_u));
memcpy(p->top_v, cur_v, uv_w * sizeof(*p->top_v));
} else {
// Process the very last row of even-sized picture
if (!(y_end & 1)) {
UpscaleLinePair(cur_y, NULL, cur_u, cur_v, cur_u, cur_v,
dst + p->stride, NULL, w, p->mode);
}
}
#else
assert(0); // shouldn't happen.
#endif
} else {
// Point-sampling U/V upscaler.
// Could be implemented with special MIX functions, too.
int j;
for (j = 0; j < mb_h; ++j) {
const uint8_t* y_src = io->y + j * io->y_stride;