speed-up fancy upscaler

by processing two rows at a time.
The [9 3 3 1] weights are decomposed as
[1 1 1 1] + [0 2 2 0] + [8 0 0 0] for better
reuse of sub-expressions, too.

Change-Id: I87ab549048ed249d38add73bb3241dfa0c583328
This commit is contained in:
Pascal Massimino 2010-12-14 13:52:07 -08:00
parent 9145f3bc93
commit d72180a489

View File

@ -76,87 +76,98 @@ typedef enum { MODE_RGB = 0, MODE_RGBA = 1,
// we interpolate u/v as: // we interpolate u/v as:
// ([9*a + 3*b + 3*c + d 3*a + 9*b + 3*c + d] + [8 8]) / 16 // ([9*a + 3*b + 3*c + d 3*a + 9*b + 3*c + d] + [8 8]) / 16
// ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16 // ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16
#define MIX_ODD(a, b, c, d) \
((9 * (a) + 3 * ((b) + (c)) + (d) + 0x00080008u) >> 4)
#define MIX_EVEN(a, b, c, d) \
((9 * (c) + 3 * ((d) + (a)) + (b) + 0x00080008u) >> 4)
// We process u and v together stashed into 32bit (16bit each). // We process u and v together stashed into 32bit (16bit each).
// Note that we could store the pair (3*t_uv + uv, t_uv + 3*uv)
// instead of (t_uv, uv), into a 64bit variable. Doing so, we could
// simplify the MIXing a bit and save two multiplies. TODO(skal).
#define LOAD_UV(u,v) ((u) | ((v) << 16)) #define LOAD_UV(u,v) ((u) | ((v) << 16))
// Macro festival, so we can define all of rgb/bgr/rgba/bgra cases #define UPSCALE_FUNC(FUNC_NAME, FUNC, XSTEP) \
// for odd and even lines static inline void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
#define UPSCALE_FUNC(FUNC_NAME, MIX, FUNC, XSTEP) \ const uint8_t* top_u, const uint8_t* top_v, \
static void FUNC_NAME(const uint8_t* cur_y, \ const uint8_t* cur_u, const uint8_t* cur_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \ uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
const uint8_t* top_u, const uint8_t* top_v, \ int x; \
int len, uint8_t* dst) { \ const int last_pixel_pair = (len - 1) >> 1; \
int x; \ uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \
uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \ uint32_t l_uv = LOAD_UV(cur_u[0], cur_v[0]); /* left-sample */ \
uint32_t l_uv = LOAD_UV(cur_u[0], cur_v[0]); /* left-sample */ \ if (top_y) { \
uint32_t uv0 = MIX(tl_uv, tl_uv, l_uv, l_uv); \ const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
FUNC(cur_y[0], uv0 & 0xff, (uv0 >> 16), dst); \ FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst); \
len -= 1; /* first pixel is done. */ \ } \
for (x = 1; x <= (len >> 1); ++x) { \ if (bottom_y) { \
const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]); /* top sample */ \ const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
const uint32_t uv = LOAD_UV(cur_u[x], cur_v[x]); /* sample */ \ FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst); \
const uint32_t uv0 = MIX(tl_uv, t_uv, l_uv, uv); \ } \
const uint32_t uv1 = MIX(t_uv, tl_uv, uv, l_uv); \ for (x = 1; x <= last_pixel_pair; ++x) { \
FUNC(cur_y[2*x-1], uv0 & 0xff, (uv0 >> 16), dst + (2*x-1) * XSTEP); \ const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]); /* top sample */ \
FUNC(cur_y[2*x ], uv1 & 0xff, (uv1 >> 16), dst + (2*x ) * XSTEP); \ const uint32_t uv = LOAD_UV(cur_u[x], cur_v[x]); /* sample */ \
tl_uv = t_uv; \ /* precompute invariant values associated with first and second diagonals*/\
l_uv = uv; \ const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u; \
} \ const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3; \
if (len & 1) { \ const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3; \
uv0 = MIX(tl_uv, tl_uv, l_uv, l_uv); \ if (top_y) { \
FUNC(cur_y[len], uv0 & 0xff, (uv0 >> 16), dst + len * XSTEP); \ const uint32_t uv0 = (diag_12 + tl_uv) >> 1; \
} \ const uint32_t uv1 = (diag_03 + t_uv) >> 1; \
} \ FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
top_dst + (2 * x - 1) * XSTEP); \
FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16), \
top_dst + (2 * x - 0) * XSTEP); \
} \
if (bottom_y) { \
const uint32_t uv0 = (diag_03 + l_uv) >> 1; \
const uint32_t uv1 = (diag_12 + uv) >> 1; \
FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
bottom_dst + (2 * x - 1) * XSTEP); \
FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16), \
bottom_dst + (2 * x + 0) * XSTEP); \
} \
tl_uv = t_uv; \
l_uv = uv; \
} \
if (!(len & 1)) { \
if (top_y) { \
const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
top_dst + (len - 1) * XSTEP); \
} \
if (bottom_y) { \
const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
bottom_dst + (len - 1) * XSTEP); \
} \
} \
}
// All variants implemented. // All variants implemented.
UPSCALE_FUNC(UpscaleEvenRgb, MIX_EVEN, VP8YuvToRgb, 3) UPSCALE_FUNC(UpscaleRgbLinePair, VP8YuvToRgb, 3)
UPSCALE_FUNC(UpscaleOddRgb, MIX_ODD, VP8YuvToRgb, 3) UPSCALE_FUNC(UpscaleBgrLinePair, VP8YuvToBgr, 3)
UPSCALE_FUNC(UpscaleEvenBgr, MIX_EVEN, VP8YuvToBgr, 3) UPSCALE_FUNC(UpscaleRgbaLinePair, VP8YuvToRgba, 4)
UPSCALE_FUNC(UpscaleOddBgr, MIX_ODD, VP8YuvToBgr, 3) UPSCALE_FUNC(UpscaleBgraLinePair, VP8YuvToBgra, 4)
UPSCALE_FUNC(UpscaleEvenRgba, MIX_EVEN, VP8YuvToRgba, 4)
UPSCALE_FUNC(UpscaleOddRgba, MIX_ODD, VP8YuvToRgba, 4)
UPSCALE_FUNC(UpscaleEvenBgra, MIX_EVEN, VP8YuvToBgra, 4)
UPSCALE_FUNC(UpscaleOddBgra, MIX_ODD, VP8YuvToBgra, 4)
// Main driver function. // Main driver function.
static inline void UpscaleLine(const uint8_t* cur_y, static inline
const uint8_t* cur_u, const uint8_t* cur_v, void UpscaleLinePair(const uint8_t* top_y, const uint8_t* bottom_y,
const uint8_t* top_u, const uint8_t* top_v, const uint8_t* top_u, const uint8_t* top_v,
int len, uint8_t* dst, int odd, CSP_MODE mode) { const uint8_t* cur_u, const uint8_t* cur_v,
if (odd) { uint8_t* top_dst, uint8_t* bottom_dst, int len,
if (mode == MODE_RGB) { CSP_MODE mode) {
UpscaleOddRgb(cur_y, cur_u, cur_v, top_u, top_v, len, dst); if (mode == MODE_RGB) {
} else if (mode == MODE_BGR) { UpscaleRgbLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
UpscaleOddBgr(cur_y, cur_u, cur_v, top_u, top_v, len, dst); top_dst, bottom_dst, len);
} else if (mode == MODE_RGBA) { } else if (mode == MODE_BGR) {
UpscaleOddRgba(cur_y, cur_u, cur_v, top_u, top_v, len, dst); UpscaleBgrLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
} else { top_dst, bottom_dst, len);
UpscaleOddBgra(cur_y, cur_u, cur_v, top_u, top_v, len, dst); } else if (mode == MODE_RGBA) {
} UpscaleRgbaLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
top_dst, bottom_dst, len);
} else { } else {
if (mode == MODE_RGB) { assert(mode == MODE_BGRA);
UpscaleEvenRgb(cur_y, cur_u, cur_v, top_u, top_v, len, dst); UpscaleBgraLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
} else if (mode == MODE_BGR) { top_dst, bottom_dst, len);
UpscaleEvenBgr(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
} else if (mode == MODE_RGBA) {
UpscaleEvenRgba(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
} else {
UpscaleEvenBgra(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
}
} }
} }
#undef LOAD_UV #undef LOAD_UV
#undef UPSCALE_FUNC #undef UPSCALE_FUNC
#undef MIX_ODD
#undef MIX_EVEN
#endif // FANCY_UPSCALING #endif // FANCY_UPSCALING
@ -196,65 +207,53 @@ static void CustomPut(const VP8Io* io) {
uint8_t* dst = p->output + io->mb_y * p->stride; uint8_t* dst = p->output + io->mb_y * p->stride;
if (io->fancy_upscaling) { if (io->fancy_upscaling) {
#ifdef FANCY_UPSCALING #ifdef FANCY_UPSCALING
const uint8_t* cur_y; const uint8_t* cur_y = io->y;
const uint8_t* cur_u = io->u; const uint8_t* cur_u = io->u;
const uint8_t* cur_v = io->v; const uint8_t* cur_v = io->v;
const uint8_t* top_u = p->top_u; const uint8_t* top_u = p->top_u;
const uint8_t* top_v = p->top_v; const uint8_t* top_v = p->top_v;
int y = io->mb_y; int y = io->mb_y;
int y_end = io->mb_y + io->mb_h - 1; int y_end = io->mb_y + io->mb_h;
if (y > 0) { if (y == 0) {
// If mid-fly, we need to finish the previous line. // First line is special cased. We mirror the u/v samples at boundary.
cur_y = p->top_y; UpscaleLinePair(NULL, cur_y, cur_u, cur_v, cur_u, cur_v,
dst -= p->stride; NULL, dst, w, p->mode);
y -= 1;
} else { } else {
// else we "replicate" the u/v sample of the first line // We can finish the left-over line from previous call
UpscaleLinePair(p->top_y, cur_y, top_u, top_v, cur_u, cur_v,
dst - p->stride, dst, w, p->mode);
}
// Loop over each output pairs of row.
for (; y + 2 < y_end; y += 2) {
top_u = cur_u; top_u = cur_u;
top_v = cur_v; top_v = cur_v;
// and start with the top line cur_u += io->uv_stride;
cur_y = io->y; cur_v += io->uv_stride;
dst += 2 * p->stride;
cur_y += 2 * io->y_stride;
UpscaleLinePair(cur_y - io->y_stride, cur_y,
top_u, top_v, cur_u, cur_v,
dst - p->stride, dst, w, p->mode);
} }
if (y_end >= io->height - 1) { // move to last row
// for the very last rows, we can process them right now cur_y += io->y_stride;
y_end = io->height; if (y_end != io->height) {
} else { // Save the unfinished samples for next call (as we're not done yet).
// we won't process the very last line this time,
// waiting for the next call instead.
}
// Loop over each output row.
for (; y < y_end; ++y) {
if (y & 1) { // odd lines
UpscaleLine(cur_y, cur_u, cur_v, top_u, top_v, w, dst, 1, p->mode);
} else { // even lines
UpscaleLine(cur_y, cur_u, cur_v, top_u, top_v, w, dst, 0, p->mode);
top_u = cur_u;
top_v = cur_v;
if (y < io->height - 2) {
cur_u += io->uv_stride;
cur_v += io->uv_stride;
}
}
dst += p->stride;
if (cur_y == p->top_y) {
cur_y = io->y;
} else {
cur_y += io->y_stride;
}
}
// Save the unfinished samples for next call (if we're not done yet).
if (y < io->height - 1) {
memcpy(p->top_y, cur_y, w * sizeof(*p->top_y)); memcpy(p->top_y, cur_y, w * sizeof(*p->top_y));
memcpy(p->top_u, top_u, uv_w * sizeof(*p->top_u)); memcpy(p->top_u, cur_u, uv_w * sizeof(*p->top_u));
memcpy(p->top_v, top_v, uv_w * sizeof(*p->top_v)); memcpy(p->top_v, cur_v, uv_w * sizeof(*p->top_v));
} else {
// Process the very last row of even-sized picture
if (!(y_end & 1)) {
UpscaleLinePair(cur_y, NULL, cur_u, cur_v, cur_u, cur_v,
dst + p->stride, NULL, w, p->mode);
}
} }
#else #else
assert(0); // shouldn't happen. assert(0); // shouldn't happen.
#endif #endif
} else { } else {
// Point-sampling U/V upscaler. // Point-sampling U/V upscaler.
// Could be implemented with special MIX functions, too.
int j; int j;
for (j = 0; j < mb_h; ++j) { for (j = 0; j < mb_h; ++j) {
const uint8_t* y_src = io->y + j * io->y_stride; const uint8_t* y_src = io->y + j * io->y_stride;