From 5ff0079ece626f122bfb8e33a5f92b5a68484176 Mon Sep 17 00:00:00 2001 From: Pascal Massimino Date: Fri, 18 Sep 2015 10:45:03 +0200 Subject: [PATCH] fix rescaler vertical interpolation * vertical expansion now uses bilinear interpolation * heavily assumes that the alpha plane is decoded in full, not row-by-row * split the RescalerExportRow and RescalerImportRow methods into Shrink and Expand variants. * MIPS implementation of ExportRowExpand is missing. There's room for extra speed optim and code re-org, but let's keep that for later patches. addresses https://code.google.com/p/webp/issues/detail?id=254 Change-Id: I8f12b855342bf07dd467fe85e4fde5fd814effdb --- src/dec/io.c | 104 ++++++----- src/dec/vp8l.c | 8 +- src/dec/webpi.h | 7 +- src/dsp/dsp.h | 30 +++- src/dsp/rescaler.c | 199 ++++++++++++++------- src/dsp/rescaler_mips32.c | 309 +++++++++++++++++---------------- src/dsp/rescaler_mips_dsp_r2.c | 200 +++++++++++---------- src/enc/picture_rescale.c | 8 +- src/utils/rescaler.c | 41 +++-- src/utils/rescaler.h | 35 ++-- 10 files changed, 553 insertions(+), 388 deletions(-) diff --git a/src/dec/io.c b/src/dec/io.c index 099aa0b3..e73cd54a 100644 --- a/src/dec/io.c +++ b/src/dec/io.c @@ -119,14 +119,16 @@ static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) { //------------------------------------------------------------------------------ -static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p) { +static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p, + int expected_num_lines_out) { const uint8_t* alpha = io->a; const WebPYUVABuffer* const buf = &p->output->u.YUVA; const int mb_w = io->mb_w; const int mb_h = io->mb_h; uint8_t* dst = buf->a + io->mb_y * buf->a_stride; int j; - + (void)expected_num_lines_out; + assert(expected_num_lines_out == mb_h); if (alpha != NULL) { for (j = 0; j < mb_h; ++j) { memcpy(dst, alpha, mb_w * sizeof(*dst)); @@ -169,7 +171,8 @@ static int GetAlphaSourceRow(const VP8Io* const io, return start_y; } -static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) { +static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p, + int expected_num_lines_out) { const uint8_t* alpha = io->a; if (alpha != NULL) { const int mb_w = io->mb_w; @@ -183,7 +186,8 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) { uint8_t* const dst = base_rgba + (alpha_first ? 0 : 3); const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w, num_rows, dst, buf->stride); - + (void)expected_num_lines_out; + assert(expected_num_lines_out == num_rows); // has_alpha is true if there's non-trivial alpha to premultiply with. if (has_alpha && WebPIsPremultipliedMode(colorspace)) { WebPApplyAlphaMultiply(base_rgba, alpha_first, @@ -193,7 +197,8 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) { return 0; } -static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) { +static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p, + int expected_num_lines_out) { const uint8_t* alpha = io->a; if (alpha != NULL) { const int mb_w = io->mb_w; @@ -209,7 +214,6 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) { #endif uint32_t alpha_mask = 0x0f; int i, j; - for (j = 0; j < num_rows; ++j) { for (i = 0; i < mb_w; ++i) { // Fill in the alpha value (converted to 4 bits). @@ -220,6 +224,8 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) { alpha += io->width; alpha_dst += buf->stride; } + (void)expected_num_lines_out; + assert(expected_num_lines_out == num_rows); if (alpha_mask != 0x0f && WebPIsPremultipliedMode(colorspace)) { WebPApplyAlphaMultiply4444(base_rgba, mb_w, num_rows, buf->stride); } @@ -261,12 +267,15 @@ static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) { return num_lines_out; } -static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p) { +static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p, + int expected_num_lines_out) { if (io->a != NULL) { const WebPYUVABuffer* const buf = &p->output->u.YUVA; uint8_t* dst_y = buf->y + p->last_y * buf->y_stride; const uint8_t* src_a = buf->a + p->last_y * buf->a_stride; const int num_lines_out = Rescale(io->a, io->width, io->mb_h, &p->scaler_a); + (void)expected_num_lines_out; + assert(expected_num_lines_out == num_lines_out); if (num_lines_out > 0) { // unmultiply the Y WebPMultRows(dst_y, buf->y_stride, src_a, buf->a_stride, p->scaler_a.dst_width, num_lines_out, 1); @@ -287,7 +296,7 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) { const size_t work_size = 2 * out_width; // scratch memory for luma rescaler const size_t uv_work_size = 2 * uv_out_width; // and for each u/v ones size_t tmp_size; - int32_t* work; + rescaler_t* work; tmp_size = (work_size + 2 * uv_work_size) * sizeof(*work); if (has_alpha) { @@ -297,7 +306,7 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) { if (p->memory == NULL) { return 0; // memory error } - work = (int32_t*)p->memory; + work = (rescaler_t*)p->memory; WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h, buf->y, out_width, out_height, buf->y_stride, 1, work); @@ -326,17 +335,17 @@ static int ExportRGB(WebPDecParams* const p, int y_pos) { const WebPYUV444Converter convert = WebPYUV444Converters[p->output->colorspace]; const WebPRGBABuffer* const buf = &p->output->u.RGBA; - uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride; + uint8_t* dst = buf->rgba + y_pos * buf->stride; int num_lines_out = 0; // For RGB rescaling, because of the YUV420, current scan position // U/V can be +1/-1 line from the Y one. Hence the double test. while (WebPRescalerHasPendingOutput(&p->scaler_y) && WebPRescalerHasPendingOutput(&p->scaler_u)) { - assert(p->last_y + y_pos + num_lines_out < p->output->height); + assert(y_pos + num_lines_out < p->output->height); assert(p->scaler_u.y_accum == p->scaler_v.y_accum); - WebPRescalerExportRow(&p->scaler_y, 0); - WebPRescalerExportRow(&p->scaler_u, 0); - WebPRescalerExportRow(&p->scaler_v, 0); + WebPRescalerExportRow(&p->scaler_y); + WebPRescalerExportRow(&p->scaler_u); + WebPRescalerExportRow(&p->scaler_v); convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst, dst, p->scaler_y.dst_width); dst += buf->stride; @@ -354,24 +363,26 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) { const int y_lines_in = WebPRescalerImport(&p->scaler_y, mb_h - j, io->y + j * io->y_stride, io->y_stride); - const int u_lines_in = - WebPRescalerImport(&p->scaler_u, uv_mb_h - uv_j, - io->u + uv_j * io->uv_stride, io->uv_stride); - const int v_lines_in = - WebPRescalerImport(&p->scaler_v, uv_mb_h - uv_j, - io->v + uv_j * io->uv_stride, io->uv_stride); - (void)v_lines_in; // remove a gcc warning - assert(u_lines_in == v_lines_in); j += y_lines_in; - uv_j += u_lines_in; - num_lines_out += ExportRGB(p, num_lines_out); + if (WebPRescaleNeededLines(&p->scaler_u, uv_mb_h - uv_j)) { + const int u_lines_in = + WebPRescalerImport(&p->scaler_u, uv_mb_h - uv_j, + io->u + uv_j * io->uv_stride, io->uv_stride); + const int v_lines_in = + WebPRescalerImport(&p->scaler_v, uv_mb_h - uv_j, + io->v + uv_j * io->uv_stride, io->uv_stride); + (void)v_lines_in; // remove a gcc warning + assert(u_lines_in == v_lines_in); + uv_j += u_lines_in; + } + num_lines_out += ExportRGB(p, p->last_y + num_lines_out); } return num_lines_out; } -static int ExportAlpha(WebPDecParams* const p, int y_pos) { +static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) { const WebPRGBABuffer* const buf = &p->output->u.RGBA; - uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride; + uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride; const WEBP_CSP_MODE colorspace = p->output->colorspace; const int alpha_first = (colorspace == MODE_ARGB || colorspace == MODE_Argb); @@ -381,9 +392,10 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos) { uint32_t non_opaque = 0; const int width = p->scaler_a.dst_width; - while (WebPRescalerHasPendingOutput(&p->scaler_a)) { - assert(p->last_y + y_pos + num_lines_out < p->output->height); - WebPRescalerExportRow(&p->scaler_a, 0); + while (WebPRescalerHasPendingOutput(&p->scaler_a) && + num_lines_out < max_lines_out) { + assert(y_pos + num_lines_out < p->output->height); + WebPRescalerExportRow(&p->scaler_a); non_opaque |= WebPDispatchAlpha(p->scaler_a.dst, 0, width, 1, dst, 0); dst += buf->stride; ++num_lines_out; @@ -395,9 +407,10 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos) { return num_lines_out; } -static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) { +static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos, + int max_lines_out) { const WebPRGBABuffer* const buf = &p->output->u.RGBA; - uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride; + uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride; #ifdef WEBP_SWAP_16BIT_CSP uint8_t* alpha_dst = base_rgba; #else @@ -409,10 +422,11 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) { const int is_premult_alpha = WebPIsPremultipliedMode(colorspace); uint32_t alpha_mask = 0x0f; - while (WebPRescalerHasPendingOutput(&p->scaler_a)) { + while (WebPRescalerHasPendingOutput(&p->scaler_a) && + num_lines_out < max_lines_out) { int i; - assert(p->last_y + y_pos + num_lines_out < p->output->height); - WebPRescalerExportRow(&p->scaler_a, 0); + assert(y_pos + num_lines_out < p->output->height); + WebPRescalerExportRow(&p->scaler_a); for (i = 0; i < width; ++i) { // Fill in the alpha value (converted to 4 bits). const uint32_t alpha_value = p->scaler_a.dst[i] >> 4; @@ -428,15 +442,17 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) { return num_lines_out; } -static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p) { +static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p, + int expected_num_out_lines) { if (io->a != NULL) { WebPRescaler* const scaler = &p->scaler_a; - int j = 0; - int pos = 0; - while (j < io->mb_h) { - j += WebPRescalerImport(scaler, io->mb_h - j, - io->a + j * io->width, io->width); - pos += p->emit_alpha_row(p, pos); + int lines_left = expected_num_out_lines; + const int y_end = p->last_y + lines_left; + while (lines_left > 0) { + const int row_offset = scaler->src_y - io->mb_y; + WebPRescalerImport(scaler, io->mb_h + io->mb_y - scaler->src_y, + io->a + row_offset * io->width, io->width); + lines_left -= p->emit_alpha_row(p, y_end - lines_left, lines_left); } } return 0; @@ -449,7 +465,7 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) { const int uv_in_width = (io->mb_w + 1) >> 1; const int uv_in_height = (io->mb_h + 1) >> 1; const size_t work_size = 2 * out_width; // scratch memory for one rescaler - int32_t* work; // rescalers work area + rescaler_t* work; // rescalers work area uint8_t* tmp; // tmp storage for scaled YUV444 samples before RGB conversion size_t tmp_size1, tmp_size2, total_size; @@ -464,7 +480,7 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) { if (p->memory == NULL) { return 0; // memory error } - work = (int32_t*)p->memory; + work = (rescaler_t*)p->memory; tmp = (uint8_t*)(work + tmp_size1); WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h, tmp + 0 * out_width, out_width, out_height, 0, 1, @@ -572,7 +588,7 @@ static int CustomPut(const VP8Io* io) { } num_lines_out = p->emit(io, p); if (p->emit_alpha != NULL) { - p->emit_alpha(io, p); + p->emit_alpha(io, p, num_lines_out); } p->last_y += num_lines_out; return 1; diff --git a/src/dec/vp8l.c b/src/dec/vp8l.c index a80661d8..7294255d 100644 --- a/src/dec/vp8l.c +++ b/src/dec/vp8l.c @@ -420,7 +420,7 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) { const int in_height = io->mb_h; const int out_height = io->scaled_height; const uint64_t work_size = 2 * num_channels * (uint64_t)out_width; - int32_t* work; // Rescaler work area. + rescaler_t* work; // Rescaler work area. const uint64_t scaled_data_size = (uint64_t)out_width; uint32_t* scaled_data; // Temporary storage for scaled BGRA data. const uint64_t memory_size = sizeof(*dec->rescaler) + @@ -436,7 +436,7 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) { dec->rescaler = (WebPRescaler*)memory; memory += sizeof(*dec->rescaler); - work = (int32_t*)memory; + work = (rescaler_t*)memory; memory += work_size * sizeof(*work); scaled_data = (uint32_t*)memory; @@ -456,7 +456,7 @@ static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace, int num_lines_out = 0; while (WebPRescalerHasPendingOutput(rescaler)) { uint8_t* const dst = rgba + num_lines_out * rgba_stride; - WebPRescalerExportRow(rescaler, 0); + WebPRescalerExportRow(rescaler); WebPMultARGBRow(src, dst_width, 1); VP8LConvertFromBGRA(src, dst_width, colorspace, dst); ++num_lines_out; @@ -574,7 +574,7 @@ static int ExportYUVA(const VP8LDecoder* const dec, int y_pos) { const int dst_width = rescaler->dst_width; int num_lines_out = 0; while (WebPRescalerHasPendingOutput(rescaler)) { - WebPRescalerExportRow(rescaler, 0); + WebPRescalerExportRow(rescaler); WebPMultARGBRow(src, dst_width, 1); ConvertToYUVA(src, dst_width, y_pos, dec->output_); ++y_pos; diff --git a/src/dec/webpi.h b/src/dec/webpi.h index 457c72ed..c75a2e4a 100644 --- a/src/dec/webpi.h +++ b/src/dec/webpi.h @@ -26,7 +26,10 @@ extern "C" { typedef struct WebPDecParams WebPDecParams; typedef int (*OutputFunc)(const VP8Io* const io, WebPDecParams* const p); -typedef int (*OutputRowFunc)(WebPDecParams* const p, int y_pos); +typedef int (*OutputAlphaFunc)(const VP8Io* const io, WebPDecParams* const p, + int expected_num_out_lines); +typedef int (*OutputRowFunc)(WebPDecParams* const p, int y_pos, + int max_out_lines); struct WebPDecParams { WebPDecBuffer* output; // output buffer. @@ -40,7 +43,7 @@ struct WebPDecParams { void* memory; // overall scratch memory for the output work. OutputFunc emit; // output RGB or YUV samples - OutputFunc emit_alpha; // output alpha channel + OutputAlphaFunc emit_alpha; // output alpha channel OutputRowFunc emit_alpha_row; // output one line of rescaled alpha values }; diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index 84423b17..02e74903 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -335,15 +335,35 @@ void WebPInitYUV444Converters(void); struct WebPRescaler; // Import a row of data and save its contribution in the rescaler. -// 'channel' denotes the channel number to be imported. -extern void (*WebPRescalerImportRow)(struct WebPRescaler* const wrk, - const uint8_t* const src, int channel); +// 'channel' denotes the channel number to be imported. 'Expand' corresponds to +// the wrk->x_expand case. Otherwise, 'Shrink' is to be used. +typedef void (*WebPRescalerImportRowFunc)(struct WebPRescaler* const wrk, + const uint8_t* const src, + int channel); + +extern WebPRescalerImportRowFunc WebPRescalerImportRowExpand; +extern WebPRescalerImportRowFunc WebPRescalerImportRowShrink; // Export one row (starting at x_out position) from rescaler. -extern void (*WebPRescalerExportRow)(struct WebPRescaler* const wrk, int x_out); +// 'Expand' corresponds to the wrk->y_expand case. +// Otherwise 'Shrink' is to be used +typedef void (*WebPRescalerExportRowFunc)(struct WebPRescaler* const wrk); +extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand; +extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink; // Plain-C implementation, as fall-back. -extern void WebPRescalerExportRowC(struct WebPRescaler* const wrk, int x_out); +extern void WebPRescalerImportRowExpandC(struct WebPRescaler* const wrk, + const uint8_t* const src, int channel); +extern void WebPRescalerImportRowShrinkC(struct WebPRescaler* const wrk, + const uint8_t* const src, int channel); +extern void WebPRescalerExportRowExpandC(struct WebPRescaler* const wrk); +extern void WebPRescalerExportRowShrinkC(struct WebPRescaler* const wrk); + +// Main entry calls: +extern void WebPRescalerImportRow(struct WebPRescaler* const wrk, + const uint8_t* const src, int channel); +// Export one row (starting at x_out position) from rescaler. +extern void WebPRescalerExportRow(struct WebPRescaler* const wrk); // Must be called first before using the above. void WebPRescalerDspInit(void); diff --git a/src/dsp/rescaler.c b/src/dsp/rescaler.c index d987db4c..f9ecaecd 100644 --- a/src/dsp/rescaler.c +++ b/src/dsp/rescaler.c @@ -17,78 +17,125 @@ //------------------------------------------------------------------------------ // Implementations of critical functions ImportRow / ExportRow -#define ROUNDER (1 << (WEBP_RESCALER_RFIX - 1)) -#define MULT_FIX(x, y) (((int64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX) +#define ROUNDER (WEBP_RESCALER_ONE >> 1) +#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX) -static void RescalerImportRowC(WebPRescaler* const wrk, - const uint8_t* const src, int channel) { +//------------------------------------------------------------------------------ +// Row import + +void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, + const uint8_t* const src, int channel) { const int x_stride = wrk->num_channels; const int x_out_max = wrk->dst_width * wrk->num_channels; int x_in = channel; int x_out; - if (!wrk->x_expand) { - uint32_t sum = 0; - int accum = 0; - for (x_out = channel; x_out < x_out_max; x_out += x_stride) { - uint32_t base = 0; + // simple bilinear interpolation + int accum = wrk->x_add; + int left = src[x_in]; + int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left; + x_in += x_stride; + x_out = channel; + + assert(!WebPRescalerInputDone(wrk)); + assert(wrk->x_expand); + while (1) { + wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum; + x_out += x_stride; + if (x_out >= x_out_max) break; + accum -= wrk->x_sub; + if (accum < 0) { + left = right; + x_in += x_stride; + assert(x_in < wrk->src_width * x_stride); + right = src[x_in]; accum += wrk->x_add; - while (accum > 0) { - accum -= wrk->x_sub; - assert(x_in < wrk->src_width * x_stride); - base = src[x_in]; - sum += base; - x_in += x_stride; - } - { // Emit next horizontal pixel. - const int32_t frac = base * (-accum); - wrk->frow[x_out] = sum * wrk->x_sub - frac; - // fresh fractional start for next pixel - sum = (int)MULT_FIX(frac, wrk->fx_scale); - } } - assert(accum == 0); - } else { // simple bilinear interpolation - int accum = wrk->x_add; - int left = src[x_in]; - int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left; - x_in += x_stride; - x_out = channel; - while (1) { - wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum; - x_out += x_stride; - if (x_out >= x_out_max) break; - accum -= wrk->x_sub; - if (accum < 0) { - left = right; - x_in += x_stride; - assert(x_in < wrk->src_width * x_stride); - right = src[x_in]; - accum += wrk->x_add; - } - } - assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0); } - // Accumulate the contribution of the new row. + assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0); +} + +void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, + const uint8_t* const src, int channel) { + const int x_stride = wrk->num_channels; + const int x_out_max = wrk->dst_width * wrk->num_channels; + int x_in = channel; + int x_out; + uint32_t sum = 0; + int accum = 0; + + assert(!WebPRescalerInputDone(wrk)); + assert(!wrk->x_expand); for (x_out = channel; x_out < x_out_max; x_out += x_stride) { - wrk->irow[x_out] += wrk->frow[x_out]; + uint32_t base = 0; + accum += wrk->x_add; + while (accum > 0) { + accum -= wrk->x_sub; + assert(x_in < wrk->src_width * x_stride); + base = src[x_in]; + sum += base; + x_in += x_stride; + } + { // Emit next horizontal pixel. + const rescaler_t frac = base * (-accum); + wrk->frow[x_out] = sum * wrk->x_sub - frac; + // fresh fractional start for next pixel + sum = (int)MULT_FIX(frac, wrk->fx_scale); + } + } + assert(accum == 0); +} + +//------------------------------------------------------------------------------ +// Row export + +void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) { + int x_out; + uint8_t* const dst = wrk->dst; + rescaler_t* const irow = wrk->irow; + const int x_out_max = wrk->dst_width * wrk->num_channels; + const rescaler_t* const frow = wrk->frow; + assert(!WebPRescalerOutputDone(wrk)); + assert(wrk->y_accum <= 0); + assert(wrk->y_expand); + if (wrk->y_accum == 0) { + for (x_out = 0; x_out < x_out_max; ++x_out) { + const int v = (int)MULT_FIX(frow[x_out], wrk->fy_scale); + dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255; + } + } else { + const int64_t A = wrk->y_sub + wrk->y_accum; + const int64_t B = -wrk->y_accum; + for (x_out = 0; x_out < x_out_max; ++x_out) { + const int64_t I = A * frow[x_out] + B * irow[x_out]; + const int v = (int)MULT_FIX(I, wrk->fxy_scale); + dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255; + } } } -void WebPRescalerExportRowC(WebPRescaler* const wrk, int x_out) { - if (wrk->y_accum <= 0) { - uint8_t* const dst = wrk->dst; - int32_t* const irow = wrk->irow; - const int32_t* const frow = wrk->frow; - const int yscale = wrk->fy_scale * (-wrk->y_accum); - const int x_out_max = wrk->dst_width * wrk->num_channels; - for (; x_out < x_out_max; ++x_out) { +void WebPRescalerExportRowShrinkC(WebPRescaler* const wrk) { + int x_out; + uint8_t* const dst = wrk->dst; + rescaler_t* const irow = wrk->irow; + const int x_out_max = wrk->dst_width * wrk->num_channels; + const rescaler_t* const frow = wrk->frow; + const int yscale = wrk->fy_scale * (-wrk->y_accum); + assert(!WebPRescalerOutputDone(wrk)); + assert(wrk->y_accum <= 0); + assert(!wrk->y_expand); + if (yscale) { + for (x_out = 0; x_out < x_out_max; ++x_out) { const int frac = (int)MULT_FIX(frow[x_out], yscale); const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale); dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255; irow[x_out] = frac; // new fractional start } - wrk->y_accum += wrk->y_add; - wrk->dst += wrk->dst_stride; + } else { + for (x_out = 0; x_out < x_out_max; ++x_out) { + const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale); + dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255; + irow[x_out] = 0; + } } } @@ -96,10 +143,39 @@ void WebPRescalerExportRowC(WebPRescaler* const wrk, int x_out) { #undef ROUNDER //------------------------------------------------------------------------------ +// Main entry calls -void (*WebPRescalerImportRow)(struct WebPRescaler* const wrk, - const uint8_t* const src, int channel); -void (*WebPRescalerExportRow)(struct WebPRescaler* const wrk, int x_out); +void WebPRescalerImportRow(WebPRescaler* const wrk, + const uint8_t* const src, int channel) { + assert(!WebPRescalerInputDone(wrk)); + if (!wrk->x_expand) { + WebPRescalerImportRowShrink(wrk, src, channel); + } else { + WebPRescalerImportRowExpand(wrk, src, channel); + } +} + +void WebPRescalerExportRow(WebPRescaler* const wrk) { + if (wrk->y_accum <= 0) { + assert(!WebPRescalerOutputDone(wrk)); + if (wrk->y_expand) { + WebPRescalerExportRowExpand(wrk); + } else { + WebPRescalerExportRowShrink(wrk); + } + wrk->y_accum += wrk->y_add; + wrk->dst += wrk->dst_stride; + ++wrk->dst_y; + } +} + +//------------------------------------------------------------------------------ + +WebPRescalerImportRowFunc WebPRescalerImportRowExpand; +WebPRescalerImportRowFunc WebPRescalerImportRowShrink; + +WebPRescalerExportRowFunc WebPRescalerExportRowExpand; +WebPRescalerExportRowFunc WebPRescalerExportRowShrink; extern void WebPRescalerDspInitMIPS32(void); extern void WebPRescalerDspInitMIPSdspR2(void); @@ -110,8 +186,11 @@ static volatile VP8CPUInfo rescaler_last_cpuinfo_used = WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) { if (rescaler_last_cpuinfo_used == VP8GetCPUInfo) return; - WebPRescalerImportRow = RescalerImportRowC; - WebPRescalerExportRow = WebPRescalerExportRowC; + WebPRescalerImportRowExpand = WebPRescalerImportRowExpandC; + WebPRescalerImportRowShrink = WebPRescalerImportRowShrinkC; + WebPRescalerExportRowExpand = WebPRescalerExportRowExpandC; + WebPRescalerExportRowShrink = WebPRescalerExportRowShrinkC; + if (VP8GetCPUInfo != NULL) { #if defined(WEBP_USE_MIPS32) if (VP8GetCPUInfo(kMIPS32)) { diff --git a/src/dsp/rescaler_mips32.c b/src/dsp/rescaler_mips32.c index cb544046..4da2c1da 100644 --- a/src/dsp/rescaler_mips32.c +++ b/src/dsp/rescaler_mips32.c @@ -15,177 +15,183 @@ #if defined(WEBP_USE_MIPS32) +#include #include "../utils/rescaler.h" -static void ImportRow(WebPRescaler* const wrk, +static void ImportRowShrink(WebPRescaler* const wrk, const uint8_t* const src, int channel) { const int x_stride = wrk->num_channels; const int x_out_max = wrk->dst_width * wrk->num_channels; const int fx_scale = wrk->fx_scale; const int x_add = wrk->x_add; const int x_sub = wrk->x_sub; - const int src_width = wrk->src_width; int* frow = wrk->frow + channel; - int* irow = wrk->irow + channel; const uint8_t* src1 = src + channel; - int temp1, temp2, temp3, temp4; + int temp1, temp2, temp3; int base, frac, sum; int accum, accum1; const int x_stride1 = x_stride << 2; int loop_c = x_out_max - channel; + + assert(!wrk->x_expand); + assert(!WebPRescalerInputDone(wrk)); + __asm__ volatile ( + "li %[temp1], 0x8000 \n\t" + "li %[temp2], 0x10000 \n\t" + "li %[sum], 0 \n\t" + "li %[accum], 0 \n\t" + "1: \n\t" + "addu %[accum], %[accum], %[x_add] \n\t" + "li %[base], 0 \n\t" + "blez %[accum], 3f \n\t" + "2: \n\t" + "lbu %[base], 0(%[src1]) \n\t" + "subu %[accum], %[accum], %[x_sub] \n\t" + "addu %[src1], %[src1], %[x_stride] \n\t" + "addu %[sum], %[sum], %[base] \n\t" + "bgtz %[accum], 2b \n\t" + "3: \n\t" + "negu %[accum1], %[accum] \n\t" + "mul %[frac], %[base], %[accum1] \n\t" + "mul %[temp3], %[sum], %[x_sub] \n\t" + "subu %[loop_c], %[loop_c], %[x_stride] \n\t" + "sll %[accum1], %[frac], 2 \n\t" + "mult %[temp1], %[temp2] \n\t" + "madd %[accum1], %[fx_scale] \n\t" + "mfhi %[sum] \n\t" + "subu %[temp3], %[temp3], %[frac] \n\t" + "sw %[temp3], 0(%[frow]) \n\t" + "addu %[frow], %[frow], %[x_stride1] \n\t" + "bgtz %[loop_c], 1b \n\t" + : [accum] "=&r" (accum), [src1] "+r" (src1), [temp3] "=&r" (temp3), + [sum] "=&r" (sum), [base] "=&r" (base), [frac] "=&r" (frac), + [frow] "+r" (frow), [accum1] "=&r" (accum1), + [temp2] "=&r" (temp2), [temp1] "=&r" (temp1) + : [x_stride] "r" (x_stride), [fx_scale] "r" (fx_scale), + [x_sub] "r" (x_sub), [x_add] "r" (x_add), + [loop_c] "r" (loop_c), [x_stride1] "r" (x_stride1) + : "memory", "hi", "lo" + ); +} + +static void ImportRowExpand(WebPRescaler* const wrk, + const uint8_t* const src, int channel) { + const int x_stride = wrk->num_channels; + const int x_out_max = wrk->dst_width * wrk->num_channels; + const int x_add = wrk->x_add; + const int x_sub = wrk->x_sub; + const int src_width = wrk->src_width; + int* frow = wrk->frow + channel; + const uint8_t* src1 = src + channel; + int temp1, temp2, temp3, temp4; + int frac; + int accum; + const int x_stride1 = x_stride << 2; int x_out = channel; - if (!wrk->x_expand) { - __asm__ volatile ( - "li %[temp1], 0x8000 \n\t" - "li %[temp2], 0x10000 \n\t" - "li %[sum], 0 \n\t" - "li %[accum], 0 \n\t" - "1: \n\t" - "addu %[accum], %[accum], %[x_add] \n\t" - "li %[base], 0 \n\t" - "blez %[accum], 3f \n\t" - "2: \n\t" - "lbu %[base], 0(%[src1]) \n\t" - "subu %[accum], %[accum], %[x_sub] \n\t" - "addu %[src1], %[src1], %[x_stride] \n\t" - "addu %[sum], %[sum], %[base] \n\t" - "bgtz %[accum], 2b \n\t" - "3: \n\t" - "negu %[accum1], %[accum] \n\t" - "mul %[frac], %[base], %[accum1] \n\t" - "mul %[temp3], %[sum], %[x_sub] \n\t" - "lw %[base], 0(%[irow]) \n\t" - "subu %[loop_c], %[loop_c], %[x_stride] \n\t" - "sll %[accum1], %[frac], 2 \n\t" - "mult %[temp1], %[temp2] \n\t" - "madd %[accum1], %[fx_scale] \n\t" - "mfhi %[sum] \n\t" - "subu %[temp3], %[temp3], %[frac] \n\t" - "sw %[temp3], 0(%[frow]) \n\t" - "addu %[base], %[base], %[temp3] \n\t" - "sw %[base], 0(%[irow]) \n\t" - "addu %[irow], %[irow], %[x_stride1] \n\t" - "addu %[frow], %[frow], %[x_stride1] \n\t" - "bgtz %[loop_c], 1b \n\t" - : [accum] "=&r" (accum), [src1] "+r" (src1), [temp3] "=&r" (temp3), - [sum] "=&r" (sum), [base] "=&r" (base), [frac] "=&r" (frac), - [frow] "+r" (frow), [irow] "+r" (irow), [accum1] "=&r" (accum1), - [temp2] "=&r" (temp2), [temp1] "=&r" (temp1) - : [x_stride] "r" (x_stride), [fx_scale] "r" (fx_scale), - [x_sub] "r" (x_sub), [x_add] "r" (x_add), - [loop_c] "r" (loop_c), [x_stride1] "r" (x_stride1) + assert(wrk->x_expand); + assert(!WebPRescalerInputDone(wrk)); + __asm__ volatile ( + "addiu %[temp3], %[src_width], -1 \n\t" + "lbu %[temp2], 0(%[src1]) \n\t" + "addu %[src1], %[src1], %[x_stride] \n\t" + "bgtz %[temp3], 0f \n\t" + "addiu %[temp1], %[temp2], 0 \n\t" + "b 3f \n\t" + "0: \n\t" + "lbu %[temp1], 0(%[src1]) \n\t" + "3: \n\t" + "addiu %[accum], %[x_add], 0 \n\t" + "1: \n\t" + "subu %[temp3], %[temp2], %[temp1] \n\t" + "mul %[temp3], %[temp3], %[accum] \n\t" + "mul %[temp4], %[temp1], %[x_add] \n\t" + "addu %[temp3], %[temp4], %[temp3] \n\t" + "sw %[temp3], 0(%[frow]) \n\t" + "addu %[frow], %[frow], %[x_stride1] \n\t" + "addu %[x_out], %[x_out], %[x_stride] \n\t" + "subu %[temp3], %[x_out], %[x_out_max] \n\t" + "bgez %[temp3], 2f \n\t" + "subu %[accum], %[accum], %[x_sub] \n\t" + "bgez %[accum], 4f \n\t" + "addiu %[temp2], %[temp1], 0 \n\t" + "addu %[src1], %[src1], %[x_stride] \n\t" + "lbu %[temp1], 0(%[src1]) \n\t" + "addu %[accum], %[accum], %[x_add] \n\t" + "4: \n\t" + "b 1b \n\t" + "2: \n\t" + : [src1] "+r" (src1), [accum] "=&r" (accum), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [temp4] "=&r" (temp4), + [x_out] "+r" (x_out), [frac] "=&r" (frac), [frow] "+r" (frow) + : [x_stride] "r" (x_stride), [x_add] "r" (x_add), [x_sub] "r" (x_sub), + [x_stride1] "r" (x_stride1), [src_width] "r" (src_width), + [x_out_max] "r" (x_out_max) + : "memory", "hi", "lo" + ); +} + +static void ExportRowShrink(WebPRescaler* const wrk) { + assert(!WebPRescalerOutputDone(wrk)); + assert(wrk->y_accum <= 0); + assert(!wrk->y_expand); + // if wrk->fxy_scale can fit into 32 bits use optimized code, + // otherwise use C code + if ((wrk->fxy_scale >> 32) == 0) { + uint8_t* dst = wrk->dst; + rescaler_t* irow = wrk->irow; + const rescaler_t* frow = wrk->frow; + const int yscale = wrk->fy_scale * (-wrk->y_accum); + const int x_out_max = wrk->dst_width * wrk->num_channels; + + int temp0, temp1, temp3, temp4, temp5, temp6, temp7, loop_end; + const int temp2 = (int)(wrk->fxy_scale); + const int temp8 = x_out_max << 2; + + __asm__ volatile( + "addiu %[temp6], $zero, -256 \n\t" + "addiu %[temp7], $zero, 255 \n\t" + "li %[temp3], 0x10000 \n\t" + "li %[temp4], 0x8000 \n\t" + "addu %[loop_end], %[frow], %[temp8] \n\t" + "1: \n\t" + "lw %[temp0], 0(%[frow]) \n\t" + "mult %[temp3], %[temp4] \n\t" + "addiu %[frow], %[frow], 4 \n\t" + "sll %[temp0], %[temp0], 2 \n\t" + "madd %[temp0], %[yscale] \n\t" + "mfhi %[temp1] \n\t" + "lw %[temp0], 0(%[irow]) \n\t" + "addiu %[dst], %[dst], 1 \n\t" + "addiu %[irow], %[irow], 4 \n\t" + "subu %[temp0], %[temp0], %[temp1] \n\t" + "mult %[temp3], %[temp4] \n\t" + "sll %[temp0], %[temp0], 2 \n\t" + "madd %[temp0], %[temp2] \n\t" + "mfhi %[temp5] \n\t" + "sw %[temp1], -4(%[irow]) \n\t" + "and %[temp0], %[temp5], %[temp6] \n\t" + "slti %[temp1], %[temp5], 0 \n\t" + "beqz %[temp0], 2f \n\t" + "xor %[temp5], %[temp5], %[temp5] \n\t" + "movz %[temp5], %[temp7], %[temp1] \n\t" + "2: \n\t" + "sb %[temp5], -1(%[dst]) \n\t" + "bne %[frow], %[loop_end], 1b \n\t" + + : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3), + [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6), + [temp7]"=&r"(temp7), [frow]"+r"(frow), [irow]"+r"(irow), + [dst]"+r"(dst), [loop_end]"=&r"(loop_end) + : [temp2]"r"(temp2), [yscale]"r"(yscale), [temp8]"r"(temp8) : "memory", "hi", "lo" ); } else { - __asm__ volatile ( - "addiu %[temp3], %[src_width], -1 \n\t" - "lbu %[temp2], 0(%[src1]) \n\t" - "addu %[src1], %[src1], %[x_stride] \n\t" - "bgtz %[temp3], 0f \n\t" - "addiu %[temp1], %[temp2], 0 \n\t" - "b 3f \n\t" - "0: \n\t" - "lbu %[temp1], 0(%[src1]) \n\t" - "3: \n\t" - "addiu %[accum], %[x_add], 0 \n\t" - "1: \n\t" - "subu %[temp3], %[temp2], %[temp1] \n\t" - "mul %[temp3], %[temp3], %[accum] \n\t" - "mul %[temp4], %[temp1], %[x_add] \n\t" - "lw %[frac], 0(%[irow]) \n\t" - "addu %[temp3], %[temp4], %[temp3] \n\t" - "sw %[temp3], 0(%[frow]) \n\t" - "addu %[frow], %[frow], %[x_stride1] \n\t" - "addu %[frac], %[frac], %[temp3] \n\t" - "addu %[x_out], %[x_out], %[x_stride] \n\t" - "sw %[frac], 0(%[irow]) \n\t" - "subu %[temp3], %[x_out], %[x_out_max] \n\t" - "addu %[irow], %[irow], %[x_stride1] \n\t" - "bgez %[temp3], 2f \n\t" - "subu %[accum], %[accum], %[x_sub] \n\t" - "bgez %[accum], 4f \n\t" - "addiu %[temp2], %[temp1], 0 \n\t" - "addu %[src1], %[src1], %[x_stride] \n\t" - "lbu %[temp1], 0(%[src1]) \n\t" - "addu %[accum], %[accum], %[x_add] \n\t" - "4: \n\t" - "b 1b \n\t" - "2: \n\t" - : [src1] "+r" (src1), [accum] "=&r" (accum), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [temp4] "=&r" (temp4), - [x_out] "+r" (x_out), [frac] "=&r" (frac), [frow] "+r" (frow), - [irow] "+r" (irow) - : [x_stride] "r" (x_stride), [x_add] "r" (x_add), [x_sub] "r" (x_sub), - [x_stride1] "r" (x_stride1), [src_width] "r" (src_width), - [x_out_max] "r" (x_out_max) - : "memory", "hi", "lo" - ); + WebPRescalerExportRowShrinkC(wrk); } } -static void ExportRow(WebPRescaler* const wrk, int x_out) { - if (wrk->y_accum <= 0) { - uint8_t* const dst = wrk->dst; - int32_t* const irow = wrk->irow; - const int32_t* const frow = wrk->frow; - const int yscale = wrk->fy_scale * (-wrk->y_accum); - const int x_out_max = wrk->dst_width * wrk->num_channels; - // if wrk->fxy_scale can fit into 32 bits use optimized code, - // otherwise use C code - if ((wrk->fxy_scale >> 32) == 0) { - int temp0, temp1, temp3, temp4, temp5, temp6, temp7, loop_end; - const int temp2 = (int)(wrk->fxy_scale); - const int temp8 = x_out_max << 2; - uint8_t* dst_t = (uint8_t*)dst; - int32_t* irow_t = (int32_t*)irow; - const int32_t* frow_t = (const int32_t*)frow; - - __asm__ volatile( - "addiu %[temp6], $zero, -256 \n\t" - "addiu %[temp7], $zero, 255 \n\t" - "li %[temp3], 0x10000 \n\t" - "li %[temp4], 0x8000 \n\t" - "addu %[loop_end], %[frow_t], %[temp8] \n\t" - "1: \n\t" - "lw %[temp0], 0(%[frow_t]) \n\t" - "mult %[temp3], %[temp4] \n\t" - "addiu %[frow_t], %[frow_t], 4 \n\t" - "sll %[temp0], %[temp0], 2 \n\t" - "madd %[temp0], %[yscale] \n\t" - "mfhi %[temp1] \n\t" - "lw %[temp0], 0(%[irow_t]) \n\t" - "addiu %[dst_t], %[dst_t], 1 \n\t" - "addiu %[irow_t], %[irow_t], 4 \n\t" - "subu %[temp0], %[temp0], %[temp1] \n\t" - "mult %[temp3], %[temp4] \n\t" - "sll %[temp0], %[temp0], 2 \n\t" - "madd %[temp0], %[temp2] \n\t" - "mfhi %[temp5] \n\t" - "sw %[temp1], -4(%[irow_t]) \n\t" - "and %[temp0], %[temp5], %[temp6] \n\t" - "slti %[temp1], %[temp5], 0 \n\t" - "beqz %[temp0], 2f \n\t" - "xor %[temp5], %[temp5], %[temp5] \n\t" - "movz %[temp5], %[temp7], %[temp1] \n\t" - "2: \n\t" - "sb %[temp5], -1(%[dst_t]) \n\t" - "bne %[frow_t], %[loop_end], 1b \n\t" - - : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3), - [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6), - [temp7]"=&r"(temp7), [frow_t]"+r"(frow_t), [irow_t]"+r"(irow_t), - [dst_t]"+r"(dst_t), [loop_end]"=&r"(loop_end) - : [temp2]"r"(temp2), [yscale]"r"(yscale), [temp8]"r"(temp8) - : "memory", "hi", "lo" - ); - wrk->y_accum += wrk->y_add; - wrk->dst += wrk->dst_stride; - } else { - WebPRescalerExportRowC(wrk, x_out); - } - } -} +// no ExportRowExpand yet. //------------------------------------------------------------------------------ // Entry point @@ -193,8 +199,9 @@ static void ExportRow(WebPRescaler* const wrk, int x_out) { extern void WebPRescalerDspInitMIPS32(void); WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPS32(void) { - WebPRescalerImportRow = ImportRow; - WebPRescalerExportRow = ExportRow; + WebPRescalerImportRowExpand = ImportRowExpand; + WebPRescalerImportRowShrink = ImportRowShrink; + WebPRescalerExportRowShrink = ExportRowShrink; } #else // !WEBP_USE_MIPS32 diff --git a/src/dsp/rescaler_mips_dsp_r2.c b/src/dsp/rescaler_mips_dsp_r2.c index ed2fd496..f433ee43 100644 --- a/src/dsp/rescaler_mips_dsp_r2.c +++ b/src/dsp/rescaler_mips_dsp_r2.c @@ -15,28 +15,27 @@ #if defined(WEBP_USE_MIPS_DSP_R2) +#include #include "../utils/rescaler.h" -static void ImportRow(WebPRescaler* const wrk, - const uint8_t* const src, int channel) { +static void ImportRowShrink(WebPRescaler* const wrk, + const uint8_t* const src, int channel) { const int x_stride = wrk->num_channels; const int x_out_max = wrk->dst_width * wrk->num_channels; const int fx_scale = wrk->fx_scale; const int x_add = wrk->x_add; const int x_sub = wrk->x_sub; - const int src_width = wrk->src_width; int* frow = wrk->frow + channel; - int* irow = wrk->irow + channel; const uint8_t* src1 = src + channel; - int temp1, temp2, temp3, temp4; + int temp3; int base, frac, sum; int accum, accum1; const int x_stride1 = x_stride << 2; int loop_c = x_out_max - channel; - int x_out = channel; - if (!wrk->x_expand) { - __asm__ volatile ( + assert(!wrk->x_expand); + assert(!WebPRescalerInputDone(wrk)); + __asm__ volatile ( "li %[sum], 0 \n\t" "li %[accum], 0 \n\t" "1: \n\t" @@ -53,88 +52,99 @@ static void ImportRow(WebPRescaler* const wrk, "negu %[accum1], %[accum] \n\t" "mul %[frac], %[base], %[accum1] \n\t" "mul %[temp3], %[sum], %[x_sub] \n\t" - "lw %[base], 0(%[irow]) \n\t" "sll %[accum1], %[frac], 1 \n\t" "subu %[loop_c], %[loop_c], %[x_stride] \n\t" "mulq_rs.w %[sum], %[accum1], %[fx_scale] \n\t" "subu %[temp3], %[temp3], %[frac] \n\t" "sw %[temp3], 0(%[frow]) \n\t" - "addu %[base], %[base], %[temp3] \n\t" - "sw %[base], 0(%[irow]) \n\t" - "addu %[irow], %[irow], %[x_stride1] \n\t" "addu %[frow], %[frow], %[x_stride1] \n\t" "bgtz %[loop_c], 1b \n\t" - : [accum]"=&r"(accum), [src1]"+&r"(src1), [temp3]"=&r"(temp3), - [sum]"=&r"(sum), [base]"=&r"(base), [frac]"=&r"(frac), - [frow]"+&r"(frow), [irow]"+&r"(irow), [accum1]"=&r"(accum1), - [loop_c]"+&r"(loop_c) - : [x_stride]"r"(x_stride), [fx_scale]"r"(fx_scale), [x_sub]"r"(x_sub), - [x_add] "r" (x_add), [x_stride1] "r" (x_stride1) - : "memory", "hi", "lo" - ); - } else { - __asm__ volatile ( - "addiu %[temp3], %[src_width], -1 \n\t" - "lbu %[temp2], 0(%[src1]) \n\t" - "addu %[src1], %[src1], %[x_stride] \n\t" - "bgtz %[temp3], 0f \n\t" - "addiu %[temp1], %[temp2], 0 \n\t" - "b 3f \n\t" - "0: \n\t" - "lbu %[temp1], 0(%[src1]) \n\t" - "3: \n\t" - "addiu %[accum], %[x_add], 0 \n\t" - "1: \n\t" - "subu %[temp3], %[temp2], %[temp1] \n\t" - "mul %[temp3], %[temp3], %[accum] \n\t" - "mul %[temp4], %[temp1], %[x_add] \n\t" - "lw %[frac], 0(%[irow]) \n\t" - "addu %[temp3], %[temp4], %[temp3] \n\t" - "sw %[temp3], 0(%[frow]) \n\t" - "addu %[frow], %[frow], %[x_stride1] \n\t" - "addu %[frac], %[frac], %[temp3] \n\t" - "addu %[x_out], %[x_out], %[x_stride] \n\t" - "sw %[frac], 0(%[irow]) \n\t" - "subu %[temp3], %[x_out], %[x_out_max] \n\t" - "addu %[irow], %[irow], %[x_stride1] \n\t" - "bgez %[temp3], 2f \n\t" - "subu %[accum], %[accum], %[x_sub] \n\t" - "bgez %[accum], 4f \n\t" - "addiu %[temp2], %[temp1], 0 \n\t" - "addu %[src1], %[src1], %[x_stride] \n\t" - "lbu %[temp1], 0(%[src1]) \n\t" - "addu %[accum], %[accum], %[x_add] \n\t" - "4: \n\t" - "b 1b \n\t" - "2: \n\t" - : [src1] "+r" (src1), [accum] "=&r" (accum), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [temp4] "=&r" (temp4), - [x_out] "+r" (x_out), [frac] "=&r" (frac), [frow] "+r" (frow), - [irow] "+r" (irow) - : [x_stride] "r" (x_stride), [x_add] "r" (x_add), [x_sub] "r" (x_sub), - [x_stride1] "r" (x_stride1), [src_width] "r" (src_width), - [x_out_max] "r" (x_out_max) - : "memory", "hi", "lo" - ); - } + : [accum]"=&r"(accum), [src1]"+&r"(src1), [temp3]"=&r"(temp3), + [sum]"=&r"(sum), [base]"=&r"(base), [frac]"=&r"(frac), + [frow]"+&r"(frow), [accum1]"=&r"(accum1), + [loop_c]"+&r"(loop_c) + : [x_stride]"r"(x_stride), [fx_scale]"r"(fx_scale), [x_sub]"r"(x_sub), + [x_add] "r" (x_add), [x_stride1] "r" (x_stride1) + : "memory", "hi", "lo" + ); } -static void ExportRow(WebPRescaler* const wrk, int x_out) { - if (wrk->y_accum <= 0) { - // if wrk->fxy_scale can fit into 32 bits use optimized code, - // otherwise use C code - if ((wrk->fxy_scale >> 32) == 0) { - uint8_t* dst = wrk->dst; - int32_t* irow = wrk->irow; - const int32_t* frow = wrk->frow; - const int yscale = wrk->fy_scale * (-wrk->y_accum); - const int x_out_max = wrk->dst_width * wrk->num_channels; - int temp0, temp1, temp3, temp4, temp5, temp6, temp7; - const int temp2 = (int)wrk->fxy_scale; - const int rest = (x_out_max - x_out) & 1; - const int32_t* const loop_end = frow + (x_out_max - x_out) - rest; +static void ImportRowExpand(WebPRescaler* const wrk, + const uint8_t* const src, int channel) { + const int x_stride = wrk->num_channels; + const int x_out_max = wrk->dst_width * wrk->num_channels; + const int x_add = wrk->x_add; + const int x_sub = wrk->x_sub; + const int src_width = wrk->src_width; + int* frow = wrk->frow + channel; + const uint8_t* src1 = src + channel; + int temp1, temp2, temp3, temp4; + int frac; + int accum; + const int x_stride1 = x_stride << 2; + int x_out = channel; - __asm__ volatile ( + assert(wrk->x_expand); + assert(!WebPRescalerInputDone(wrk)); + __asm__ volatile ( + "addiu %[temp3], %[src_width], -1 \n\t" + "lbu %[temp2], 0(%[src1]) \n\t" + "addu %[src1], %[src1], %[x_stride] \n\t" + "bgtz %[temp3], 0f \n\t" + "addiu %[temp1], %[temp2], 0 \n\t" + "b 3f \n\t" + "0: \n\t" + "lbu %[temp1], 0(%[src1]) \n\t" + "3: \n\t" + "addiu %[accum], %[x_add], 0 \n\t" + "1: \n\t" + "subu %[temp3], %[temp2], %[temp1] \n\t" + "mul %[temp3], %[temp3], %[accum] \n\t" + "mul %[temp4], %[temp1], %[x_add] \n\t" + "addu %[temp3], %[temp4], %[temp3] \n\t" + "sw %[temp3], 0(%[frow]) \n\t" + "addu %[frow], %[frow], %[x_stride1] \n\t" + "addu %[x_out], %[x_out], %[x_stride] \n\t" + "subu %[temp3], %[x_out], %[x_out_max] \n\t" + "bgez %[temp3], 2f \n\t" + "subu %[accum], %[accum], %[x_sub] \n\t" + "bgez %[accum], 4f \n\t" + "addiu %[temp2], %[temp1], 0 \n\t" + "addu %[src1], %[src1], %[x_stride] \n\t" + "lbu %[temp1], 0(%[src1]) \n\t" + "addu %[accum], %[accum], %[x_add] \n\t" + "4: \n\t" + "b 1b \n\t" + "2: \n\t" + : [src1] "+r" (src1), [accum] "=&r" (accum), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [temp4] "=&r" (temp4), + [x_out] "+r" (x_out), [frac] "=&r" (frac), [frow] "+r" (frow) + : [x_stride] "r" (x_stride), [x_add] "r" (x_add), [x_sub] "r" (x_sub), + [x_stride1] "r" (x_stride1), [src_width] "r" (src_width), + [x_out_max] "r" (x_out_max) + : "memory", "hi", "lo" + ); +} + +static void ExportRowShrink(WebPRescaler* const wrk) { + assert(!WebPRescalerOutputDone(wrk)); + assert(wrk->y_accum <= 0); + assert(!wrk->y_expand); + // if wrk->fxy_scale can fit into 32 bits use optimized code, + // otherwise use C code + if ((wrk->fxy_scale >> 32) == 0) { + uint8_t* dst = wrk->dst; + rescaler_t* irow = wrk->irow; + const rescaler_t* frow = wrk->frow; + const int yscale = wrk->fy_scale * (-wrk->y_accum); + const int x_out_max = wrk->dst_width * wrk->num_channels; + + int temp0, temp1, temp3, temp4, temp5, temp6, temp7; + const int temp2 = (int)wrk->fxy_scale; + const int rest = x_out_max & 1; + const rescaler_t* const loop_end = frow + x_out_max - rest; + + __asm__ volatile ( ".set push \n\t" ".set noreorder \n\t" "beq %[frow], %[loop_end], 1f \n\t" @@ -188,30 +198,30 @@ static void ExportRow(WebPRescaler* const wrk, int x_out) { "sb %[temp5], 0(%[dst]) \n\t" "3: \n\t" ".set pop \n\t" - : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3), - [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6), - [temp7]"=&r"(temp7), [frow]"+&r"(frow), [irow]"+&r"(irow), - [dst]"+&r"(dst) - : [temp2]"r"(temp2), [yscale]"r"(yscale), [loop_end]"r"(loop_end), - [rest]"r"(rest) - : "memory", "hi", "lo" - ); - wrk->y_accum += wrk->y_add; - wrk->dst += wrk->dst_stride; - } else { - WebPRescalerExportRowC(wrk, x_out); - } + : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3), + [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6), + [temp7]"=&r"(temp7), [frow]"+&r"(frow), [irow]"+&r"(irow), + [dst]"+&r"(dst) + : [temp2]"r"(temp2), [yscale]"r"(yscale), [loop_end]"r"(loop_end), + [rest]"r"(rest) + : "memory", "hi", "lo" + ); + } else { + WebPRescalerExportRowShrinkC(wrk); } } +// no ExportRowExpand yet. + //------------------------------------------------------------------------------ // Entry point extern void WebPRescalerDspInitMIPSdspR2(void); WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPSdspR2(void) { - WebPRescalerImportRow = ImportRow; - WebPRescalerExportRow = ExportRow; + WebPRescalerImportRowExpand = ImportRowExpand; + WebPRescalerImportRowShrink = ImportRowShrink; + WebPRescalerExportRowShrink = ExportRowShrink; } #else // !WEBP_USE_MIPS_DSP_R2 diff --git a/src/enc/picture_rescale.c b/src/enc/picture_rescale.c index 794808bf..023e5998 100644 --- a/src/enc/picture_rescale.c +++ b/src/enc/picture_rescale.c @@ -175,7 +175,7 @@ static void RescalePlane(const uint8_t* src, int src_width, int src_height, int src_stride, uint8_t* dst, int dst_width, int dst_height, int dst_stride, - int32_t* const work, + rescaler_t* const work, int num_channels) { WebPRescaler rescaler; int y = 0; @@ -205,7 +205,7 @@ static void AlphaMultiplyY(WebPPicture* const pic, int inverse) { int WebPPictureRescale(WebPPicture* pic, int width, int height) { WebPPicture tmp; int prev_width, prev_height; - int32_t* work; + rescaler_t* work; if (pic == NULL) return 0; prev_width = pic->width; @@ -221,7 +221,7 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) { if (!WebPPictureAlloc(&tmp)) return 0; if (!pic->use_argb) { - work = (int32_t*)WebPSafeMalloc(2ULL * width, sizeof(*work)); + work = (rescaler_t*)WebPSafeMalloc(2ULL * width, sizeof(*work)); if (work == NULL) { WebPPictureFree(&tmp); return 0; @@ -249,7 +249,7 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) { tmp.v, HALVE(width), HALVE(height), tmp.uv_stride, work, 1); } else { - work = (int32_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work)); + work = (rescaler_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work)); if (work == NULL) { WebPPictureFree(&tmp); return 0; diff --git a/src/utils/rescaler.c b/src/utils/rescaler.c index b61f74b1..33da4245 100644 --- a/src/utils/rescaler.c +++ b/src/utils/rescaler.c @@ -22,7 +22,7 @@ void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height, uint8_t* const dst, int dst_width, int dst_height, int dst_stride, - int num_channels, int32_t* const work) { + int num_channels, rescaler_t* const work) { const int x_add = src_width, x_sub = dst_width; const int y_add = src_height, y_sub = dst_height; wrk->x_expand = (src_width < dst_width); @@ -31,6 +31,8 @@ void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height, wrk->src_height = src_height; wrk->dst_width = dst_width; wrk->dst_height = dst_height; + wrk->src_y = 0; + wrk->dst_y = 0; wrk->dst = dst; wrk->dst_stride = dst_stride; wrk->num_channels = num_channels; @@ -39,18 +41,20 @@ void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height, wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add; wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub; if (!wrk->x_expand) { // fx_scale is not used otherwise - wrk->fx_scale = (1 << WEBP_RESCALER_RFIX) / wrk->x_sub; + wrk->fx_scale = WEBP_RESCALER_ONE / wrk->x_sub; } - // vertical scaling parameters - wrk->y_accum = y_add; - wrk->y_add = y_add; - wrk->y_sub = y_sub; - wrk->fy_scale = (1 << WEBP_RESCALER_RFIX) / wrk->y_sub; - - wrk->fxy_scale = - ((int64_t)dst_height << WEBP_RESCALER_RFIX) / (wrk->x_add * wrk->y_add); - + wrk->y_add = wrk->y_expand ? y_add - 1 : y_add; + wrk->y_sub = wrk->y_expand ? y_sub - 1: y_sub; + wrk->y_accum = wrk->y_expand ? wrk->y_sub : wrk->y_add; + if (!wrk->y_expand) { + wrk->fy_scale = WEBP_RESCALER_ONE / wrk->y_sub; + wrk->fxy_scale = ((uint64_t)dst_height << WEBP_RESCALER_RFIX) + / (wrk->x_add * wrk->y_add); + } else { + wrk->fy_scale = WEBP_RESCALER_ONE / wrk->x_add; + wrk->fxy_scale = WEBP_RESCALER_ONE / (wrk->x_add * wrk->y_sub); + } wrk->irow = work; wrk->frow = work + num_channels * dst_width; memset(work, 0, 2 * dst_width * num_channels * sizeof(*work)); @@ -98,10 +102,21 @@ int WebPRescalerImport(WebPRescaler* const wrk, int num_lines, const uint8_t* src, int src_stride) { int total_imported = 0; while (total_imported < num_lines && !WebPRescalerHasPendingOutput(wrk)) { - int channel; + int x, channel; + if (wrk->y_expand) { + rescaler_t* const tmp = wrk->irow; + wrk->irow = wrk->frow; + wrk->frow = tmp; + } for (channel = 0; channel < wrk->num_channels; ++channel) { WebPRescalerImportRow(wrk, src, channel); } + if (!wrk->y_expand) { // Accumulate the contribution of the new row. + for (x = 0; x < wrk->num_channels * wrk->dst_width; ++x) { + wrk->irow[x] += wrk->frow[x]; + } + } + ++wrk->src_y; src += src_stride; ++total_imported; wrk->y_accum -= wrk->y_sub; @@ -112,7 +127,7 @@ int WebPRescalerImport(WebPRescaler* const wrk, int num_lines, int WebPRescalerExport(WebPRescaler* const rescaler) { int total_exported = 0; while (WebPRescalerHasPendingOutput(rescaler)) { - WebPRescalerExportRow(rescaler, 0); + WebPRescalerExportRow(rescaler); ++total_exported; } return total_exported; diff --git a/src/utils/rescaler.h b/src/utils/rescaler.h index 82e3dac9..7787f9c7 100644 --- a/src/utils/rescaler.h +++ b/src/utils/rescaler.h @@ -21,23 +21,27 @@ extern "C" { #include "../webp/types.h" #define WEBP_RESCALER_RFIX 30 // fixed-point precision for multiplies +#define WEBP_RESCALER_ONE (1u << WEBP_RESCALER_RFIX) // Structure used for on-the-fly rescaling +typedef int32_t rescaler_t; // type for side-buffer typedef struct WebPRescaler WebPRescaler; struct WebPRescaler { int x_expand; // true if we're expanding in the x direction int y_expand; // true if we're expanding in the y direction int num_channels; // bytes to jump between pixels - int fy_scale, fx_scale; // fixed-point scaling factor - int64_t fxy_scale; // '' + uint32_t fx_scale; // fixed-point scaling factors + uint32_t fy_scale; // '' + uint64_t fxy_scale; // '' int y_accum; // vertical accumulator int y_add, y_sub; // vertical increments int x_add, x_sub; // horizontal increments int src_width, src_height; // source dimensions int dst_width, dst_height; // destination dimensions + int src_y, dst_y; // row counters for input and output uint8_t* dst; int dst_stride; - int32_t* irow, *frow; // work buffer + rescaler_t* irow, *frow; // work buffer }; // Initialize a rescaler given scratch area 'work' and dimensions of src & dst. @@ -46,7 +50,7 @@ void WebPRescalerInit(WebPRescaler* const rescaler, uint8_t* const dst, int dst_width, int dst_height, int dst_stride, int num_channels, - int32_t* const work); + rescaler_t* const work); // If either 'scaled_width' or 'scaled_height' (but not both) is 0 the value // will be calculated preserving the aspect ratio, otherwise the values are @@ -66,15 +70,26 @@ int WebPRescaleNeededLines(const WebPRescaler* const rescaler, int WebPRescalerImport(WebPRescaler* const rescaler, int num_rows, const uint8_t* src, int src_stride); -// Return true if there is pending output rows ready. -static WEBP_INLINE -int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) { - return (rescaler->y_accum <= 0); -} - // Export as many rows as possible. Return the numbers of rows written. int WebPRescalerExport(WebPRescaler* const rescaler); +// Return true if input is finished +static WEBP_INLINE +int WebPRescalerInputDone(const WebPRescaler* const rescaler) { + return (rescaler->src_y >= rescaler->src_height); +} +// Return true if output is finished +static WEBP_INLINE +int WebPRescalerOutputDone(const WebPRescaler* const rescaler) { + return (rescaler->dst_y >= rescaler->dst_height); +} + +// Return true if there are pending output rows ready. +static WEBP_INLINE +int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) { + return !WebPRescalerOutputDone(rescaler) && (rescaler->y_accum <= 0); +} + //------------------------------------------------------------------------------ #ifdef __cplusplus