diff --git a/src/dec/vp8l_dec.c b/src/dec/vp8l_dec.c index 2e4f888d..57423138 100644 --- a/src/dec/vp8l_dec.c +++ b/src/dec/vp8l_dec.c @@ -24,6 +24,7 @@ #include "src/dsp/dsp.h" #include "src/dsp/lossless.h" #include "src/dsp/lossless_common.h" +#include "src/dsp/yuv.h" #include "src/utils/bit_reader_utils.h" #include "src/utils/color_cache_utils.h" #include "src/utils/huffman_utils.h" @@ -703,13 +704,71 @@ static int EmitRescaledRowsYUVA(const VP8LDecoder* const dec, uint8_t* in, return y_pos; } -static int EmitRowsYUVA(const VP8LDecoder* const dec, const uint8_t* in, - int in_stride, int mb_w, int num_rows) { +// Returns true if alpha[] has non-0xff values. +static int CheckNonOpaque(const uint8_t* alpha, int width, int height, + int y_step) { + WebPInitAlphaProcessing(); + for (; height-- > 0; alpha += y_step) { + if (WebPHasAlpha8b(alpha, width)) return 1; + } + return 0; +} + +static int EmitRowsYUVA(const uint8_t* const in, const VP8Io* const io, + int in_stride, uint16_t* tmp_rgb, + VP8LDecoder* const dec) { int y_pos = dec->last_out_row; - while (num_rows-- > 0) { - ConvertToYUVA((const uint32_t*)in, mb_w, y_pos, dec->output); - in += in_stride; - ++y_pos; + const int width = io->mb_w; + int num_rows = io->mb_h; + const int y_pos_final = y_pos + num_rows; + const int y_stride = dec->output->u.YUVA.y_stride; + const int uv_stride = dec->output->u.YUVA.u_stride; + const int a_stride = dec->output->u.YUVA.a_stride; + uint8_t* dst_a = dec->output->u.YUVA.a; + uint8_t* dst_y = dec->output->u.YUVA.y + y_pos * y_stride; + uint8_t* dst_u = dec->output->u.YUVA.u + (y_pos >> 1) * uv_stride; + uint8_t* dst_v = dec->output->u.YUVA.v + (y_pos >> 1) * uv_stride; + const uint8_t* r_ptr = in + CHANNEL_OFFSET(1); + const uint8_t* g_ptr = in + CHANNEL_OFFSET(2); + const uint8_t* b_ptr = in + CHANNEL_OFFSET(3); + const uint8_t* a_ptr = NULL; + int has_alpha = 0; + + // Make sure the lines are processed two by two from the start. + assert(y_pos % 2 == 0); + + // Make sure num_rows is even. y_pos_final will check if it not. + num_rows &= ~1; + + if (dst_a) { + dst_a += y_pos * a_stride; + a_ptr = in + CHANNEL_OFFSET(0); + has_alpha = CheckNonOpaque(a_ptr, width, num_rows, in_stride); + } + // Process pairs of lines. + WebPImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, /*step=*/4, in_stride, + has_alpha, width, num_rows, tmp_rgb, y_stride, + uv_stride, a_stride, dst_y, dst_u, dst_v, dst_a); + + y_pos += num_rows; + if (y_pos_final == io->crop_bottom - io->crop_top && y_pos < y_pos_final) { + assert(y_pos + 1 == y_pos_final); + // If we output the last line of an image with odd height. + dst_y += num_rows * y_stride; + dst_u += (num_rows >> 1) * uv_stride; + dst_v += (num_rows >> 1) * uv_stride; + r_ptr += num_rows * in_stride; + g_ptr += num_rows * in_stride; + b_ptr += num_rows * in_stride; + if (dst_a) { + dst_a += num_rows * a_stride; + a_ptr += num_rows * in_stride; + has_alpha = CheckNonOpaque(a_ptr, width, /*height=*/1, in_stride); + } + WebPImportYUVAFromRGBALastLine(r_ptr, g_ptr, b_ptr, a_ptr, /*step=*/4, + has_alpha, width, tmp_rgb, dst_y, dst_u, + dst_v, dst_a); + y_pos = y_pos_final; } return y_pos; } @@ -789,8 +848,17 @@ static void ApplyInverseTransforms(VP8LDecoder* const dec, int start_row, // last call. static void ProcessRows(VP8LDecoder* const dec, int row) { const uint32_t* const rows = dec->pixels + dec->width * dec->last_row; - const int num_rows = row - dec->last_row; + int num_rows; + // In case of YUV conversion and if we do not need to get to the last row. + if (!WebPIsRGBMode(dec->output->colorspace) && row >= dec->io->crop_top && + row < dec->io->crop_bottom) { + // Make sure the number of rows to process is even. + if ((row - dec->io->crop_top) % 2 == 1) { + --row; + } + } + num_rows = row - dec->last_row; assert(row <= dec->io->crop_bottom); // We can't process more than NUM_ARGB_CACHE_ROWS at a time (that's the size // of argb_cache), but we currently don't need more than that. @@ -822,7 +890,8 @@ static void ProcessRows(VP8LDecoder* const dec, int row) { dec->last_out_row = io->use_scaling ? EmitRescaledRowsYUVA(dec, rows_data, in_stride, io->mb_h) - : EmitRowsYUVA(dec, rows_data, in_stride, io->mb_w, io->mb_h); + : EmitRowsYUVA(rows_data, io, in_stride, + dec->accumulated_rgb_pixels, dec); } assert(dec->last_out_row <= output->height); } @@ -1526,9 +1595,16 @@ static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) { const uint64_t cache_top_pixels = (uint16_t)final_width; // Scratch buffer for temporary BGRA storage. Not needed for paletted alpha. const uint64_t cache_pixels = (uint64_t)final_width * NUM_ARGB_CACHE_ROWS; - const uint64_t total_num_pixels = - num_pixels + cache_top_pixels + cache_pixels; - + // Scratch buffer to accumulate RGBA values (hence 4*)for YUV conversion. + uint64_t accumulated_rgb_pixels = 0; + uint64_t total_num_pixels; + if (dec->output != NULL && !WebPIsRGBMode(dec->output->colorspace)) { + const int uv_width = (dec->io->crop_right - dec->io->crop_left + 1) >> 1; + accumulated_rgb_pixels = + 4 * uv_width * sizeof(*dec->accumulated_rgb_pixels) / sizeof(uint32_t); + } + total_num_pixels = + num_pixels + cache_top_pixels + cache_pixels + accumulated_rgb_pixels; assert(dec->width <= final_width); dec->pixels = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint32_t)); if (dec->pixels == NULL) { @@ -1536,6 +1612,12 @@ static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) { return VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY); } dec->argb_cache = dec->pixels + num_pixels + cache_top_pixels; + dec->accumulated_rgb_pixels = + accumulated_rgb_pixels == 0 + ? NULL + : (uint16_t*)(dec->pixels + num_pixels + cache_top_pixels + + cache_pixels); + return 1; } diff --git a/src/dec/vp8li_dec.h b/src/dec/vp8li_dec.h index 07002941..bffad693 100644 --- a/src/dec/vp8li_dec.h +++ b/src/dec/vp8li_dec.h @@ -67,6 +67,8 @@ struct VP8LDecoder { uint32_t* pixels; // Internal data: either uint8_t* for alpha // or uint32_t* for BGRA. uint32_t* argb_cache; // Scratch buffer for temporary BGRA storage. + uint16_t* accumulated_rgb_pixels; // Scratch buffer for accumulated RGB for + // YUV conversion. VP8LBitReader br; int incremental; // if true, incremental decoding is expected diff --git a/src/dsp/yuv.c b/src/dsp/yuv.c index ef32981a..fccc394c 100644 --- a/src/dsp/yuv.c +++ b/src/dsp/yuv.c @@ -21,6 +21,16 @@ #include "src/webp/decode.h" #include "src/webp/types.h" +// Uncomment to disable gamma-compression during RGB->U/V averaging +#define USE_GAMMA_COMPRESSION + +// If defined, use table to compute x / alpha. +#define USE_INVERSE_ALPHA_TABLE + +#ifdef USE_GAMMA_COMPRESSION +#include +#endif + //----------------------------------------------------------------------------- // Plain-C version @@ -204,6 +214,388 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb, } } +//------------------------------------------------------------------------------ +// Code for gamma correction + +#if defined(USE_GAMMA_COMPRESSION) + +// Gamma correction compensates loss of resolution during chroma subsampling. +#define GAMMA_FIX 12 // fixed-point precision for linear values +#define GAMMA_TAB_FIX 7 // fixed-point fractional bits precision +#define GAMMA_TAB_SIZE (1 << (GAMMA_FIX - GAMMA_TAB_FIX)) +static const double kGamma = 0.80; +static const int kGammaScale = ((1 << GAMMA_FIX) - 1); +static const int kGammaTabScale = (1 << GAMMA_TAB_FIX); +static const int kGammaTabRounder = (1 << GAMMA_TAB_FIX >> 1); + +static int kLinearToGammaTab[GAMMA_TAB_SIZE + 1]; +static uint16_t kGammaToLinearTab[256]; +static volatile int kGammaTablesOk = 0; +extern VP8CPUInfo VP8GetCPUInfo; + +WEBP_DSP_INIT_FUNC(WebPInitGammaTables) { + if (!kGammaTablesOk) { + int v; + const double scale = (double)(1 << GAMMA_TAB_FIX) / kGammaScale; + const double norm = 1. / 255.; + for (v = 0; v <= 255; ++v) { + kGammaToLinearTab[v] = + (uint16_t)(pow(norm * v, kGamma) * kGammaScale + .5); + } + for (v = 0; v <= GAMMA_TAB_SIZE; ++v) { + kLinearToGammaTab[v] = (int)(255. * pow(scale * v, 1. / kGamma) + .5); + } + kGammaTablesOk = 1; + } +} + +static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { + return kGammaToLinearTab[v]; +} + +static WEBP_INLINE int Interpolate(int v) { + const int tab_pos = v >> (GAMMA_TAB_FIX + 2); // integer part + const int x = v & ((kGammaTabScale << 2) - 1); // fractional part + const int v0 = kLinearToGammaTab[tab_pos]; + const int v1 = kLinearToGammaTab[tab_pos + 1]; + const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x); // interpolate + assert(tab_pos + 1 < GAMMA_TAB_SIZE + 1); + return y; +} + +// Convert a linear value 'v' to YUV_FIX+2 fixed-point precision +// U/V value, suitable for RGBToU/V calls. +static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) { + const int y = Interpolate(base_value << shift); // final uplifted value + return (y + kGammaTabRounder) >> GAMMA_TAB_FIX; // descale +} + +#else + +void WebPInitGammaTables(void) {} +static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; } +static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) { + return (int)(base_value << shift); +} + +#endif // USE_GAMMA_COMPRESSION + +#define SUM4(ptr, step) \ + LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[(step)]) + \ + GammaToLinear((ptr)[rgb_stride]) + \ + GammaToLinear((ptr)[rgb_stride + (step)]), \ + 0) + +#define SUM2(ptr) \ + LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[rgb_stride]), 1) + +//------------------------------------------------------------------------------ +// "Fast" regular RGB->YUV + +#define SUM4(ptr, step) \ + LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[(step)]) + \ + GammaToLinear((ptr)[rgb_stride]) + \ + GammaToLinear((ptr)[rgb_stride + (step)]), \ + 0) + +#define SUM2(ptr) \ + LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[rgb_stride]), 1) + +#define SUM2ALPHA(ptr) ((ptr)[0] + (ptr)[rgb_stride]) +#define SUM4ALPHA(ptr) (SUM2ALPHA(ptr) + SUM2ALPHA((ptr) + 4)) + +#if defined(USE_INVERSE_ALPHA_TABLE) + +static const int kAlphaFix = 19; +// Following table is (1 << kAlphaFix) / a. The (v * kInvAlpha[a]) >> kAlphaFix +// formula is then equal to v / a in most (99.6%) cases. Note that this table +// and constant are adjusted very tightly to fit 32b arithmetic. +// In particular, they use the fact that the operands for 'v / a' are actually +// derived as v = (a0.p0 + a1.p1 + a2.p2 + a3.p3) and a = a0 + a1 + a2 + a3 +// with ai in [0..255] and pi in [0..1<> (kAlphaFix - 2)) + +#else + +#define DIVIDE_BY_ALPHA(sum, a) (4 * (sum) / (a)) + +#endif // USE_INVERSE_ALPHA_TABLE + +static WEBP_INLINE int LinearToGammaWeighted(const uint8_t* src, + const uint8_t* a_ptr, + uint32_t total_a, int step, + int rgb_stride) { + const uint32_t sum = + a_ptr[0] * GammaToLinear(src[0]) + + a_ptr[step] * GammaToLinear(src[step]) + + a_ptr[rgb_stride] * GammaToLinear(src[rgb_stride]) + + a_ptr[rgb_stride + step] * GammaToLinear(src[rgb_stride + step]); + assert(total_a > 0 && total_a <= 4 * 0xff); +#if defined(USE_INVERSE_ALPHA_TABLE) + assert((uint64_t)sum * kInvAlpha[total_a] < ((uint64_t)1 << 32)); +#endif + return LinearToGamma(DIVIDE_BY_ALPHA(sum, total_a), 0); +} + +void WebPAccumulateRGBA(const uint8_t* const r_ptr, const uint8_t* const g_ptr, + const uint8_t* const b_ptr, const uint8_t* const a_ptr, + int rgb_stride, uint16_t* dst, int width) { + int i, j; + // we loop over 2x2 blocks and produce one R/G/B/A value for each. + for (i = 0, j = 0; i < (width >> 1); i += 1, j += 2 * 4, dst += 4) { + const uint32_t a = SUM4ALPHA(a_ptr + j); + int r, g, b; + if (a == 4 * 0xff || a == 0) { + r = SUM4(r_ptr + j, 4); + g = SUM4(g_ptr + j, 4); + b = SUM4(b_ptr + j, 4); + } else { + r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 4, rgb_stride); + g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 4, rgb_stride); + b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 4, rgb_stride); + } + dst[0] = r; + dst[1] = g; + dst[2] = b; + dst[3] = a; + } + if (width & 1) { + const uint32_t a = 2u * SUM2ALPHA(a_ptr + j); + int r, g, b; + if (a == 4 * 0xff || a == 0) { + r = SUM2(r_ptr + j); + g = SUM2(g_ptr + j); + b = SUM2(b_ptr + j); + } else { + r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 0, rgb_stride); + g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 0, rgb_stride); + b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 0, rgb_stride); + } + dst[0] = r; + dst[1] = g; + dst[2] = b; + dst[3] = a; + } +} + +void WebPAccumulateRGB(const uint8_t* const r_ptr, const uint8_t* const g_ptr, + const uint8_t* const b_ptr, int step, int rgb_stride, + uint16_t* dst, int width) { + int i, j; + for (i = 0, j = 0; i < (width >> 1); i += 1, j += 2 * step, dst += 4) { + dst[0] = SUM4(r_ptr + j, step); + dst[1] = SUM4(g_ptr + j, step); + dst[2] = SUM4(b_ptr + j, step); + // MemorySanitizer may raise false positives with data that passes through + // RGBA32PackedToPlanar_16b_SSE41() due to incorrect modeling of shuffles. + // See https://crbug.com/webp/573. +#ifdef WEBP_MSAN + dst[3] = 0; +#endif + } + if (width & 1) { + dst[0] = SUM2(r_ptr + j); + dst[1] = SUM2(g_ptr + j); + dst[2] = SUM2(b_ptr + j); +#ifdef WEBP_MSAN + dst[3] = 0; +#endif + } +} + +static void ImportYUVAFromRGBA_C(const uint8_t* r_ptr, const uint8_t* g_ptr, + const uint8_t* b_ptr, const uint8_t* a_ptr, + int step, // bytes per pixel + int rgb_stride, // bytes per scanline + int has_alpha, int width, int height, + uint16_t* tmp_rgb, int y_stride, int uv_stride, + int a_stride, uint8_t* dst_y, uint8_t* dst_u, + uint8_t* dst_v, uint8_t* dst_a) { + int y; + const int is_rgb = (r_ptr < b_ptr); // otherwise it's bgr + const int uv_width = (width + 1) >> 1; + + has_alpha &= dst_a != NULL; + if (has_alpha) { +#if defined(USE_GAMMA_COMPRESSION) && defined(USE_INVERSE_ALPHA_TABLE) + assert(kAlphaFix + GAMMA_FIX <= 31); +#endif + } + + WebPInitGammaTables(); + + // Downsample Y/U/V planes, two rows at a time + for (y = 0; y < (height >> 1); ++y) { + int rows_have_alpha = has_alpha; + if (is_rgb) { + WebPConvertRGBToY(r_ptr, dst_y, width, step); + WebPConvertRGBToY(r_ptr + rgb_stride, dst_y + y_stride, width, step); + } else { + WebPConvertBGRToY(b_ptr, dst_y, width, step); + WebPConvertBGRToY(b_ptr + rgb_stride, dst_y + y_stride, width, step); + } + dst_y += 2 * y_stride; + if (has_alpha) { + rows_have_alpha &= + !WebPExtractAlpha(a_ptr, rgb_stride, width, 2, dst_a, a_stride); + dst_a += 2 * a_stride; + } + // Collect averaged R/G/B(/A) + if (!rows_have_alpha) { + WebPAccumulateRGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, tmp_rgb, width); + } else { + WebPAccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, rgb_stride, tmp_rgb, + width); + } + // Convert to U/V + WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width); + dst_u += uv_stride; + dst_v += uv_stride; + r_ptr += 2 * rgb_stride; + b_ptr += 2 * rgb_stride; + g_ptr += 2 * rgb_stride; + if (has_alpha) a_ptr += 2 * rgb_stride; + } +} + +static void ImportYUVAFromRGBALastLine_C( + const uint8_t* r_ptr, const uint8_t* g_ptr, const uint8_t* b_ptr, + const uint8_t* a_ptr, + int step, // bytes per pixel + int has_alpha, int width, uint16_t* tmp_rgb, uint8_t* dst_y, uint8_t* dst_u, + uint8_t* dst_v, uint8_t* dst_a) { + const int is_rgb = (r_ptr < b_ptr); // otherwise it's bgr + const int uv_width = (width + 1) >> 1; + int row_has_alpha = has_alpha && dst_a != NULL; + + if (is_rgb) { + WebPConvertRGBToY(r_ptr, dst_y, width, step); + } else { + WebPConvertBGRToY(b_ptr, dst_y, width, step); + } + if (row_has_alpha) { + row_has_alpha &= !WebPExtractAlpha(a_ptr, 0, width, 1, dst_a, 0); + } + // Collect averaged R/G/B(/A) + if (!row_has_alpha) { + // Collect averaged R/G/B + WebPAccumulateRGB(r_ptr, g_ptr, b_ptr, step, /*rgb_stride=*/0, tmp_rgb, + width); + } else { + WebPAccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, /*rgb_stride=*/0, tmp_rgb, + width); + } + WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width); +} + //----------------------------------------------------------------------------- void (*WebPConvertRGBToY)(const uint8_t* WEBP_RESTRICT rgb, @@ -214,6 +606,21 @@ void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb, uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, int width); +void (*WebPImportYUVAFromRGBA)(const uint8_t* r_ptr, const uint8_t* g_ptr, + const uint8_t* b_ptr, const uint8_t* a_ptr, + int step, // bytes per pixel + int rgb_stride, // bytes per scanline + int has_alpha, int width, int height, + uint16_t* tmp_rgb, int y_stride, int uv_stride, + int a_stride, uint8_t* dst_y, uint8_t* dst_u, + uint8_t* dst_v, uint8_t* dst_a); +void (*WebPImportYUVAFromRGBALastLine)( + const uint8_t* r_ptr, const uint8_t* g_ptr, const uint8_t* b_ptr, + const uint8_t* a_ptr, + int step, // bytes per pixel + int has_alpha, int width, uint16_t* tmp_rgb, uint8_t* dst_y, uint8_t* dst_u, + uint8_t* dst_v, uint8_t* dst_a); + void (*WebPConvertARGBToY)(const uint32_t* WEBP_RESTRICT argb, uint8_t* WEBP_RESTRICT y, int width); void (*WebPConvertARGBToUV)(const uint32_t* WEBP_RESTRICT argb, @@ -233,6 +640,9 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) { WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C; + WebPImportYUVAFromRGBA = ImportYUVAFromRGBA_C; + WebPImportYUVAFromRGBALastLine = ImportYUVAFromRGBALastLine_C; + if (VP8GetCPUInfo != NULL) { #if defined(WEBP_HAVE_SSE2) if (VP8GetCPUInfo(kSSE2)) { diff --git a/src/dsp/yuv.h b/src/dsp/yuv.h index 57b03dd7..52f9b14d 100644 --- a/src/dsp/yuv.h +++ b/src/dsp/yuv.h @@ -40,6 +40,15 @@ #include "src/dsp/dsp.h" #include "src/webp/types.h" +// Macros to give the offset of each channel in a uint32_t containing ARGB. +#ifdef WORDS_BIGENDIAN +// uint32_t 0xff000000 is 0xff,00,00,00 in memory +#define CHANNEL_OFFSET(i) (i) +#else +// uint32_t 0xff000000 is 0x00,00,00,ff in memory +#define CHANNEL_OFFSET(i) (3 - (i)) +#endif + //------------------------------------------------------------------------------ // YUV -> RGB conversion @@ -221,6 +230,31 @@ static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) { return VP8ClipUV(v, rounding); } +extern void (*WebPImportYUVAFromRGBA)( + const uint8_t* r_ptr, const uint8_t* g_ptr, const uint8_t* b_ptr, + const uint8_t* a_ptr, + int step, // bytes per pixel + int rgb_stride, // bytes per scanline + int has_alpha, int width, int height, uint16_t* tmp_rgb, int y_stride, + int uv_stride, int a_stride, uint8_t* dst_y, uint8_t* dst_u, uint8_t* dst_v, + uint8_t* dst_a); +extern void (*WebPImportYUVAFromRGBALastLine)( + const uint8_t* r_ptr, const uint8_t* g_ptr, const uint8_t* b_ptr, + const uint8_t* a_ptr, + int step, // bytes per pixel + int has_alpha, int width, uint16_t* tmp_rgb, uint8_t* dst_y, uint8_t* dst_u, + uint8_t* dst_v, uint8_t* dst_a); + +// Internal function to WebPImportYUVAFromRGBA* that can be reused. +void WebPAccumulateRGBA(const uint8_t* const r_ptr, const uint8_t* const g_ptr, + const uint8_t* const b_ptr, const uint8_t* const a_ptr, + int rgb_stride, uint16_t* dst, int width); +void WebPAccumulateRGB(const uint8_t* const r_ptr, const uint8_t* const g_ptr, + const uint8_t* const b_ptr, int step, int rgb_stride, + uint16_t* dst, int width); +// Must be called before calling WebPAccumulateRGB*. +void WebPInitGammaTables(void); + #ifdef __cplusplus } // extern "C" #endif diff --git a/src/enc/picture_csp_enc.c b/src/enc/picture_csp_enc.c index 016b843f..6f9abf59 100644 --- a/src/enc/picture_csp_enc.c +++ b/src/enc/picture_csp_enc.c @@ -32,20 +32,6 @@ #include #endif -// Uncomment to disable gamma-compression during RGB->U/V averaging -#define USE_GAMMA_COMPRESSION - -// If defined, use table to compute x / alpha. -#define USE_INVERSE_ALPHA_TABLE - -#ifdef WORDS_BIGENDIAN -// uint32_t 0xff000000 is 0xff,00,00,00 in memory -#define CHANNEL_OFFSET(i) (i) -#else -// uint32_t 0xff000000 is 0x00,00,00,ff in memory -#define CHANNEL_OFFSET(i) (3 - (i)) -#endif - #define ALPHA_OFFSET CHANNEL_OFFSET(0) //------------------------------------------------------------------------------ @@ -83,91 +69,8 @@ int WebPPictureHasTransparency(const WebPPicture* picture) { picture->a_stride); } -//------------------------------------------------------------------------------ -// Code for gamma correction - -#if defined(USE_GAMMA_COMPRESSION) - -// Gamma correction compensates loss of resolution during chroma subsampling. -#define GAMMA_FIX 12 // fixed-point precision for linear values -#define GAMMA_TAB_FIX 7 // fixed-point fractional bits precision -#define GAMMA_TAB_SIZE (1 << (GAMMA_FIX - GAMMA_TAB_FIX)) -static const double kGamma = 0.80; -static const int kGammaScale = ((1 << GAMMA_FIX) - 1); -static const int kGammaTabScale = (1 << GAMMA_TAB_FIX); -static const int kGammaTabRounder = (1 << GAMMA_TAB_FIX >> 1); - -static int kLinearToGammaTab[GAMMA_TAB_SIZE + 1]; -static uint16_t kGammaToLinearTab[256]; -static volatile int kGammaTablesOk = 0; -static void InitGammaTables(void); extern VP8CPUInfo VP8GetCPUInfo; -WEBP_DSP_INIT_FUNC(InitGammaTables) { - if (!kGammaTablesOk) { - int v; - const double scale = (double)(1 << GAMMA_TAB_FIX) / kGammaScale; - const double norm = 1. / 255.; - for (v = 0; v <= 255; ++v) { - kGammaToLinearTab[v] = - (uint16_t)(pow(norm * v, kGamma) * kGammaScale + .5); - } - for (v = 0; v <= GAMMA_TAB_SIZE; ++v) { - kLinearToGammaTab[v] = (int)(255. * pow(scale * v, 1. / kGamma) + .5); - } - kGammaTablesOk = 1; - } -} - -static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { - return kGammaToLinearTab[v]; -} - -static WEBP_INLINE int Interpolate(int v) { - const int tab_pos = v >> (GAMMA_TAB_FIX + 2); // integer part - const int x = v & ((kGammaTabScale << 2) - 1); // fractional part - const int v0 = kLinearToGammaTab[tab_pos]; - const int v1 = kLinearToGammaTab[tab_pos + 1]; - const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x); // interpolate - assert(tab_pos + 1 < GAMMA_TAB_SIZE + 1); - return y; -} - -// Convert a linear value 'v' to YUV_FIX+2 fixed-point precision -// U/V value, suitable for RGBToU/V calls. -static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) { - const int y = Interpolate(base_value << shift); // final uplifted value - return (y + kGammaTabRounder) >> GAMMA_TAB_FIX; // descale -} - -#else - -static void InitGammaTables(void) {} -static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; } -static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) { - return (int)(base_value << shift); -} - -#endif // USE_GAMMA_COMPRESSION - -//------------------------------------------------------------------------------ -// RGB -> YUV conversion - -static int RGBToY(int r, int g, int b, VP8Random* const rg) { - return (rg == NULL) ? VP8RGBToY(r, g, b, YUV_HALF) - : VP8RGBToY(r, g, b, VP8RandomBits(rg, YUV_FIX)); -} - -static int RGBToU(int r, int g, int b, VP8Random* const rg) { - return (rg == NULL) ? VP8RGBToU(r, g, b, YUV_HALF << 2) - : VP8RGBToU(r, g, b, VP8RandomBits(rg, YUV_FIX + 2)); -} - -static int RGBToV(int r, int g, int b, VP8Random* const rg) { - return (rg == NULL) ? VP8RGBToV(r, g, b, YUV_HALF << 2) - : VP8RGBToV(r, g, b, VP8RandomBits(rg, YUV_FIX + 2)); -} - //------------------------------------------------------------------------------ // Sharp RGB->YUV conversion @@ -190,162 +93,6 @@ static int PreprocessARGB(const uint8_t* r_ptr, const uint8_t* g_ptr, return ok; } -//------------------------------------------------------------------------------ -// "Fast" regular RGB->YUV - -#define SUM4(ptr, step) \ - LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[(step)]) + \ - GammaToLinear((ptr)[rgb_stride]) + \ - GammaToLinear((ptr)[rgb_stride + (step)]), \ - 0) - -#define SUM2(ptr) \ - LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[rgb_stride]), 1) - -#define SUM2ALPHA(ptr) ((ptr)[0] + (ptr)[rgb_stride]) -#define SUM4ALPHA(ptr) (SUM2ALPHA(ptr) + SUM2ALPHA((ptr) + 4)) - -#if defined(USE_INVERSE_ALPHA_TABLE) - -static const int kAlphaFix = 19; -// Following table is (1 << kAlphaFix) / a. The (v * kInvAlpha[a]) >> kAlphaFix -// formula is then equal to v / a in most (99.6%) cases. Note that this table -// and constant are adjusted very tightly to fit 32b arithmetic. -// In particular, they use the fact that the operands for 'v / a' are actually -// derived as v = (a0.p0 + a1.p1 + a2.p2 + a3.p3) and a = a0 + a1 + a2 + a3 -// with ai in [0..255] and pi in [0..1<> (kAlphaFix - 2)) - -#else - -#define DIVIDE_BY_ALPHA(sum, a) (4 * (sum) / (a)) - -#endif // USE_INVERSE_ALPHA_TABLE - -static WEBP_INLINE int LinearToGammaWeighted(const uint8_t* src, - const uint8_t* a_ptr, - uint32_t total_a, int step, - int rgb_stride) { - const uint32_t sum = - a_ptr[0] * GammaToLinear(src[0]) + - a_ptr[step] * GammaToLinear(src[step]) + - a_ptr[rgb_stride] * GammaToLinear(src[rgb_stride]) + - a_ptr[rgb_stride + step] * GammaToLinear(src[rgb_stride + step]); - assert(total_a > 0 && total_a <= 4 * 0xff); -#if defined(USE_INVERSE_ALPHA_TABLE) - assert((uint64_t)sum * kInvAlpha[total_a] < ((uint64_t)1 << 32)); -#endif - return LinearToGamma(DIVIDE_BY_ALPHA(sum, total_a), 0); -} - static WEBP_INLINE void ConvertRowToY(const uint8_t* const r_ptr, const uint8_t* const g_ptr, const uint8_t* const b_ptr, int step, @@ -353,78 +100,8 @@ static WEBP_INLINE void ConvertRowToY(const uint8_t* const r_ptr, VP8Random* const rg) { int i, j; for (i = 0, j = 0; i < width; i += 1, j += step) { - dst_y[i] = RGBToY(r_ptr[j], g_ptr[j], b_ptr[j], rg); - } -} - -static WEBP_INLINE void AccumulateRGBA(const uint8_t* const r_ptr, - const uint8_t* const g_ptr, - const uint8_t* const b_ptr, - const uint8_t* const a_ptr, - int rgb_stride, uint16_t* dst, - int width) { - int i, j; - // we loop over 2x2 blocks and produce one R/G/B/A value for each. - for (i = 0, j = 0; i < (width >> 1); i += 1, j += 2 * 4, dst += 4) { - const uint32_t a = SUM4ALPHA(a_ptr + j); - int r, g, b; - if (a == 4 * 0xff || a == 0) { - r = SUM4(r_ptr + j, 4); - g = SUM4(g_ptr + j, 4); - b = SUM4(b_ptr + j, 4); - } else { - r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 4, rgb_stride); - g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 4, rgb_stride); - b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 4, rgb_stride); - } - dst[0] = r; - dst[1] = g; - dst[2] = b; - dst[3] = a; - } - if (width & 1) { - const uint32_t a = 2u * SUM2ALPHA(a_ptr + j); - int r, g, b; - if (a == 4 * 0xff || a == 0) { - r = SUM2(r_ptr + j); - g = SUM2(g_ptr + j); - b = SUM2(b_ptr + j); - } else { - r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 0, rgb_stride); - g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 0, rgb_stride); - b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 0, rgb_stride); - } - dst[0] = r; - dst[1] = g; - dst[2] = b; - dst[3] = a; - } -} - -static WEBP_INLINE void AccumulateRGB(const uint8_t* const r_ptr, - const uint8_t* const g_ptr, - const uint8_t* const b_ptr, int step, - int rgb_stride, uint16_t* dst, - int width) { - int i, j; - for (i = 0, j = 0; i < (width >> 1); i += 1, j += 2 * step, dst += 4) { - dst[0] = SUM4(r_ptr + j, step); - dst[1] = SUM4(g_ptr + j, step); - dst[2] = SUM4(b_ptr + j, step); - // MemorySanitizer may raise false positives with data that passes through - // RGBA32PackedToPlanar_16b_SSE41() due to incorrect modeling of shuffles. - // See https://crbug.com/webp/573. -#ifdef WEBP_MSAN - dst[3] = 0; -#endif - } - if (width & 1) { - dst[0] = SUM2(r_ptr + j); - dst[1] = SUM2(g_ptr + j); - dst[2] = SUM2(b_ptr + j); -#ifdef WEBP_MSAN - dst[3] = 0; -#endif + dst_y[i] = + VP8RGBToY(r_ptr[j], g_ptr[j], b_ptr[j], VP8RandomBits(rg, YUV_FIX)); } } @@ -435,8 +112,8 @@ static WEBP_INLINE void ConvertRowsToUV(const uint16_t* rgb, int i; for (i = 0; i < width; i += 1, rgb += 4) { const int r = rgb[0], g = rgb[1], b = rgb[2]; - dst_u[i] = RGBToU(r, g, b, rg); - dst_v[i] = RGBToV(r, g, b, rg); + dst_u[i] = VP8RGBToU(r, g, b, VP8RandomBits(rg, YUV_FIX + 2)); + dst_v[i] = VP8RGBToV(r, g, b, VP8RandomBits(rg, YUV_FIX + 2)); } } @@ -452,7 +129,6 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr, const uint8_t* g_ptr, const int width = picture->width; const int height = picture->height; const int has_alpha = CheckNonOpaque(a_ptr, width, height, step, rgb_stride); - const int is_rgb = (r_ptr < b_ptr); // otherwise it's bgr picture->colorspace = has_alpha ? WEBP_YUV420A : WEBP_YUV420; picture->use_argb = 0; @@ -468,9 +144,6 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr, const uint8_t* g_ptr, } if (has_alpha) { assert(step == 4); -#if defined(USE_GAMMA_COMPRESSION) && defined(USE_INVERSE_ALPHA_TABLE) - assert(kAlphaFix + GAMMA_FIX <= 31); -#endif } if (use_iterative_conversion) { @@ -499,85 +172,88 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr, const uint8_t* g_ptr, rg = &base_rg; } WebPInitConvertARGBToYUV(); - InitGammaTables(); + WebPInitGammaTables(); if (tmp_rgb == NULL) { return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY); } - // Downsample Y/U/V planes, two rows at a time - for (y = 0; y < (height >> 1); ++y) { - int rows_have_alpha = has_alpha; - if (rg == NULL) { - if (is_rgb) { - WebPConvertRGBToY(r_ptr, dst_y, width, step); - WebPConvertRGBToY(r_ptr + rgb_stride, dst_y + picture->y_stride, - width, step); - } else { - WebPConvertBGRToY(b_ptr, dst_y, width, step); - WebPConvertBGRToY(b_ptr + rgb_stride, dst_y + picture->y_stride, - width, step); + if (rg == NULL) { + // Downsample Y/U/V planes, two rows at a time + WebPImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride, + has_alpha, width, height, tmp_rgb, + picture->y_stride, picture->uv_stride, + picture->a_stride, dst_y, dst_u, dst_v, dst_a); + if (height & 1) { + dst_y += (height - 1) * picture->y_stride; + dst_u += (height >> 1) * picture->uv_stride; + dst_v += (height >> 1) * picture->uv_stride; + r_ptr += (height - 1) * rgb_stride; + b_ptr += (height - 1) * rgb_stride; + g_ptr += (height - 1) * rgb_stride; + if (has_alpha) { + dst_a += (height - 1) * picture->a_stride; + a_ptr += (height - 1) * rgb_stride; } - } else { + WebPImportYUVAFromRGBALastLine(r_ptr, g_ptr, b_ptr, a_ptr, step, + has_alpha, width, tmp_rgb, dst_y, dst_u, + dst_v, dst_a); + } + } else { + // Copy of WebPImportYUVAFromRGBA/WebPImportYUVAFromRGBALastLine, + // but with dithering. + for (y = 0; y < (height >> 1); ++y) { + int rows_have_alpha = has_alpha; ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg); ConvertRowToY(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride, step, dst_y + picture->y_stride, width, rg); - } - dst_y += 2 * picture->y_stride; - if (has_alpha) { - rows_have_alpha &= !WebPExtractAlpha(a_ptr, rgb_stride, width, 2, dst_a, - picture->a_stride); - dst_a += 2 * picture->a_stride; - } - // Collect averaged R/G/B(/A) - if (!rows_have_alpha) { - AccumulateRGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, tmp_rgb, width); - } else { - AccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, rgb_stride, tmp_rgb, width); - } - // Convert to U/V - if (rg == NULL) { - WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width); - } else { - ConvertRowsToUV(tmp_rgb, dst_u, dst_v, uv_width, rg); - } - dst_u += picture->uv_stride; - dst_v += picture->uv_stride; - r_ptr += 2 * rgb_stride; - b_ptr += 2 * rgb_stride; - g_ptr += 2 * rgb_stride; - if (has_alpha) a_ptr += 2 * rgb_stride; - } - if (height & 1) { // extra last row - int row_has_alpha = has_alpha; - if (rg == NULL) { - if (is_rgb) { - WebPConvertRGBToY(r_ptr, dst_y, width, step); - } else { - WebPConvertBGRToY(b_ptr, dst_y, width, step); + dst_y += 2 * picture->y_stride; + if (has_alpha) { + rows_have_alpha &= !WebPExtractAlpha(a_ptr, rgb_stride, width, 2, + dst_a, picture->a_stride); + dst_a += 2 * picture->a_stride; } - } else { - ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg); - } - if (row_has_alpha) { - row_has_alpha &= !WebPExtractAlpha(a_ptr, 0, width, 1, dst_a, 0); - } - // Collect averaged R/G/B(/A) - if (!row_has_alpha) { - // Collect averaged R/G/B - AccumulateRGB(r_ptr, g_ptr, b_ptr, step, /* rgb_stride = */ 0, tmp_rgb, - width); - } else { - AccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, /* rgb_stride = */ 0, - tmp_rgb, width); - } - if (rg == NULL) { - WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width); - } else { + // Collect averaged R/G/B(/A) + if (!rows_have_alpha) { + WebPAccumulateRGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, tmp_rgb, + width); + } else { + WebPAccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, rgb_stride, tmp_rgb, + width); + } + // Convert to U/V ConvertRowsToUV(tmp_rgb, dst_u, dst_v, uv_width, rg); + dst_u += picture->uv_stride; + dst_v += picture->uv_stride; + r_ptr += 2 * rgb_stride; + b_ptr += 2 * rgb_stride; + g_ptr += 2 * rgb_stride; + if (has_alpha) a_ptr += 2 * rgb_stride; + } + if (height & 1) { // extra last row + int row_has_alpha = has_alpha; + ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg); + if (row_has_alpha) { + row_has_alpha &= !WebPExtractAlpha(a_ptr, 0, width, 1, dst_a, 0); + } + // Collect averaged R/G/B(/A) + if (!row_has_alpha) { + // Collect averaged R/G/B + WebPAccumulateRGB(r_ptr, g_ptr, b_ptr, step, /*rgb_stride=*/0, + tmp_rgb, width); + } else { + WebPAccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, /*rgb_stride=*/0, + tmp_rgb, width); + } + if (rg == NULL) { + WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width); + } else { + ConvertRowsToUV(tmp_rgb, dst_u, dst_v, uv_width, rg); + } } } + WebPSafeFree(tmp_rgb); } return 1;