diff --git a/sharpyuv/sharpyuv.c b/sharpyuv/sharpyuv.c index 302a4f23..84de6b0a 100644 --- a/sharpyuv/sharpyuv.c +++ b/sharpyuv/sharpyuv.c @@ -31,83 +31,114 @@ static const int kMinDimensionIterativeConversion = 4; #define YUV_FIX 16 // fixed-point precision for RGB->YUV static const int kYuvHalf = 1 << (YUV_FIX - 1); -// We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some -// banding sometimes. Better use extra precision. -#define SFIX 2 // fixed-point precision of RGB and Y/W -#define MAX_Y_T ((256 << SFIX) - 1) -typedef int16_t fixed_t; // signed type with extra SFIX precision for UV -typedef uint16_t fixed_y_t; // unsigned type with extra SFIX precision for W +// Max bit depth so that intermediate calculations fit in 16 bits. +// TODO(b/194336375): the C code can handle up to 14 bits, but the SIMD code +// currently needs more room. +static const int kMaxBitDepth = 10; -static const int kYuvRounder = (1 << (YUV_FIX + SFIX - 1)); +// Returns the precision shift to use based on the input rgb_bit_depth. +static int GetPrecisionShift(int rgb_bit_depth) { + // Try to add 2 bits of precision if it fits in kMaxBitDepth. Otherwise remove + // bits if needed. + return ((rgb_bit_depth + 2) <= kMaxBitDepth) ? 2 + : (kMaxBitDepth - rgb_bit_depth); +} + +typedef int16_t fixed_t; // signed type with extra precision for UV +typedef uint16_t fixed_y_t; // unsigned type with extra precision for W //------------------------------------------------------------------------------ // Code for gamma correction // Gamma correction compensates loss of resolution during chroma subsampling. -static const double kGammaF = 1./0.45; -#define GAMMA_TAB_FIX 8 -#define GAMMA_TAB_SIZE (1 << GAMMA_TAB_FIX) -static uint32_t kLinearToGammaTabS[GAMMA_TAB_SIZE + 2]; -#define GAMMA_TO_LINEAR_BITS 14 -static const int kGammaToLinearHalf = 1 << (GAMMA_TO_LINEAR_BITS - 1); -static uint32_t kGammaToLinearTabS[MAX_Y_T + 1]; // size scales with Y_FIX -static volatile int kGammaTablesSOk = 0; +// Size of pre-computed table for converting from gamma to linear. +#define GAMMA_TO_LINEAR_TAB_BITS 10 +#define GAMMA_TO_LINEAR_TAB_SIZE (1 << GAMMA_TO_LINEAR_TAB_BITS) +static uint32_t kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE + 2]; +// Size of pre-computed table for converting from linear to gamma. +#define LINEAR_TO_GAMMA_TAB_BITS 8 +#define LINEAR_TO_GAMMA_TAB_SIZE (1 << LINEAR_TO_GAMMA_TAB_BITS) +static uint32_t kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE + 2]; +static const double kGammaF = 1. / 0.45; +#define GAMMA_TO_LINEAR_BITS 14 + +static volatile int kGammaTablesSOk = 0; static void InitGammaTablesS(void) { assert(2 * GAMMA_TO_LINEAR_BITS < 32); // we use uint32_t intermediate values if (!kGammaTablesSOk) { int v; - const double norm = 1. / MAX_Y_T; - const double scale = 1. / GAMMA_TAB_SIZE; const double a = 0.09929682680944; const double thresh = 0.018053968510807; - const double final_scale = 1 << GAMMA_TO_LINEAR_BITS; - for (v = 0; v <= MAX_Y_T; ++v) { - const double g = norm * v; - double value; - if (g <= thresh * 4.5) { - value = g / 4.5; - } else { - const double a_rec = 1. / (1. + a); - value = pow(a_rec * (g + a), kGammaF); + // Precompute gamma to linear table. + { + const double norm = 1. / GAMMA_TO_LINEAR_TAB_SIZE; + const double a_rec = 1. / (1. + a); + const double final_scale = 1 << GAMMA_TO_LINEAR_BITS; + for (v = 0; v <= GAMMA_TO_LINEAR_TAB_SIZE; ++v) { + const double g = norm * v; + double value; + if (g <= thresh * 4.5) { + value = g / 4.5; + } else { + value = pow(a_rec * (g + a), kGammaF); + } + kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5); } - kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5); + // to prevent small rounding errors to cause read-overflow: + kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE + 1] = + kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE]; } - for (v = 0; v <= GAMMA_TAB_SIZE; ++v) { - const double g = scale * v; - double value; - if (g <= thresh) { - value = 4.5 * g; - } else { - value = (1. + a) * pow(g, 1. / kGammaF) - a; + // Precompute linear to gamma table. + { + const double scale = 1. / LINEAR_TO_GAMMA_TAB_SIZE; + for (v = 0; v <= LINEAR_TO_GAMMA_TAB_SIZE; ++v) { + const double g = scale * v; + double value; + if (g <= thresh) { + value = 4.5 * g; + } else { + value = (1. + a) * pow(g, 1. / kGammaF) - a; + } + kLinearToGammaTabS[v] = + (uint32_t)(GAMMA_TO_LINEAR_TAB_SIZE * value + 0.5); } - kLinearToGammaTabS[v] = (uint32_t)(MAX_Y_T * value + 0.5); + // to prevent small rounding errors to cause read-overflow: + kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE + 1] = + kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE]; } - // to prevent small rounding errors to cause read-overflow: - kLinearToGammaTabS[GAMMA_TAB_SIZE + 1] = kLinearToGammaTabS[GAMMA_TAB_SIZE]; kGammaTablesSOk = 1; } } -// return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS -static WEBP_INLINE uint32_t GammaToLinearS(int v) { - return kGammaToLinearTabS[v]; +static WEBP_INLINE uint32_t FixedPointInterpolation(int v, uint32_t* tab, + int tab_pos_shift, + int tab_value_shift) { + const uint32_t tab_pos = v >> tab_pos_shift; + // fractional part, in 'tab_pos_shift' fixed-point precision + const uint32_t x = v - (tab_pos << tab_pos_shift); // fractional part + // v0 / v1 are in kGammaToLinearBits fixed-point precision (range [0..1]) + const uint32_t v0 = tab[tab_pos + 0] << tab_value_shift; + const uint32_t v1 = tab[tab_pos + 1] << tab_value_shift; + // Final interpolation. + const uint32_t v2 = (v1 - v0) * x; // note: v1 >= v0. + const int half = (tab_pos_shift > 0) ? 1 << (tab_pos_shift - 1) : 0; + const uint32_t result = v0 + ((v2 + half) >> tab_pos_shift); + return result; } -static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) { - // 'value' is in GAMMA_TO_LINEAR_BITS fractional precision - const uint32_t v = value * GAMMA_TAB_SIZE; - const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS; - // fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision - const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS); // fractional part - // v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1]) - const uint32_t v0 = kLinearToGammaTabS[tab_pos + 0]; - const uint32_t v1 = kLinearToGammaTabS[tab_pos + 1]; - // Final interpolation. - const uint32_t v2 = (v1 - v0) * x; // note: v1 >= v0. - const uint32_t result = - v0 + ((v2 + kGammaToLinearHalf) >> GAMMA_TO_LINEAR_BITS); - return result; +static WEBP_INLINE uint32_t GammaToLinear(int v, int bit_depth) { + const int shift = GAMMA_TO_LINEAR_TAB_BITS - bit_depth; + if (shift > 0) { + return kGammaToLinearTabS[v << shift]; + } + return FixedPointInterpolation(v, kGammaToLinearTabS, -shift, 0); +} + +static WEBP_INLINE uint32_t LinearToGamma(uint32_t value, int bit_depth) { + const uint32_t v = value << LINEAR_TO_GAMMA_TAB_BITS; + return FixedPointInterpolation(v, kLinearToGammaTabS, GAMMA_TO_LINEAR_BITS, + bit_depth - GAMMA_TO_LINEAR_TAB_BITS); } //------------------------------------------------------------------------------ @@ -116,46 +147,57 @@ static uint8_t clip_8b(fixed_t v) { return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u; } -static fixed_y_t clip_y(int y) { - return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T; +static uint16_t clip(fixed_t v, int max) { + return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v; +} + +static fixed_y_t clip_bit_depth(int y, int bit_depth) { + const int max = (1 << bit_depth) - 1; + return (!(y & ~max)) ? (fixed_y_t)y : (y < 0) ? 0 : max; } //------------------------------------------------------------------------------ -static int RGBToGray(int r, int g, int b) { - const int luma = 13933 * r + 46871 * g + 4732 * b + kYuvHalf; - return (luma >> YUV_FIX); +static int RGBToGray(int64_t r, int64_t g, int64_t b) { + const int64_t luma = 13933 * r + 46871 * g + 4732 * b + kYuvHalf; + return (int)(luma >> YUV_FIX); } -static uint32_t ScaleDown(int a, int b, int c, int d) { - const uint32_t A = GammaToLinearS(a); - const uint32_t B = GammaToLinearS(b); - const uint32_t C = GammaToLinearS(c); - const uint32_t D = GammaToLinearS(d); - return LinearToGammaS((A + B + C + D + 2) >> 2); +static uint32_t ScaleDown(int a, int b, int c, int d, int rgb_bit_depth) { + const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth); + const uint32_t A = GammaToLinear(a, bit_depth); + const uint32_t B = GammaToLinear(b, bit_depth); + const uint32_t C = GammaToLinear(c, bit_depth); + const uint32_t D = GammaToLinear(d, bit_depth); + return LinearToGamma((A + B + C + D + 2) >> 2, bit_depth); } -static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) { +static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w, + int rgb_bit_depth) { + const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth); int i; for (i = 0; i < w; ++i) { - const uint32_t R = GammaToLinearS(src[0 * w + i]); - const uint32_t G = GammaToLinearS(src[1 * w + i]); - const uint32_t B = GammaToLinearS(src[2 * w + i]); + const uint32_t R = GammaToLinear(src[0 * w + i], bit_depth); + const uint32_t G = GammaToLinear(src[1 * w + i], bit_depth); + const uint32_t B = GammaToLinear(src[2 * w + i], bit_depth); const uint32_t Y = RGBToGray(R, G, B); - dst[i] = (fixed_y_t)LinearToGammaS(Y); + dst[i] = (fixed_y_t)LinearToGamma(Y, bit_depth); } } static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2, - fixed_t* dst, int uv_w) { + fixed_t* dst, int uv_w, int rgb_bit_depth) { int i; for (i = 0; i < uv_w; ++i) { - const int r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1], - src2[0 * uv_w + 0], src2[0 * uv_w + 1]); - const int g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1], - src2[2 * uv_w + 0], src2[2 * uv_w + 1]); - const int b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1], - src2[4 * uv_w + 0], src2[4 * uv_w + 1]); + const int r = + ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1], src2[0 * uv_w + 0], + src2[0 * uv_w + 1], rgb_bit_depth); + const int g = + ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1], src2[2 * uv_w + 0], + src2[2 * uv_w + 1], rgb_bit_depth); + const int b = + ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1], src2[4 * uv_w + 0], + src2[4 * uv_w + 1], rgb_bit_depth); const int W = RGBToGray(r, g, b); dst[0 * uv_w] = (fixed_t)(r - W); dst[1 * uv_w] = (fixed_t)(g - W); @@ -176,30 +218,50 @@ static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) { //------------------------------------------------------------------------------ -static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0) { +static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0, int bit_depth) { const int v0 = (A * 3 + B + 2) >> 2; - return clip_y(v0 + W0); + return clip_bit_depth(v0 + W0, bit_depth); } //------------------------------------------------------------------------------ -static WEBP_INLINE fixed_y_t UpLift(uint8_t a) { // 8bit -> SFIX - return ((fixed_y_t)a << SFIX); +static WEBP_INLINE int Shift(int v, int shift) { + return (shift >= 0) ? (v << shift) : (v >> -shift); +} + +static WEBP_INLINE fixed_y_t ChangePrecision(uint16_t a, int shift) { + if (shift == 0) return a; + if (shift < 0) { + const int rounding = 1 << (-shift - 1); + return (a + rounding) >> -shift; + } + return ((fixed_y_t)a << shift); } static void ImportOneRow(const uint8_t* const r_ptr, const uint8_t* const g_ptr, const uint8_t* const b_ptr, - int step, + int rgb_step, + int rgb_bit_depth, int pic_width, fixed_y_t* const dst) { + // Convert the rgb_step from a number of bytes to a number of uint8_t or + // uint16_t values depending the bit depth. + const int step = (rgb_bit_depth > 8) ? rgb_step / 2 : rgb_step; int i; const int w = (pic_width + 1) & ~1; for (i = 0; i < pic_width; ++i) { const int off = i * step; - dst[i + 0 * w] = UpLift(r_ptr[off]); - dst[i + 1 * w] = UpLift(g_ptr[off]); - dst[i + 2 * w] = UpLift(b_ptr[off]); + const int shift = GetPrecisionShift(rgb_bit_depth); + if (rgb_bit_depth == 8) { + dst[i + 0 * w] = ChangePrecision(r_ptr[off], shift); + dst[i + 1 * w] = ChangePrecision(g_ptr[off], shift); + dst[i + 2 * w] = ChangePrecision(b_ptr[off], shift); + } else { + dst[i + 0 * w] = ChangePrecision(((uint16_t*)r_ptr)[off], shift); + dst[i + 1 * w] = ChangePrecision(((uint16_t*)g_ptr)[off], shift); + dst[i + 2 * w] = ChangePrecision(((uint16_t*)b_ptr)[off], shift); + } } if (pic_width & 1) { // replicate rightmost pixel dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1]; @@ -214,24 +276,28 @@ static void InterpolateTwoRows(const fixed_y_t* const best_y, const fixed_t* next_uv, int w, fixed_y_t* out1, - fixed_y_t* out2) { + fixed_y_t* out2, + int rgb_bit_depth) { const int uv_w = w >> 1; const int len = (w - 1) >> 1; // length to filter int k = 3; + const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth); while (k-- > 0) { // process each R/G/B segments in turn // special boundary case for i==0 - out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]); - out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]); + out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0], bit_depth); + out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w], bit_depth); - SharpYuvFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1); - SharpYuvFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1); + SharpYuvFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1, + bit_depth); + SharpYuvFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1, + bit_depth); // special boundary case for i == w - 1 when w is even if (!(w & 1)) { out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1], - best_y[w - 1 + 0]); + best_y[w - 1 + 0], bit_depth); out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1], - best_y[w - 1 + w]); + best_y[w - 1 + w], bit_depth); } out1 += w; out2 += w; @@ -241,17 +307,19 @@ static void InterpolateTwoRows(const fixed_y_t* const best_y, } } -static WEBP_INLINE uint8_t RGBToYUVComponent(int r, int g, int b, - const int coeffs[4]) { +static WEBP_INLINE int RGBToYUVComponent(int r, int g, int b, + const int coeffs[4], int sfix) { + const int srounder = 1 << (YUV_FIX + sfix - 1); const int luma = coeffs[0] * r + coeffs[1] * g + coeffs[2] * b + - (coeffs[3] << SFIX) + kYuvRounder; - return clip_8b((luma >> (YUV_FIX + SFIX))); + coeffs[3] + srounder; + return (luma >> (YUV_FIX + sfix)); } static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv, - uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, - int dst_stride_u, uint8_t* dst_v, int dst_stride_v, - int width, int height, + uint8_t* y_ptr, int y_stride, uint8_t* u_ptr, + int u_stride, uint8_t* v_ptr, int v_stride, + int rgb_bit_depth, + int yuv_bit_depth, int width, int height, const SharpYuvConversionMatrix* yuv_matrix) { int i, j; const fixed_t* const best_uv_base = best_uv; @@ -259,6 +327,9 @@ static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv, const int h = (height + 1) & ~1; const int uv_w = w >> 1; const int uv_h = h >> 1; + const int sfix = GetPrecisionShift(rgb_bit_depth); + const int yuv_max = (1 << yuv_bit_depth) - 1; + for (best_uv = best_uv_base, j = 0; j < height; ++j) { for (i = 0; i < width; ++i) { const int off = (i >> 1); @@ -266,24 +337,38 @@ static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv, const int r = best_uv[off + 0 * uv_w] + W; const int g = best_uv[off + 1 * uv_w] + W; const int b = best_uv[off + 2 * uv_w] + W; - dst_y[i] = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_y); + const int y = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_y, sfix); + if (yuv_bit_depth <= 8) { + y_ptr[i] = clip_8b(y); + } else { + ((uint16_t*)y_ptr)[i] = clip(y, yuv_max); + } } best_y += w; best_uv += (j & 1) * 3 * uv_w; - dst_y += dst_stride_y; + y_ptr += y_stride; } for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) { for (i = 0; i < uv_w; ++i) { const int off = i; + // Note r, g and b values here are off by W, but a constant offset on all + // 3 components doesn't change the value of u and v with a YCbCr matrix. const int r = best_uv[off + 0 * uv_w]; const int g = best_uv[off + 1 * uv_w]; const int b = best_uv[off + 2 * uv_w]; - dst_u[i] = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_u); - dst_v[i] = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_v); + const int u = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_u, sfix); + const int v = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_v, sfix); + if (yuv_bit_depth <= 8) { + u_ptr[i] = clip_8b(u); + v_ptr[i] = clip_8b(v); + } else { + ((uint16_t*)u_ptr)[i] = clip(u, yuv_max); + ((uint16_t*)v_ptr)[i] = clip(v, yuv_max); + } } best_uv += 3 * uv_w; - dst_u += dst_stride_u; - dst_v += dst_stride_v; + u_ptr += u_stride; + v_ptr += v_stride; } return 1; } @@ -300,10 +385,11 @@ static void* SafeMalloc(uint64_t nmemb, size_t size) { #define SAFE_ALLOC(W, H, T) ((T*)SafeMalloc((W) * (H), sizeof(T))) static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr, - const uint8_t* b_ptr, int step, int rgb_stride, - uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, - int dst_stride_u, uint8_t* dst_v, int dst_stride_v, - int width, int height, + const uint8_t* b_ptr, int rgb_step, int rgb_stride, + int rgb_bit_depth, uint8_t* y_ptr, int y_stride, + uint8_t* u_ptr, int u_stride, uint8_t* v_ptr, + int v_stride, int yuv_bit_depth, int width, + int height, const SharpYuvConversionMatrix* yuv_matrix) { // we expand the right/bottom border if needed const int w = (width + 1) & ~1; @@ -344,19 +430,20 @@ static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr, fixed_y_t* const src2 = tmp_buffer + 3 * w; // prepare two rows of input - ImportOneRow(r_ptr, g_ptr, b_ptr, step, width, src1); + ImportOneRow(r_ptr, g_ptr, b_ptr, rgb_step, rgb_bit_depth, width, + src1); if (!is_last_row) { ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride, - step, width, src2); + rgb_step, rgb_bit_depth, width, src2); } else { memcpy(src2, src1, 3 * w * sizeof(*src2)); } StoreGray(src1, best_y + 0, w); StoreGray(src2, best_y + w, w); - UpdateW(src1, target_y, w); - UpdateW(src2, target_y + w, w); - UpdateChroma(src1, src2, target_uv, uv_w); + UpdateW(src1, target_y, w, rgb_bit_depth); + UpdateW(src2, target_y + w, w, rgb_bit_depth); + UpdateChroma(src1, src2, target_uv, uv_w, rgb_bit_depth); memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv)); best_y += 2 * w; best_uv += 3 * uv_w; @@ -382,17 +469,20 @@ static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr, fixed_y_t* const src2 = tmp_buffer + 3 * w; { const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0); - InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w, src1, src2); + InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w, + src1, src2, rgb_bit_depth); prev_uv = cur_uv; cur_uv = next_uv; } - UpdateW(src1, best_rgb_y + 0 * w, w); - UpdateW(src2, best_rgb_y + 1 * w, w); - UpdateChroma(src1, src2, best_rgb_uv, uv_w); + UpdateW(src1, best_rgb_y + 0 * w, w, rgb_bit_depth); + UpdateW(src2, best_rgb_y + 1 * w, w, rgb_bit_depth); + UpdateChroma(src1, src2, best_rgb_uv, uv_w, rgb_bit_depth); // update two rows of Y and one row of RGB - diff_y_sum += SharpYuvUpdateY(target_y, best_rgb_y, best_y, 2 * w); + diff_y_sum += + SharpYuvUpdateY(target_y, best_rgb_y, best_y, 2 * w, + rgb_bit_depth + GetPrecisionShift(rgb_bit_depth)); SharpYuvUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w); best_y += 2 * w; @@ -407,10 +497,11 @@ static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr, } prev_diff_y_sum = diff_y_sum; } + // final reconstruction - ok = ConvertWRGBToYUV(best_y_base, best_uv_base, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, width, height, - yuv_matrix); + ok = ConvertWRGBToYUV(best_y_base, best_uv_base, y_ptr, y_stride, u_ptr, + u_stride, v_ptr, v_stride, rgb_bit_depth, yuv_bit_depth, + width, height, yuv_matrix); End: free(best_y_base); @@ -444,20 +535,66 @@ void SharpYuvInit(VP8CPUInfo cpu_info_func) { sharpyuv_last_cpuinfo_used = cpu_info_func; } -int SharpYuvConvert(const uint8_t* r_ptr, const uint8_t* g_ptr, - const uint8_t* b_ptr, int step, int rgb_stride, - uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, - int dst_stride_u, uint8_t* dst_v, int dst_stride_v, - int width, int height, - const SharpYuvConversionMatrix* yuv_matrix) { +int SharpYuvConvert(const void* r_ptr, const void* g_ptr, + const void* b_ptr, int rgb_step, int rgb_stride, + int rgb_bit_depth, void* y_ptr, int y_stride, + void* u_ptr, int u_stride, void* v_ptr, + int v_stride, int yuv_bit_depth, int width, + int height, const SharpYuvConversionMatrix* yuv_matrix) { + SharpYuvConversionMatrix scaled_matrix; + const int rgb_max = (1 << rgb_bit_depth) - 1; + const int rgb_round = 1 << (rgb_bit_depth - 1); + const int yuv_max = (1 << yuv_bit_depth) - 1; + const int sfix = GetPrecisionShift(rgb_bit_depth); + if (width < kMinDimensionIterativeConversion || - height < kMinDimensionIterativeConversion) { + height < kMinDimensionIterativeConversion || + r_ptr == NULL || g_ptr == NULL || b_ptr == NULL || y_ptr == NULL || + u_ptr == NULL || v_ptr == NULL) { + return 0; + } + if (rgb_bit_depth != 8 && rgb_bit_depth != 10 && rgb_bit_depth != 12 && + rgb_bit_depth != 16) { + return 0; + } + if (yuv_bit_depth != 8 && yuv_bit_depth != 10 && yuv_bit_depth != 12) { + return 0; + } + if (rgb_bit_depth > 8 && (rgb_step % 2 != 0 || rgb_stride %2 != 0)) { + // Step/stride should be even for uint16_t buffers. + return 0; + } + if (yuv_bit_depth > 8 && + (y_stride % 2 != 0 || u_stride % 2 != 0 || v_stride % 2 != 0)) { + // Stride should be even for uint16_t buffers. return 0; } SharpYuvInit(NULL); - return DoSharpArgbToYuv(r_ptr, g_ptr, b_ptr, step, rgb_stride, dst_y, - dst_stride_y, dst_u, dst_stride_u, dst_v, - dst_stride_v, width, height, yuv_matrix); + + // Add scaling factor to go from rgb_bit_depth to yuv_bit_depth, to the + // rgb->yuv conversion matrix. + if (rgb_bit_depth == yuv_bit_depth) { + memcpy(&scaled_matrix, yuv_matrix, sizeof(scaled_matrix)); + } else { + int i; + for (i = 0; i < 3; ++i) { + scaled_matrix.rgb_to_y[i] = + (yuv_matrix->rgb_to_y[i] * yuv_max + rgb_round) / rgb_max; + scaled_matrix.rgb_to_u[i] = + (yuv_matrix->rgb_to_u[i] * yuv_max + rgb_round) / rgb_max; + scaled_matrix.rgb_to_v[i] = + (yuv_matrix->rgb_to_v[i] * yuv_max + rgb_round) / rgb_max; + } + } + // Also incorporate precision change scaling. + scaled_matrix.rgb_to_y[3] = Shift(yuv_matrix->rgb_to_y[3], sfix); + scaled_matrix.rgb_to_u[3] = Shift(yuv_matrix->rgb_to_u[3], sfix); + scaled_matrix.rgb_to_v[3] = Shift(yuv_matrix->rgb_to_v[3], sfix); + + return DoSharpArgbToYuv(r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride, + rgb_bit_depth, y_ptr, y_stride, u_ptr, u_stride, + v_ptr, v_stride, yuv_bit_depth, width, height, + &scaled_matrix); } //------------------------------------------------------------------------------ diff --git a/sharpyuv/sharpyuv.h b/sharpyuv/sharpyuv.h index d0ef379f..70f9b998 100644 --- a/sharpyuv/sharpyuv.h +++ b/sharpyuv/sharpyuv.h @@ -35,15 +35,33 @@ typedef struct { // Assumes that the image will be upsampled using a bilinear filter. If nearest // neighbor is used instead, the upsampled image might look worse than with // standard downsampling. -// TODO(maryla): add 10 bits support. Add YUV444 to YUV420 conversion. -// Maybe also add 422 support (it's rarely used in practice, especially for -// images). -int SharpYuvConvert(const uint8_t* r_ptr, const uint8_t* g_ptr, - const uint8_t* b_ptr, int step, int rgb_stride, - uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, - int dst_stride_u, uint8_t* dst_v, int dst_stride_v, - int width, int height, - const SharpYuvConversionMatrix* yuv_matrix); +// r_ptr, g_ptr, b_ptr: pointers to the source r, g and b channels. Should point +// to uint8_t buffers if rgb_bit_depth is 8, or uint16_t buffers otherwise. +// rgb_step: distance in bytes between two horizontally adjacent pixels on the +// r, g and b channels. If rgb_bit_depth is > 8, it should be a +// multiple of 2. +// rgb_stride: distance in bytes between two vertically adjacent pixels on the +// r, g, and b channels. If rgb_bit_depth is > 8, it should be a +// multiple of 2. +// rgb_bit_depth: number of bits for each r/g/b value. One of: 8, 10, 12, 16. +// Note: for 10+ bit, input is truncated to 10 bits. +// TODO(b/194336375): increase precision. +// yuv_bit_depth: number of bits for each y/u/v value. One of: 8, 10, 12. +// y_ptr, u_ptr, v_ptr: pointers to the destination y, u and v channels. Should +// point to uint8_t buffers if yuv_bit_depth is 8, or uint16_t buffers +// otherwise. +// y_stride, u_stride, v_stride: distance in bytes between two vertically +// adjacent pixels on the y, u and v channels. If yuv_bit_depth > 8, they +// should be multiples of 2. +// width, height: width and height of the image in pixels +int SharpYuvConvert(const void* r_ptr, const void* g_ptr, const void* b_ptr, + int rgb_step, int rgb_stride, int rgb_bit_depth, + void* y_ptr, int y_stride, void* u_ptr, int u_stride, + void* v_ptr, int v_stride, int yuv_bit_depth, int width, + int height, const SharpYuvConversionMatrix* yuv_matrix); + +// TODO(b/194336375): Add YUV444 to YUV420 conversion. Maybe also add 422 +// support (it's rarely used in practice, especially for images). #ifdef __cplusplus } // extern "C" diff --git a/sharpyuv/sharpyuv_csp.c b/sharpyuv/sharpyuv_csp.c index 4dc8142c..5334fa64 100644 --- a/sharpyuv/sharpyuv_csp.c +++ b/sharpyuv/sharpyuv_csp.c @@ -15,7 +15,7 @@ #include #include -static int ToFixed16(float f) { return (int)round(f * (1 << 16)); } +static int ToFixed16(float f) { return (int)floor(f * (1 << 16) + 0.5f); } void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space, SharpYuvConversionMatrix* matrix) { @@ -25,28 +25,27 @@ void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space, const float cr = 0.5f / (1.0f - kb); const float cb = 0.5f / (1.0f - kr); - const int shift = yuv_color_space->bits - 8; + const int shift = yuv_color_space->bit_depth - 8; - const float denom = (float)((1 << yuv_color_space->bits) - 1); + const float denom = (float)((1 << yuv_color_space->bit_depth) - 1); float scale_y = 1.0f; - float addY = 0.0f; + float add_y = 0.0f; float scale_u = cr; float scale_v = cb; float add_uv = (float)(128 << shift); - - assert(yuv_color_space->bits >= 8); + assert(yuv_color_space->bit_depth >= 8); if (yuv_color_space->range == kSharpYuvRangeLimited) { scale_y *= (219 << shift) / denom; scale_u *= (224 << shift) / denom; scale_v *= (224 << shift) / denom; - addY = (float)(16 << shift); + add_y = (float)(16 << shift); } matrix->rgb_to_y[0] = ToFixed16(kr * scale_y); matrix->rgb_to_y[1] = ToFixed16(kg * scale_y); matrix->rgb_to_y[2] = ToFixed16(kb * scale_y); - matrix->rgb_to_y[3] = ToFixed16(addY); + matrix->rgb_to_y[3] = ToFixed16(add_y); matrix->rgb_to_u[0] = ToFixed16(-kr * scale_u); matrix->rgb_to_u[1] = ToFixed16(-kg * scale_u); diff --git a/sharpyuv/sharpyuv_csp.h b/sharpyuv/sharpyuv_csp.h index 153c1156..63c99ef5 100644 --- a/sharpyuv/sharpyuv_csp.h +++ b/sharpyuv/sharpyuv_csp.h @@ -30,7 +30,7 @@ typedef struct { // Y = Kr * r + Kg * g + Kb * b where Kg = 1 - Kr - Kb. float kr; float kb; - int bits; // Only 8 bit is supported by SharpYuvConvert. + int bit_depth; // 8, 10 or 12 SharpYuvRange range; } SharpYuvColorSpace; diff --git a/sharpyuv/sharpyuv_dsp.c b/sharpyuv/sharpyuv_dsp.c index 93f1f01e..956fa7ce 100644 --- a/sharpyuv/sharpyuv_dsp.c +++ b/sharpyuv/sharpyuv_dsp.c @@ -21,19 +21,19 @@ //----------------------------------------------------------------------------- #if !WEBP_NEON_OMIT_C_CODE -#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic -static uint16_t clip_y(int v) { - return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v; +static uint16_t clip(int v, int max) { + return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v; } static uint64_t SharpYuvUpdateY_C(const uint16_t* ref, const uint16_t* src, - uint16_t* dst, int len) { + uint16_t* dst, int len, int bit_depth) { uint64_t diff = 0; int i; + const int max_y = (1 << bit_depth) - 1; for (i = 0; i < len; ++i) { const int diff_y = ref[i] - src[i]; const int new_y = (int)dst[i] + diff_y; - dst[i] = clip_y(new_y); + dst[i] = clip(new_y, max_y); diff += (uint64_t)abs(diff_y); } return diff; @@ -49,27 +49,28 @@ static void SharpYuvUpdateRGB_C(const int16_t* ref, const int16_t* src, } static void SharpYuvFilterRow_C(const int16_t* A, const int16_t* B, int len, - const uint16_t* best_y, uint16_t* out) { + const uint16_t* best_y, uint16_t* out, + int bit_depth) { int i; + const int max_y = (1 << bit_depth) - 1; for (i = 0; i < len; ++i, ++A, ++B) { const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4; const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4; - out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0); - out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); + out[2 * i + 0] = clip(best_y[2 * i + 0] + v0, max_y); + out[2 * i + 1] = clip(best_y[2 * i + 1] + v1, max_y); } } #endif // !WEBP_NEON_OMIT_C_CODE -#undef MAX_Y - //----------------------------------------------------------------------------- uint64_t (*SharpYuvUpdateY)(const uint16_t* src, const uint16_t* ref, - uint16_t* dst, int len); + uint16_t* dst, int len, int bit_depth); void (*SharpYuvUpdateRGB)(const int16_t* src, const int16_t* ref, int16_t* dst, int len); void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len, - const uint16_t* best_y, uint16_t* out); + const uint16_t* best_y, uint16_t* out, + int bit_depth); extern void InitSharpYuvSSE2(void); extern void InitSharpYuvNEON(void); diff --git a/sharpyuv/sharpyuv_dsp.h b/sharpyuv/sharpyuv_dsp.h index 5cacd27c..e561d8d3 100644 --- a/sharpyuv/sharpyuv_dsp.h +++ b/sharpyuv/sharpyuv_dsp.h @@ -17,11 +17,12 @@ #include "src/dsp/cpu.h" extern uint64_t (*SharpYuvUpdateY)(const uint16_t* src, const uint16_t* ref, - uint16_t* dst, int len); + uint16_t* dst, int len, int bit_depth); extern void (*SharpYuvUpdateRGB)(const int16_t* src, const int16_t* ref, int16_t* dst, int len); extern void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len, - const uint16_t* best_y, uint16_t* out); + const uint16_t* best_y, uint16_t* out, + int bit_depth); void SharpYuvInitDsp(VP8CPUInfo cpu_info_func); diff --git a/sharpyuv/sharpyuv_neon.c b/sharpyuv/sharpyuv_neon.c index d9ff24af..e15ec8a3 100644 --- a/sharpyuv/sharpyuv_neon.c +++ b/sharpyuv/sharpyuv_neon.c @@ -23,16 +23,16 @@ extern void InitSharpYuvNEON(void); #if defined(WEBP_USE_NEON) -#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic -static uint16_t clip_y_NEON(int v) { - return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v; +static uint16_t clip_NEON(int v, int max) { + return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v; } static uint64_t SharpYuvUpdateY_NEON(const uint16_t* ref, const uint16_t* src, - uint16_t* dst, int len) { + uint16_t* dst, int len, int bit_depth) { + const int max_y = (1 << bit_depth) - 1; int i; const int16x8_t zero = vdupq_n_s16(0); - const int16x8_t max = vdupq_n_s16(MAX_Y); + const int16x8_t max = vdupq_n_s16(max_y); uint64x2_t sum = vdupq_n_u64(0); uint64_t diff; @@ -52,7 +52,7 @@ static uint64_t SharpYuvUpdateY_NEON(const uint16_t* ref, const uint16_t* src, for (; i < len; ++i) { const int diff_y = ref[i] - src[i]; const int new_y = (int)(dst[i]) + diff_y; - dst[i] = clip_y_NEON(new_y); + dst[i] = clip_NEON(new_y, max_y); diff += (uint64_t)(abs(diff_y)); } return diff; @@ -76,9 +76,11 @@ static void SharpYuvUpdateRGB_NEON(const int16_t* ref, const int16_t* src, } static void SharpYuvFilterRow_NEON(const int16_t* A, const int16_t* B, int len, - const uint16_t* best_y, uint16_t* out) { + const uint16_t* best_y, uint16_t* out, + int bit_depth) { + const int max_y = (1 << bit_depth) - 1; int i; - const int16x8_t max = vdupq_n_s16(MAX_Y); + const int16x8_t max = vdupq_n_s16(max_y); const int16x8_t zero = vdupq_n_s16(0); for (i = 0; i + 8 <= len; i += 8) { const int16x8_t a0 = vld1q_s16(A + i + 0); @@ -112,11 +114,10 @@ static void SharpYuvFilterRow_NEON(const int16_t* A, const int16_t* B, int len, const int a0a1b0b1 = a0b1 + a1b0 + 8; const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; - out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0); - out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1); + out[2 * i + 0] = clip_NEON(best_y[2 * i + 0] + v0, max_y); + out[2 * i + 1] = clip_NEON(best_y[2 * i + 1] + v1, max_y); } } -#undef MAX_Y //------------------------------------------------------------------------------ diff --git a/sharpyuv/sharpyuv_sse2.c b/sharpyuv/sharpyuv_sse2.c index b52a1910..cfa519dc 100644 --- a/sharpyuv/sharpyuv_sse2.c +++ b/sharpyuv/sharpyuv_sse2.c @@ -22,18 +22,18 @@ extern void InitSharpYuvSSE2(void); #if defined(WEBP_USE_SSE2) -#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic -static uint16_t clip_y(int v) { - return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v; +static uint16_t clip_SSE2(int v, int max) { + return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v; } static uint64_t SharpYuvUpdateY_SSE2(const uint16_t* ref, const uint16_t* src, - uint16_t* dst, int len) { + uint16_t* dst, int len, int bit_depth) { + const int max_y = (1 << bit_depth) - 1; uint64_t diff = 0; uint32_t tmp[4]; int i; const __m128i zero = _mm_setzero_si128(); - const __m128i max = _mm_set1_epi16(MAX_Y); + const __m128i max = _mm_set1_epi16(max_y); const __m128i one = _mm_set1_epi16(1); __m128i sum = zero; @@ -55,7 +55,7 @@ static uint64_t SharpYuvUpdateY_SSE2(const uint16_t* ref, const uint16_t* src, for (; i < len; ++i) { const int diff_y = ref[i] - src[i]; const int new_y = (int)dst[i] + diff_y; - dst[i] = clip_y(new_y); + dst[i] = clip_SSE2(new_y, max_y); diff += (uint64_t)abs(diff_y); } return diff; @@ -79,10 +79,12 @@ static void SharpYuvUpdateRGB_SSE2(const int16_t* ref, const int16_t* src, } static void SharpYuvFilterRow_SSE2(const int16_t* A, const int16_t* B, int len, - const uint16_t* best_y, uint16_t* out) { + const uint16_t* best_y, uint16_t* out, + int bit_depth) { + const int max_y = (1 << bit_depth) - 1; int i; const __m128i kCst8 = _mm_set1_epi16(8); - const __m128i max = _mm_set1_epi16(MAX_Y); + const __m128i max = _mm_set1_epi16(max_y); const __m128i zero = _mm_setzero_si128(); for (i = 0; i + 8 <= len; i += 8) { const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0)); @@ -121,11 +123,10 @@ static void SharpYuvFilterRow_SSE2(const int16_t* A, const int16_t* B, int len, const int a0a1b0b1 = a0b1 + a1b0 + 8; const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; - out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0); - out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); + out[2 * i + 0] = clip_SSE2(best_y[2 * i + 0] + v0, max_y); + out[2 * i + 1] = clip_SSE2(best_y[2 * i + 1] + v1, max_y); } } -#undef MAX_Y //------------------------------------------------------------------------------ diff --git a/src/enc/picture_csp_enc.c b/src/enc/picture_csp_enc.c index 08826331..5e60f5ba 100644 --- a/src/enc/picture_csp_enc.c +++ b/src/enc/picture_csp_enc.c @@ -191,10 +191,10 @@ static int PreprocessARGB(const uint8_t* r_ptr, int step, int rgb_stride, WebPPicture* const picture) { const int ok = SharpYuvConvert( - r_ptr, g_ptr, b_ptr, step, rgb_stride, picture->y, picture->y_stride, - picture->u, picture->uv_stride, picture->v, picture->uv_stride, - picture->width, picture->height, - SharpYuvGetConversionMatrix(kSharpYuvMatrixWebp)); + r_ptr, g_ptr, b_ptr, step, rgb_stride, /*rgb_bit_depth=*/8, + picture->y, picture->y_stride, picture->u, picture->uv_stride, picture->v, + picture->uv_stride, /*yuv_bit_depth=*/8, picture->width, + picture->height, SharpYuvGetConversionMatrix(kSharpYuvMatrixWebp)); if (!ok) { return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY); }