Have lossless use ImportYUVAFromRGB

There was a duplicated functionality with a lower quality which could lead to decoded lossless WebP to YUV being different from lossless WebP to PNG to YUV. The rescaler is not using it yet. Bug: 432241412 Change-Id: Id794880957935b69729d4b34ae453551d13364dc
2026-02-28 04:02:19 +01:00 · 2025-08-25 12:52:26 +02:00
parent fd2c2cc05b
commit 0d14d84bdb
5 changed files with 611 additions and 407 deletions
--- a/src/dec/vp8l_dec.c
+++ b/src/dec/vp8l_dec.c
@@ -24,6 +24,7 @@
 #include "src/dsp/dsp.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless_common.h"
+#include "src/dsp/yuv.h"
 #include "src/utils/bit_reader_utils.h"
 #include "src/utils/color_cache_utils.h"
 #include "src/utils/huffman_utils.h"
@@ -703,13 +704,71 @@ static int EmitRescaledRowsYUVA(const VP8LDecoder* const dec, uint8_t* in,
  return y_pos;
 }

-static int EmitRowsYUVA(const VP8LDecoder* const dec, const uint8_t* in,
-                        int in_stride, int mb_w, int num_rows) {
+// Returns true if alpha[] has non-0xff values.
+static int CheckNonOpaque(const uint8_t* alpha, int width, int height,
+                          int y_step) {
+  WebPInitAlphaProcessing();
+  for (; height-- > 0; alpha += y_step) {
+    if (WebPHasAlpha8b(alpha, width)) return 1;
+  }
+  return 0;
+}
+
+static int EmitRowsYUVA(const uint8_t* const in, const VP8Io* const io,
+                        int in_stride, uint16_t* tmp_rgb,
+                        VP8LDecoder* const dec) {
  int y_pos = dec->last_out_row;
-  while (num_rows-- > 0) {
-    ConvertToYUVA((const uint32_t*)in, mb_w, y_pos, dec->output);
-    in += in_stride;
-    ++y_pos;
+  const int width = io->mb_w;
+  int num_rows = io->mb_h;
+  const int y_pos_final = y_pos + num_rows;
+  const int y_stride = dec->output->u.YUVA.y_stride;
+  const int uv_stride = dec->output->u.YUVA.u_stride;
+  const int a_stride = dec->output->u.YUVA.a_stride;
+  uint8_t* dst_a = dec->output->u.YUVA.a;
+  uint8_t* dst_y = dec->output->u.YUVA.y + y_pos * y_stride;
+  uint8_t* dst_u = dec->output->u.YUVA.u + (y_pos >> 1) * uv_stride;
+  uint8_t* dst_v = dec->output->u.YUVA.v + (y_pos >> 1) * uv_stride;
+  const uint8_t* r_ptr = in + CHANNEL_OFFSET(1);
+  const uint8_t* g_ptr = in + CHANNEL_OFFSET(2);
+  const uint8_t* b_ptr = in + CHANNEL_OFFSET(3);
+  const uint8_t* a_ptr = NULL;
+  int has_alpha = 0;
+
+  // Make sure the lines are processed two by two from the start.
+  assert(y_pos % 2 == 0);
+
+  // Make sure num_rows is even. y_pos_final will check if it not.
+  num_rows &= ~1;
+
+  if (dst_a) {
+    dst_a += y_pos * a_stride;
+    a_ptr = in + CHANNEL_OFFSET(0);
+    has_alpha = CheckNonOpaque(a_ptr, width, num_rows, in_stride);
+  }
+  // Process pairs of lines.
+  WebPImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, /*step=*/4, in_stride,
+                         has_alpha, width, num_rows, tmp_rgb, y_stride,
+                         uv_stride, a_stride, dst_y, dst_u, dst_v, dst_a);
+
+  y_pos += num_rows;
+  if (y_pos_final == io->crop_bottom - io->crop_top && y_pos < y_pos_final) {
+    assert(y_pos + 1 == y_pos_final);
+    // If we output the last line of an image with odd height.
+    dst_y += num_rows * y_stride;
+    dst_u += (num_rows >> 1) * uv_stride;
+    dst_v += (num_rows >> 1) * uv_stride;
+    r_ptr += num_rows * in_stride;
+    g_ptr += num_rows * in_stride;
+    b_ptr += num_rows * in_stride;
+    if (dst_a) {
+      dst_a += num_rows * a_stride;
+      a_ptr += num_rows * in_stride;
+      has_alpha = CheckNonOpaque(a_ptr, width, /*height=*/1, in_stride);
+    }
+    WebPImportYUVAFromRGBALastLine(r_ptr, g_ptr, b_ptr, a_ptr, /*step=*/4,
+                                   has_alpha, width, tmp_rgb, dst_y, dst_u,
+                                   dst_v, dst_a);
+    y_pos = y_pos_final;
  }
  return y_pos;
 }
@@ -789,8 +848,17 @@ static void ApplyInverseTransforms(VP8LDecoder* const dec, int start_row,
 // last call.
 static void ProcessRows(VP8LDecoder* const dec, int row) {
  const uint32_t* const rows = dec->pixels + dec->width * dec->last_row;
-  const int num_rows = row - dec->last_row;
+  int num_rows;

+  // In case of YUV conversion and if we do not need to get to the last row.
+  if (!WebPIsRGBMode(dec->output->colorspace) && row >= dec->io->crop_top &&
+      row < dec->io->crop_bottom) {
+    // Make sure the number of rows to process is even.
+    if ((row - dec->io->crop_top) % 2 == 1) {
+      --row;
+    }
+  }
+  num_rows = row - dec->last_row;
  assert(row <= dec->io->crop_bottom);
  // We can't process more than NUM_ARGB_CACHE_ROWS at a time (that's the size
  // of argb_cache), but we currently don't need more than that.
@@ -822,7 +890,8 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
        dec->last_out_row =
            io->use_scaling
                ? EmitRescaledRowsYUVA(dec, rows_data, in_stride, io->mb_h)
-                : EmitRowsYUVA(dec, rows_data, in_stride, io->mb_w, io->mb_h);
+                : EmitRowsYUVA(rows_data, io, in_stride,
+                               dec->accumulated_rgb_pixels, dec);
      }
      assert(dec->last_out_row <= output->height);
    }
@@ -1526,9 +1595,16 @@ static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) {
  const uint64_t cache_top_pixels = (uint16_t)final_width;
  // Scratch buffer for temporary BGRA storage. Not needed for paletted alpha.
  const uint64_t cache_pixels = (uint64_t)final_width * NUM_ARGB_CACHE_ROWS;
-  const uint64_t total_num_pixels =
-      num_pixels + cache_top_pixels + cache_pixels;
-
+  // Scratch buffer to accumulate RGBA values (hence 4*)for YUV conversion.
+  uint64_t accumulated_rgb_pixels = 0;
+  uint64_t total_num_pixels;
+  if (dec->output != NULL && !WebPIsRGBMode(dec->output->colorspace)) {
+    const int uv_width = (dec->io->crop_right - dec->io->crop_left + 1) >> 1;
+    accumulated_rgb_pixels =
+        4 * uv_width * sizeof(*dec->accumulated_rgb_pixels) / sizeof(uint32_t);
+  }
+  total_num_pixels =
+      num_pixels + cache_top_pixels + cache_pixels + accumulated_rgb_pixels;
  assert(dec->width <= final_width);
  dec->pixels = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint32_t));
  if (dec->pixels == NULL) {
@@ -1536,6 +1612,12 @@ static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) {
    return VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY);
  }
  dec->argb_cache = dec->pixels + num_pixels + cache_top_pixels;
+  dec->accumulated_rgb_pixels =
+      accumulated_rgb_pixels == 0
+          ? NULL
+          : (uint16_t*)(dec->pixels + num_pixels + cache_top_pixels +
+                        cache_pixels);
+
  return 1;
 }

--- a/src/dec/vp8li_dec.h
+++ b/src/dec/vp8li_dec.h
@@ -67,6 +67,8 @@ struct VP8LDecoder {
  uint32_t* pixels;      // Internal data: either uint8_t* for alpha
                         // or uint32_t* for BGRA.
  uint32_t* argb_cache;  // Scratch buffer for temporary BGRA storage.
+  uint16_t* accumulated_rgb_pixels;  // Scratch buffer for accumulated RGB for
+                                     // YUV conversion.

  VP8LBitReader br;
  int incremental;         // if true, incremental decoding is expected
--- a/src/dsp/yuv.c
+++ b/src/dsp/yuv.c
@@ -21,6 +21,16 @@
 #include "src/webp/decode.h"
 #include "src/webp/types.h"

+// Uncomment to disable gamma-compression during RGB->U/V averaging
+#define USE_GAMMA_COMPRESSION
+
+// If defined, use table to compute x / alpha.
+#define USE_INVERSE_ALPHA_TABLE
+
+#ifdef USE_GAMMA_COMPRESSION
+#include <math.h>
+#endif
+
 //-----------------------------------------------------------------------------
 // Plain-C version

@@ -204,6 +214,388 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb,
  }
 }

+//------------------------------------------------------------------------------
+// Code for gamma correction
+
+#if defined(USE_GAMMA_COMPRESSION)
+
+// Gamma correction compensates loss of resolution during chroma subsampling.
+#define GAMMA_FIX 12     // fixed-point precision for linear values
+#define GAMMA_TAB_FIX 7  // fixed-point fractional bits precision
+#define GAMMA_TAB_SIZE (1 << (GAMMA_FIX - GAMMA_TAB_FIX))
+static const double kGamma = 0.80;
+static const int kGammaScale = ((1 << GAMMA_FIX) - 1);
+static const int kGammaTabScale = (1 << GAMMA_TAB_FIX);
+static const int kGammaTabRounder = (1 << GAMMA_TAB_FIX >> 1);
+
+static int kLinearToGammaTab[GAMMA_TAB_SIZE + 1];
+static uint16_t kGammaToLinearTab[256];
+static volatile int kGammaTablesOk = 0;
+extern VP8CPUInfo VP8GetCPUInfo;
+
+WEBP_DSP_INIT_FUNC(WebPInitGammaTables) {
+  if (!kGammaTablesOk) {
+    int v;
+    const double scale = (double)(1 << GAMMA_TAB_FIX) / kGammaScale;
+    const double norm = 1. / 255.;
+    for (v = 0; v <= 255; ++v) {
+      kGammaToLinearTab[v] =
+          (uint16_t)(pow(norm * v, kGamma) * kGammaScale + .5);
+    }
+    for (v = 0; v <= GAMMA_TAB_SIZE; ++v) {
+      kLinearToGammaTab[v] = (int)(255. * pow(scale * v, 1. / kGamma) + .5);
+    }
+    kGammaTablesOk = 1;
+  }
+}
+
+static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) {
+  return kGammaToLinearTab[v];
+}
+
+static WEBP_INLINE int Interpolate(int v) {
+  const int tab_pos = v >> (GAMMA_TAB_FIX + 2);   // integer part
+  const int x = v & ((kGammaTabScale << 2) - 1);  // fractional part
+  const int v0 = kLinearToGammaTab[tab_pos];
+  const int v1 = kLinearToGammaTab[tab_pos + 1];
+  const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x);  // interpolate
+  assert(tab_pos + 1 < GAMMA_TAB_SIZE + 1);
+  return y;
+}
+
+// Convert a linear value 'v' to YUV_FIX+2 fixed-point precision
+// U/V value, suitable for RGBToU/V calls.
+static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
+  const int y = Interpolate(base_value << shift);  // final uplifted value
+  return (y + kGammaTabRounder) >> GAMMA_TAB_FIX;  // descale
+}
+
+#else
+
+void WebPInitGammaTables(void) {}
+static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
+static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
+  return (int)(base_value << shift);
+}
+
+#endif  // USE_GAMMA_COMPRESSION
+
+#define SUM4(ptr, step)                                                  \
+  LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[(step)]) + \
+                    GammaToLinear((ptr)[rgb_stride]) +                   \
+                    GammaToLinear((ptr)[rgb_stride + (step)]),           \
+                0)
+
+#define SUM2(ptr) \
+  LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[rgb_stride]), 1)
+
+//------------------------------------------------------------------------------
+// "Fast" regular RGB->YUV
+
+#define SUM4(ptr, step)                                                  \
+  LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[(step)]) + \
+                    GammaToLinear((ptr)[rgb_stride]) +                   \
+                    GammaToLinear((ptr)[rgb_stride + (step)]),           \
+                0)
+
+#define SUM2(ptr) \
+  LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[rgb_stride]), 1)
+
+#define SUM2ALPHA(ptr) ((ptr)[0] + (ptr)[rgb_stride])
+#define SUM4ALPHA(ptr) (SUM2ALPHA(ptr) + SUM2ALPHA((ptr) + 4))
+
+#if defined(USE_INVERSE_ALPHA_TABLE)
+
+static const int kAlphaFix = 19;
+// Following table is (1 << kAlphaFix) / a. The (v * kInvAlpha[a]) >> kAlphaFix
+// formula is then equal to v / a in most (99.6%) cases. Note that this table
+// and constant are adjusted very tightly to fit 32b arithmetic.
+// In particular, they use the fact that the operands for 'v / a' are actually
+// derived as v = (a0.p0 + a1.p1 + a2.p2 + a3.p3) and a = a0 + a1 + a2 + a3
+// with ai in [0..255] and pi in [0..1<<GAMMA_FIX). The constraint to avoid
+// overflow is: GAMMA_FIX + kAlphaFix <= 31.
+static const uint32_t kInvAlpha[4 * 0xff + 1] = {
+    0, /* alpha = 0 */
+    524288, 262144, 174762, 131072, 104857, 87381, 74898, 65536, 58254, 52428,
+    47662,  43690,  40329,  37449,  34952,  32768, 30840, 29127, 27594, 26214,
+    24966,  23831,  22795,  21845,  20971,  20164, 19418, 18724, 18078, 17476,
+    16912,  16384,  15887,  15420,  14979,  14563, 14169, 13797, 13443, 13107,
+    12787,  12483,  12192,  11915,  11650,  11397, 11155, 10922, 10699, 10485,
+    10280,  10082,  9892,   9709,   9532,   9362,  9198,  9039,  8886,  8738,
+    8594,   8456,   8322,   8192,   8065,   7943,  7825,  7710,  7598,  7489,
+    7384,   7281,   7182,   7084,   6990,   6898,  6808,  6721,  6636,  6553,
+    6472,   6393,   6316,   6241,   6168,   6096,  6026,  5957,  5890,  5825,
+    5761,   5698,   5637,   5577,   5518,   5461,  5405,  5349,  5295,  5242,
+    5190,   5140,   5090,   5041,   4993,   4946,  4899,  4854,  4809,  4766,
+    4723,   4681,   4639,   4599,   4559,   4519,  4481,  4443,  4405,  4369,
+    4332,   4297,   4262,   4228,   4194,   4161,  4128,  4096,  4064,  4032,
+    4002,   3971,   3942,   3912,   3883,   3855,  3826,  3799,  3771,  3744,
+    3718,   3692,   3666,   3640,   3615,   3591,  3566,  3542,  3518,  3495,
+    3472,   3449,   3426,   3404,   3382,   3360,  3339,  3318,  3297,  3276,
+    3256,   3236,   3216,   3196,   3177,   3158,  3139,  3120,  3102,  3084,
+    3066,   3048,   3030,   3013,   2995,   2978,  2962,  2945,  2928,  2912,
+    2896,   2880,   2864,   2849,   2833,   2818,  2803,  2788,  2774,  2759,
+    2744,   2730,   2716,   2702,   2688,   2674,  2661,  2647,  2634,  2621,
+    2608,   2595,   2582,   2570,   2557,   2545,  2532,  2520,  2508,  2496,
+    2484,   2473,   2461,   2449,   2438,   2427,  2416,  2404,  2394,  2383,
+    2372,   2361,   2351,   2340,   2330,   2319,  2309,  2299,  2289,  2279,
+    2269,   2259,   2250,   2240,   2231,   2221,  2212,  2202,  2193,  2184,
+    2175,   2166,   2157,   2148,   2139,   2131,  2122,  2114,  2105,  2097,
+    2088,   2080,   2072,   2064,   2056,   2048,  2040,  2032,  2024,  2016,
+    2008,   2001,   1993,   1985,   1978,   1971,  1963,  1956,  1949,  1941,
+    1934,   1927,   1920,   1913,   1906,   1899,  1892,  1885,  1879,  1872,
+    1865,   1859,   1852,   1846,   1839,   1833,  1826,  1820,  1814,  1807,
+    1801,   1795,   1789,   1783,   1777,   1771,  1765,  1759,  1753,  1747,
+    1741,   1736,   1730,   1724,   1718,   1713,  1707,  1702,  1696,  1691,
+    1685,   1680,   1675,   1669,   1664,   1659,  1653,  1648,  1643,  1638,
+    1633,   1628,   1623,   1618,   1613,   1608,  1603,  1598,  1593,  1588,
+    1583,   1579,   1574,   1569,   1565,   1560,  1555,  1551,  1546,  1542,
+    1537,   1533,   1528,   1524,   1519,   1515,  1510,  1506,  1502,  1497,
+    1493,   1489,   1485,   1481,   1476,   1472,  1468,  1464,  1460,  1456,
+    1452,   1448,   1444,   1440,   1436,   1432,  1428,  1424,  1420,  1416,
+    1413,   1409,   1405,   1401,   1398,   1394,  1390,  1387,  1383,  1379,
+    1376,   1372,   1368,   1365,   1361,   1358,  1354,  1351,  1347,  1344,
+    1340,   1337,   1334,   1330,   1327,   1323,  1320,  1317,  1314,  1310,
+    1307,   1304,   1300,   1297,   1294,   1291,  1288,  1285,  1281,  1278,
+    1275,   1272,   1269,   1266,   1263,   1260,  1257,  1254,  1251,  1248,
+    1245,   1242,   1239,   1236,   1233,   1230,  1227,  1224,  1222,  1219,
+    1216,   1213,   1210,   1208,   1205,   1202,  1199,  1197,  1194,  1191,
+    1188,   1186,   1183,   1180,   1178,   1175,  1172,  1170,  1167,  1165,
+    1162,   1159,   1157,   1154,   1152,   1149,  1147,  1144,  1142,  1139,
+    1137,   1134,   1132,   1129,   1127,   1125,  1122,  1120,  1117,  1115,
+    1113,   1110,   1108,   1106,   1103,   1101,  1099,  1096,  1094,  1092,
+    1089,   1087,   1085,   1083,   1081,   1078,  1076,  1074,  1072,  1069,
+    1067,   1065,   1063,   1061,   1059,   1057,  1054,  1052,  1050,  1048,
+    1046,   1044,   1042,   1040,   1038,   1036,  1034,  1032,  1030,  1028,
+    1026,   1024,   1022,   1020,   1018,   1016,  1014,  1012,  1010,  1008,
+    1006,   1004,   1002,   1000,   998,    996,   994,   992,   991,   989,
+    987,    985,    983,    981,    979,    978,   976,   974,   972,   970,
+    969,    967,    965,    963,    961,    960,   958,   956,   954,   953,
+    951,    949,    948,    946,    944,    942,   941,   939,   937,   936,
+    934,    932,    931,    929,    927,    926,   924,   923,   921,   919,
+    918,    916,    914,    913,    911,    910,   908,   907,   905,   903,
+    902,    900,    899,    897,    896,    894,   893,   891,   890,   888,
+    887,    885,    884,    882,    881,    879,   878,   876,   875,   873,
+    872,    870,    869,    868,    866,    865,   863,   862,   860,   859,
+    858,    856,    855,    853,    852,    851,   849,   848,   846,   845,
+    844,    842,    841,    840,    838,    837,   836,   834,   833,   832,
+    830,    829,    828,    826,    825,    824,   823,   821,   820,   819,
+    817,    816,    815,    814,    812,    811,   810,   809,   807,   806,
+    805,    804,    802,    801,    800,    799,   798,   796,   795,   794,
+    793,    791,    790,    789,    788,    787,   786,   784,   783,   782,
+    781,    780,    779,    777,    776,    775,   774,   773,   772,   771,
+    769,    768,    767,    766,    765,    764,   763,   762,   760,   759,
+    758,    757,    756,    755,    754,    753,   752,   751,   750,   748,
+    747,    746,    745,    744,    743,    742,   741,   740,   739,   738,
+    737,    736,    735,    734,    733,    732,   731,   730,   729,   728,
+    727,    726,    725,    724,    723,    722,   721,   720,   719,   718,
+    717,    716,    715,    714,    713,    712,   711,   710,   709,   708,
+    707,    706,    705,    704,    703,    702,   701,   700,   699,   699,
+    698,    697,    696,    695,    694,    693,   692,   691,   690,   689,
+    688,    688,    687,    686,    685,    684,   683,   682,   681,   680,
+    680,    679,    678,    677,    676,    675,   674,   673,   673,   672,
+    671,    670,    669,    668,    667,    667,   666,   665,   664,   663,
+    662,    661,    661,    660,    659,    658,   657,   657,   656,   655,
+    654,    653,    652,    652,    651,    650,   649,   648,   648,   647,
+    646,    645,    644,    644,    643,    642,   641,   640,   640,   639,
+    638,    637,    637,    636,    635,    634,   633,   633,   632,   631,
+    630,    630,    629,    628,    627,    627,   626,   625,   624,   624,
+    623,    622,    621,    621,    620,    619,   618,   618,   617,   616,
+    616,    615,    614,    613,    613,    612,   611,   611,   610,   609,
+    608,    608,    607,    606,    606,    605,   604,   604,   603,   602,
+    601,    601,    600,    599,    599,    598,   597,   597,   596,   595,
+    595,    594,    593,    593,    592,    591,   591,   590,   589,   589,
+    588,    587,    587,    586,    585,    585,   584,   583,   583,   582,
+    581,    581,    580,    579,    579,    578,   578,   577,   576,   576,
+    575,    574,    574,    573,    572,    572,   571,   571,   570,   569,
+    569,    568,    568,    567,    566,    566,   565,   564,   564,   563,
+    563,    562,    561,    561,    560,    560,   559,   558,   558,   557,
+    557,    556,    555,    555,    554,    554,   553,   553,   552,   551,
+    551,    550,    550,    549,    548,    548,   547,   547,   546,   546,
+    545,    544,    544,    543,    543,    542,   542,   541,   541,   540,
+    539,    539,    538,    538,    537,    537,   536,   536,   535,   534,
+    534,    533,    533,    532,    532,    531,   531,   530,   530,   529,
+    529,    528,    527,    527,    526,    526,   525,   525,   524,   524,
+    523,    523,    522,    522,    521,    521,   520,   520,   519,   519,
+    518,    518,    517,    517,    516,    516,   515,   515,   514,   514};
+
+// Note that LinearToGamma() expects the values to be premultiplied by 4,
+// so we incorporate this factor 4 inside the DIVIDE_BY_ALPHA macro directly.
+#define DIVIDE_BY_ALPHA(sum, a) (((sum) * kInvAlpha[(a)]) >> (kAlphaFix - 2))
+
+#else
+
+#define DIVIDE_BY_ALPHA(sum, a) (4 * (sum) / (a))
+
+#endif  // USE_INVERSE_ALPHA_TABLE
+
+static WEBP_INLINE int LinearToGammaWeighted(const uint8_t* src,
+                                             const uint8_t* a_ptr,
+                                             uint32_t total_a, int step,
+                                             int rgb_stride) {
+  const uint32_t sum =
+      a_ptr[0] * GammaToLinear(src[0]) +
+      a_ptr[step] * GammaToLinear(src[step]) +
+      a_ptr[rgb_stride] * GammaToLinear(src[rgb_stride]) +
+      a_ptr[rgb_stride + step] * GammaToLinear(src[rgb_stride + step]);
+  assert(total_a > 0 && total_a <= 4 * 0xff);
+#if defined(USE_INVERSE_ALPHA_TABLE)
+  assert((uint64_t)sum * kInvAlpha[total_a] < ((uint64_t)1 << 32));
+#endif
+  return LinearToGamma(DIVIDE_BY_ALPHA(sum, total_a), 0);
+}
+
+void WebPAccumulateRGBA(const uint8_t* const r_ptr, const uint8_t* const g_ptr,
+                        const uint8_t* const b_ptr, const uint8_t* const a_ptr,
+                        int rgb_stride, uint16_t* dst, int width) {
+  int i, j;
+  // we loop over 2x2 blocks and produce one R/G/B/A value for each.
+  for (i = 0, j = 0; i < (width >> 1); i += 1, j += 2 * 4, dst += 4) {
+    const uint32_t a = SUM4ALPHA(a_ptr + j);
+    int r, g, b;
+    if (a == 4 * 0xff || a == 0) {
+      r = SUM4(r_ptr + j, 4);
+      g = SUM4(g_ptr + j, 4);
+      b = SUM4(b_ptr + j, 4);
+    } else {
+      r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 4, rgb_stride);
+      g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 4, rgb_stride);
+      b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 4, rgb_stride);
+    }
+    dst[0] = r;
+    dst[1] = g;
+    dst[2] = b;
+    dst[3] = a;
+  }
+  if (width & 1) {
+    const uint32_t a = 2u * SUM2ALPHA(a_ptr + j);
+    int r, g, b;
+    if (a == 4 * 0xff || a == 0) {
+      r = SUM2(r_ptr + j);
+      g = SUM2(g_ptr + j);
+      b = SUM2(b_ptr + j);
+    } else {
+      r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 0, rgb_stride);
+      g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 0, rgb_stride);
+      b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 0, rgb_stride);
+    }
+    dst[0] = r;
+    dst[1] = g;
+    dst[2] = b;
+    dst[3] = a;
+  }
+}
+
+void WebPAccumulateRGB(const uint8_t* const r_ptr, const uint8_t* const g_ptr,
+                       const uint8_t* const b_ptr, int step, int rgb_stride,
+                       uint16_t* dst, int width) {
+  int i, j;
+  for (i = 0, j = 0; i < (width >> 1); i += 1, j += 2 * step, dst += 4) {
+    dst[0] = SUM4(r_ptr + j, step);
+    dst[1] = SUM4(g_ptr + j, step);
+    dst[2] = SUM4(b_ptr + j, step);
+    // MemorySanitizer may raise false positives with data that passes through
+    // RGBA32PackedToPlanar_16b_SSE41() due to incorrect modeling of shuffles.
+    // See https://crbug.com/webp/573.
+#ifdef WEBP_MSAN
+    dst[3] = 0;
+#endif
+  }
+  if (width & 1) {
+    dst[0] = SUM2(r_ptr + j);
+    dst[1] = SUM2(g_ptr + j);
+    dst[2] = SUM2(b_ptr + j);
+#ifdef WEBP_MSAN
+    dst[3] = 0;
+#endif
+  }
+}
+
+static void ImportYUVAFromRGBA_C(const uint8_t* r_ptr, const uint8_t* g_ptr,
+                                 const uint8_t* b_ptr, const uint8_t* a_ptr,
+                                 int step,        // bytes per pixel
+                                 int rgb_stride,  // bytes per scanline
+                                 int has_alpha, int width, int height,
+                                 uint16_t* tmp_rgb, int y_stride, int uv_stride,
+                                 int a_stride, uint8_t* dst_y, uint8_t* dst_u,
+                                 uint8_t* dst_v, uint8_t* dst_a) {
+  int y;
+  const int is_rgb = (r_ptr < b_ptr);  // otherwise it's bgr
+  const int uv_width = (width + 1) >> 1;
+
+  has_alpha &= dst_a != NULL;
+  if (has_alpha) {
+#if defined(USE_GAMMA_COMPRESSION) && defined(USE_INVERSE_ALPHA_TABLE)
+    assert(kAlphaFix + GAMMA_FIX <= 31);
+#endif
+  }
+
+  WebPInitGammaTables();
+
+  // Downsample Y/U/V planes, two rows at a time
+  for (y = 0; y < (height >> 1); ++y) {
+    int rows_have_alpha = has_alpha;
+    if (is_rgb) {
+      WebPConvertRGBToY(r_ptr, dst_y, width, step);
+      WebPConvertRGBToY(r_ptr + rgb_stride, dst_y + y_stride, width, step);
+    } else {
+      WebPConvertBGRToY(b_ptr, dst_y, width, step);
+      WebPConvertBGRToY(b_ptr + rgb_stride, dst_y + y_stride, width, step);
+    }
+    dst_y += 2 * y_stride;
+    if (has_alpha) {
+      rows_have_alpha &=
+          !WebPExtractAlpha(a_ptr, rgb_stride, width, 2, dst_a, a_stride);
+      dst_a += 2 * a_stride;
+    }
+    // Collect averaged R/G/B(/A)
+    if (!rows_have_alpha) {
+      WebPAccumulateRGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, tmp_rgb, width);
+    } else {
+      WebPAccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, rgb_stride, tmp_rgb,
+                         width);
+    }
+    // Convert to U/V
+    WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width);
+    dst_u += uv_stride;
+    dst_v += uv_stride;
+    r_ptr += 2 * rgb_stride;
+    b_ptr += 2 * rgb_stride;
+    g_ptr += 2 * rgb_stride;
+    if (has_alpha) a_ptr += 2 * rgb_stride;
+  }
+}
+
+static void ImportYUVAFromRGBALastLine_C(
+    const uint8_t* r_ptr, const uint8_t* g_ptr, const uint8_t* b_ptr,
+    const uint8_t* a_ptr,
+    int step,  // bytes per pixel
+    int has_alpha, int width, uint16_t* tmp_rgb, uint8_t* dst_y, uint8_t* dst_u,
+    uint8_t* dst_v, uint8_t* dst_a) {
+  const int is_rgb = (r_ptr < b_ptr);  // otherwise it's bgr
+  const int uv_width = (width + 1) >> 1;
+  int row_has_alpha = has_alpha && dst_a != NULL;
+
+  if (is_rgb) {
+    WebPConvertRGBToY(r_ptr, dst_y, width, step);
+  } else {
+    WebPConvertBGRToY(b_ptr, dst_y, width, step);
+  }
+  if (row_has_alpha) {
+    row_has_alpha &= !WebPExtractAlpha(a_ptr, 0, width, 1, dst_a, 0);
+  }
+  // Collect averaged R/G/B(/A)
+  if (!row_has_alpha) {
+    // Collect averaged R/G/B
+    WebPAccumulateRGB(r_ptr, g_ptr, b_ptr, step, /*rgb_stride=*/0, tmp_rgb,
+                      width);
+  } else {
+    WebPAccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, /*rgb_stride=*/0, tmp_rgb,
+                       width);
+  }
+  WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width);
+}
+
 //-----------------------------------------------------------------------------

 void (*WebPConvertRGBToY)(const uint8_t* WEBP_RESTRICT rgb,
@@ -214,6 +606,21 @@ void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb,
                              uint8_t* WEBP_RESTRICT u,
                              uint8_t* WEBP_RESTRICT v, int width);

+void (*WebPImportYUVAFromRGBA)(const uint8_t* r_ptr, const uint8_t* g_ptr,
+                               const uint8_t* b_ptr, const uint8_t* a_ptr,
+                               int step,        // bytes per pixel
+                               int rgb_stride,  // bytes per scanline
+                               int has_alpha, int width, int height,
+                               uint16_t* tmp_rgb, int y_stride, int uv_stride,
+                               int a_stride, uint8_t* dst_y, uint8_t* dst_u,
+                               uint8_t* dst_v, uint8_t* dst_a);
+void (*WebPImportYUVAFromRGBALastLine)(
+    const uint8_t* r_ptr, const uint8_t* g_ptr, const uint8_t* b_ptr,
+    const uint8_t* a_ptr,
+    int step,  // bytes per pixel
+    int has_alpha, int width, uint16_t* tmp_rgb, uint8_t* dst_y, uint8_t* dst_u,
+    uint8_t* dst_v, uint8_t* dst_a);
+
 void (*WebPConvertARGBToY)(const uint32_t* WEBP_RESTRICT argb,
                           uint8_t* WEBP_RESTRICT y, int width);
 void (*WebPConvertARGBToUV)(const uint32_t* WEBP_RESTRICT argb,
@@ -233,6 +640,9 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {

  WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;

+  WebPImportYUVAFromRGBA = ImportYUVAFromRGBA_C;
+  WebPImportYUVAFromRGBALastLine = ImportYUVAFromRGBALastLine_C;
+
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_HAVE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
--- a/src/dsp/yuv.h
+++ b/src/dsp/yuv.h
@@ -40,6 +40,15 @@
 #include "src/dsp/dsp.h"
 #include "src/webp/types.h"

+// Macros to give the offset of each channel in a uint32_t containing ARGB.
+#ifdef WORDS_BIGENDIAN
+// uint32_t 0xff000000 is 0xff,00,00,00 in memory
+#define CHANNEL_OFFSET(i) (i)
+#else
+// uint32_t 0xff000000 is 0x00,00,00,ff in memory
+#define CHANNEL_OFFSET(i) (3 - (i))
+#endif
+
 //------------------------------------------------------------------------------
 // YUV -> RGB conversion

@@ -221,6 +230,31 @@ static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
  return VP8ClipUV(v, rounding);
 }

+extern void (*WebPImportYUVAFromRGBA)(
+    const uint8_t* r_ptr, const uint8_t* g_ptr, const uint8_t* b_ptr,
+    const uint8_t* a_ptr,
+    int step,        // bytes per pixel
+    int rgb_stride,  // bytes per scanline
+    int has_alpha, int width, int height, uint16_t* tmp_rgb, int y_stride,
+    int uv_stride, int a_stride, uint8_t* dst_y, uint8_t* dst_u, uint8_t* dst_v,
+    uint8_t* dst_a);
+extern void (*WebPImportYUVAFromRGBALastLine)(
+    const uint8_t* r_ptr, const uint8_t* g_ptr, const uint8_t* b_ptr,
+    const uint8_t* a_ptr,
+    int step,  // bytes per pixel
+    int has_alpha, int width, uint16_t* tmp_rgb, uint8_t* dst_y, uint8_t* dst_u,
+    uint8_t* dst_v, uint8_t* dst_a);
+
+// Internal function to WebPImportYUVAFromRGBA* that can be reused.
+void WebPAccumulateRGBA(const uint8_t* const r_ptr, const uint8_t* const g_ptr,
+                        const uint8_t* const b_ptr, const uint8_t* const a_ptr,
+                        int rgb_stride, uint16_t* dst, int width);
+void WebPAccumulateRGB(const uint8_t* const r_ptr, const uint8_t* const g_ptr,
+                       const uint8_t* const b_ptr, int step, int rgb_stride,
+                       uint16_t* dst, int width);
+// Must be called before calling WebPAccumulateRGB*.
+void WebPInitGammaTables(void);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/src/enc/picture_csp_enc.c
+++ b/src/enc/picture_csp_enc.c
@@ -32,20 +32,6 @@
 #include <pthread.h>
 #endif

-// Uncomment to disable gamma-compression during RGB->U/V averaging
-#define USE_GAMMA_COMPRESSION
-
-// If defined, use table to compute x / alpha.
-#define USE_INVERSE_ALPHA_TABLE
-
-#ifdef WORDS_BIGENDIAN
-// uint32_t 0xff000000 is 0xff,00,00,00 in memory
-#define CHANNEL_OFFSET(i) (i)
-#else
-// uint32_t 0xff000000 is 0x00,00,00,ff in memory
-#define CHANNEL_OFFSET(i) (3 - (i))
-#endif
-
 #define ALPHA_OFFSET CHANNEL_OFFSET(0)

 //------------------------------------------------------------------------------
@@ -83,91 +69,8 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
                        picture->a_stride);
 }

-//------------------------------------------------------------------------------
-// Code for gamma correction
-
-#if defined(USE_GAMMA_COMPRESSION)
-
-// Gamma correction compensates loss of resolution during chroma subsampling.
-#define GAMMA_FIX 12     // fixed-point precision for linear values
-#define GAMMA_TAB_FIX 7  // fixed-point fractional bits precision
-#define GAMMA_TAB_SIZE (1 << (GAMMA_FIX - GAMMA_TAB_FIX))
-static const double kGamma = 0.80;
-static const int kGammaScale = ((1 << GAMMA_FIX) - 1);
-static const int kGammaTabScale = (1 << GAMMA_TAB_FIX);
-static const int kGammaTabRounder = (1 << GAMMA_TAB_FIX >> 1);
-
-static int kLinearToGammaTab[GAMMA_TAB_SIZE + 1];
-static uint16_t kGammaToLinearTab[256];
-static volatile int kGammaTablesOk = 0;
-static void InitGammaTables(void);
 extern VP8CPUInfo VP8GetCPUInfo;

-WEBP_DSP_INIT_FUNC(InitGammaTables) {
-  if (!kGammaTablesOk) {
-    int v;
-    const double scale = (double)(1 << GAMMA_TAB_FIX) / kGammaScale;
-    const double norm = 1. / 255.;
-    for (v = 0; v <= 255; ++v) {
-      kGammaToLinearTab[v] =
-          (uint16_t)(pow(norm * v, kGamma) * kGammaScale + .5);
-    }
-    for (v = 0; v <= GAMMA_TAB_SIZE; ++v) {
-      kLinearToGammaTab[v] = (int)(255. * pow(scale * v, 1. / kGamma) + .5);
-    }
-    kGammaTablesOk = 1;
-  }
-}
-
-static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) {
-  return kGammaToLinearTab[v];
-}
-
-static WEBP_INLINE int Interpolate(int v) {
-  const int tab_pos = v >> (GAMMA_TAB_FIX + 2);   // integer part
-  const int x = v & ((kGammaTabScale << 2) - 1);  // fractional part
-  const int v0 = kLinearToGammaTab[tab_pos];
-  const int v1 = kLinearToGammaTab[tab_pos + 1];
-  const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x);  // interpolate
-  assert(tab_pos + 1 < GAMMA_TAB_SIZE + 1);
-  return y;
-}
-
-// Convert a linear value 'v' to YUV_FIX+2 fixed-point precision
-// U/V value, suitable for RGBToU/V calls.
-static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
-  const int y = Interpolate(base_value << shift);  // final uplifted value
-  return (y + kGammaTabRounder) >> GAMMA_TAB_FIX;  // descale
-}
-
-#else
-
-static void InitGammaTables(void) {}
-static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
-static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
-  return (int)(base_value << shift);
-}
-
-#endif  // USE_GAMMA_COMPRESSION
-
-//------------------------------------------------------------------------------
-// RGB -> YUV conversion
-
-static int RGBToY(int r, int g, int b, VP8Random* const rg) {
-  return (rg == NULL) ? VP8RGBToY(r, g, b, YUV_HALF)
-                      : VP8RGBToY(r, g, b, VP8RandomBits(rg, YUV_FIX));
-}
-
-static int RGBToU(int r, int g, int b, VP8Random* const rg) {
-  return (rg == NULL) ? VP8RGBToU(r, g, b, YUV_HALF << 2)
-                      : VP8RGBToU(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
-}
-
-static int RGBToV(int r, int g, int b, VP8Random* const rg) {
-  return (rg == NULL) ? VP8RGBToV(r, g, b, YUV_HALF << 2)
-                      : VP8RGBToV(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
-}
-
 //------------------------------------------------------------------------------
 // Sharp RGB->YUV conversion

@@ -190,162 +93,6 @@ static int PreprocessARGB(const uint8_t* r_ptr, const uint8_t* g_ptr,
  return ok;
 }

-//------------------------------------------------------------------------------
-// "Fast" regular RGB->YUV
-
-#define SUM4(ptr, step)                                                  \
-  LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[(step)]) + \
-                    GammaToLinear((ptr)[rgb_stride]) +                   \
-                    GammaToLinear((ptr)[rgb_stride + (step)]),           \
-                0)
-
-#define SUM2(ptr) \
-  LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[rgb_stride]), 1)
-
-#define SUM2ALPHA(ptr) ((ptr)[0] + (ptr)[rgb_stride])
-#define SUM4ALPHA(ptr) (SUM2ALPHA(ptr) + SUM2ALPHA((ptr) + 4))
-
-#if defined(USE_INVERSE_ALPHA_TABLE)
-
-static const int kAlphaFix = 19;
-// Following table is (1 << kAlphaFix) / a. The (v * kInvAlpha[a]) >> kAlphaFix
-// formula is then equal to v / a in most (99.6%) cases. Note that this table
-// and constant are adjusted very tightly to fit 32b arithmetic.
-// In particular, they use the fact that the operands for 'v / a' are actually
-// derived as v = (a0.p0 + a1.p1 + a2.p2 + a3.p3) and a = a0 + a1 + a2 + a3
-// with ai in [0..255] and pi in [0..1<<GAMMA_FIX). The constraint to avoid
-// overflow is: GAMMA_FIX + kAlphaFix <= 31.
-static const uint32_t kInvAlpha[4 * 0xff + 1] = {
-    0, /* alpha = 0 */
-    524288, 262144, 174762, 131072, 104857, 87381, 74898, 65536, 58254, 52428,
-    47662,  43690,  40329,  37449,  34952,  32768, 30840, 29127, 27594, 26214,
-    24966,  23831,  22795,  21845,  20971,  20164, 19418, 18724, 18078, 17476,
-    16912,  16384,  15887,  15420,  14979,  14563, 14169, 13797, 13443, 13107,
-    12787,  12483,  12192,  11915,  11650,  11397, 11155, 10922, 10699, 10485,
-    10280,  10082,  9892,   9709,   9532,   9362,  9198,  9039,  8886,  8738,
-    8594,   8456,   8322,   8192,   8065,   7943,  7825,  7710,  7598,  7489,
-    7384,   7281,   7182,   7084,   6990,   6898,  6808,  6721,  6636,  6553,
-    6472,   6393,   6316,   6241,   6168,   6096,  6026,  5957,  5890,  5825,
-    5761,   5698,   5637,   5577,   5518,   5461,  5405,  5349,  5295,  5242,
-    5190,   5140,   5090,   5041,   4993,   4946,  4899,  4854,  4809,  4766,
-    4723,   4681,   4639,   4599,   4559,   4519,  4481,  4443,  4405,  4369,
-    4332,   4297,   4262,   4228,   4194,   4161,  4128,  4096,  4064,  4032,
-    4002,   3971,   3942,   3912,   3883,   3855,  3826,  3799,  3771,  3744,
-    3718,   3692,   3666,   3640,   3615,   3591,  3566,  3542,  3518,  3495,
-    3472,   3449,   3426,   3404,   3382,   3360,  3339,  3318,  3297,  3276,
-    3256,   3236,   3216,   3196,   3177,   3158,  3139,  3120,  3102,  3084,
-    3066,   3048,   3030,   3013,   2995,   2978,  2962,  2945,  2928,  2912,
-    2896,   2880,   2864,   2849,   2833,   2818,  2803,  2788,  2774,  2759,
-    2744,   2730,   2716,   2702,   2688,   2674,  2661,  2647,  2634,  2621,
-    2608,   2595,   2582,   2570,   2557,   2545,  2532,  2520,  2508,  2496,
-    2484,   2473,   2461,   2449,   2438,   2427,  2416,  2404,  2394,  2383,
-    2372,   2361,   2351,   2340,   2330,   2319,  2309,  2299,  2289,  2279,
-    2269,   2259,   2250,   2240,   2231,   2221,  2212,  2202,  2193,  2184,
-    2175,   2166,   2157,   2148,   2139,   2131,  2122,  2114,  2105,  2097,
-    2088,   2080,   2072,   2064,   2056,   2048,  2040,  2032,  2024,  2016,
-    2008,   2001,   1993,   1985,   1978,   1971,  1963,  1956,  1949,  1941,
-    1934,   1927,   1920,   1913,   1906,   1899,  1892,  1885,  1879,  1872,
-    1865,   1859,   1852,   1846,   1839,   1833,  1826,  1820,  1814,  1807,
-    1801,   1795,   1789,   1783,   1777,   1771,  1765,  1759,  1753,  1747,
-    1741,   1736,   1730,   1724,   1718,   1713,  1707,  1702,  1696,  1691,
-    1685,   1680,   1675,   1669,   1664,   1659,  1653,  1648,  1643,  1638,
-    1633,   1628,   1623,   1618,   1613,   1608,  1603,  1598,  1593,  1588,
-    1583,   1579,   1574,   1569,   1565,   1560,  1555,  1551,  1546,  1542,
-    1537,   1533,   1528,   1524,   1519,   1515,  1510,  1506,  1502,  1497,
-    1493,   1489,   1485,   1481,   1476,   1472,  1468,  1464,  1460,  1456,
-    1452,   1448,   1444,   1440,   1436,   1432,  1428,  1424,  1420,  1416,
-    1413,   1409,   1405,   1401,   1398,   1394,  1390,  1387,  1383,  1379,
-    1376,   1372,   1368,   1365,   1361,   1358,  1354,  1351,  1347,  1344,
-    1340,   1337,   1334,   1330,   1327,   1323,  1320,  1317,  1314,  1310,
-    1307,   1304,   1300,   1297,   1294,   1291,  1288,  1285,  1281,  1278,
-    1275,   1272,   1269,   1266,   1263,   1260,  1257,  1254,  1251,  1248,
-    1245,   1242,   1239,   1236,   1233,   1230,  1227,  1224,  1222,  1219,
-    1216,   1213,   1210,   1208,   1205,   1202,  1199,  1197,  1194,  1191,
-    1188,   1186,   1183,   1180,   1178,   1175,  1172,  1170,  1167,  1165,
-    1162,   1159,   1157,   1154,   1152,   1149,  1147,  1144,  1142,  1139,
-    1137,   1134,   1132,   1129,   1127,   1125,  1122,  1120,  1117,  1115,
-    1113,   1110,   1108,   1106,   1103,   1101,  1099,  1096,  1094,  1092,
-    1089,   1087,   1085,   1083,   1081,   1078,  1076,  1074,  1072,  1069,
-    1067,   1065,   1063,   1061,   1059,   1057,  1054,  1052,  1050,  1048,
-    1046,   1044,   1042,   1040,   1038,   1036,  1034,  1032,  1030,  1028,
-    1026,   1024,   1022,   1020,   1018,   1016,  1014,  1012,  1010,  1008,
-    1006,   1004,   1002,   1000,   998,    996,   994,   992,   991,   989,
-    987,    985,    983,    981,    979,    978,   976,   974,   972,   970,
-    969,    967,    965,    963,    961,    960,   958,   956,   954,   953,
-    951,    949,    948,    946,    944,    942,   941,   939,   937,   936,
-    934,    932,    931,    929,    927,    926,   924,   923,   921,   919,
-    918,    916,    914,    913,    911,    910,   908,   907,   905,   903,
-    902,    900,    899,    897,    896,    894,   893,   891,   890,   888,
-    887,    885,    884,    882,    881,    879,   878,   876,   875,   873,
-    872,    870,    869,    868,    866,    865,   863,   862,   860,   859,
-    858,    856,    855,    853,    852,    851,   849,   848,   846,   845,
-    844,    842,    841,    840,    838,    837,   836,   834,   833,   832,
-    830,    829,    828,    826,    825,    824,   823,   821,   820,   819,
-    817,    816,    815,    814,    812,    811,   810,   809,   807,   806,
-    805,    804,    802,    801,    800,    799,   798,   796,   795,   794,
-    793,    791,    790,    789,    788,    787,   786,   784,   783,   782,
-    781,    780,    779,    777,    776,    775,   774,   773,   772,   771,
-    769,    768,    767,    766,    765,    764,   763,   762,   760,   759,
-    758,    757,    756,    755,    754,    753,   752,   751,   750,   748,
-    747,    746,    745,    744,    743,    742,   741,   740,   739,   738,
-    737,    736,    735,    734,    733,    732,   731,   730,   729,   728,
-    727,    726,    725,    724,    723,    722,   721,   720,   719,   718,
-    717,    716,    715,    714,    713,    712,   711,   710,   709,   708,
-    707,    706,    705,    704,    703,    702,   701,   700,   699,   699,
-    698,    697,    696,    695,    694,    693,   692,   691,   690,   689,
-    688,    688,    687,    686,    685,    684,   683,   682,   681,   680,
-    680,    679,    678,    677,    676,    675,   674,   673,   673,   672,
-    671,    670,    669,    668,    667,    667,   666,   665,   664,   663,
-    662,    661,    661,    660,    659,    658,   657,   657,   656,   655,
-    654,    653,    652,    652,    651,    650,   649,   648,   648,   647,
-    646,    645,    644,    644,    643,    642,   641,   640,   640,   639,
-    638,    637,    637,    636,    635,    634,   633,   633,   632,   631,
-    630,    630,    629,    628,    627,    627,   626,   625,   624,   624,
-    623,    622,    621,    621,    620,    619,   618,   618,   617,   616,
-    616,    615,    614,    613,    613,    612,   611,   611,   610,   609,
-    608,    608,    607,    606,    606,    605,   604,   604,   603,   602,
-    601,    601,    600,    599,    599,    598,   597,   597,   596,   595,
-    595,    594,    593,    593,    592,    591,   591,   590,   589,   589,
-    588,    587,    587,    586,    585,    585,   584,   583,   583,   582,
-    581,    581,    580,    579,    579,    578,   578,   577,   576,   576,
-    575,    574,    574,    573,    572,    572,   571,   571,   570,   569,
-    569,    568,    568,    567,    566,    566,   565,   564,   564,   563,
-    563,    562,    561,    561,    560,    560,   559,   558,   558,   557,
-    557,    556,    555,    555,    554,    554,   553,   553,   552,   551,
-    551,    550,    550,    549,    548,    548,   547,   547,   546,   546,
-    545,    544,    544,    543,    543,    542,   542,   541,   541,   540,
-    539,    539,    538,    538,    537,    537,   536,   536,   535,   534,
-    534,    533,    533,    532,    532,    531,   531,   530,   530,   529,
-    529,    528,    527,    527,    526,    526,   525,   525,   524,   524,
-    523,    523,    522,    522,    521,    521,   520,   520,   519,   519,
-    518,    518,    517,    517,    516,    516,   515,   515,   514,   514};
-
-// Note that LinearToGamma() expects the values to be premultiplied by 4,
-// so we incorporate this factor 4 inside the DIVIDE_BY_ALPHA macro directly.
-#define DIVIDE_BY_ALPHA(sum, a) (((sum) * kInvAlpha[(a)]) >> (kAlphaFix - 2))
-
-#else
-
-#define DIVIDE_BY_ALPHA(sum, a) (4 * (sum) / (a))
-
-#endif  // USE_INVERSE_ALPHA_TABLE
-
-static WEBP_INLINE int LinearToGammaWeighted(const uint8_t* src,
-                                             const uint8_t* a_ptr,
-                                             uint32_t total_a, int step,
-                                             int rgb_stride) {
-  const uint32_t sum =
-      a_ptr[0] * GammaToLinear(src[0]) +
-      a_ptr[step] * GammaToLinear(src[step]) +
-      a_ptr[rgb_stride] * GammaToLinear(src[rgb_stride]) +
-      a_ptr[rgb_stride + step] * GammaToLinear(src[rgb_stride + step]);
-  assert(total_a > 0 && total_a <= 4 * 0xff);
-#if defined(USE_INVERSE_ALPHA_TABLE)
-  assert((uint64_t)sum * kInvAlpha[total_a] < ((uint64_t)1 << 32));
-#endif
-  return LinearToGamma(DIVIDE_BY_ALPHA(sum, total_a), 0);
-}
-
 static WEBP_INLINE void ConvertRowToY(const uint8_t* const r_ptr,
                                      const uint8_t* const g_ptr,
                                      const uint8_t* const b_ptr, int step,
@@ -353,78 +100,8 @@ static WEBP_INLINE void ConvertRowToY(const uint8_t* const r_ptr,
                                      VP8Random* const rg) {
  int i, j;
  for (i = 0, j = 0; i < width; i += 1, j += step) {
-    dst_y[i] = RGBToY(r_ptr[j], g_ptr[j], b_ptr[j], rg);
-  }
-}
-
-static WEBP_INLINE void AccumulateRGBA(const uint8_t* const r_ptr,
-                                       const uint8_t* const g_ptr,
-                                       const uint8_t* const b_ptr,
-                                       const uint8_t* const a_ptr,
-                                       int rgb_stride, uint16_t* dst,
-                                       int width) {
-  int i, j;
-  // we loop over 2x2 blocks and produce one R/G/B/A value for each.
-  for (i = 0, j = 0; i < (width >> 1); i += 1, j += 2 * 4, dst += 4) {
-    const uint32_t a = SUM4ALPHA(a_ptr + j);
-    int r, g, b;
-    if (a == 4 * 0xff || a == 0) {
-      r = SUM4(r_ptr + j, 4);
-      g = SUM4(g_ptr + j, 4);
-      b = SUM4(b_ptr + j, 4);
-    } else {
-      r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 4, rgb_stride);
-      g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 4, rgb_stride);
-      b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 4, rgb_stride);
-    }
-    dst[0] = r;
-    dst[1] = g;
-    dst[2] = b;
-    dst[3] = a;
-  }
-  if (width & 1) {
-    const uint32_t a = 2u * SUM2ALPHA(a_ptr + j);
-    int r, g, b;
-    if (a == 4 * 0xff || a == 0) {
-      r = SUM2(r_ptr + j);
-      g = SUM2(g_ptr + j);
-      b = SUM2(b_ptr + j);
-    } else {
-      r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 0, rgb_stride);
-      g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 0, rgb_stride);
-      b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 0, rgb_stride);
-    }
-    dst[0] = r;
-    dst[1] = g;
-    dst[2] = b;
-    dst[3] = a;
-  }
-}
-
-static WEBP_INLINE void AccumulateRGB(const uint8_t* const r_ptr,
-                                      const uint8_t* const g_ptr,
-                                      const uint8_t* const b_ptr, int step,
-                                      int rgb_stride, uint16_t* dst,
-                                      int width) {
-  int i, j;
-  for (i = 0, j = 0; i < (width >> 1); i += 1, j += 2 * step, dst += 4) {
-    dst[0] = SUM4(r_ptr + j, step);
-    dst[1] = SUM4(g_ptr + j, step);
-    dst[2] = SUM4(b_ptr + j, step);
-    // MemorySanitizer may raise false positives with data that passes through
-    // RGBA32PackedToPlanar_16b_SSE41() due to incorrect modeling of shuffles.
-    // See https://crbug.com/webp/573.
-#ifdef WEBP_MSAN
-    dst[3] = 0;
-#endif
-  }
-  if (width & 1) {
-    dst[0] = SUM2(r_ptr + j);
-    dst[1] = SUM2(g_ptr + j);
-    dst[2] = SUM2(b_ptr + j);
-#ifdef WEBP_MSAN
-    dst[3] = 0;
-#endif
+    dst_y[i] =
+        VP8RGBToY(r_ptr[j], g_ptr[j], b_ptr[j], VP8RandomBits(rg, YUV_FIX));
  }
 }

@@ -435,8 +112,8 @@ static WEBP_INLINE void ConvertRowsToUV(const uint16_t* rgb,
  int i;
  for (i = 0; i < width; i += 1, rgb += 4) {
    const int r = rgb[0], g = rgb[1], b = rgb[2];
-    dst_u[i] = RGBToU(r, g, b, rg);
-    dst_v[i] = RGBToV(r, g, b, rg);
+    dst_u[i] = VP8RGBToU(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
+    dst_v[i] = VP8RGBToV(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
  }
 }

@@ -452,7 +129,6 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr, const uint8_t* g_ptr,
  const int width = picture->width;
  const int height = picture->height;
  const int has_alpha = CheckNonOpaque(a_ptr, width, height, step, rgb_stride);
-  const int is_rgb = (r_ptr < b_ptr);  // otherwise it's bgr

  picture->colorspace = has_alpha ? WEBP_YUV420A : WEBP_YUV420;
  picture->use_argb = 0;
@@ -468,9 +144,6 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr, const uint8_t* g_ptr,
  }
  if (has_alpha) {
    assert(step == 4);
-#if defined(USE_GAMMA_COMPRESSION) && defined(USE_INVERSE_ALPHA_TABLE)
-    assert(kAlphaFix + GAMMA_FIX <= 31);
-#endif
  }

  if (use_iterative_conversion) {
@@ -499,85 +172,88 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr, const uint8_t* g_ptr,
      rg = &base_rg;
    }
    WebPInitConvertARGBToYUV();
-    InitGammaTables();
+    WebPInitGammaTables();

    if (tmp_rgb == NULL) {
      return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
    }

-    // Downsample Y/U/V planes, two rows at a time
-    for (y = 0; y < (height >> 1); ++y) {
-      int rows_have_alpha = has_alpha;
-      if (rg == NULL) {
-        if (is_rgb) {
-          WebPConvertRGBToY(r_ptr, dst_y, width, step);
-          WebPConvertRGBToY(r_ptr + rgb_stride, dst_y + picture->y_stride,
-                            width, step);
-        } else {
-          WebPConvertBGRToY(b_ptr, dst_y, width, step);
-          WebPConvertBGRToY(b_ptr + rgb_stride, dst_y + picture->y_stride,
-                            width, step);
+    if (rg == NULL) {
+      // Downsample Y/U/V planes, two rows at a time
+      WebPImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
+                             has_alpha, width, height, tmp_rgb,
+                             picture->y_stride, picture->uv_stride,
+                             picture->a_stride, dst_y, dst_u, dst_v, dst_a);
+      if (height & 1) {
+        dst_y += (height - 1) * picture->y_stride;
+        dst_u += (height >> 1) * picture->uv_stride;
+        dst_v += (height >> 1) * picture->uv_stride;
+        r_ptr += (height - 1) * rgb_stride;
+        b_ptr += (height - 1) * rgb_stride;
+        g_ptr += (height - 1) * rgb_stride;
+        if (has_alpha) {
+          dst_a += (height - 1) * picture->a_stride;
+          a_ptr += (height - 1) * rgb_stride;
        }
-      } else {
+        WebPImportYUVAFromRGBALastLine(r_ptr, g_ptr, b_ptr, a_ptr, step,
+                                       has_alpha, width, tmp_rgb, dst_y, dst_u,
+                                       dst_v, dst_a);
+      }
+    } else {
+      // Copy of WebPImportYUVAFromRGBA/WebPImportYUVAFromRGBALastLine,
+      // but with dithering.
+      for (y = 0; y < (height >> 1); ++y) {
+        int rows_have_alpha = has_alpha;
        ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg);
        ConvertRowToY(r_ptr + rgb_stride, g_ptr + rgb_stride,
                      b_ptr + rgb_stride, step, dst_y + picture->y_stride,
                      width, rg);
-      }
-      dst_y += 2 * picture->y_stride;
-      if (has_alpha) {
-        rows_have_alpha &= !WebPExtractAlpha(a_ptr, rgb_stride, width, 2, dst_a,
-                                             picture->a_stride);
-        dst_a += 2 * picture->a_stride;
-      }
-      // Collect averaged R/G/B(/A)
-      if (!rows_have_alpha) {
-        AccumulateRGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, tmp_rgb, width);
-      } else {
-        AccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, rgb_stride, tmp_rgb, width);
-      }
-      // Convert to U/V
-      if (rg == NULL) {
-        WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width);
-      } else {
-        ConvertRowsToUV(tmp_rgb, dst_u, dst_v, uv_width, rg);
-      }
-      dst_u += picture->uv_stride;
-      dst_v += picture->uv_stride;
-      r_ptr += 2 * rgb_stride;
-      b_ptr += 2 * rgb_stride;
-      g_ptr += 2 * rgb_stride;
-      if (has_alpha) a_ptr += 2 * rgb_stride;
-    }
-    if (height & 1) {  // extra last row
-      int row_has_alpha = has_alpha;
-      if (rg == NULL) {
-        if (is_rgb) {
-          WebPConvertRGBToY(r_ptr, dst_y, width, step);
-        } else {
-          WebPConvertBGRToY(b_ptr, dst_y, width, step);
+        dst_y += 2 * picture->y_stride;
+        if (has_alpha) {
+          rows_have_alpha &= !WebPExtractAlpha(a_ptr, rgb_stride, width, 2,
+                                               dst_a, picture->a_stride);
+          dst_a += 2 * picture->a_stride;
        }
-      } else {
-        ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg);
-      }
-      if (row_has_alpha) {
-        row_has_alpha &= !WebPExtractAlpha(a_ptr, 0, width, 1, dst_a, 0);
-      }
-      // Collect averaged R/G/B(/A)
-      if (!row_has_alpha) {
-        // Collect averaged R/G/B
-        AccumulateRGB(r_ptr, g_ptr, b_ptr, step, /* rgb_stride = */ 0, tmp_rgb,
-                      width);
-      } else {
-        AccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, /* rgb_stride = */ 0,
-                       tmp_rgb, width);
-      }
-      if (rg == NULL) {
-        WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width);
-      } else {
+        // Collect averaged R/G/B(/A)
+        if (!rows_have_alpha) {
+          WebPAccumulateRGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, tmp_rgb,
+                            width);
+        } else {
+          WebPAccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, rgb_stride, tmp_rgb,
+                             width);
+        }
+        // Convert to U/V
        ConvertRowsToUV(tmp_rgb, dst_u, dst_v, uv_width, rg);
+        dst_u += picture->uv_stride;
+        dst_v += picture->uv_stride;
+        r_ptr += 2 * rgb_stride;
+        b_ptr += 2 * rgb_stride;
+        g_ptr += 2 * rgb_stride;
+        if (has_alpha) a_ptr += 2 * rgb_stride;
+      }
+      if (height & 1) {  // extra last row
+        int row_has_alpha = has_alpha;
+        ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg);
+        if (row_has_alpha) {
+          row_has_alpha &= !WebPExtractAlpha(a_ptr, 0, width, 1, dst_a, 0);
+        }
+        // Collect averaged R/G/B(/A)
+        if (!row_has_alpha) {
+          // Collect averaged R/G/B
+          WebPAccumulateRGB(r_ptr, g_ptr, b_ptr, step, /*rgb_stride=*/0,
+                            tmp_rgb, width);
+        } else {
+          WebPAccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, /*rgb_stride=*/0,
+                             tmp_rgb, width);
+        }
+        if (rg == NULL) {
+          WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width);
+        } else {
+          ConvertRowsToUV(tmp_rgb, dst_u, dst_v, uv_width, rg);
+        }
      }
    }
+
    WebPSafeFree(tmp_rgb);
  }
  return 1;