use DC error diffusion for U/V at low-quality

This fixes some color smearing due to heavy quantization. This is only enabled for q <= 30 (cf ERROR_DIFFUSION_QUALITY) Change-Id: I07e83a4d38461357a32c9e214f7eadc6db73baa9
2025-12-24 05:56:27 +01:00 · 2017-12-11 05:07:13 -08:00
parent 1c59020b93
commit 96bf07c560
5 changed files with 106 additions and 3 deletions
--- a/src/enc/frame_enc.c
+++ b/src/enc/frame_enc.c
@@ -871,4 +871,3 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
 #endif    // DISABLE_TOKEN_BUFFER

 //------------------------------------------------------------------------------
-
--- a/src/enc/iterator_enc.c
+++ b/src/enc/iterator_enc.c
@@ -26,6 +26,9 @@ static void InitLeft(VP8EncIterator* const it) {
  memset(it->u_left_, 129, 8);
  memset(it->v_left_, 129, 8);
  it->left_nz_[8] = 0;
+  if (it->top_derr_ != NULL) {
+    memset(&it->left_derr_, 0, sizeof(it->left_derr_));
+  }
 }

 static void InitTop(VP8EncIterator* const it) {
@@ -33,6 +36,9 @@ static void InitTop(VP8EncIterator* const it) {
  const size_t top_size = enc->mb_w_ * 16;
  memset(enc->y_top_, 127, 2 * top_size);
  memset(enc->nz_, 0, enc->mb_w_ * sizeof(*enc->nz_));
+  if (enc->top_derr_ != NULL) {
+    memset(enc->top_derr_, 0, enc->mb_w_ * sizeof(*enc->top_derr_));
+  }
 }

 void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
@@ -76,6 +82,7 @@ void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
  it->y_left_ = (uint8_t*)WEBP_ALIGN(it->yuv_left_mem_ + 1);
  it->u_left_ = it->y_left_ + 16 + 16;
  it->v_left_ = it->u_left_ + 16;
+  it->top_derr_ = enc->top_derr_;
  VP8IteratorReset(it);
 }

@@ -450,4 +457,3 @@ int VP8IteratorRotateI4(VP8EncIterator* const it,
 }

 //------------------------------------------------------------------------------
-
--- a/src/enc/quant_enc.c
+++ b/src/enc/quant_enc.c
@@ -826,6 +826,80 @@ static int ReconstructIntra4(VP8EncIterator* const it,
  return nz;
 }

+//------------------------------------------------------------------------------
+// DC-error diffusion
+
+// Diffusion weights. We under-correct a bit (3/4th of the error is actually
+// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
+#define C1 2    // fraction of error sent to the 4x4 block below
+#define C2 1    // fraction of error sent to the 4x4 block on the right
+#define DSHIFT 2
+
+// Quantize as usual, but also compute and return the quantization error.
+// Error is already divided by DSHIFT.
+static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
+  int V = *v;
+  const int sign = (V < 0);
+  if (sign) V = -V;
+  if (V > (int)mtx->zthresh_[0]) {
+    const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
+    const int err = (V - qV);
+    *v = sign ? -qV : qV;
+    return (sign ? -err : err) >> DSHIFT;
+  }
+  *v = 0;
+  return (sign ? -V : V) >> DSHIFT;
+}
+
+static void CorrectDCValues(const VP8EncIterator* const it,
+                            const VP8Matrix* const mtx,
+                            int16_t tmp[][16], VP8ModeScore* const rd) {
+  //         | top[0] | top[1]
+  // --------+--------+---------
+  // left[0] | tmp[0]   tmp[1]  <->   err0 err1
+  // left[1] | tmp[2]   tmp[3]        err2 err3
+  //
+  // Final errors {err1,err2,err3} are preserved and later restored
+  // as top[]/left[] on the next block.
+  int ch;
+  for (ch = 0; ch <= 1; ++ch) {
+    const int16_t* const top = it->top_derr_[it->x_][ch];
+    const int16_t* const left = it->left_derr_[ch];
+    int16_t (* const c)[16] = &tmp[ch * 4];
+    int err0, err1, err2, err3;
+    c[0][0] += C1 * top[0] + C2 * left[0];
+    err0 = QuantizeSingle(&c[0][0], mtx);
+    c[1][0] += C1 * top[1] + C2 * err0;
+    err1 = QuantizeSingle(&c[1][0], mtx);
+    c[2][0] += C1 * err0 + C2 * left[1];
+    err2 = QuantizeSingle(&c[2][0], mtx);
+    c[3][0] += C1 * err1 + C2 * err2;
+    err3 = QuantizeSingle(&c[3][0], mtx);
+    rd->derr[ch][0] = err1;
+    rd->derr[ch][1] = err2;
+    rd->derr[ch][2] = err3;
+  }
+}
+
+static void StoreDiffusionErrors(VP8EncIterator* const it,
+                                 const VP8ModeScore* const rd) {
+  int ch;
+  for (ch = 0; ch <= 1; ++ch) {
+    int16_t* const top = it->top_derr_[it->x_][ch];
+    int16_t* const left = it->left_derr_[ch];
+    left[0] = rd->derr[ch][0];   // restore err1
+    left[1] = rd->derr[ch][2];   //     ... err3
+    top[0]  = rd->derr[ch][1];   //     ... err2
+    top[1]  = rd->derr[ch][2];   //     ... err3.
+  }
+}
+
+#undef C1
+#undef C2
+#undef DSHIFT
+
+//------------------------------------------------------------------------------
+
 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
                         uint8_t* const yuv_out, int mode) {
  const VP8Encoder* const enc = it->enc_;
@@ -839,6 +913,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
  for (n = 0; n < 8; n += 2) {
    VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
  }
+  if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
+
  if (DO_TRELLIS_UV && it->do_trellis_) {
    int ch, x, y;
    for (ch = 0, n = 0; ch <= 2; ch += 2) {
@@ -1101,6 +1177,9 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
      CopyScore(&rd_best, &rd_uv);
      rd->mode_uv = mode;
      memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
+      if (it->top_derr_ != NULL) {
+        memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
+      }
      SwapPtr(&dst, &tmp_dst);
    }
  }
@@ -1109,6 +1188,9 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
  if (dst != dst0) {   // copy 16x8 block if needed
    VP8Copy16x8(dst, dst0);
  }
+  if (it->top_derr_ != NULL) {  // store diffusion errors for next block
+    StoreDiffusionErrors(it, rd);
+  }
 }

 //------------------------------------------------------------------------------
--- a/src/enc/vp8i_enc.h
+++ b/src/enc/vp8i_enc.h
@@ -120,6 +120,9 @@ static WEBP_INLINE int QUANTDIV(uint32_t n, uint32_t iQ, uint32_t B) {
 // Uncomment the following to remove token-buffer code:
 // #define DISABLE_TOKEN_BUFFER

+// quality below which error-diffusion is enabled
+#define ERROR_DIFFUSION_QUALITY 30
+
 //------------------------------------------------------------------------------
 // Headers

@@ -201,6 +204,8 @@ typedef struct {
  score_t i4_penalty_;   // penalty for using Intra4
 } VP8SegmentInfo;

+typedef int16_t DError[2 /* u/v */][2 /* top or left */];
+
 // Handy transient struct to accumulate score and info during RD-optimization
 // and mode evaluation.
 typedef struct {
@@ -213,6 +218,7 @@ typedef struct {
  uint8_t modes_i4[16];       // mode numbers for intra4 predictions
  int mode_uv;                // mode number of chroma prediction
  uint32_t nz;                // non-zero blocks
+  int16_t derr[2][3];         // DC diffusion errors for U/V for blocks #1/2/3
 } VP8ModeScore;

 // Iterator structure to iterate through macroblocks, pointing to the
@@ -242,6 +248,9 @@ typedef struct {
  int           count_down0_;      // starting counter value (for progress)
  int           percent0_;         // saved initial progress percent

+  DError        left_derr_;        // left error diffusion (u/v)
+  DError       *top_derr_;         // top diffusion error - NULL if disabled
+
  uint8_t* y_left_;    // left luma samples (addressable from index -1 to 15).
  uint8_t* u_left_;    // left u samples (addressable from index -1 to 7)
  uint8_t* v_left_;    // left v samples (addressable from index -1 to 7)
@@ -401,6 +410,7 @@ struct VP8Encoder {
  uint8_t*   uv_top_;    // top u/v samples.
                         // U and V are packed into 16 bytes (8 U + 8 V)
  LFStats*   lf_stats_;  // autofilter stats (if NULL, autofilter is off)
+  DError*    top_derr_;  // diffusion error (NULL if disabled)
 };

 //------------------------------------------------------------------------------
--- a/src/enc/webp_enc.c
+++ b/src/enc/webp_enc.c
@@ -159,12 +159,15 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
      + WEBP_ALIGN_CST;                      // align all
  const size_t lf_stats_size =
      config->autofilter ? sizeof(*enc->lf_stats_) + WEBP_ALIGN_CST : 0;
+  const size_t top_derr_size = (config->quality <= ERROR_DIFFUSION_QUALITY) ?
+      mb_w * sizeof(*enc->top_derr_) : 0;
  uint8_t* mem;
  const uint64_t size = (uint64_t)sizeof(*enc)   // main struct
                      + WEBP_ALIGN_CST           // cache alignment
                      + info_size                // modes info
                      + preds_size               // prediction modes
                      + samples_size             // top/left samples
+                      + top_derr_size            // top diffusion error
                      + nz_size                  // coeff context bits
                      + lf_stats_size;           // autofilter stats

@@ -175,11 +178,12 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
         "                info: %ld\n"
         "               preds: %ld\n"
         "         top samples: %ld\n"
+         "       top diffusion: %ld\n"
         "            non-zero: %ld\n"
         "            lf-stats: %ld\n"
         "               total: %ld\n",
         sizeof(*enc) + WEBP_ALIGN_CST, info_size,
-         preds_size, samples_size, nz_size, lf_stats_size, size);
+         preds_size, samples_size, top_derr_size, nz_size, lf_stats_size, size);
  printf("Transient object sizes:\n"
         "      VP8EncIterator: %ld\n"
         "        VP8ModeScore: %ld\n"
@@ -219,6 +223,8 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
  enc->y_top_ = mem;
  enc->uv_top_ = enc->y_top_ + top_stride;
  mem += 2 * top_stride;
+  enc->top_derr_ = top_derr_size ? (DError*)mem : NULL;
+  mem += top_derr_size;
  assert(mem <= (uint8_t*)enc + size);

  enc->config_ = config;