add a -dither dithering option to the decoder

Even at high quality setting, the U/V quantizer step is limited to 4 which can lead to banding on gradient. This option allows to selectively apply some randomness to potentially flattened-out U/V blocks and attenuate the banding. This option is off by default in 'dwebp', but set to -dither 50 by default in 'vwebp'. Note: depending on the number of blocks selectively dithered, we can have up to a 10% slow-down in decoding speed it seems. Change-Id: Icc2446007f33ddacb60b3a80a9e63f2d5ad162de
2025-07-19 07:20:02 +02:00 · 2013-11-26 22:59:02 +01:00
parent e812401299
commit cbdd3e6e53
11 changed files with 138 additions and 11 deletions
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@ -555,6 +555,8 @@ static void Help(void) {
         "  -version  .... print version number and exit.\n"
         "  -nofancy ..... don't use the fancy YUV420 upscaler.\n"
         "  -nofilter .... disable in-loop filtering.\n"
+         "  -nodither .... disable dithering.\n"
+         "  -dither <d> .. dithering strength (in 0..100)\n"
         "  -mt .......... use multi-threading\n"
         "  -crop <x> <y> <w> <h> ... crop output with the given rectangle\n"
         "  -scale <w> <h> .......... scale the output (*after* any cropping)\n"
@ -625,6 +627,10 @@ int main(int argc, const char *argv[]) {
      format = YUV;
    } else if (!strcmp(argv[c], "-mt")) {
      config.options.use_threads = 1;
+    } else if (!strcmp(argv[c], "-nodither")) {
+      config.options.dithering_strength = 0;
+    } else if (!strcmp(argv[c], "-dither") && c < argc - 1) {
+      config.options.dithering_strength = strtol(argv[++c], NULL, 0);
    } else if (!strcmp(argv[c], "-crop") && c < argc - 4) {
      config.options.use_cropping = 1;
      config.options.crop_left   = strtol(argv[++c], NULL, 0);
@ -719,7 +725,7 @@ int main(int argc, const char *argv[]) {
    if (!incremental) {
      status = WebPDecode(data, data_size, &config);
    } else {
-      WebPIDecoder* const idec = WebPINewDecoder(output_buffer);
+      WebPIDecoder* const idec = WebPIDecode(data, data_size, &config);
      if (idec == NULL) {
        fprintf(stderr, "Failed during WebPINewDecoder().\n");
        status = VP8_STATUS_OUT_OF_MEMORY;
--- a/examples/vwebp.c
+++ b/examples/vwebp.c
@ -376,6 +376,7 @@ static void Help(void) {
         "  -noicc ....... don't use the icc profile if present.\n"
         "  -nofancy ..... don't use the fancy YUV420 upscaler.\n"
         "  -nofilter .... disable in-loop filtering.\n"
+         "  -dither <int>  dithering strength (0..100). Default=50.\n"
         "  -mt .......... use multi-threading.\n"
         "  -info ........ print info.\n"
         "  -h     ....... this help message.\n"
@ -397,6 +398,7 @@ int main(int argc, char *argv[]) {
    fprintf(stderr, "Library version mismatch!\n");
    return -1;
  }
+  config->options.dithering_strength = 50;
  kParams.use_color_profile = 1;

  for (c = 1; c < argc; ++c) {
@ -409,6 +411,8 @@ int main(int argc, char *argv[]) {
      config->options.no_fancy_upsampling = 1;
    } else if (!strcmp(argv[c], "-nofilter")) {
      config->options.bypass_filtering = 1;
+    } else if (!strcmp(argv[c], "-dither") && c + 1 < argc) {
+      config->options.dithering_strength = strtol(argv[++c], NULL, 0);
    } else if (!strcmp(argv[c], "-info")) {
      kParams.print_info = 1;
    } else if (!strcmp(argv[c], "-version")) {
--- a/man/dwebp.1
+++ b/man/dwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH DWEBP 1 "May 10, 2013"
+.TH DWEBP 1 "November 26, 2013"
 .SH NAME
 dwebp \- decompress a WebP file to an image file
 .SH SYNOPSIS
@ -55,7 +55,15 @@ edges (especially the red ones), but should be faster.
 .B \-nofilter
 Don't use the in-loop filtering process even if it is required by
 the bitstream. This may produce visible blocks on the non-compliant output,
-but will make the decoding faster.
+but it will make the decoding faster.
+.TP
+.B \-dither " strength
+Specify a dithering \fBstrength\fP between 0 and 100. Dithering is a
+post-processing effect applied to chroma components in lossy compression.
+It helps by smoothing gradients and avoiding banding artifacts.
+.TP
+.B \-nodither
+Disable all dithering (default).
 .TP
 .B \-mt
 Use multi-threading for decoding, if possible.
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@ -148,6 +148,82 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
  }
 }

+//------------------------------------------------------------------------------
+// Dithering
+
+#define DITHER_AMP_TAB_SIZE 12
+static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
+  // roughly, it's dqm->uv_mat_[1]
+  8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
+};
+
+void VP8InitDithering(const WebPDecoderOptions* const options,
+                      VP8Decoder* const dec) {
+  assert(dec != NULL);
+  if (options != NULL) {
+    const int d = options->dithering_strength;
+    const int max_amp = (1 << VP8_RANDOM_DITHER_FIX) - 1;
+    const int f = (d < 0) ? 0 : (d > 100) ? max_amp : (d * max_amp / 100);
+    if (f > 0) {
+      int s;
+      int all_amp = 0;
+      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+        VP8QuantMatrix* const dqm = &dec->dqm_[s];
+        if (dqm->uv_quant_ < DITHER_AMP_TAB_SIZE) {
+          // TODO(skal): should we specially dither more for uv_quant_ < 0?
+          const int idx = (dqm->uv_quant_ < 0) ? 0 : dqm->uv_quant_;
+          dqm->dither_ = (f * kQuantToDitherAmp[idx]) >> 3;
+        }
+        all_amp |= dqm->dither_;
+      }
+      if (all_amp != 0) {
+        VP8InitRandom(&dec->dithering_rg_, 1.0f);
+        dec->dither_ = 1;
+      }
+    }
+  }
+}
+
+// minimal amp that will provide a non-zero dithering effect
+#define MIN_DITHER_AMP 4
+#define DITHER_DESCALE 4
+#define DITHER_DESCALE_ROUNDER (1 << (DITHER_DESCALE - 1))
+#define DITHER_AMP_BITS 8
+#define DITHER_AMP_CENTER (1 << DITHER_AMP_BITS)
+
+static void Dither8x8(VP8Random* const rg, uint8_t* dst, int bps, int amp) {
+  int i, j;
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i) {
+      // TODO: could be made faster with SSE2
+      const int bits =
+          VP8RandomBits2(rg, DITHER_AMP_BITS + 1, amp) - DITHER_AMP_CENTER;
+      // Convert to range: [-2,2] for dither=50, [-4,4] for dither=100
+      const int delta = (bits + DITHER_DESCALE_ROUNDER) >> DITHER_DESCALE;
+      const int v = (int)dst[i] + delta;
+      dst[i] = (v < 0) ? 0 : (v > 255) ? 255u : (uint8_t)v;
+    }
+    dst += bps;
+  }
+}
+
+static void DitherRow(VP8Decoder* const dec) {
+  int mb_x;
+  assert(dec->dither_);
+  for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
+    const VP8ThreadContext* const ctx = &dec->thread_ctx_;
+    const VP8MBData* const data = ctx->mb_data_ + mb_x;
+    const int cache_id = ctx->id_;
+    const int uv_bps = dec->cache_uv_stride_;
+    if (data->dither_ >= MIN_DITHER_AMP) {
+      uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
+      uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
+      Dither8x8(&dec->dithering_rg_, u_dst, uv_bps, data->dither_);
+      Dither8x8(&dec->dithering_rg_, v_dst, uv_bps, data->dither_);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // This function is called after a row of macroblocks is finished decoding.
 // It also takes into account the following restrictions:
@ -186,6 +262,10 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
    FilterRow(dec);
  }

+  if (dec->dither_) {
+    DitherRow(dec);
+  }
+
  if (io->put != NULL) {
    int y_start = MACROBLOCK_VPOS(mb_y);
    int y_end = MACROBLOCK_VPOS(mb_y + 1);
--- a/src/dec/idec.c
+++ b/src/dec/idec.c
@ -423,6 +423,7 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
  // This change must be done before calling VP8InitFrame()
  dec->mt_method_ = VP8GetThreadMethod(params->options, NULL,
                                       io->width, io->height);
+  VP8InitDithering(params->options, dec);
  if (!CopyParts0Data(idec)) {
    return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
  }
--- a/src/dec/quant.c
+++ b/src/dec/quant.c
@ -104,6 +104,8 @@ void VP8ParseQuant(VP8Decoder* const dec) {

      m->uv_mat_[0] = kDcTable[clip(q + dquv_dc, 117)];
      m->uv_mat_[1] = kAcTable[clip(q + dquv_ac, 127)];
+
+      m->uv_quant_ = q + dquv_ac;   // for dithering strength evaluation
    }
  }
 }
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@ -561,6 +561,12 @@ static int ParseResiduals(VP8Decoder* const dec,

  block->non_zero_y_ = non_zero_y;
  block->non_zero_uv_ = non_zero_uv;
+
+  // We look at the mode-code of each block and check if some blocks have less
+  // than three non-zero coeffs (code < 2). This is to avoid dithering flat and
+  // empty blocks.
+  block->dither_ = (non_zero_uv & 0xaaaa) ? 0 : q->dither_;
+
  return !(non_zero_y | non_zero_uv);  // will be used for further optimization
 }

--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@ -17,6 +17,7 @@
 #include <string.h>     // for memcpy()
 #include "./vp8li.h"
 #include "../utils/bit_reader.h"
+#include "../utils/random.h"
 #include "../utils/thread.h"
 #include "../dsp/dsp.h"

@ -173,6 +174,9 @@ typedef struct {  // Top/Left Contexts used for syntax-parsing
 typedef int quant_t[2];      // [DC / AC].  Can be 'uint16_t[2]' too (~slower).
 typedef struct {
  quant_t y1_mat_, y2_mat_, uv_mat_;
+
+  int uv_quant_;   // U/V quantizer value
+  int dither_;     // dithering amplitude (0 = off, max=255)
 } VP8QuantMatrix;

 // Data needed to reconstruct a macroblock
@ -190,6 +194,7 @@ typedef struct {
  // This allows to call specialized transform functions.
  uint32_t non_zero_y_;
  uint32_t non_zero_uv_;
+  uint8_t dither_;      // local dithering strength (deduced from non_zero_*)
 } VP8MBData;

 // Persistent information needed by the parallel processing
@ -244,6 +249,10 @@ struct VP8Decoder {
  // per-partition boolean decoders.
  VP8BitReader parts_[MAX_NUM_PARTITIONS];

+  // Dithering strength, deduced from decoding options
+  int dither_;                // whether to use dithering or not
+  VP8Random dithering_rg_;    // random generator for dithering
+
  // dequantization (one set of DC/AC dequant factor per segment)
  VP8QuantMatrix dqm_[NUM_MB_SEGMENTS];

@ -324,7 +333,10 @@ int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
 int VP8GetThreadMethod(const WebPDecoderOptions* const options,
                       const WebPHeaderStructure* const headers,
                       int width, int height);
-// Process the last decoded row (filtering + output)
+// Initialize dithering post-process if needed.
+void VP8InitDithering(const WebPDecoderOptions* const options,
+                      VP8Decoder* const dec);
+// Process the last decoded row (filtering + output).
 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
 // To be called at the start of a new scanline, to initialize predictors.
 void VP8InitScanline(VP8Decoder* const dec);
--- a/src/dec/webp.c
+++ b/src/dec/webp.c
@ -474,6 +474,7 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
        // This change must be done before calling VP8Decode()
        dec->mt_method_ = VP8GetThreadMethod(params->options, &headers,
                                             io.width, io.height);
+        VP8InitDithering(params->options, dec);
        if (!VP8Decode(dec, &io)) {
          status = dec->status_;
        }
--- a/src/utils/random.h
+++ b/src/utils/random.h
@ -34,8 +34,10 @@ typedef struct {
 void VP8InitRandom(VP8Random* const rg, float dithering);

 // Returns a centered pseudo-random number with 'num_bits' amplitude.
-// (uses D.Knuth's Difference-based random generator)
-static WEBP_INLINE int VP8RandomBits(VP8Random* const rg, int num_bits) {
+// (uses D.Knuth's Difference-based random generator).
+// 'amp' is in VP8_RANDOM_DITHER_FIX fixed-point precision.
+static WEBP_INLINE int VP8RandomBits2(VP8Random* const rg, int num_bits,
+                                      int amp) {
  int diff;
  assert(num_bits + VP8_RANDOM_DITHER_FIX <= 31);
  diff = rg->tab_[rg->index1_] - rg->tab_[rg->index2_];
@ -43,12 +45,16 @@ static WEBP_INLINE int VP8RandomBits(VP8Random* const rg, int num_bits) {
  rg->tab_[rg->index1_] = diff;
  if (++rg->index1_ == VP8_RANDOM_TABLE_SIZE) rg->index1_ = 0;
  if (++rg->index2_ == VP8_RANDOM_TABLE_SIZE) rg->index2_ = 0;
-  diff = (diff << 1) >> (32 - num_bits);    // sign-extend, 0-center
-  diff = (diff * rg->amp_) >> VP8_RANDOM_DITHER_FIX;   // restrict range
-  diff += 1 << (num_bits - 1);              // shift back to 0.5-center
+  diff = (diff << 1) >> (32 - num_bits);         // sign-extend, 0-center
+  diff = (diff * amp) >> VP8_RANDOM_DITHER_FIX;  // restrict range
+  diff += 1 << (num_bits - 1);                   // shift back to 0.5-center
  return diff;
 }

+static WEBP_INLINE int VP8RandomBits(VP8Random* const rg, int num_bits) {
+  return VP8RandomBits2(rg, num_bits, rg->amp_);
+}
+
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/webp/decode.h
+++ b/src/webp/decode.h
@ -20,7 +20,7 @@
 extern "C" {
 #endif

-#define WEBP_DECODER_ABI_VERSION 0x0202    // MAJOR(8b) + MINOR(8b)
+#define WEBP_DECODER_ABI_VERSION 0x0203    // MAJOR(8b) + MINOR(8b)

 // Note: forward declaring enumerations is not allowed in (strict) C and C++,
 // the types are left here for reference.
@ -441,11 +441,12 @@ struct WebPDecoderOptions {
  int use_scaling;                    // if true, scaling is applied _afterward_
  int scaled_width, scaled_height;    // final resolution
  int use_threads;                    // if true, use multi-threaded decoding
+  int dithering_strength;             // dithering strength (0=Off, 100=full)

  // Unused for now:
  int force_rotation;                 // forced rotation (to be applied _last_)
  int no_enhancement;                 // if true, discard enhancement layer
-  uint32_t pad[6];                    // padding for later use
+  uint32_t pad[5];                    // padding for later use
 };

 // Main object storing the configuration for advanced decoding.