From bbe32df1e333183a8e44f5d0619d83812fb231e2 Mon Sep 17 00:00:00 2001
From: skal <pascal.massimino@gmail.com>
Date: Sat, 14 Jun 2014 00:06:16 +0200
Subject: [PATCH] add alpha dithering for lossy

new options:
 dwebp -alpha_dither
 vwebp -noalphadither

When the source was marked as quantized, we use a threshold-averaging
filter to smooth the decoded alpha plane.
Note: this option forces the decoding of alpha data in one pass, and
might slow the decoding a bit.

The new field in WebPDecoderOptions struct is 'alpha_dithering_strength'
(0 by default, means: off). Max strength value is '100'.

Change-Id: I218e21af96360d4781587fede95f8ea4e2b7287a
---
 README                       |   2 +
 examples/dwebp.c             |   3 +
 examples/vwebp.c             |   4 +
 man/dwebp.1                  |   7 +-
 man/vwebp.1                  |   6 +-
 src/dec/alpha.c              |  16 ++-
 src/dec/frame.c              |   7 +
 src/dec/vp8i.h               |   3 +-
 src/utils/quant_levels_dec.c | 267 ++++++++++++++++++++++++++++++++++-
 src/utils/quant_levels_dec.h |  11 +-
 src/webp/decode.h            |   3 +-
 11 files changed, 308 insertions(+), 21 deletions(-)
diff --git a/README b/README
index 434d6f94..63da6a2f 100644
--- a/README
+++ b/README
@@ -270,6 +270,7 @@ Use following options to convert into alternate image formats:
   -nofilter .... disable in-loop filtering
   -nodither .... disable dithering
   -dither <d> .. dithering strength (in 0..100)
+  -alpha_dither  use alpha-plane dithering if needed.
   -mt .......... use multi-threading
   -crop <x> <y> <w> <h> ... crop output with the given rectangle
   -scale <w> <h> .......... scale the output (*after* any cropping)
@@ -297,6 +298,7 @@ Options are:
   -nofancy ..... don't use the fancy YUV420 upscaler.
   -nofilter .... disable in-loop filtering.
   -dither <int>  dithering strength (0..100). Default=50.
+  -noalphadither disable alpha plane dithering.
   -mt .......... use multi-threading.
   -info ........ print info.
   -h     ....... this help message.
diff --git a/examples/dwebp.c b/examples/dwebp.c
index 9bde6ec2..0fc8be06 100644
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@@ -557,6 +557,7 @@ static void Help(void) {
          "  -nofilter .... disable in-loop filtering\n"
          "  -nodither .... disable dithering\n"
          "  -dither <d> .. dithering strength (in 0..100)\n"
+         "  -alpha_dither  use alpha-plane dithering if needed.\n"
          "  -mt .......... use multi-threading\n"
          "  -crop <x> <y> <w> <h> ... crop output with the given rectangle\n"
          "  -scale <w> <h> .......... scale the output (*after* any cropping)\n"
@@ -623,6 +624,8 @@ int main(int argc, const char *argv[]) {
       format = YUV;
     } else if (!strcmp(argv[c], "-mt")) {
       config.options.use_threads = 1;
+    } else if (!strcmp(argv[c], "-alpha_dither")) {
+      config.options.alpha_dithering_strength = 100;
     } else if (!strcmp(argv[c], "-nodither")) {
       config.options.dithering_strength = 0;
     } else if (!strcmp(argv[c], "-dither") && c < argc - 1) {
diff --git a/examples/vwebp.c b/examples/vwebp.c
index 64c0f632..459a4f99 100644
--- a/examples/vwebp.c
+++ b/examples/vwebp.c
@@ -377,6 +377,7 @@ static void Help(void) {
          "  -nofancy ..... don't use the fancy YUV420 upscaler.\n"
          "  -nofilter .... disable in-loop filtering.\n"
          "  -dither <int>  dithering strength (0..100). Default=50.\n"
+         "  -noalphadither disable alpha plane dithering.\n"
          "  -mt .......... use multi-threading.\n"
          "  -info ........ print info.\n"
          "  -h     ....... this help message.\n"
@@ -399,6 +400,7 @@ int main(int argc, char *argv[]) {
     return -1;
   }
   config->options.dithering_strength = 50;
+  config->options.alpha_dithering_strength = 100;
   kParams.use_color_profile = 1;
 
   for (c = 1; c < argc; ++c) {
@@ -411,6 +413,8 @@ int main(int argc, char *argv[]) {
       config->options.no_fancy_upsampling = 1;
     } else if (!strcmp(argv[c], "-nofilter")) {
       config->options.bypass_filtering = 1;
+    } else if (!strcmp(argv[c], "-noalphadither")) {
+      config->options.alpha_dithering_strength = 0;
     } else if (!strcmp(argv[c], "-dither") && c + 1 < argc) {
       config->options.dithering_strength = strtol(argv[++c], NULL, 0);
     } else if (!strcmp(argv[c], "-info")) {
diff --git a/man/dwebp.1 b/man/dwebp.1
index 6ef505fa..5741be6a 100644
--- a/man/dwebp.1
+++ b/man/dwebp.1
@@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH DWEBP 1 "March 7, 2014"
+.TH DWEBP 1 "June 13, 2014"
 .SH NAME
 dwebp \- decompress a WebP file to an image file
 .SH SYNOPSIS
@@ -68,6 +68,11 @@ Specify a dithering \fBstrength\fP between 0 and 100. Dithering is a
 post-processing effect applied to chroma components in lossy compression.
 It helps by smoothing gradients and avoiding banding artifacts.
 .TP
+.BI \-alpha_dither
+If the compressed file contains a transparency plane that was quantized
+during compression, this flag will allow dithering the reconstructed plane
+in order to generate smoother transparency gradients.
+.TP
 .B \-nodither
 Disable all dithering (default).
 .TP
diff --git a/man/vwebp.1 b/man/vwebp.1
index 9115009b..a842f6ef 100644
--- a/man/vwebp.1
+++ b/man/vwebp.1
@@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH VWEBP 1 "March 7, 2014"
+.TH VWEBP 1 "June 13, 2014"
 .SH NAME
 vwebp \- decompress a WebP file and display it in a window
 .SH SYNOPSIS
@@ -34,6 +34,10 @@ Specify a dithering \fBstrength\fP between 0 and 100. Dithering is a
 post-processing effect applied to chroma components in lossy compression.
 It helps by smoothing gradients and avoiding banding artifacts. Default: 50.
 .TP
+.BI \-noalphadither
+By default, quantized transparency planes are dithered during decompression,
+to smooth the gradients. This flag will prevent this dithering.
+.TP
 .B \-mt
 Use multi-threading for decoding, if possible.
 .TP
diff --git a/src/dec/alpha.c b/src/dec/alpha.c
index 608f7c42..f23ba7d6 100644
--- a/src/dec/alpha.c
+++ b/src/dec/alpha.c
@@ -108,12 +108,6 @@ static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
     unfilter_func(width, height, width, row, num_rows, output);
   }
 
-  if (alph_dec->pre_processing_ == ALPHA_PREPROCESSED_LEVELS) {
-    if (!DequantizeLevels(output, width, height, row, num_rows)) {
-      return 0;
-    }
-  }
-
   if (row + num_rows == dec->pic_hdr_.height_) {
     dec->is_alpha_decoded_ = 1;
   }
@@ -143,12 +137,22 @@ const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
       dec->alph_dec_ = NULL;
       return NULL;
     }
+    // if we allowed use of alpha dithering, check whether it's needed at all
+    if (dec->alph_dec_->pre_processing_ != ALPHA_PREPROCESSED_LEVELS) {
+      dec->alpha_dithering_ = 0;  // disable dithering
+    } else {
+      num_rows = height;          // decode everything in one pass
+    }
   }
 
   if (!dec->is_alpha_decoded_) {
     int ok = 0;
     assert(dec->alph_dec_ != NULL);
     ok = ALPHDecode(dec, row, num_rows);
+    if (ok && dec->alpha_dithering_ > 0) {
+      ok = WebPDequantizeLevels(dec->alpha_plane_, width, height,
+                                dec->alpha_dithering_);
+    }
     if (!ok || dec->is_alpha_decoded_) {
       ALPHDelete(dec->alph_dec_);
       dec->alph_dec_ = NULL;
diff --git a/src/dec/frame.c b/src/dec/frame.c
index 01d33d3e..efe886b1 100644
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@@ -177,6 +177,13 @@ void VP8InitDithering(const WebPDecoderOptions* const options,
         dec->dither_ = 1;
       }
     }
+    // potentially allow alpha dithering
+    dec->alpha_dithering_ = options->alpha_dithering_strength;
+    if (dec->alpha_dithering_ > 100) {
+      dec->alpha_dithering_ = 100;
+    } else if (dec->alpha_dithering_ < 0) {
+      dec->alpha_dithering_ = 0;
+    }
   }
 }
 
diff --git a/src/dec/vp8i.h b/src/dec/vp8i.h
index 35a757b7..d5b67660 100644
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@@ -296,7 +296,8 @@ struct VP8Decoder {
   const uint8_t* alpha_data_;     // compressed alpha data (if present)
   size_t alpha_data_size_;
   int is_alpha_decoded_;  // true if alpha_data_ is decoded in alpha_plane_
-  uint8_t* alpha_plane_;        // output. Persistent, contains the whole data.
+  uint8_t* alpha_plane_;  // output. Persistent, contains the whole data.
+  int alpha_dithering_;   // derived from decoding options (0=off, 100=full).
 };
 
 //------------------------------------------------------------------------------
diff --git a/src/utils/quant_levels_dec.c b/src/utils/quant_levels_dec.c
index 8489705a..c599e40a 100644
--- a/src/utils/quant_levels_dec.c
+++ b/src/utils/quant_levels_dec.c
@@ -7,18 +7,273 @@
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// TODO(skal): implement gradient smoothing.
+// Implement gradient smoothing: we replace a current alpha value by its
+// surrounding average if it's close enough (that is: the change will be less
+// than the minimum distance between two quantized level).
+// We use sliding window for computing the 2d moving average.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include "./quant_levels_dec.h"
 
-int DequantizeLevels(uint8_t* const data, int width, int height,
-                     int row, int num_rows) {
-  if (data == NULL || width <= 0 || height <= 0 || row < 0 || num_rows < 0 ||
-      row + num_rows > height) {
-    return 0;
+#include <string.h>   // for memset
+
+#include "./utils.h"
+
+// #define USE_DITHERING   // uncomment to enable ordered dithering (not vital)
+
+#define FIX 16     // fix-point precision for averaging
+#define LFIX 2     // extra precision for look-up table
+#define LUT_SIZE ((1 << (8 + LFIX)) - 1)  // look-up table size
+
+#if defined(USE_DITHERING)
+
+#define DFIX 4           // extra precision for ordered dithering
+#define DSIZE 4          // dithering size (must be a power of two)
+// cf. http://en.wikipedia.org/wiki/Ordered_dithering
+static const uint8_t kOrderedDither[DSIZE][DSIZE] = {
+ {  0,  8,  2, 10 },     // coefficients are in DFIX fixed-point precision
+ { 12,  4, 14,  6 },
+ {  3, 11,  1,  9 },
+ { 15,  7, 13,  5 }
+};
+
+#else
+#define DFIX 0
+#endif
+
+typedef struct {
+  int width_, height_;  // dimension
+  int row_;             // current input row being processed
+  uint8_t* src_;        // input pointer
+  uint8_t* dst_;        // output pointer
+
+  int radius_;          // filter radius (=delay)
+  int scale_;           // normalization factor, in FIX bits precision
+
+  void* mem_;           // all memory
+
+  // various scratch buffers
+  uint16_t* start_;
+  uint16_t* cur_;
+  uint16_t* end_;
+  uint16_t* top_;
+  uint16_t* average_;
+
+  // input levels distribution
+  int num_levels_;       // number of quantized levels
+  int min_, max_;        // min and max level values
+  int min_level_dist_;   // smallest distance between two consecutive levels
+
+  int16_t* correction_;  // size = 1 + 2*LUT_SIZE  -> ~4k memory
+} SmoothParams;
+
+//------------------------------------------------------------------------------
+
+#define CLIP_MASK (int)(~0U << (8 + DFIX))
+static WEBP_INLINE uint8_t clip_8b(int v) {
+  return (!(v & CLIP_MASK)) ? (uint8_t)(v >> DFIX) : (v < 0) ? 0u : 255u;
+}
+
+// vertical accumulation
+static void VFilter(SmoothParams* const p) {
+  const uint8_t* src = p->src_;
+  const int w = p->width_;
+  uint16_t* const cur = p->cur_;
+  const uint16_t* const top = p->top_;
+  uint16_t* const out = p->end_;
+  uint16_t sum = 0;               // all arithmetic is modulo 16bit
+  int x;
+
+  for (x = 0; x < w; ++x) {
+    uint16_t new_value;
+    sum += src[x];
+    new_value = top[x] + sum;
+    out[x] = new_value - cur[x];  // vertical sum of 'r' pixels.
+    cur[x] = new_value;
   }
+  // move input pointers one row down
+  p->top_ = p->cur_;
+  p->cur_ += w;
+  if (p->cur_ == p->end_) p->cur_ = p->start_;  // roll-over
+  // We replicate edges, as it's somewhat easier as a boundary condition.
+  // That's why we don't update the 'src' pointer on top/bottom area:
+  if (p->row_ >= 0 && p->row_ < p->height_ - 1) {
+    p->src_ += p->width_;
+  }
+}
+
+// horizontal accumulation. We use mirror replication of missing pixels, as it's
+// a little easier to implement (surprisingly).
+static void HFilter(SmoothParams* const p) {
+  const uint16_t* const in = p->end_;
+  uint16_t* const out = p->average_;
+  const uint32_t scale = p->scale_;
+  const int w = p->width_;
+  const int r = p->radius_;
+
+  int x;
+  for (x = 0; x <= r; ++x) {   // left mirroring
+    const uint16_t delta = in[x + r - 1] + in[r - x];
+    out[x] = (delta * scale) >> FIX;
+  }
+  for (; x < w - r; ++x) {     // bulk middle run
+    const uint16_t delta = in[x + r] - in[x - r - 1];
+    out[x] = (delta * scale) >> FIX;
+  }
+  for (; x < w; ++x) {         // right mirroring
+    const uint16_t delta =
+        2 * in[w - 1] - in[2 * w - 2 - r - x] - in[x - r - 1];
+    out[x] = (delta * scale) >> FIX;
+  }
+}
+
+// emit one filtered output row
+static void ApplyFilter(SmoothParams* const p) {
+  const uint16_t* const average = p->average_;
+  const int w = p->width_;
+  const int16_t* const correction = p->correction_;
+#if defined(USE_DITHERING)
+  const uint8_t* const dither = kOrderedDither[p->row_ % DSIZE];
+#endif
+  uint8_t* const dst = p->dst_;
+  int x;
+  for (x = 0; x < w; ++x) {
+    const int v = dst[x];
+    if (v < p->max_ && v > p->min_) {
+      const int c = (v << DFIX) + correction[average[x] - (v << LFIX)];
+#if defined(USE_DITHERING)
+      dst[x] = clip_8b(c + dither[x % DSIZE]);
+#else
+      dst[x] = clip_8b(c);
+#endif
+    }
+  }
+  p->dst_ += w;  // advance output pointer
+}
+
+//------------------------------------------------------------------------------
+// Initialize correction table
+
+static void InitCorrectionLUT(int16_t* const lut, int min_dist) {
+  // The correction curve is:
+  //   f(x) = x for x <= threshold2
+  //   f(x) = 0 for x >= threshold1
+  // and a linear interpolation for range x=[threshold2, threshold1]
+  // (along with f(-x) = -f(x) symmetry).
+  // Note that: threshold2 = 3/4 * threshold1
+  const int threshold1 = min_dist << LFIX;
+  const int threshold2 = (3 * threshold1) >> 2;
+  const int max_threshold = threshold2 << DFIX;
+  const int delta = threshold1 - threshold2;
+  int i;
+  for (i = 1; i <= LUT_SIZE; ++i) {
+    int c = (i <= threshold2) ? (i << DFIX)
+          : (i < threshold1) ? max_threshold * (threshold1 - i) / delta
+          : 0;
+    c >>= LFIX;
+    lut[+i] = +c;
+    lut[-i] = -c;
+  }
+  lut[0] = 0;
+}
+
+static void CountLevels(const uint8_t* const data, int size,
+                        SmoothParams* const p) {
+  int i, last_level;
+  uint8_t used_levels[256] = { 0 };
+  p->min_ = 255;
+  p->max_ = 0;
+  for (i = 0; i < size; ++i) {
+    const int v = data[i];
+    if (v < p->min_) p->min_ = v;
+    if (v > p->max_) p->max_ = v;
+    used_levels[v] = 1;
+  }
+  // Compute the mininum distance between two non-zero levels.
+  p->min_level_dist_ = p->max_ - p->min_;
+  last_level = -1;
+  for (i = 0; i < 256; ++i) {
+    if (used_levels[i]) {
+      ++p->num_levels_;
+      if (last_level >= 0) {
+        const int level_dist = i - last_level;
+        if (level_dist < p->min_level_dist_) {
+          p->min_level_dist_ = level_dist;
+        }
+      }
+      last_level = i;
+    }
+  }
+}
+
+// Initialize all params.
+static int InitParams(uint8_t* const data, int width, int height,
+                      int radius, SmoothParams* const p) {
+  const int R = 2 * radius + 1;  // total size of the kernel
+
+  const size_t size_scratch_m = (R + 1) * width * sizeof(*p->start_);
+  const size_t size_m =  width * sizeof(*p->average_);
+  const size_t size_lut = (1 + 2 * LUT_SIZE) * sizeof(*p->correction_);
+  const size_t total_size = size_scratch_m + size_m + size_lut;
+  uint8_t* mem = (uint8_t*)WebPSafeMalloc(1U, total_size);
+
+  if (mem == NULL) return 0;
+  p->mem_ = (void*)mem;
+
+  p->start_ = (uint16_t*)mem;
+  p->cur_ = p->start_;
+  p->end_ = p->start_ + R * width;
+  p->top_ = p->end_ - width;
+  memset(p->top_, 0, width * sizeof(*p->top_));
+  mem += size_scratch_m;
+
+  p->average_ = (uint16_t*)mem;
+  mem += size_m;
+
+  p->width_ = width;
+  p->height_ = height;
+  p->src_ = data;
+  p->dst_ = data;
+  p->radius_ = radius;
+  p->scale_ = (1 << (FIX + LFIX)) / (R * R);  // normalization constant
+  p->row_ = -radius;
+
+  // analyze the input distribution so we can best-fit the threshold
+  CountLevels(data, width * height, p);
+
+  // correction table
+  p->correction_ = ((int16_t*)mem) + LUT_SIZE;
+  InitCorrectionLUT(p->correction_, p->min_level_dist_);
+
   return 1;
 }
 
+static void CleanupParams(SmoothParams* const p) {
+  WebPSafeFree(p->mem_);
+}
+
+int WebPDequantizeLevels(uint8_t* const data, int width, int height,
+                         int strength) {
+  const int radius = 4 * strength / 100;
+  if (strength < 0 || strength > 100) return 0;
+  if (data == NULL || width <= 0 || height <= 0) return 0;  // bad params
+  if (radius > 0) {
+    SmoothParams p;
+    memset(&p, 0, sizeof(p));
+    if (!InitParams(data, width, height, radius, &p)) return 0;
+    if (p.num_levels_ > 2) {
+      for (; p.row_ < p.height_; ++p.row_) {
+        VFilter(&p);  // accumulate average of input
+        // Need to wait few rows in order to prime the filter,
+        // before emitting some output.
+        if (p.row_ >= p.radius_) {
+          HFilter(&p);
+          ApplyFilter(&p);
+        }
+      }
+    }
+    CleanupParams(&p);
+  }
+  return 1;
+}
diff --git a/src/utils/quant_levels_dec.h b/src/utils/quant_levels_dec.h
index 0288383a..9aab0680 100644
--- a/src/utils/quant_levels_dec.h
+++ b/src/utils/quant_levels_dec.h
@@ -21,11 +21,12 @@ extern "C" {
 #endif
 
 // Apply post-processing to input 'data' of size 'width'x'height' assuming that
-// the source was quantized to a reduced number of levels. The post-processing
-// will be applied to 'num_rows' rows of 'data' starting from 'row'.
-// Returns false in case of error (data is NULL, invalid parameters, ...).
-int DequantizeLevels(uint8_t* const data, int width, int height,
-                     int row, int num_rows);
+// the source was quantized to a reduced number of levels.
+// Strength is in [0..100] and controls the amount of dithering applied.
+// Returns false in case of error (data is NULL, invalid parameters,
+// malloc failure, ...).
+int WebPDequantizeLevels(uint8_t* const data, int width, int height,
+                         int strength);
 
 #ifdef __cplusplus
 }    // extern "C"
diff --git a/src/webp/decode.h b/src/webp/decode.h
index 580274c8..e0dd5703 100644
--- a/src/webp/decode.h
+++ b/src/webp/decode.h
@@ -443,11 +443,12 @@ struct WebPDecoderOptions {
   int use_threads;                    // if true, use multi-threaded decoding
   int dithering_strength;             // dithering strength (0=Off, 100=full)
   int flip;                           // flip output vertically
+  int alpha_dithering_strength;       // alpha dithering strength in [0..100]
 
   // Unused for now:
   int force_rotation;                 // forced rotation (to be applied _last_)
   int no_enhancement;                 // if true, discard enhancement layer
-  uint32_t pad[4];                    // padding for later use
+  uint32_t pad[3];                    // padding for later use
 };
 
 // Main object storing the configuration for advanced decoding.