From 498d4dd634f052f9e7cde7f328ee96278a577c66 Mon Sep 17 00:00:00 2001
From: Vikas Arora <vikasa@google.com>
Date: Wed, 8 May 2013 17:19:04 -0700
Subject: [PATCH] WebP-Lossless encoding improvements.

Lossy (with Alpha) image compression gets 2.3X speedup.
Compressing lossless images is 20%-40% faster now.

Change-Id: I41f0225838b48ae5c60b1effd1b0de72fecb3ae6
(cherry picked from commit 8eae188a62bdd2dfef2266cfd34617a421c3266f)
---
 src/dsp/lossless.c            | 21 +++++++++
 src/dsp/lossless.h            |  3 ++
 src/enc/alpha.c               | 65 +++++++++++++++++++++------
 src/enc/backward_references.c | 76 +++++++++++++++----------------
 src/enc/vp8l.c                | 84 ++++++++++++++++++-----------------
 src/utils/filters.c           |  4 +-
 6 files changed, 159 insertions(+), 94 deletions(-)

diff --git a/src/dsp/lossless.c b/src/dsp/lossless.c
index 080b3e63..c015b7ad 100644
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
@@ -1325,6 +1325,27 @@ void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
   }
 }
 
+// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
+void VP8LBundleColorMap(const uint8_t* const row, int width,
+                        int xbits, uint32_t* const dst) {
+  int x;
+  if (xbits > 0) {
+    const int bit_depth = 1 << (3 - xbits);
+    const int mask = (1 << xbits) - 1;
+    uint32_t code = 0xff000000;
+    for (x = 0; x < width; ++x) {
+      const int xsub = x & mask;
+      if (xsub == 0) {
+        code = 0xff000000;
+      }
+      code |= row[x] << (8 + bit_depth * xsub);
+      dst[x >> xbits] = code;
+    }
+  } else {
+    for (x = 0; x < width; ++x) dst[x] = 0xff000000 | (row[x] << 8);
+  }
+}
+
 //------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
diff --git a/src/dsp/lossless.h b/src/dsp/lossless.h
index 0ac4ecb8..6742bcc8 100644
--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@@ -83,6 +83,9 @@ static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
   return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
 }
 
+void VP8LBundleColorMap(const uint8_t* const row, int width,
+                        int xbits, uint32_t* const dst);
+
 //------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
diff --git a/src/enc/alpha.c b/src/enc/alpha.c
index aadf88fe..60391c11 100644
--- a/src/enc/alpha.c
+++ b/src/enc/alpha.c
@@ -80,7 +80,7 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
   config.lossless = 1;
   config.method = effort_level;  // impact is very small
   // Set a moderate default quality setting for alpha.
-  config.quality = 5.f * effort_level;
+  config.quality = 10.f * effort_level;
   assert(config.quality >= 0 && config.quality <= 100.f);
 
   ok = VP8LBitWriterInit(&tmp_bw, (width * height) >> 3);
@@ -156,6 +156,25 @@ static void CopyPlane(const uint8_t* src, int src_stride,
   }
 }
 
+static int GetNumColors(const uint8_t* data, int width, int height,
+                        int stride) {
+  int j;
+  int colors = 0;
+  uint8_t color[256] = { 0 };
+
+  for (j = 0; j < height; ++j) {
+    int i;
+    const uint8_t* const p = data + j * stride;
+    for (i = 0; i < width; ++i) {
+      color[p[i]] = 1;
+    }
+  }
+  for (j = 0; j < 256; ++j) {
+    if (color[j] > 0) ++colors;
+  }
+  return colors;
+}
+
 static int EncodeAlpha(VP8Encoder* const enc,
                        int quality, int method, int filter,
                        int effort_level,
@@ -207,18 +226,32 @@ static int EncodeAlpha(VP8Encoder* const enc,
     VP8BitWriter bw;
     int test_filter;
     uint8_t* filtered_alpha = NULL;
+    int try_filter_none = (effort_level > 3);
 
-    // We always test WEBP_FILTER_NONE first.
-    ok = EncodeAlphaInternal(quant_alpha, width, height,
-                             method, WEBP_FILTER_NONE, reduce_levels,
-                             effort_level, NULL, &bw, pic->stats);
-    if (!ok) {
-      VP8BitWriterWipeOut(&bw);
-      goto End;
+    if (filter == WEBP_FILTER_FAST) {  // Quick estimate of the best candidate.
+      const int kMinColorsForFilterNone = 16;
+      const int kMaxColorsForFilterNone = 192;
+      const int num_colors = GetNumColors(quant_alpha, width, height, width);
+      // For low number of colors, NONE yeilds better compression.
+      filter = (num_colors <= kMinColorsForFilterNone) ? WEBP_FILTER_NONE :
+               EstimateBestFilter(quant_alpha, width, height, width);
+      // For large number of colors, try FILTER_NONE in addition to the best
+      // filter as well.
+      if (num_colors > kMaxColorsForFilterNone) {
+        try_filter_none = 1;
+      }
     }
 
-    if (filter == WEBP_FILTER_FAST) {  // Quick estimate of a second candidate?
-      filter = EstimateBestFilter(quant_alpha, width, height, width);
+    // Test for WEBP_FILTER_NONE for higher effort levels.
+    if (try_filter_none || filter == WEBP_FILTER_NONE) {
+      ok = EncodeAlphaInternal(quant_alpha, width, height,
+                               method, WEBP_FILTER_NONE, reduce_levels,
+                               effort_level, NULL, &bw, pic->stats);
+
+      if (!ok) {
+        VP8BitWriterWipeOut(&bw);
+        goto End;
+      }
     }
     // Stop?
     if (filter == WEBP_FILTER_NONE) {
@@ -234,11 +267,14 @@ static int EncodeAlpha(VP8Encoder* const enc,
     // Try the other mode(s).
     {
       WebPAuxStats best_stats;
-      size_t best_score = VP8BitWriterSize(&bw);
+      size_t best_score = try_filter_none ?
+                          VP8BitWriterSize(&bw) : (size_t)~0U;
+      int wipe_tmp_bw = try_filter_none;
 
       memset(&best_stats, 0, sizeof(best_stats));  // prevent spurious warning
       if (pic->stats != NULL) best_stats = *pic->stats;
-      for (test_filter = WEBP_FILTER_HORIZONTAL;
+      for (test_filter =
+           try_filter_none ? WEBP_FILTER_HORIZONTAL : WEBP_FILTER_NONE;
            ok && (test_filter <= WEBP_FILTER_GRADIENT);
            ++test_filter) {
         VP8BitWriter tmp_bw;
@@ -262,7 +298,10 @@ static int EncodeAlpha(VP8Encoder* const enc,
         } else {
           VP8BitWriterWipeOut(&bw);
         }
-        VP8BitWriterWipeOut(&tmp_bw);
+        if (wipe_tmp_bw) {
+          VP8BitWriterWipeOut(&tmp_bw);
+        }
+        wipe_tmp_bw = 1;  // For next filter trial for WEBP_FILTER_BEST.
       }
       if (pic->stats != NULL) *pic->stats = best_stats;
     }
diff --git a/src/enc/backward_references.c b/src/enc/backward_references.c
index cf027875..67dd7e94 100644
--- a/src/enc/backward_references.c
+++ b/src/enc/backward_references.c
@@ -142,9 +142,10 @@ static void HashChainInsert(HashChain* const p,
 }
 
 static void GetParamsForHashChainFindCopy(int quality, int xsize,
-                                          int* window_size, int* iter_pos,
-                                          int* iter_limit) {
+                                          int cache_bits, int* window_size,
+                                          int* iter_pos, int* iter_limit) {
   const int iter_mult = (quality < 27) ? 1 : 1 + ((quality - 27) >> 4);
+  const int iter_neg = -iter_mult * (quality >> 1);
   // Limit the backward-ref window size for lower qualities.
   const int max_window_size = (quality > 50) ? WINDOW_SIZE
                             : (quality > 25) ? (xsize << 8)
@@ -152,77 +153,74 @@ static void GetParamsForHashChainFindCopy(int quality, int xsize,
   assert(xsize > 0);
   *window_size = (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE
                : max_window_size;
-  *iter_pos = 5 + (quality >> 3);
-  *iter_limit = -quality * iter_mult;
+  *iter_pos = 8 + (quality >> 3);
+  // For lower entropy images, the rigourous search loop in HashChainFindCopy
+  // can be relaxed.
+  *iter_limit = (cache_bits > 0) ? iter_neg : iter_neg / 2;
 }
 
 static int HashChainFindCopy(const HashChain* const p,
-                             int base_position, int xsize,
+                             int base_position, int xsize_signed,
                              const uint32_t* const argb, int maxlen,
                              int window_size, int iter_pos, int iter_limit,
                              int* const distance_ptr,
                              int* const length_ptr) {
-  const uint64_t hash_code = GetPixPairHash64(&argb[base_position]);
-  int prev_length = 0;
-  int64_t best_val = 0;
-  int best_length = 0;
-  int best_distance = 0;
   const uint32_t* const argb_start = argb + base_position;
+  uint64_t best_val = 0;
+  uint32_t best_length = 1;
+  uint32_t best_distance = 0;
+  const uint32_t xsize = (uint32_t)xsize_signed;
   const int min_pos =
       (base_position > window_size) ? base_position - window_size : 0;
   int pos;
-
   assert(xsize > 0);
-  for (pos = p->hash_to_first_index_[hash_code];
+  for (pos = p->hash_to_first_index_[GetPixPairHash64(argb_start)];
        pos >= min_pos;
        pos = p->chain_[pos]) {
-    int64_t val;
-    int curr_length;
+    uint64_t val;
+    uint32_t curr_length;
+    uint32_t distance;
     if (iter_pos < 0) {
       if (iter_pos < iter_limit || best_val >= 0xff0000) {
         break;
       }
     }
     --iter_pos;
-    if (best_length != 0 &&
-        argb[pos + best_length - 1] != argb_start[best_length - 1]) {
+    if (argb[pos + best_length - 1] != argb_start[best_length - 1]) {
       continue;
     }
     curr_length = FindMatchLength(argb + pos, argb_start, maxlen);
-    if (curr_length < prev_length) {
+    if (curr_length < best_length) {
       continue;
     }
-    val = 65536 * curr_length;
+    distance = (uint32_t)(base_position - pos);
+    val = curr_length << 16;
     // Favoring 2d locality here gives savings for certain images.
-    if (base_position - pos < 9 * xsize) {
-      const int y = (base_position - pos) / xsize;
-      int x = (base_position - pos) % xsize;
-      if (x > xsize / 2) {
+    if (distance < 9 * xsize) {
+      const uint32_t y = distance / xsize;
+      uint32_t x = distance % xsize;
+      if (x > (xsize >> 1)) {
         x = xsize - x;
       }
-      if (x <= 7 && x >= -8) {
+      if (x <= 7) {
+        val += 9 * 9 + 9 * 9;
         val -= y * y + x * x;
-      } else {
-        val -= 9 * 9 + 9 * 9;
       }
-    } else {
-      val -= 9 * 9 + 9 * 9;
     }
     if (best_val < val) {
-      prev_length = curr_length;
       best_val = val;
       best_length = curr_length;
-      best_distance = base_position - pos;
+      best_distance = distance;
       if (curr_length >= MAX_LENGTH) {
         break;
       }
-      if ((best_distance == 1 || best_distance == xsize) &&
+      if ((best_distance == 1 || distance == xsize) &&
           best_length >= 128) {
         break;
       }
     }
   }
-  *distance_ptr = best_distance;
+  *distance_ptr = (int)best_distance;
   *length_ptr = best_length;
   return (best_length >= MIN_LENGTH);
 }
@@ -284,8 +282,8 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
   if (!HashChainInit(hash_chain, pix_count)) goto Error;
 
   refs->size = 0;
-  GetParamsForHashChainFindCopy(quality, xsize, &window_size, &iter_pos,
-                                &iter_limit);
+  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
+                                &window_size, &iter_pos, &iter_limit);
   for (i = 0; i < pix_count; ) {
     // Alternative#1: Code the pixels starting at 'i' using backward reference.
     int offset = 0;
@@ -510,8 +508,8 @@ static int BackwardReferencesHashChainDistanceOnly(
   // We loop one pixel at a time, but store all currently best points to
   // non-processed locations from this point.
   dist_array[0] = 0;
-  GetParamsForHashChainFindCopy(quality, xsize, &window_size, &iter_pos,
-                                &iter_limit);
+  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
+                                &window_size, &iter_pos, &iter_limit);
   for (i = 0; i < pix_count; ++i) {
     double prev_cost = 0.0;
     int shortmax;
@@ -645,8 +643,8 @@ static int BackwardReferencesHashChainFollowChosenPath(
   }
 
   refs->size = 0;
-  GetParamsForHashChainFindCopy(quality, xsize, &window_size, &iter_pos,
-                                &iter_limit);
+  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
+                                &window_size, &iter_pos, &iter_limit);
   for (ix = 0; ix < chosen_path_size; ++ix, ++size) {
     int offset = 0;
     int len = 0;
@@ -785,7 +783,9 @@ int VP8LGetBackwardReferences(int width, int height,
     *best = refs_lz77;   // default guess: lz77 is better
     VP8LClearBackwardRefs(&refs_rle);
     if (try_lz77_trace_backwards) {
-      const int recursion_level = (num_pix < 320 * 200) ? 1 : 0;
+      // Set recursion level for large images using a color cache.
+      const int recursion_level =
+          (num_pix < 320 * 200) && (cache_bits > 0) ? 1 : 0;
       VP8LBackwardRefs refs_trace;
       if (!VP8LBackwardRefsAlloc(&refs_trace, num_pix)) {
         goto End;
diff --git a/src/enc/vp8l.c b/src/enc/vp8l.c
index 5077167b..8af544ff 100644
--- a/src/enc/vp8l.c
+++ b/src/enc/vp8l.c
@@ -811,27 +811,6 @@ static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
   return err;
 }
 
-// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
-static void BundleColorMap(const uint8_t* const row, int width,
-                           int xbits, uint32_t* const dst) {
-  int x;
-  if (xbits > 0) {
-    const int bit_depth = 1 << (3 - xbits);
-    const int mask = (1 << xbits) - 1;
-    uint32_t code = 0xff000000;
-    for (x = 0; x < width; ++x) {
-      const int xsub = x & mask;
-      if (xsub == 0) {
-        code = 0xff000000;
-      }
-      code |= row[x] << (8 + bit_depth * xsub);
-      dst[x >> xbits] = code;
-    }
-  } else {
-    for (x = 0; x < width; ++x) dst[x] = 0xff000000 | (row[x] << 8);
-  }
-}
-
 // Note: Expects "enc->palette_" to be set properly.
 // Also, "enc->palette_" will be modified after this call and should not be used
 // later.
@@ -848,6 +827,7 @@ static WebPEncodingError ApplyPalette(VP8LBitWriter* const bw,
   const int palette_size = enc->palette_size_;
   uint8_t* row = NULL;
   int xbits;
+  int is_alpha = 1;
 
   // Replace each input pixel by corresponding palette index.
   // This is done line by line.
@@ -864,19 +844,43 @@ static WebPEncodingError ApplyPalette(VP8LBitWriter* const bw,
   row = WebPSafeMalloc((uint64_t)width, sizeof(*row));
   if (row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
 
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x < width; ++x) {
-      const uint32_t pix = src[x];
-      for (i = 0; i < palette_size; ++i) {
-        if (pix == palette[i]) {
-          row[x] = i;
-          break;
+  for (i = 0; i < palette_size; ++i) {
+    if ((palette[i] & 0x00ff00ffu) != 0) {
+      is_alpha = 0;
+      break;
+    }
+  }
+
+  if (is_alpha) {
+    int inv_palette[MAX_PALETTE_SIZE] = { 0 };
+    for (i = 0; i < palette_size; ++i) {
+      const int color = (palette[i] >> 8) & 0xff;
+      inv_palette[color] = i;
+    }
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
+        const int color = (src[x] >> 8) & 0xff;
+        row[x] = inv_palette[color];
+      }
+      VP8LBundleColorMap(row, width, xbits, dst);
+      src += pic->argb_stride;
+      dst += enc->current_width_;
+    }
+  } else {
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
+        const uint32_t pix = src[x];
+        for (i = 0; i < palette_size; ++i) {
+          if (pix == palette[i]) {
+            row[x] = i;
+            break;
+          }
         }
       }
+      VP8LBundleColorMap(row, width, xbits, dst);
+      src += pic->argb_stride;
+      dst += enc->current_width_;
     }
-    BundleColorMap(row, width, xbits, dst);
-    src += pic->argb_stride;
-    dst += enc->current_width_;
   }
 
   // Save palette to bitstream.
@@ -899,13 +903,10 @@ static WebPEncodingError ApplyPalette(VP8LBitWriter* const bw,
 
 // -----------------------------------------------------------------------------
 
-static int GetHistoBits(const WebPConfig* const config,
-                        const WebPPicture* const pic) {
-  const int width = pic->width;
-  const int height = pic->height;
+static int GetHistoBits(int method, int use_palette, int width, int height) {
   const uint64_t hist_size = sizeof(VP8LHistogram);
   // Make tile size a function of encoding method (Range: 0 to 6).
-  int histo_bits = 7 - config->method;
+  int histo_bits = (use_palette ? 9 : 7) - method;
   while (1) {
     const uint64_t huff_image_size = VP8LSubSampleSize(width, histo_bits) *
                                      VP8LSubSampleSize(height, histo_bits) *
@@ -917,13 +918,14 @@ static int GetHistoBits(const WebPConfig* const config,
          (histo_bits > MAX_HUFFMAN_BITS) ? MAX_HUFFMAN_BITS : histo_bits;
 }
 
-static void InitEncParams(VP8LEncoder* const enc) {
+static void FinishEncParams(VP8LEncoder* const enc) {
   const WebPConfig* const config = enc->config_;
-  const WebPPicture* const picture = enc->pic_;
+  const WebPPicture* const pic = enc->pic_;
   const int method = config->method;
   const float quality = config->quality;
+  const int use_palette = enc->use_palette_;
   enc->transform_bits_ = (method < 4) ? 5 : (method > 4) ? 3 : 4;
-  enc->histo_bits_ = GetHistoBits(config, picture);
+  enc->histo_bits_ = GetHistoBits(method, use_palette, pic->width, pic->height);
   enc->cache_bits_ = (quality <= 25.f) ? 0 : 7;
 }
 
@@ -965,8 +967,6 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
     goto Error;
   }
 
-  InitEncParams(enc);
-
   // ---------------------------------------------------------------------------
   // Analyze image (entropy, num_palettes etc)
 
@@ -975,6 +975,8 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
     goto Error;
   }
 
+  FinishEncParams(enc);
+
   if (enc->use_palette_) {
     err = ApplyPalette(bw, enc, quality);
     if (err != VP8_ENC_OK) goto Error;
diff --git a/src/utils/filters.c b/src/utils/filters.c
index 9486355a..ad847746 100644
--- a/src/utils/filters.c
+++ b/src/utils/filters.c
@@ -154,8 +154,7 @@ static void GradientUnfilter(int width, int height, int stride, uint8_t* data) {
 #undef SANITY_CHECK
 
 // -----------------------------------------------------------------------------
-// Quick estimate of a potentially interesting filter mode to try, in addition
-// to the default NONE.
+// Quick estimate of a potentially interesting filter mode to try.
 
 #define SMAX 16
 #define SDIFF(a, b) (abs((a) - (b)) >> 4)   // Scoring diff, in [0..SMAX)
@@ -165,6 +164,7 @@ WEBP_FILTER_TYPE EstimateBestFilter(const uint8_t* data,
   int i, j;
   int bins[WEBP_FILTER_LAST][SMAX];
   memset(bins, 0, sizeof(bins));
+
   // We only sample every other pixels. That's enough.
   for (j = 2; j < height - 1; j += 2) {
     const uint8_t* const p = data + j * stride;