Add Alpha encode/decode code.

Add code for Alpha encoding & decoding. The alpha compression is done via backward reference counts encoded with Arithmetic encoder (TCoder). Also provided is lossy Alpha pre-processing option via level-quantizations using kNN heuristic. Change-Id: Ib6b13530c1a4ab6493edcb586ad29fe242bc1766
2025-08-29 15:22:12 +02:00 · 2011-11-30 17:46:45 +05:30
parent afc4c5d695
commit e1947a9299
9 changed files with 1284 additions and 15 deletions
--- a/Makefile.vc
+++ b/Makefile.vc
@@ -175,8 +175,11 @@ X_OBJS= \
    $(DIROBJ)\enc\tree.obj \
    $(DIROBJ)\enc\webpenc.obj \
    $(DIROBJ)\mux\mux.obj \
    $(DIROBJ)\utils\alpha.obj \
    $(DIROBJ)\utils\bit_reader.obj \
    $(DIROBJ)\utils\bit_writer.obj \
    $(DIROBJ)\utils\quant_levels.obj \
    $(DIROBJ)\utils\tcoder.obj \
    $(DIROBJ)\utils\thread.obj \
    $(RESOURCE) \
--- a/makefile.unix
+++ b/makefile.unix
@@ -54,6 +54,8 @@ ARFLAGS = r
 CC = gcc -Isrc/ -Iexamples/ -Wall
 CFLAGS = -O3 -DNDEBUG $(EXTRA_FLAGS)
 INSTALL = install
 GROFF = /usr/bin/groff
 COL = /usr/bin/col
 LDFLAGS = $(EXTRA_LIBS) -lm
 DEC_OBJS = src/dec/frame.o src/dec/webp.o src/dec/quant.o src/dec/tree.o \
@@ -68,7 +70,8 @@ DSP_OBJS = src/dsp/cpu.o src/dsp/enc.o \
           src/dsp/enc_sse2.o src/dsp/dec.o src/dsp/dec_sse2.o \
           src/dsp/dec_neon.o src/dsp/upsampling.o src/dsp/upsampling_sse2.o \
           src/dsp/yuv.o
-UTILS_OBJS = src/utils/bit_reader.o src/utils/bit_writer.o src/utils/thread.o
+UTILS_OBJS = src/utils/alpha.o src/utils/bit_reader.o src/utils/bit_writer.o \
             src/utils/quant_levels.o src/utils/thread.o src/utils/tcoder.o
 OBJS = $(DEC_OBJS) $(ENC_OBJS) $(DSP_OBJS) $(UTILS_OBJS)
@@ -77,10 +80,13 @@ MUX_OBJS = src/mux/mux.o
 HDRS = src/webp/encode.h src/enc/vp8enci.h src/enc/cost.h src/webp/mux.h \
       src/dec/vp8i.h  \
       src/dsp/yuv.h src/dsp/dsp.h \
-       src/utils/bit_writer.h src/utils/bit_reader.h src/utils/thread.h
+       src/utils/alpha.h src/utils/bit_writer.h src/utils/bit_reader.h \
       src/utils/thread.h src/utils/tcoder.h
-OUTPUT = examples/cwebp examples/dwebp examples/webpmux \
+OUT_LIBS = src/libwebp.a src/mux/libwebpmux.a
-	 src/libwebp.a src/mux/libwebpmux.a
+OUT_EXAMPLES = examples/cwebp examples/dwebp examples/webpmux
 OUTPUT = $(OUT_LIBS) $(OUT_EXAMPLES)
 all:ex
@@ -93,29 +99,29 @@ src/libwebp.a:  $(OBJS)
 src/mux/libwebpmux.a:  $(MUX_OBJS)
 	$(AR) $(ARFLAGS) $@ $^
-ex: examples/cwebp examples/dwebp examples/webpmux
+ex: $(OUT_EXAMPLES)
 examples/cwebp: examples/cwebp.o src/libwebp.a
 examples/dwebp: examples/dwebp.o src/libwebp.a
 examples/webpmux: examples/webpmux.o src/mux/libwebpmux.a src/libwebp.a
-examples/cwebp examples/dwebp examples/webpmux:
+
 $(OUT_EXAMPLES):
 	$(CC) -o $@ $^ $(LDFLAGS)
 dist: DESTDIR := dist
 dist: all
 	$(INSTALL) -m755 -d $(DESTDIR)/include/webp \
-	    $(DESTDIR)/doc $(DESTDIR)/lib
+             $(DESTDIR)/doc $(DESTDIR)/lib
-	$(INSTALL) -m755 -s examples/cwebp examples/dwebp examples/webpmux \
+	$(INSTALL) -m755 -s $(OUT_EXAMPLES) $(DESTDIR)
 	    $(DESTDIR)
 	$(INSTALL) -m644 src/webp/*.h $(DESTDIR)/include/webp
 	$(INSTALL) -m644 src/libwebp.a $(DESTDIR)/lib
 	umask 022; \
 	for m in man/[cd]webp.1; do \
 	  basenam=$$(basename $$m .1); \
-	  /usr/bin/groff -t -e -man -T utf8 $$m \
+	  $(GROFF) -t -e -man -T utf8 $$m \
-	    | col -bx >$(DESTDIR)/doc/$${basenam}.txt; \
+	    | $(COL) -bx >$(DESTDIR)/doc/$${basenam}.txt; \
-	  /usr/bin/groff -t -e -man -T html $$m \
+	  $(GROFF) -t -e -man -T html $$m \
-	    | col -bx >$(DESTDIR)/doc/$${basenam}.html; \
+	    | $(COL) -bx >$(DESTDIR)/doc/$${basenam}.html; \
 	done
 clean:
--- a/src/utils/Makefile.am
+++ b/src/utils/Makefile.am
@@ -1,10 +1,12 @@
 AM_CPPFLAGS = -I$(top_srcdir)/src
-libwebputils_la_SOURCES = bit_reader.h bit_reader.c \
+libwebputils_la_SOURCES = alpha.h alpha.c \
                          bit_reader.h bit_reader.c \
                          bit_writer.h bit_writer.c \
                          quant_levels.c \
                          tcoder.h tcoderi.h tcoder.c \
                          thread.h thread.c
 libwebputils_la_LDFLAGS = -version-info 0:0:0
 libwebputils_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
 libwebputilsinclude_HEADERS = ../webp/types.h
 libwebputilsincludedir = $(includedir)/webp
--- a/src/utils/alpha.c
+++ b/src/utils/alpha.c
@@ -0,0 +1,432 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
 //  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Alpha plane encoding and decoding library.
 //
 // Author: vikasa@google.com (Vikas Arora)
 #include <string.h>  // for memcpy()
 #include "./alpha.h"
 #include "./bit_reader.h"
 #include "./bit_writer.h"
 #include "./tcoder.h"
 #define MAX_SYMBOLS      255
 #define ALPHA_HEADER_LEN 2
 // -----------------------------------------------------------------------------
 // Alpha Encode.
 static int EncodeIdent(const uint8_t* data, int width, int height,
                       uint8_t** output, size_t* output_size) {
  const size_t data_size = height * width;
  uint8_t* alpha = NULL;
  assert((output != NULL) && (output_size != NULL));
  if (data == NULL) {
    return 0;
  }
  alpha = (uint8_t*)malloc(data_size);
  if (alpha == NULL) {
    return 0;
  }
  memcpy(alpha, data, data_size);
  *output_size = data_size;
  *output = alpha;
  return 1;
 }
 // -----------------------------------------------------------------------------
 // Zlib-like encoding using TCoder
 typedef struct {
  int dist;        // backward distance (=0 means: literal)
  int literal;     // literal value (if dist = 0)
  size_t len;      // length of matched string for non-literal
 } Token;
 #define MIN_LEN 2
 #define DEFER_SKIP 1      // for deferred evaluation (0 = off)
 #define CACHED_COST(coder, c) ((cost_cache[(c)] == 0.) ?             \
  (cost_cache[(c)] = lit_mode_cost + TCoderSymbolCost((coder), (c))) \
  : cost_cache[(c)])
 // Record symbol
 #define RECORD(TOKEN) {                                       \
  TCoderEncode(coderd, (TOKEN)->dist, NULL);                  \
  if ((TOKEN)->dist == 0) {                                   \
    TCoderEncode(coder, (TOKEN)->literal, NULL);              \
  } else {                                                    \
    TCoderEncode(coderl, (TOKEN)->len - MIN_LEN, NULL);       \
  }                                                           \
 }
 static size_t GetLongestMatch(const uint8_t* const data,
                           const uint8_t* const ref, size_t max_len) {
  size_t n;
  for (n = 0; n < max_len && (data[n] == ref[n]); ++n) { /* do nothing */ }
  return n;
 }
 static int EncodeZlibTCoder(uint8_t* data, int width, int height,
                            uint8_t** output, size_t* output_size) {
  int ok = 0;
  const size_t data_size = width * height;
  const size_t MAX_DIST = 3 * width;
  const size_t MAX_LEN = 2 * width;
  Token* const msg = (Token*)malloc(data_size * sizeof(*msg));
  int num_tokens;
  TCoder* const coder = TCoderNew(MAX_SYMBOLS);
  TCoder* const coderd = TCoderNew(MAX_DIST);
  TCoder* const coderl = TCoderNew(MAX_LEN - MIN_LEN);
  if (coder == NULL || coderd == NULL || coderl == NULL) {
    goto End;
  }
  if (msg == NULL) {
    goto End;
  }
  {
    int deferred_eval = 0;
    size_t n = 0;
    num_tokens = 0;
    while (n < data_size) {
      const double lit_mode_cost = TCoderSymbolCost(coderd, 0);
      double cost_cache[MAX_SYMBOLS + 1] = { 0. };
      Token best;
      size_t dist = 0;
      double best_cost = CACHED_COST(coder, data[n]);
      size_t max_len = MAX_LEN;
      if (max_len > data_size - n) {
        max_len = data_size - n;
      }
      best.dist = 0;
      best.literal = data[n];
      best.len = 1;
      for (dist = 1; dist <= MAX_DIST && dist <= n; ++dist) {
        const int pos = n - dist;
        const size_t min_len = best.len - 1;
        size_t len;
        // Early out: we probe at two locations for a quick match check
        if (data[pos] != data[n] ||
            data[pos + min_len] != data[n + min_len]) {
          continue;
        }
        len = GetLongestMatch(data + pos, data + n, max_len);
        if (len >= MIN_LEN && len >= best.len) {
          // This is the cost of the coding proposal
          const double cost = TCoderSymbolCost(coderl, len - MIN_LEN)
                            + TCoderSymbolCost(coderd, dist);
          // We're gaining an extra len-best.len coded message over the last
          // known best. Compute how this would have cost if coded all literal.
          // (TODO: we shoud fully re-evaluate at position best.len and not
          // assume all is going be coded as literals. But it's at least an
          // upper-bound (worst-case coding). Deferred evaluation usd below
          // partially addresses this.
          double lit_cost = 0;
          size_t i;
          for (i = best.len; i < len; ++i) {
            lit_cost += CACHED_COST(coder, data[n + i]);
          }
          // So, is it worth ?
          if (best_cost + lit_cost >= cost) {
            best_cost = cost;
            best.len = len;
            best.dist = dist;
          }
        }
        if (len >= MAX_LEN) {
          break;  // No need to search further. We already got a max-long match
        }
      }
      // Deferred evaluation: before finalizing a choice we try to find
      // best cost at position n + 1 and see if we get a longer
      // match then current best. If so, we transform the current match
      // into a literal, go to position n + 1, and try again.
      {
        Token* cur = &msg[num_tokens];
        int forget = 0;
        if (deferred_eval) {
          --cur;
          // If the next match isn't longer, keep previous match
          if (best.len <= cur->len) {
            deferred_eval = 0;
            n += cur->len - DEFER_SKIP;
            forget = 1;   // forget the new match
            RECORD(cur)
          } else {   // else transform previous match into a shorter one
            cur->len = DEFER_SKIP;
            if (DEFER_SKIP == 1) {
              cur->dist = 0;    // literal
            }
            // TODO(later): RECORD() macro should be changed to take an extra
            // "is_final" param, so that we could write the bitstream at once.
            RECORD(cur)
            ++cur;
          }
        }
        if (!forget) {
          *cur = best;
          ++num_tokens;
          if (DEFER_SKIP > 0) {
            deferred_eval = (cur->len > 2) && (cur->len < MAX_LEN / 2);
          }
          if (deferred_eval) {
            // will probe at a later position before finalizing.
            n += DEFER_SKIP;
          } else {
            // Keep the current choice.
            n += cur->len;
            RECORD(cur)
          }
        }
      }
    }
  }
  // Final bitstream assembly.
  {
    int n;
    VP8BitWriter bw;
    VP8BitWriterInit(&bw, 0);
    TCoderInit(coder);
    TCoderInit(coderd);
    TCoderInit(coderl);
    for (n = 0; n < num_tokens; ++n) {
      const Token* const t = &msg[n];
      const int is_literal = (t->dist == 0);
      TCoderEncode(coderd, t->dist, &bw);
      if (is_literal) {  // literal
        TCoderEncode(coder, t->literal, &bw);
      } else {
        TCoderEncode(coderl, t->len - MIN_LEN, &bw);
      }
    }
    // clean up
    VP8BitWriterFinish(&bw);
    *output = VP8BitWriterBuf(&bw);
    *output_size = VP8BitWriterSize(&bw);
    ok = 1;
  }
 End:
  if (coder) TCoderDelete(coder);
  if (coderl) TCoderDelete(coderl);
  if (coderd) TCoderDelete(coderd);
  free(msg);
  return ok;
 }
 // -----------------------------------------------------------------------------
 int EncodeAlpha(const uint8_t* data, int width, int height, int stride,
                int quality, int method,
                uint8_t** output, size_t* output_size) {
  const int kMaxImageDim = (1 << 14) - 1;
  uint8_t* compressed_alpha = NULL;
  uint8_t* quant_alpha = NULL;
  uint8_t* out = NULL;
  size_t compressed_size = 0;
  size_t data_size = height * width;
  float mse = 0.0;
  int ok = 0;
  int h;
  if ((data == NULL) || (output == NULL) || (output_size == NULL)) {
    return 0;
  }
  if (width <= 0 || width > kMaxImageDim ||
      height <= 0 || height > kMaxImageDim || stride < width) {
    return 0;
  }
  if (quality < 0 || quality > 100) {
    return 0;
  }
  if (method < 0 || method > 1) {
    return 0;
  }
  quant_alpha = (uint8_t*)malloc(data_size);
  if (quant_alpha == NULL) {
    return 0;
  }
  // Extract the alpha data (WidthXHeight) from raw_data (StrideXHeight).
  for (h = 0; h < height; ++h) {
    memcpy(quant_alpha + h * width, data + h * stride, width * sizeof(*data));
  }
  if (quality < 100) {  // No Quantization required for 'quality = 100'.
    // 16 Alpha levels gives quite a low MSE w.r.t Original Alpha plane hence
    // mapped to moderate quality 70. Hence Quality:[0, 70] -> Levels:[2, 16]
    // and Quality:]70, 100] -> Levels:]16, 256].
    const int alpha_levels = (quality <= 70) ?
                             2 + quality / 5 :
                             16 + (quality - 70) * 8;
    ok = QuantizeLevels(quant_alpha, width, height, alpha_levels, &mse);
    if (!ok) {
      free(quant_alpha);
      return 0;
    }
  }
  if (method == 0) {
    ok = EncodeIdent(quant_alpha, width, height,
                     &compressed_alpha, &compressed_size);
  } else if (method == 1) {
    ok = EncodeZlibTCoder(quant_alpha, width, height,
                          &compressed_alpha, &compressed_size);
  }
  free(quant_alpha);
  if (!ok) {
    return 0;
  }
  out = (uint8_t*)malloc(compressed_size + ALPHA_HEADER_LEN);
  if (out == NULL) {
    free(compressed_alpha);
    return 0;
  } else {
    *output = out;
  }
  // Alpha bit-stream Header:
  // Byte0: Compression Method.
  // Byte1: Reserved for later extension.
  out[0] = method & 0xff;
  out[1] = 0;  // Reserved Byte.
  out += ALPHA_HEADER_LEN;
  memcpy(out, compressed_alpha, compressed_size);
  free(compressed_alpha);
  out += compressed_size;
  *output_size = out - *output;
  return 1;
 }
 // -----------------------------------------------------------------------------
 // Alpha Decode.
 static int DecodeIdent(const uint8_t* data, size_t data_size,
                       uint8_t* output) {
  assert((data != NULL) && (output != NULL));
  memcpy(output, data, data_size);
  return 1;
 }
 static int DecompressZlibTCoder(const uint8_t* data, size_t data_size,
                                int width, int height,
                                uint8_t* output, size_t output_size) {
  int ok = 1;
  const size_t MAX_DIST = 3 * width;
  const size_t MAX_LEN = 2 * width;
  TCoder* const coder = TCoderNew(MAX_SYMBOLS);
  TCoder* const coderd = TCoderNew(MAX_DIST);
  TCoder* const coderl = TCoderNew(MAX_LEN - MIN_LEN);
  if (coder == NULL || coderd == NULL || coderl == NULL) {
    goto End;
  }
  (void)height;     // unused parameter
  {
    size_t pos = 0;
    VP8BitReader br;
    VP8InitBitReader(&br, data, data + data_size);
    while (pos < output_size) {
      const int dist = TCoderDecode(coderd, &br);
      if (dist == 0) {
        const int literal = TCoderDecode(coder, &br);
        output[pos] = literal;
        ++pos;
      } else {
        const int len = MIN_LEN + TCoderDecode(coderl, &br);
        int k;
        if (pos + len > output_size) goto End;
        for (k = 0; k < len; ++k) {
          output[pos + k] = output[pos + k - dist];
        }
        pos += len;
      }
    }
  }
  ok = 1;
 End:
  if (coder) TCoderDelete(coder);
  if (coderl) TCoderDelete(coderl);
  if (coderd) TCoderDelete(coderd);
  return ok;
 }
 // -----------------------------------------------------------------------------
 int DecodeAlpha(const uint8_t* data, size_t data_size,
                int width, int height, int stride,
                uint8_t* output) {
  uint8_t* decoded_data = NULL;
  int ok = 0;
  int method;
  size_t decoded_size = height * width;
  if (data == NULL || output == NULL) {
    return 0;
  }
  if (data_size <= ALPHA_HEADER_LEN) {
    return 0;
  }
  if (width <= 0 || height <= 0 || stride < width) {
    return 0;
  }
  method = data[0];
  if (method < 0 || method > 1) {
    return 0;
  }
  decoded_data = (uint8_t*)malloc(decoded_size);
  if (decoded_data == NULL) {
    return 0;
  }
  data_size -= ALPHA_HEADER_LEN;
  data += ALPHA_HEADER_LEN;
  if (method == 0) {
    ok = DecodeIdent(data, data_size, decoded_data);
  } else if (method == 1) {
    ok = DecompressZlibTCoder(data, data_size, width, height,
                              decoded_data, decoded_size);
  }
  if (ok) {
    // Construct raw_data (HeightXStride) from the alpha data (HeightXWidth).
    int h;
    for (h = 0; h < height; ++h) {
      memcpy(output + h * stride, decoded_data + h * width,
             width * sizeof(*data));
    }
  }
  free(decoded_data);
  return ok;
 }
--- a/src/utils/alpha.h
+++ b/src/utils/alpha.h
@@ -0,0 +1,68 @@
 // Copyright 2011 Google Inc.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
 //  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Alpha plane encoding and decoding library.
 //
 // Author: vikasa@google.com (Vikas Arora)
 #ifndef WEBP_UTILS_ALPHA_H_
 #define WEBP_UTILS_ALPHA_H_
 #include <stdlib.h>
 #include "../webp/types.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 // Encodes the given Alpha data 'data' of size 'stride'x'height' via specified
 // compression method 'method'. The pre-processing (Quantization) is
 // performed if 'quality' is less than 100. For such cases, the encoding is
 // lossy. Valid ranges for 'quality' is [0, 100] and 'method' is [0, 2]:
 //   'method = 0' - No compression;
 //   'method = 1' - zlib;
 // 'output' corresponds to the buffer containing compressed Alpha data.
 //          This buffer is allocated by this method and caller should call
 //          free(*output) when done.
 // 'output_size' corresponds to size of this compressed Alpha buffer.
 //
 // Returns 1 on successfully encoding the Alpha and
 //         0 if either:
 //           data, output or output_size is NULL, or
 //           inappropriate width, height or stride, or
 //           invalid quality or method, or
 //           Memory allocation for the compressed data fails.
 int EncodeAlpha(const uint8_t* data, int width, int height, int stride,
                int quality, int method,
                uint8_t** output, size_t* output_size);
 // Decodes the compressed data 'data' of size 'data_size' into the 'output'.
 // The 'output' buffer should be pre-alloacated and must be of the same
 // dimension 'height'x'stride', as that of the image.
 //
 // Returns 1 on successfully decoding the compressed Alpha and
 //         0 if either:
 //           data or output is NULL, or
 //           Error in bit-stream header (invalid compression mode or qbits), or
 //           Error returned by approppriate compression method.
 int DecodeAlpha(const uint8_t* data, size_t data_size,
                int width, int height, int stride, uint8_t* output);
 // Replace the input 'data' of size 'width'x'height' with 'num-levels'
 // quantized values. If not NULL, 'mse' will contain the mean-squared error.
 // Valid range for 'num_levels' is [2, 256].
 // Returns false in case of error (data is NULL, or parameters are invalid).
 int QuantizeLevels(uint8_t* data, int width, int height, int num_levels,
                   float* mse);
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 #endif  /* WEBP_UTILS_ALPHA_H_ */
--- a/src/utils/quant_levels.c
+++ b/src/utils/quant_levels.c
@@ -0,0 +1,143 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
 //  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Quantize levels for specified number of quantization-levels ([2, 256]).
 // Min and max values are preserved (usual 0 and 255 for alpha plane).
 //
 // Author: skal@google.com (Pascal Massimino)
 #include <assert.h>
 #include <math.h>    // for sqrt()
 #include "./alpha.h"
 #define NUM_SYMBOLS     256
 #define MAX_ITER  6             // Maximum number of convergence steps.
 #define ERROR_THRESHOLD 1e-4    // MSE stopping criterion.
 // -----------------------------------------------------------------------------
 // Quantize levels.
 int QuantizeLevels(uint8_t* data, int width, int height,
                   int num_levels, float* mse) {
  int freq[NUM_SYMBOLS] = { 0 };
  int q_level[NUM_SYMBOLS] = { 0 };
  double inv_q_level[NUM_SYMBOLS] = { 0 };
  int min_s = 255, max_s = 0;
  const size_t data_size = height * width;
  size_t n = 0;
  int s, num_levels_in, iter;
  double last_err = 1.e38, err = 0.;
  if (data == NULL) {
    return 0;
  }
  if (width <= 0 || height <= 0) {
    return 0;
  }
  if (num_levels < 2 || num_levels > 256) {
    return 0;
  }
  num_levels_in = 0;
  for (n = 0; n < data_size; ++n) {
    num_levels_in += (freq[data[n]] == 0);
    if (min_s > data[n]) min_s = data[n];
    if (max_s < data[n]) max_s = data[n];
    ++freq[data[n]];
  }
  if (num_levels_in <= num_levels) {
    if (mse) *mse = 0.;
    return 1;   // nothing to do !
  }
  // Start with uniformly spread centroids.
  for (s = 0; s < num_levels; ++s) {
    inv_q_level[s] = min_s + (double)(max_s - min_s) * s / (num_levels - 1);
  }
  // Fixed values. Won't be changed.
  q_level[min_s] = 0;
  q_level[max_s] = num_levels - 1;
  assert(inv_q_level[0] == min_s);
  assert(inv_q_level[num_levels - 1] == max_s);
  // k-Means iterations.
  for (iter = 0; iter < MAX_ITER; ++iter) {
    double err_count;
    double q_sum[NUM_SYMBOLS] = { 0 };
    double q_count[NUM_SYMBOLS] = { 0 };
    int slot = 0;
    // Assign classes to representatives.
    for (s = min_s; s <= max_s; ++s) {
      // Keep track of the nearest neighbour 'slot'
      while (slot < num_levels - 1 &&
             2 * s > inv_q_level[slot] + inv_q_level[slot + 1]) {
        ++slot;
      }
      if (freq[s] > 0) {
        q_sum[slot] += s * freq[s];
        q_count[slot] += freq[s];
      }
      q_level[s] = slot;
    }
    // Assign new representatives to classes.
    if (num_levels > 2) {
      for (slot = 1; slot < num_levels - 1; ++slot) {
        const double count = q_count[slot];
        if (count > 0.) {
          inv_q_level[slot] = q_sum[slot] / count;
        }
      }
    }
    // Compute convergence error.
    err = 0.;
    err_count = 0.;
    for (s = min_s; s <= max_s; ++s) {
      const double error = s - inv_q_level[q_level[s]];
      err += freq[s] * error * error;
      err_count += freq[s];
    }
    if (err_count > 0.) err /= err_count;
    // Check for convergence: we stop as soon as the error is no
    // longer improving.
    if (last_err - err < ERROR_THRESHOLD) break;
    last_err = err;
  }
  // Remap the alpha plane to quantized values.
  {
    // double->int rounding operation can be costly, so we do it
    // once for all before remaping. We also perform the data[] -> slot
    // mapping, while at it (avoid one indirection in the final loop).
    uint8_t map[NUM_SYMBOLS];
    int s;
    for (s = min_s; s <= max_s; ++s) {
      const int slot = q_level[s];
      map[s] = (uint8_t)(inv_q_level[slot] + .5);
    }
    // Final pass.
    for (n = 0; n < data_size; ++n) {
      data[n] = map[data[n]];
    }
  }
  // Compute final mean squared error if needed.
  if (mse) {
    *mse = sqrt(err);
  }
  return 1;
 }
--- a/src/utils/tcoder.c
+++ b/src/utils/tcoder.c
@@ -0,0 +1,460 @@
 // Copyright 2011 Google Inc.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
 //  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Tree-coder using VP8's boolean coder
 //
 // Author: Skal (pascal.massimino@gmail.com)
 //
 // Rationale:
 //   We extend the boolean (binary) coder to handle arbitrary-sized alphabets,
 // and not just binary ones.
 // We dynamically maintain the population count and use the locally-optimal
 // probability distribution for coding every symbol. Every symbol can be
 // coded using _any_ binary tree. The boolean coder would traverse it and
 // branch each nodes left and right with the accumulated probability.
 //
 // E.g. with 3 symbols A, B, C already coded 30, 50 and 120 times respectively:
 //
 /*  Root Node #0 (count=30+50+120=200)
    |  \
    |   A (count=30)
   Inner-Node #1 (count=50+120=170)
    | \
    |  C (count=120)
    B (count=50)
 */
 // If the next symbol to code is "C", we'll first code '0' with probability
 // p0 = 170/200 (which is the probability of taking the left branch at the
 // Root Node #0) and then code '1' with a probability p1 = 120/170 (which
 // is the probability of taking the right branch at the Inner-Node #1). The
 // total probability p0 * p1  = 120 / 200 is the correct one for symbol 'C'
 // (up to small rounding differences in the boolean coder).
 // The alphabet could be coded with _any_ tree, provided the count at the
 // inner nodes are updated appropriately. Put otherwise, the binary tree
 // is only used to efficiently update the frequency counts in O(ln(N)) time
 // instead of O(N).
 // For instance, we could use the equivalent tree:
 /*  Root (count=200)
     | \
     |  C (count=120)
    Inner (count=50+30=80)
     |  \
     |   B (count=50)
     A (count=30)
 */
 // The frequency distribution would still be respected when coding the symbols.
 // But! There's a noticeable difference: it only takes _one_ call to VP8PutBit()
 // when coding the letter 'C' (with probability 120/200), which is the most
 // frequent symbol. This has an impact on speed, considering that each call
 // to VP8PutBit/VP8GetBit is costly. Hence, in order to minimize the number
 // of binary coding, the frequent symbol should be up in the tree.
 // Using Huffman tree is a solution, but the management and updating can be
 // quite complicated. Here we opt for a simpler option: we use _ternary_
 // tree instead, where each inner node can be associated with a symbol, in
 // addition to the regular left/right branches. When we traverse down
 // the tree, a stop bit is used to signal whether the traversal is finished
 // or not. Its probability is proportional to the frequency with which the
 // node's symbol has been seen (see probaS_). If the traversal is not
 // finished, we keep branching right or left according with a probability
 // proportional to each branch's use count (see probaL_).
 // When a symbol is seen more frequently than its parent, we simply
 // exchange the two symbols without changing the tree structure or the
 // left/right branches.
 // Hence, both tree examples above can be coded using this ternary tree:
 /*       Root #0 (count=200)
         / | \
        /  C  \
    Node #1   Node #2
    / | \     / | \
   x  A  x   x  B  x        <- where 'x' means un-assigned branches.
 */
 // Here, if the symbol 'A' becomes more frequent afterward, we'll just swap it
 // with 'C' (cf ExchangeSymbol()) without reorganizing the tree.
 //
 // Using this simple maintainance, we obverved a typical 10-20% reduction
 // in the number of calls to VP8PutBit(), leading to 3-5% speed gain.
 //
 #include "./tcoderi.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 #ifdef _MSC_VER
 static double log2(double d) {
  const double kLog2Reciprocal = 1.442695040888963;
  return log(d) * kLog2Reciprocal;
 }
 #endif
 // For code=00001xxx..., returns the position of the leftmost leading '1' bit.
 static WEBP_INLINE int CodeLength(int code) {
  int length = 0;
  if (code > 0) {
    while ((code >> length) != 1) ++length;
  }
  return length;
 }
 // -----------------------------------------------------------------------------
 TCoder* TCoderNew(int max_symbol) {
  const int num_nodes = max_symbol + 1;
  TCoder* c;
  uint8_t* memory;
  const int size = sizeof(*c)
                 + num_nodes * sizeof(*c->nodes_)
                 + num_nodes * sizeof(*c->symbols_);
  if (max_symbol < 0) return NULL;
  memory = (uint8_t*)malloc(size);
  if (memory == NULL) return NULL;
  c = (TCoder*)memory;
  memory += sizeof(*c);
  c->nodes_ = (Node*)memory - 1;
  memory += num_nodes * sizeof(*c->nodes_);
  c->symbols_ = (int*)memory;
  c->num_nodes_ = num_nodes;
  c->frozen_ = 0;
  TCoderInit(c);
  return c;
 }
 static WEBP_INLINE void ResetNode(Node* const node, Symbol_t symbol) {
  assert(node);
  node->countS_ = (Count_t)0;
  node->count_  = (Count_t)0;
  node->probaS_ = HALF_PROBA;
  node->probaL_ = HALF_PROBA;
  node->symbol_ = symbol;
 }
 // Wipe the tree clean.
 static void ResetTree(TCoder* const c) {
  int pos;
  assert(c);
  c->num_symbols_ = 0;
  c->total_coded_ = 0;
  c->probaN_ = HALF_PROBA;
  for (pos = 1; pos <= c->num_nodes_; ++pos) {
    ResetNode(&c->nodes_[pos], INVALID_SYMBOL);
  }
  c->fixed_symbols_ = 0;
  c->symbol_bit_cost_ = 5 + CodeLength(c->num_nodes_);
 }
 static void ResetSymbolMap(TCoder* const c) {
  Symbol_t s;
  assert(c);
  c->num_symbols_ = 0;
  c->probaN_ = HALF_PROBA;
  for (s = 0; s < c->num_nodes_; ++s) {
    c->symbols_[s] = INVALID_POS;
  }
 }
 void TCoderInit(TCoder* const c) {
  assert(c);
  if (!c->frozen_) {      // Reset counters
    ResetTree(c);
    ResetSymbolMap(c);
  }
 }
 void TCoderDelete(TCoder* const c) {
  free(c);
 }
 // -----------------------------------------------------------------------------
 // Tree utils around nodes
 // Total number of visits on this nodes
 static WEBP_INLINE Count_t TotalCount(const Node* const n) {
  return n->countS_ + n->count_;
 }
 // Returns true if node has no child.
 static WEBP_INLINE int IsLeaf(const TCoder* const c, int pos) {
  return (2 * pos > c->num_symbols_);
 }
 // Returns true if node has no child.
 static WEBP_INLINE int HasOnlyRightChild(const TCoder* const c, int pos) {
  return (2 * pos == c->num_symbols_);
 }
 // -----------------------------------------------------------------------------
 // Node management
 static int NewNode(TCoder* const c, int s) {
  // For an initial new symbol position, we pick the slot that is the
  // closest to the top of the tree. It shortens the paths' length.
  const int pos = 1 + c->num_symbols_;
  assert(c);
  assert(c->num_symbols_ < c->num_nodes_);
  c->symbols_[s] = pos;
  ResetNode(&c->nodes_[pos], s);
  ++c->num_symbols_;
  return pos;
 }
 // trivial method, mainly for debug
 static WEBP_INLINE int SymbolToNode(const TCoder* const c, int s) {
  const int pos = c->symbols_[s];
  assert(s >= 0 && s < c->num_nodes_ && s != INVALID_SYMBOL);
  assert(pos != INVALID_POS);
  assert(c->nodes_[pos].symbol_ == s);
  return pos;
 }
 #define SWAP(T, a, b) do {  \
  const T tmp = (a);        \
  (a) = (b);                \
  (b) = tmp;                \
 } while (0)
 // Make child symbol bubble up one level
 static void ExchangeSymbol(const TCoder* const c, const int pos) {
  const int parent = pos >> 1;
  Node* const node0 = &c->nodes_[parent];   // parent node
  Node* const node1 = &c->nodes_[pos];      // child node
  const Symbol_t S0 = node0->symbol_;
  const Symbol_t S1 = node1->symbol_;
  c->symbols_[S1] = parent;
  c->symbols_[S0] = pos;
  assert(node1->countS_ >= node0->countS_);
  node0->count_ -= (node1->countS_ - node0->countS_);
  assert(node0->count_ > 0);
  SWAP(Count_t,  node0->countS_, node1->countS_);
  SWAP(Symbol_t, node0->symbol_, node1->symbol_);
  // Note: probaL_ and probaS_ are recomputed. No need to SWAP them.
 }
 #undef SWAP
 // -----------------------------------------------------------------------------
 // probability computation
 static WEBP_INLINE int CalcProba(Count_t num, Count_t total,
                                 int max_proba, int round) {
  int p;
  assert(total > 0);
  p = (num * max_proba + round) / total;
  assert(p >= 0 && p <= MAX_PROBA);
  return MAX_PROBA - p;
 }
 static WEBP_INLINE void UpdateNodeProbas(TCoder* const c, int pos) {
  Node* const node = &c->nodes_[pos];
  const Count_t total = TotalCount(node);
  node->probaS_ = CalcProba(node->countS_, total, MAX_PROBA, 0);
  if (!IsLeaf(c, pos)) {
    const Count_t total_count = node->count_;
    const Count_t left_count = TotalCount(&c->nodes_[2 * pos]);
    node->probaL_ =
        MAX_PROBA - CalcProba(left_count, total_count, MAX_PROBA, 0);
  }
 }
 static void UpdateProbas(TCoder* const c, int pos) {
  for ( ; pos >= 1; pos >>= 1) {
    UpdateNodeProbas(c, pos);
  }
  c->probaN_ = CalcProba(c->num_symbols_, c->total_coded_, HALF_PROBA - 1, 0);
 }
 // -----------------------------------------------------------------------------
 static void UpdateTree(TCoder* const c, int pos, Count_t incr) {
  Node* node = &c->nodes_[pos];
  const int is_fresh_new_symbol = (node->countS_ == 0);
  assert(c);
  assert(pos >= 1 && pos <= c->num_nodes_);
  assert(node->symbol_ != INVALID_SYMBOL);
  if (!c->frozen_ || is_fresh_new_symbol) {
    const int starting_pos = pos;   // save for later
    // Update the counters up the tree, possibly exchanging some nodes
    node->countS_ += incr;
    while (pos > 1) {
      Node* const parent = &c->nodes_[pos >> 1];
      parent->count_ += incr;
      if (parent->countS_ < node->countS_) {
        ExchangeSymbol(c, pos);
      }
      pos >>= 1;
      node = parent;
    }
    c->total_coded_ += incr;
    UpdateProbas(c, starting_pos);  // Update the probas along the modified path
  }
 }
 // -----------------------------------------------------------------------------
 // Fixed-length symbol coding
 // Note: the symbol will be coded exactly once at most, so using a fixed length
 // code is better than Golomb-code (e.g.) on average.
 // We use the exact bit-distribution probability considering the upper-bound
 // supplied:
 //  Written in binary, a symbol 's' has a probability of having its k-th bit
 // set to 1 which is given by:
 //  If the k-th bit of max_value is 0:
 //    P0(k) = [(max_value >> (k + 1)) << k] / max_value
 //  If the k-th bit of max_value is 1:
 //    P1(k) = P0(k) + [max_value & ((1 << k) - 1)] / max_value
 static WEBP_INLINE void CodeSymbol(VP8BitWriter* const bw, int s,
                                   int max_value) {
  int i, up = 1;
  assert(bw);
  for (i = 0; up < max_value; up <<= 1, ++i) {
    int den = (max_value >> 1) & ~(up - 1);
    if (max_value & up) den |= max_value & (up - 1);
    VP8PutBit(bw, (s >> i) & 1, MAX_PROBA -  MAX_PROBA * den / max_value);
  }
 }
 static WEBP_INLINE int DecodeSymbol(VP8BitReader* const br, int max_value) {
  int i, up = 1, v = 0;
  assert(br);
  for (i = 0; up < max_value; ++i) {
    int den = (max_value >> 1) & ~(up - 1);
    if (max_value & up) den |= max_value & (up - 1);
    v |= VP8GetBit(br, MAX_PROBA -  MAX_PROBA * den / max_value) << i;
    up <<= 1;
  }
  return v;
 }
 // -----------------------------------------------------------------------------
 // Encoding
 void TCoderEncode(TCoder* const c, int s, VP8BitWriter* const bw) {
  int pos;
  const int is_new_symbol = (c->symbols_[s] == INVALID_POS);
  assert(c);
  if (!c->fixed_symbols_ && c->num_symbols_ < c->num_nodes_) {
    if (c->num_symbols_ > 0) {
      if (bw != NULL) {
        VP8PutBit(bw, is_new_symbol, c->probaN_);
      }
    } else {
      assert(is_new_symbol);
    }
  } else {
    assert(!is_new_symbol);
  }
  if (is_new_symbol) {
    if (bw != NULL) {
      CodeSymbol(bw, s, c->num_nodes_);
    }
    pos = NewNode(c, s);
  } else {
    pos = SymbolToNode(c, s);
    if (bw != NULL) {
      const int length = CodeLength(pos);
      int parent = 1;
      int i;
      for (i = 0; !IsLeaf(c, parent); ++i) {
        const Node* const node = &c->nodes_[parent];
        const int symbol_proba = node->probaS_;
        const int is_stop = (i == length);
        if (VP8PutBit(bw, is_stop, symbol_proba)) {
          break;
        } else if (!HasOnlyRightChild(c, parent)) {
          const int left_proba = node->probaL_;
          const int is_right = (pos >> (length - 1 - i)) & 1;  // extract bits #i
          VP8PutBit(bw, is_right, left_proba);
          parent = (parent << 1) | is_right;
        } else {
          parent <<= 1;
          break;
        }
      }
      assert(parent == pos);
    }
  }
  UpdateTree(c, pos, 1);
 }
 // -----------------------------------------------------------------------------
 // Decoding
 int TCoderDecode(TCoder* const c, VP8BitReader* const br) {
  int s;
  int pos;
  int is_new_symbol = 0;
  assert(c);
  assert(br);
  // Check if we need to transmit the new symbol's value
  if (!c->fixed_symbols_ && c->num_symbols_ < c->num_nodes_) {
    if (c->num_symbols_ > 0) {
      is_new_symbol = VP8GetBit(br, c->probaN_);
    } else {
      is_new_symbol = 1;
    }
  }
  // Code either the raw value, or the path downward to its node.
  if (is_new_symbol) {
    s = DecodeSymbol(br, c->num_nodes_);
    pos = NewNode(c, s);
  } else {
    pos = 1;
    while (!IsLeaf(c, pos)) {
      const Node* const node = &c->nodes_[pos];
      // Did we reach the stopping node?
      const int symbol_proba = node->probaS_;
      const int is_stop = VP8GetBit(br, symbol_proba);
      if (is_stop) {
        break;  // reached the stopping node for the coded symbol.
      } else {
        // Not yet done, keep traversing and branching.
        if (!HasOnlyRightChild(c, pos)) {
          const int left_proba = node->probaL_;
          const int is_right = VP8GetBit(br, left_proba);
          pos = (pos << 1) | is_right;
        } else {
          pos <<= 1;
          break;
        }
        assert(pos <= c->num_nodes_);
      }
    }
    s = c->nodes_[pos].symbol_;
    assert(pos == SymbolToNode(c, s));
  }
  assert(pos <= c->num_nodes_);
  UpdateTree(c, pos, 1);
  return s;
 }
 // -----------------------------------------------------------------------------
 double TCoderSymbolCost(const TCoder* const c, int symbol) {
  const int pos = c->symbols_[symbol];
  assert(c);
  assert(symbol >= 0 && symbol < c->num_nodes_);
  if (pos != INVALID_POS) {
    const Node* const node = &c->nodes_[pos];
    const Count_t count = node->countS_;
    assert(count > 0);
    assert(c->total_coded_ > 0);
    // Note: we use 1 + total_coded_ as denominator because we most probably
    // intend to code an extra symbol afterward.
    // TODO(skal): is log2() too slow ?
    return -log2(count / (1. + c->total_coded_));
  }
  return c->symbol_bit_cost_;
 }
 // -----------------------------------------------------------------------------
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
--- a/src/utils/tcoder.h
+++ b/src/utils/tcoder.h
@@ -0,0 +1,84 @@
 // Copyright 2011 Google Inc.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
 //  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Tree-coder using VP8's boolean coder
 //
 // Symbols are stored as nodes of a tree that records their frequencies and
 // is dynamically updated.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 //
 // Encoding example:
 /*
 static int Compress(const uint8_t* src, int src_length,
                    uint8_t** output, size_t* output_size) {
  int i;
  TCoder* coder = TCoderNew(255);
  VP8BitWriter bw;
  VP8BitWriterInit(&bw, 0);
  for (i = 0; i < src_length; ++i)
    TCoderEncode(coder, src[i], &bw);
  TCoderDelete(coder);
  VP8BitWriterFinish(&bw);
  *output = VP8BitWriterBuf(&bw);
  *output_size = VP8BitWriterSize(&bw);
  return !bw.error_;
 }
 */
 //
 // Decoding example:
 /*
 static int Decompress(const uint8_t* src, size_t src_size,
                      uint8_t* dst, int dst_length) {
  int i;
  TCoder* coder = TCoderNew(255);
  VP8BitReader br;
  VP8InitBitReader(&br, src, src + src_size);
  for (i = 0; i < dst_length; ++i)
    dst[i] = TCoderDecode(coder, &br);
  TCoderDelete(coder);
  return !br.eof_;
 }
 */
 #ifndef WEBP_UTILS_TCODER_H_
 #define WEBP_UTILS_TCODER_H_
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 struct VP8BitReader;
 struct VP8BitWriter;
 typedef struct TCoder TCoder;
 // Creates a tree-coder capable of coding symbols in
 // the [0, max_symbol] range. Returns NULL in case of memory error.
 TCoder* TCoderNew(int max_symbol);
 // Re-initialize an existing object, make it ready for a new encoding or
 // decoding cycle.
 void TCoderInit(TCoder* const c);
 // destroys the tree-ocder object and frees memory.
 void TCoderDelete(TCoder* const c);
 // Code next symbol 's'. If the bit-writer 'bw' is NULL, the function will
 // just record the symbol, and update the internal frequency counters.
 void TCoderEncode(TCoder* const c, int s, struct VP8BitWriter* const bw);
 // Decode and return next symbol.
 int TCoderDecode(TCoder* const c, struct VP8BitReader* const br);
 // Theoretical number of bits needed to code 'symbol' in the current state.
 double TCoderSymbolCost(const TCoder* const c, int symbol);
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 #endif  // WEBP_UTILS_TCODER_H_
--- a/src/utils/tcoderi.h
+++ b/src/utils/tcoderi.h
@@ -0,0 +1,71 @@
 // Copyright 2011 Google Inc.
 //
 // This code is licensed under the same terms as WebM:
 //  Software License Agreement:  http://www.webmproject.org/license/software/
 //  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
 // Internal header for tree-coder
 //
 // Author: Skal (pascal.massimino@gmail.com)
 //
 #ifndef WEBP_UTILS_TCODERI_H_
 #define WEBP_UTILS_TCODERI_H_
 #include "./tcoder.h"
 #include <assert.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "../utils/bit_reader.h"
 #include "../utils/bit_writer.h"
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 typedef int Symbol_t;
 typedef uint32_t Count_t;  // TODO(skal): check overflow during coding.
 #define INVALID_SYMBOL ((Symbol_t)(-1))
 #define INVALID_POS    0
 #define MAX_PROBA 255
 #define HALF_PROBA 128
 typedef struct {        // ternary node.
  Symbol_t symbol_;
  // Note: theoretically, one of this three field is redundant and could be
  // omitted, but it'd make the code quite complicated (having to look-up the
  // parent's total count in order to deduce the missing field). Better not.
  Count_t countS_;    // count for symbol
  Count_t count_;     // count for non-symbol (derived from sub-tree)
  int probaL_;        // cached left proba = TotalCount(left) / count_
  int probaS_;        // cached approximate proba = countS_ / TotalCount
 } Node;
 struct TCoder {
  // dynamic fields:
  int num_symbols_;       // number of symbols actually used
  Count_t total_coded_;   // total number of coded symbols
  int frozen_;            // if true, frequencies are not updated
  int fixed_symbols_;     // if true, symbols are not updated
  int probaN_;            // cached new-symbol probability
  // constants:
  int num_nodes_;            // max number of symbols or nodes. Constant, > 0.
  double symbol_bit_cost_;   // latest evaluation of the bit-cost per new symbol
  Node* nodes_;              // nodes (1-based indexed)
  int* symbols_;             // for each symbol, location of its node
 };
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 #endif  // WEBP_UTILS_TCODERI_H_