diff --git a/examples/cwebp.c b/examples/cwebp.c
index df14b83f..a04264db 100644
--- a/examples/cwebp.c
+++ b/examples/cwebp.c
@@ -838,6 +838,11 @@ int main(int argc, const char *argv[]) {
       }
     } else if (!strcmp(argv[c], "-noalpha")) {
       keep_alpha = 0;
+#ifdef USE_LOSSLESS_ENCODER
+    } else if (!strcmp(argv[c], "-lossless")) {
+      config.lossless = 1;
+      picture.use_argb_input = 1;
+#endif
     } else if (!strcmp(argv[c], "-size") && c < argc - 1) {
       config.target_size = strtol(argv[++c], NULL, 0);
     } else if (!strcmp(argv[c], "-psnr") && c < argc - 1) {
diff --git a/src/dec/vp8l.c b/src/dec/vp8l.c
index 92e62d22..d73ffe27 100644
--- a/src/dec/vp8l.c
+++ b/src/dec/vp8l.c
@@ -31,6 +31,7 @@ static const int kCodeLengthRepeatOffsets[3] = { 3, 3, 11 };
 #define NUM_LENGTH_CODES    24
 #define NUM_DISTANCE_CODES  40
 #define DEFAULT_CODE_LENGTH 8
+#define MAX_CACHE_BITS      11
 
 // -----------------------------------------------------------------------------
 //  Five Huffman codes are used at each meta code:
@@ -171,7 +172,7 @@ static WEBP_INLINE int ReadSymbol(const HuffmanTree* tree,
 
 static int ReadHuffmanCodeLengths(
     VP8LDecoder* const dec, const int* const code_length_code_lengths,
-    int num_codes, int num_symbols, int* const code_lengths) {
+    int num_symbols, int* const code_lengths) {
   int ok = 0;
   VP8LBitReader* const br = &dec->br_;
   int symbol;
@@ -179,7 +180,8 @@ static int ReadHuffmanCodeLengths(
   int prev_code_len = DEFAULT_CODE_LENGTH;
   HuffmanTree tree;
 
-  if (!HuffmanTreeBuildImplicit(&tree, code_length_code_lengths, num_codes)) {
+  if (!HuffmanTreeBuildImplicit(&tree, code_length_code_lengths,
+                                NUM_CODE_LENGTH_CODES)) {
     dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
     return 0;
   }
@@ -236,25 +238,17 @@ static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
     int symbols[2];
     int codes[2];
     int code_lengths[2];
-    const int nbits = VP8LReadBits(br, 3);
-    const int num_symbols = 1 + ((nbits == 0) ? 0 : VP8LReadBits(br, 1));
-
-    if (nbits == 0) {
-      symbols[0] = 0;
-      codes[0] = 0;
-      code_lengths[0] = 0;
-    } else {
-      const int num_bits = (nbits - 1) * 2 + 4;
-      int i;
-      for (i = 0; i < num_symbols; ++i) {
-        symbols[i] = VP8LReadBits(br, num_bits);
-        if (symbols[i] >= alphabet_size) {
-          dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
-          return 0;
-        }
-        codes[i] = i;
-        code_lengths[i] = num_symbols - 1;
-      }
+    const int num_symbols = VP8LReadBits(br, 1) + 1;
+    const int first_symbol_len_code = VP8LReadBits(br, 1);
+    // The first code is either 1 bit or 8 bit code.
+    symbols[0] = VP8LReadBits(br, (first_symbol_len_code == 0) ? 1 : 8);
+    codes[0] = 0;
+    code_lengths[0] = num_symbols - 1;
+    // The second code (if present), is always 8 bit long.
+    if (num_symbols == 2) {
+      symbols[1] = VP8LReadBits(br, 8);
+      codes[1] = 1;
+      code_lengths[1] = num_symbols - 1;
     }
     ok = HuffmanTreeBuildExplicit(tree, code_lengths, codes,
                                   symbols, num_symbols);
@@ -277,9 +271,8 @@ static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
     for (i = 0; i < num_codes; ++i) {
       code_length_code_lengths[kCodeLengthCodeOrder[i]] = VP8LReadBits(br, 3);
     }
-    ok = ReadHuffmanCodeLengths(dec, code_length_code_lengths,
-                                NUM_CODE_LENGTH_CODES,
-                                alphabet_size, code_lengths);
+    ok = ReadHuffmanCodeLengths(dec, code_length_code_lengths, alphabet_size,
+                                code_lengths);
     if (ok) {
       ok = HuffmanTreeBuildImplicit(tree, code_lengths, alphabet_size);
     }
@@ -293,11 +286,23 @@ static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
   return 1;
 }
 
+static void DeleteHtreeGroups(HTreeGroup* htree_groups, int num_htree_groups) {
+  if (htree_groups != NULL) {
+    int i, j;
+    for (i = 0; i < num_htree_groups; ++i) {
+      HuffmanTree* const htrees = htree_groups[i].htrees_;
+      for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
+        HuffmanTreeRelease(&htrees[j]);
+      }
+    }
+    free(htree_groups);
+  }
+}
+
 static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
-                            int* const color_cache_bits_ptr) {
+                            int color_cache_bits) {
   int ok = 0;
   int i, j;
-  int color_cache_size;
   VP8LBitReader* const br = &dec->br_;
   VP8LMetadata* const hdr = &dec->hdr_;
   uint32_t* huffman_image = NULL;
@@ -305,11 +310,11 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
   int num_htree_groups = 1;
 
   if (VP8LReadBits(br, 1)) {      // use meta Huffman codes
-    int meta_codes_nbits;
     const int huffman_precision = VP8LReadBits(br, 4);
     const int huffman_xsize = VP8LSubSampleSize(xsize, huffman_precision);
     const int huffman_ysize = VP8LSubSampleSize(ysize, huffman_precision);
     const int huffman_pixs = huffman_xsize * huffman_ysize;
+
     if (!DecodeImageStream(huffman_xsize, huffman_ysize, 0, dec,
                            &huffman_image)) {
       dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
@@ -318,19 +323,12 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
     hdr->huffman_subsample_bits_ = huffman_precision;
     for (i = 0; i < huffman_pixs; ++i) {
       // The huffman data is stored in red and green bytes.
-      huffman_image[i] = (huffman_image[i] >> 8) & 0xffff;
+      const int index = (huffman_image[i] >> 8) & 0xffff;
+      huffman_image[i] = index;
+      if (index >= num_htree_groups) {
+        num_htree_groups = index + 1;
+      }
     }
-
-    meta_codes_nbits = VP8LReadBits(br, 4);
-    num_htree_groups = 2 + VP8LReadBits(br, meta_codes_nbits);
-  }
-
-  if (VP8LReadBits(br, 1)) {    // use color cache
-    *color_cache_bits_ptr = VP8LReadBits(br, 4);
-    color_cache_size = 1 << *color_cache_bits_ptr;
-  } else {
-    *color_cache_bits_ptr = 0;
-    color_cache_size = 0;
   }
 
   htree_groups = (HTreeGroup*)calloc(num_htree_groups, sizeof(*htree_groups));
@@ -341,12 +339,13 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
 
   ok = !br->error_;
   for (i = 0; ok && i < num_htree_groups; ++i) {
+    HuffmanTree* const htrees = htree_groups[i].htrees_;
     for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
       int alphabet_size = kAlphabetSize[j];
-      if (j == 0) {
-        alphabet_size += color_cache_size;
+      if (j == 0 && color_cache_bits > 0) {
+        alphabet_size += 1 << color_cache_bits;
       }
-      ok = ReadHuffmanCode(alphabet_size, dec, &htree_groups[i].htrees_[j]);
+      ok = ReadHuffmanCode(alphabet_size, dec, htrees + j);
       ok = ok && !br->error_;
     }
   }
@@ -360,14 +359,7 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
 
  Error:
   free(huffman_image);
-  if (htree_groups != NULL) {
-    for (i = 0; i < num_htree_groups; ++i) {
-      for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
-        HuffmanTreeRelease(&htree_groups[i].htrees_[j]);
-      }
-    }
-    free(htree_groups);
-  }
+  DeleteHtreeGroups(htree_groups, num_htree_groups);
   return 0;
 }
 
@@ -498,6 +490,7 @@ static WEBP_INLINE HTreeGroup* GetHtreeGroupForPos(VP8LMetadata* const hdr,
                                                    int x, int y) {
   const int meta_index = GetMetaIndex(hdr->huffman_image_, hdr->huffman_xsize_,
                                       hdr->huffman_subsample_bits_, x, y);
+  assert(meta_index < hdr->num_htree_groups_);
   return hdr->htree_groups_ + meta_index;
 }
 
@@ -770,16 +763,7 @@ static void ClearMetadata(VP8LMetadata* const hdr) {
   assert(hdr);
 
   free(hdr->huffman_image_);
-  if (hdr->htree_groups_ != NULL) {
-    int i, j;
-    for (i = 0; i < hdr->num_htree_groups_; ++i) {
-      for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
-        HuffmanTreeRelease(&hdr->htree_groups_[i].htrees_[j]);
-      }
-    }
-    free(hdr->htree_groups_);
-  }
-
+  DeleteHtreeGroups(hdr->htree_groups_, hdr->num_htree_groups_);
   VP8LColorCacheDelete(hdr->color_cache_);
   InitMetadata(hdr);
 }
@@ -836,29 +820,38 @@ static int DecodeImageStream(int xsize, int ysize,
   int ok = 1;
   int transform_xsize = xsize;
   int transform_ysize = ysize;
+  VP8LBitReader* const br = &dec->br_;
   VP8LMetadata* const hdr = &dec->hdr_;
   uint32_t* data = NULL;
+  const int transform_start_idx = dec->next_transform_;
   int color_cache_bits = 0;
 
-  VP8LBitReader* const br = &dec->br_;
-  int transform_start_idx = dec->next_transform_;
-
-  // Step#1: Read the transforms (may recurse).
+  // Read the transforms (may recurse).
   if (is_level0) {
     while (ok && VP8LReadBits(br, 1)) {
       ok = ReadTransform(&transform_xsize, &transform_ysize, dec);
     }
   }
 
-  // Step#2: Read the Huffman codes (may recurse).
-  ok = ok && ReadHuffmanCodes(dec, transform_xsize, transform_ysize,
-                              &color_cache_bits);
+  // Color cache
+  if (ok && VP8LReadBits(br, 1)) {
+    color_cache_bits = VP8LReadBits(br, 4);
+    ok = (color_cache_bits >= 1 && color_cache_bits <= MAX_CACHE_BITS);
+    if (!ok) {
+      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+      goto End;
+    }
+  }
 
+  // Read the Huffman codes (may recurse).
+  ok = ok && ReadHuffmanCodes(dec, transform_xsize, transform_ysize,
+                              color_cache_bits);
   if (!ok) {
     dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
     goto End;
   }
 
+  // Finish setting up the color-cache
   if (color_cache_bits > 0) {
     hdr->color_cache_size_ = 1 << color_cache_bits;
     hdr->color_cache_ = (VP8LColorCache*)malloc(sizeof(*hdr->color_cache_));
@@ -869,7 +862,6 @@ static int DecodeImageStream(int xsize, int ysize,
       goto End;
     }
   }
-
   UpdateDecoder(dec, transform_xsize, transform_ysize);
 
   if (is_level0) {   // level 0 complete
@@ -884,11 +876,11 @@ static int DecodeImageStream(int xsize, int ysize,
     goto End;
   }
 
-  // Step#3: Use the Huffman trees to decode the LZ77 encoded data.
+  // Use the Huffman trees to decode the LZ77 encoded data.
   ok = DecodeImageData(dec, data, transform_xsize, transform_ysize, 0);
   ok = ok && !br->error_;
 
-  // Step#4: Apply transforms on the decoded data.
+  // Apply transforms on the decoded data.
   if (ok) ApplyInverseTransforms(dec, transform_start_idx, data);
 
  End:
diff --git a/src/dsp/lossless.c b/src/dsp/lossless.c
index 9a960c41..f0ae075c 100644
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
@@ -15,12 +15,128 @@
 extern "C" {
 #endif
 
+#include <math.h>
 #include <stdlib.h>
 #include "./lossless.h"
 #include "../dec/vp8li.h"
 
+#ifdef USE_LOSSLESS_ENCODER
+
+#include "../enc/histogram.h"
+
+// A lookup table for small values of log(int) to be used in entropy
+// computation.
+//
+// ", ".join(["%.16ff" % x for x in [0.0]+[log(x) for x in range(1, 256)]])
+#define LOG_LOOKUP_IDX_MAX 256
+static const float kLogTable[LOG_LOOKUP_IDX_MAX] = {
+  0.0000000000000000f, 0.0000000000000000f, 0.6931471805599453f,
+  1.0986122886681098f, 1.3862943611198906f, 1.6094379124341003f,
+  1.7917594692280550f, 1.9459101490553132f, 2.0794415416798357f,
+  2.1972245773362196f, 2.3025850929940459f, 2.3978952727983707f,
+  2.4849066497880004f, 2.5649493574615367f, 2.6390573296152584f,
+  2.7080502011022101f, 2.7725887222397811f, 2.8332133440562162f,
+  2.8903717578961645f, 2.9444389791664403f, 2.9957322735539909f,
+  3.0445224377234230f, 3.0910424533583161f, 3.1354942159291497f,
+  3.1780538303479458f, 3.2188758248682006f, 3.2580965380214821f,
+  3.2958368660043291f, 3.3322045101752038f, 3.3672958299864741f,
+  3.4011973816621555f, 3.4339872044851463f, 3.4657359027997265f,
+  3.4965075614664802f, 3.5263605246161616f, 3.5553480614894135f,
+  3.5835189384561099f, 3.6109179126442243f, 3.6375861597263857f,
+  3.6635616461296463f, 3.6888794541139363f, 3.7135720667043080f,
+  3.7376696182833684f, 3.7612001156935624f, 3.7841896339182610f,
+  3.8066624897703196f, 3.8286413964890951f, 3.8501476017100584f,
+  3.8712010109078911f, 3.8918202981106265f, 3.9120230054281460f,
+  3.9318256327243257f, 3.9512437185814275f, 3.9702919135521220f,
+  3.9889840465642745f, 4.0073331852324712f, 4.0253516907351496f,
+  4.0430512678345503f, 4.0604430105464191f, 4.0775374439057197f,
+  4.0943445622221004f, 4.1108738641733114f, 4.1271343850450917f,
+  4.1431347263915326f, 4.1588830833596715f, 4.1743872698956368f,
+  4.1896547420264252f, 4.2046926193909657f, 4.2195077051761070f,
+  4.2341065045972597f, 4.2484952420493594f, 4.2626798770413155f,
+  4.2766661190160553f, 4.2904594411483910f, 4.3040650932041702f,
+  4.3174881135363101f, 4.3307333402863311f, 4.3438054218536841f,
+  4.3567088266895917f, 4.3694478524670215f, 4.3820266346738812f,
+  4.3944491546724391f, 4.4067192472642533f, 4.4188406077965983f,
+  4.4308167988433134f, 4.4426512564903167f, 4.4543472962535073f,
+  4.4659081186545837f, 4.4773368144782069f, 4.4886363697321396f,
+  4.4998096703302650f, 4.5108595065168497f, 4.5217885770490405f,
+  4.5325994931532563f, 4.5432947822700038f, 4.5538768916005408f,
+  4.5643481914678361f, 4.5747109785033828f, 4.5849674786705723f,
+  4.5951198501345898f, 4.6051701859880918f, 4.6151205168412597f,
+  4.6249728132842707f, 4.6347289882296359f, 4.6443908991413725f,
+  4.6539603501575231f, 4.6634390941120669f, 4.6728288344619058f,
+  4.6821312271242199f, 4.6913478822291435f, 4.7004803657924166f,
+  4.7095302013123339f, 4.7184988712950942f, 4.7273878187123408f,
+  4.7361984483944957f, 4.7449321283632502f, 4.7535901911063645f,
+  4.7621739347977563f, 4.7706846244656651f, 4.7791234931115296f,
+  4.7874917427820458f, 4.7957905455967413f, 4.8040210447332568f,
+  4.8121843553724171f, 4.8202815656050371f, 4.8283137373023015f,
+  4.8362819069514780f, 4.8441870864585912f, 4.8520302639196169f,
+  4.8598124043616719f, 4.8675344504555822f, 4.8751973232011512f,
+  4.8828019225863706f, 4.8903491282217537f, 4.8978397999509111f,
+  4.9052747784384296f, 4.9126548857360524f, 4.9199809258281251f,
+  4.9272536851572051f, 4.9344739331306915f, 4.9416424226093039f,
+  4.9487598903781684f, 4.9558270576012609f, 4.9628446302599070f,
+  4.9698132995760007f, 4.9767337424205742f, 4.9836066217083363f,
+  4.9904325867787360f, 4.9972122737641147f, 5.0039463059454592f,
+  5.0106352940962555f, 5.0172798368149243f, 5.0238805208462765f,
+  5.0304379213924353f, 5.0369526024136295f, 5.0434251169192468f,
+  5.0498560072495371f, 5.0562458053483077f, 5.0625950330269669f,
+  5.0689042022202315f, 5.0751738152338266f, 5.0814043649844631f,
+  5.0875963352323836f, 5.0937502008067623f, 5.0998664278241987f,
+  5.1059454739005803f, 5.1119877883565437f, 5.1179938124167554f,
+  5.1239639794032588f, 5.1298987149230735f, 5.1357984370502621f,
+  5.1416635565026603f, 5.1474944768134527f, 5.1532915944977793f,
+  5.1590552992145291f, 5.1647859739235145f, 5.1704839950381514f,
+  5.1761497325738288f, 5.1817835502920850f, 5.1873858058407549f,
+  5.1929568508902104f, 5.1984970312658261f, 5.2040066870767951f,
+  5.2094861528414214f, 5.2149357576089859f, 5.2203558250783244f,
+  5.2257466737132017f, 5.2311086168545868f, 5.2364419628299492f,
+  5.2417470150596426f, 5.2470240721604862f, 5.2522734280466299f,
+  5.2574953720277815f, 5.2626901889048856f, 5.2678581590633282f,
+  5.2729995585637468f, 5.2781146592305168f, 5.2832037287379885f,
+  5.2882670306945352f, 5.2933048247244923f, 5.2983173665480363f,
+  5.3033049080590757f, 5.3082676974012051f, 5.3132059790417872f,
+  5.3181199938442161f, 5.3230099791384085f, 5.3278761687895813f,
+  5.3327187932653688f, 5.3375380797013179f, 5.3423342519648109f,
+  5.3471075307174685f, 5.3518581334760666f, 5.3565862746720123f,
+  5.3612921657094255f, 5.3659760150218512f, 5.3706380281276624f,
+  5.3752784076841653f, 5.3798973535404597f, 5.3844950627890888f,
+  5.3890717298165010f, 5.3936275463523620f, 5.3981627015177525f,
+  5.4026773818722793f, 5.4071717714601188f, 5.4116460518550396f,
+  5.4161004022044201f, 5.4205349992722862f, 5.4249500174814029f,
+  5.4293456289544411f, 5.4337220035542400f, 5.4380793089231956f,
+  5.4424177105217932f, 5.4467373716663099f, 5.4510384535657002f,
+  5.4553211153577017f, 5.4595855141441589f, 5.4638318050256105f,
+  5.4680601411351315f, 5.4722706736714750f, 5.4764635519315110f,
+  5.4806389233419912f, 5.4847969334906548f, 5.4889377261566867f,
+  5.4930614433405482f, 5.4971682252932021f, 5.5012582105447274f,
+  5.5053315359323625f, 5.5093883366279774f, 5.5134287461649825f,
+  5.5174528964647074f, 5.5214609178622460f, 5.5254529391317835f,
+  5.5294290875114234f, 5.5333894887275203f, 5.5373342670185366f,
+  5.5412635451584258f
+};
+
+#define APPROX_LOG_MAX  4096
+#define LOG_2_BASE_E    0.6931471805599453f
+
+float VP8LFastLog(int v) {
+  if (v < APPROX_LOG_MAX) {
+    int log_cnt = 0;
+    while (v >= LOG_LOOKUP_IDX_MAX) {
+      ++log_cnt;
+      v = v >> 1;
+    }
+    return kLogTable[v] + (log_cnt * LOG_2_BASE_E);
+  }
+  return log(v);
+}
+
+#endif
+
 //------------------------------------------------------------------------------
-// Inverse image transforms.
+// Image transforms.
 
 // In-place sum of each component with mod 256.
 static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
@@ -101,61 +217,67 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
 //------------------------------------------------------------------------------
 // Predictors
 
-static void Predictor0(uint32_t* src, const uint32_t* top) {
+static uint32_t Predictor0(uint32_t left, const uint32_t* const top) {
   (void)top;
-  AddPixelsEq(src, ARGB_BLACK);
+  (void)left;
+  return ARGB_BLACK;
 }
-static void Predictor1(uint32_t* src, const uint32_t* top) {
+static uint32_t Predictor1(uint32_t left, const uint32_t* const top) {
   (void)top;
-  AddPixelsEq(src, src[-1]);  // left
+  return left;
 }
-static void Predictor2(uint32_t* src, const uint32_t* top) {
-  AddPixelsEq(src, top[0]);
+static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return top[0];
 }
-static void Predictor3(uint32_t* src, const uint32_t* top) {
-  AddPixelsEq(src, top[1]);
+static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return top[1];
 }
-static void Predictor4(uint32_t* src, const uint32_t* top) {
-  AddPixelsEq(src, top[-1]);
+static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return top[-1];
 }
-static void Predictor5(uint32_t* src, const uint32_t* top) {
-  const uint32_t pred = Average3(src[-1], top[0], top[1]);
-  AddPixelsEq(src, pred);
+static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average3(left, top[0], top[1]);
+  return pred;
 }
-static void Predictor6(uint32_t* src, const uint32_t* top) {
-  const uint32_t pred = Average2(src[-1], top[-1]);
-  AddPixelsEq(src, pred);
+static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(left, top[-1]);
+  return pred;
 }
-static void Predictor7(uint32_t* src, const uint32_t* top) {
-  const uint32_t pred = Average2(src[-1], top[0]);
-  AddPixelsEq(src, pred);
+static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(left, top[0]);
+  return pred;
 }
-static void Predictor8(uint32_t* src, const uint32_t* top) {
+static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(top[-1], top[0]);
-  AddPixelsEq(src, pred);
+  (void)left;
+  return pred;
 }
-static void Predictor9(uint32_t* src, const uint32_t* top) {
+static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(top[0], top[1]);
-  AddPixelsEq(src, pred);
+  (void)left;
+  return pred;
 }
-static void Predictor10(uint32_t* src, const uint32_t* top) {
-  const uint32_t pred = Average4(src[-1], top[-1], top[0], top[1]);
-  AddPixelsEq(src, pred);
+static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
+  return pred;
 }
-static void Predictor11(uint32_t* src, const uint32_t* top) {
-  const uint32_t pred = Select(top[0], src[-1], top[-1]);
-  AddPixelsEq(src, pred);
+static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Select(top[0], left, top[-1]);
+  return pred;
 }
-static void Predictor12(uint32_t* src, const uint32_t* top) {
-  const uint32_t pred = ClampedAddSubtractFull(src[-1], top[0], top[-1]);
-  AddPixelsEq(src, pred);
+static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
+  return pred;
 }
-static void Predictor13(uint32_t* src, const uint32_t* top) {
-  const uint32_t pred = ClampedAddSubtractHalf(src[-1], top[0], top[-1]);
-  AddPixelsEq(src, pred);
+static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
+  return pred;
 }
 
-typedef void (*PredictorFunc)(uint32_t* src, const uint32_t* top);
+typedef uint32_t (*PredictorFunc)(uint32_t left, const uint32_t* const top);
 static const PredictorFunc kPredictors[16] = {
   Predictor0, Predictor1, Predictor2, Predictor3,
   Predictor4, Predictor5, Predictor6, Predictor7,
@@ -164,15 +286,215 @@ static const PredictorFunc kPredictors[16] = {
   Predictor0, Predictor0    // <- padding security sentinels
 };
 
+#ifdef USE_LOSSLESS_ENCODER
+// TODO(vikasa): Replace 256 etc with defines.
+static double PredictionCostSpatial(const int* counts,
+                                    int weight_0, double exp_val) {
+  const int significant_symbols = 16;
+  const double exp_decay_factor = 0.6;
+  double bits = weight_0 * counts[0];
+  int i;
+  for (i = 1; i < significant_symbols; ++i) {
+    bits += exp_val * (counts[i] + counts[256 - i]);
+    exp_val *= exp_decay_factor;
+  }
+  return -0.1 * bits;
+}
+
+// Compute the Shanon's entropy: Sum(p*log2(p))
+static double ShannonEntropy(const int* const array, int n) {
+  int i;
+  double retval = 0;
+  int sum = 0;
+  for (i = 0; i < n; ++i) {
+    if (array[i] != 0) {
+      sum += array[i];
+      retval += array[i] * VP8LFastLog(array[i]);
+    }
+  }
+  retval -= sum * VP8LFastLog(sum);
+  retval *= -1.4426950408889634;  // 1.0 / -FastLog(2);
+  return retval;
+}
+
+static double PredictionCostSpatialHistogram(int accumulated[4][256],
+                                             int tile[4][256]) {
+  int i;
+  int k;
+  int combo[256];
+  double retval = 0;
+  for (i = 0; i < 4; ++i) {
+    const double exp_val = 0.94;
+    retval += PredictionCostSpatial(&tile[i][0], 1, exp_val);
+    retval += ShannonEntropy(&tile[i][0], 256);
+    for (k = 0; k < 256; ++k) {
+      combo[k] = accumulated[i][k] + tile[i][k];
+    }
+    retval += ShannonEntropy(&combo[0], 256);
+  }
+  return retval;
+}
+
+static int GetBestPredictorForTile(int width, int height,
+                                   int tile_x, int tile_y, int bits,
+                                   int accumulated[4][256],
+                                   const uint32_t* const argb_scratch) {
+  const int kNumPredModes = 14;
+  const int col_start = tile_x << bits;
+  const int row_start = tile_y << bits;
+  const int tile_size = 1 << bits;
+  const int ymax = (tile_size <= height - row_start) ?
+      tile_size : height - row_start;
+  const int xmax = (tile_size <= width - col_start) ?
+      tile_size : width - col_start;
+  int histo[4][256];
+  double best_diff = 1e99;
+  int best_mode = 0;
+
+  int mode;
+  for (mode = 0; mode < kNumPredModes; ++mode) {
+    const uint32_t* current_row = argb_scratch;
+    const PredictorFunc pred_func = kPredictors[mode];
+    double cur_diff;
+    int y;
+    memset(&histo[0][0], 0, sizeof(histo));
+    for (y = 0; y < ymax; ++y) {
+      int x;
+      const int row = row_start + y;
+      const uint32_t* const upper_row = current_row;
+      current_row = upper_row + width;
+      for (x = 0; x < xmax; ++x) {
+        const int col = col_start + x;
+        uint32_t predict;
+        uint32_t predict_diff;
+        if (row == 0) {
+          predict = (col == 0) ? ARGB_BLACK : current_row[col - 1];  // Left.
+        } else if (col == 0) {
+          predict = upper_row[col];  // Top.
+        } else {
+          predict = pred_func(current_row[col - 1], upper_row + col);
+        }
+        predict_diff = VP8LSubPixels(current_row[col], predict);
+        ++histo[0][predict_diff >> 24];
+        ++histo[1][((predict_diff >> 16) & 0xff)];
+        ++histo[2][((predict_diff >> 8) & 0xff)];
+        ++histo[3][(predict_diff & 0xff)];
+      }
+    }
+    cur_diff = PredictionCostSpatialHistogram(accumulated, histo);
+    if (cur_diff < best_diff) {
+      best_diff = cur_diff;
+      best_mode = mode;
+    }
+  }
+
+  return best_mode;
+}
+
+static void CopyTileWithPrediction(int width, int height,
+                                   int tile_x, int tile_y, int bits, int mode,
+                                   const uint32_t* const argb_scratch,
+                                   uint32_t* const argb) {
+  const int col_start = tile_x << bits;
+  const int row_start = tile_y << bits;
+  const int tile_size = 1 << bits;
+  const int ymax = (tile_size <= height - row_start) ?
+      tile_size : height - row_start;
+  const int xmax = (tile_size <= width - col_start) ?
+      tile_size : width - col_start;
+  const PredictorFunc pred_func = kPredictors[mode];
+  const uint32_t* current_row = argb_scratch;
+
+  int y;
+  for (y = 0; y < ymax; ++y) {
+    int x;
+    const int row = row_start + y;
+    const uint32_t* const upper_row = current_row;
+    current_row = upper_row + width;
+    for (x = 0; x < xmax; ++x) {
+      const int col = col_start + x;
+      const int pix = row * width + col;
+      uint32_t predict;
+      if (row == 0) {
+        predict = (col == 0) ? ARGB_BLACK : current_row[col - 1];  // Left.
+      } else if (col == 0) {
+        predict = upper_row[col];  // Top.
+      } else {
+        predict = pred_func(current_row[col - 1], upper_row + col);
+      }
+      argb[pix] = VP8LSubPixels(current_row[col], predict);
+    }
+  }
+}
+
+void VP8LResidualImage(int width, int height, int bits,
+                       uint32_t* const argb, uint32_t* const argb_scratch,
+                       uint32_t* const image) {
+  const int max_tile_size = 1 << bits;
+  const int tiles_per_row = VP8LSubSampleSize(width, bits);
+  const int tiles_per_col = VP8LSubSampleSize(height, bits);
+  uint32_t* const upper_row = argb_scratch;
+  uint32_t* const current_tile_rows = argb_scratch + width;
+  int tile_y;
+  int histo[4][256];
+  memset(histo, 0, sizeof(histo));
+  for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
+    const int tile_y_offset = tile_y * max_tile_size;
+    const int this_tile_height =
+        (tile_y < tiles_per_col - 1) ? max_tile_size : height - tile_y_offset;
+    int tile_x;
+    if (tile_y > 0) {
+      memcpy(upper_row, current_tile_rows + (max_tile_size - 1) * width,
+             width * sizeof(*upper_row));
+    }
+    memcpy(current_tile_rows, &argb[tile_y_offset * width],
+           this_tile_height * width * sizeof(*current_tile_rows));
+    for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
+      int pred;
+      int y;
+      const int tile_x_offset = tile_x * max_tile_size;
+      int all_x_max = tile_x_offset + max_tile_size;
+      if (all_x_max > width) {
+        all_x_max = width;
+      }
+      pred = GetBestPredictorForTile(width, height, tile_x, tile_y, bits, histo,
+                                     argb_scratch);
+      image[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8);
+      CopyTileWithPrediction(width, height, tile_x, tile_y, bits, pred,
+                             argb_scratch, argb);
+      for (y = 0; y < max_tile_size; ++y) {
+        int ix;
+        int all_x;
+        int all_y = tile_y_offset + y;
+        if (all_y >= height) {
+          break;
+        }
+        ix = all_y * width + tile_x_offset;
+        for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
+          const uint32_t a = argb[ix];
+          ++histo[0][a >> 24];
+          ++histo[1][((a >> 16) & 0xff)];
+          ++histo[2][((a >> 8) & 0xff)];
+          ++histo[3][(a & 0xff)];
+        }
+      }
+    }
+  }
+}
+
+#endif
+
 // Inverse prediction.
 static void PredictorInverseTransform(const VP8LTransform* const transform,
                                       int y_start, int y_end, uint32_t* data) {
   const int width = transform->xsize_;
   if (y_start == 0) {  // First Row follows the L (mode=1) mode.
     int x;
-    Predictor0(data, NULL);
+    const uint32_t pred = Predictor0(data[-1], NULL);
+    AddPixelsEq(data, pred);
     for (x = 1; x < width; ++x) {
-      Predictor1(data + x, NULL);
+      const uint32_t pred = Predictor1(data[x - 1], NULL);
+      AddPixelsEq(data + x, pred);
     }
     data += width;
     ++y_start;
@@ -186,20 +508,24 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
         transform->data_ + (y >> transform->bits_) * tiles_per_row;
 
     while (y < y_end) {
+      int x;
+      uint32_t pred;
       const uint32_t* pred_mode_src = pred_mode_base;
       PredictorFunc pred_func;
-      int x;
 
       // First pixel follows the T (mode=2) mode.
-      Predictor2(data, data - width);
+      pred = Predictor2(data[-1], data - width);
+      AddPixelsEq(data, pred);
 
       // .. the rest:
       pred_func = kPredictors[((*pred_mode_src++) >> 8) & 0xf];
       for (x = 1; x < width; ++x) {
+        uint32_t pred;
         if ((x & mask) == 0) {    // start of tile. Read predictor function.
           pred_func = kPredictors[((*pred_mode_src++) >> 8) & 0xf];
         }
-        pred_func(data + x, data + x - width);
+        pred = pred_func(data[x - 1], data + x - width);
+        AddPixelsEq(data + x, pred);
       }
       data += width;
       ++y;
@@ -210,8 +536,21 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
   }
 }
 
-// Add Green to Blue and Red channels (i.e. perform the inverse transform of
-// 'Subtract Green').
+#ifdef USE_LOSSLESS_ENCODER
+void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
+  int i;
+  for (i = 0; i < num_pixs; ++i) {
+    const uint32_t argb = argb_data[i];
+    const uint32_t green = (argb >> 8) & 0xff;
+    const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
+    const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
+    argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
+  }
+}
+#endif
+
+// Add green to blue and red channels (i.e. perform the inverse transform of
+// 'subtract green').
 static void AddGreenToBlueAndRed(const VP8LTransform* const transform,
                                  int y_start, int y_end, uint32_t* data) {
   const int width = transform->xsize_;
@@ -228,13 +567,21 @@ static void AddGreenToBlueAndRed(const VP8LTransform* const transform,
 }
 
 typedef struct {
-  int green_to_red_;
-  int green_to_blue_;
-  int red_to_blue_;
+  // Note: the members are uint8_t, so that any negative values are
+  // automatically converted to "mod 256" values.
+  uint8_t green_to_red_;
+  uint8_t green_to_blue_;
+  uint8_t red_to_blue_;
 } Multipliers;
 
+static WEBP_INLINE void MultipliersClear(Multipliers* m) {
+  m->green_to_red_ = 0;
+  m->green_to_blue_ = 0;
+  m->red_to_blue_ = 0;
+}
+
 static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
-                                              int8_t color) {
+                                                int8_t color) {
   return (uint32_t)((int)(color_pred) * color) >> 5;
 }
 
@@ -245,21 +592,277 @@ static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
   m->red_to_blue_   = (color_code >> 16) & 0xff;
 }
 
-static WEBP_INLINE void TransformColor(const Multipliers* const m,
-                                       uint32_t* const argb) {
-  const uint32_t green = *argb >> 8;
-  const uint32_t red = *argb >> 16;
-  uint32_t new_red = red;
-  uint32_t new_blue = *argb;
-
-  new_red += ColorTransformDelta(m->green_to_red_, green);
-  new_red &= 0xff;
-  new_blue += ColorTransformDelta(m->green_to_blue_, green);
-  new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
-  new_blue &= 0xff;
-  *argb = (*argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
+static WEBP_INLINE uint32_t MultipliersToColorCode(Multipliers* const m) {
+  return 0xff000000u |
+         ((uint32_t)(m->red_to_blue_) << 16) |
+         ((uint32_t)(m->green_to_blue_) << 8) |
+         m->green_to_red_;
 }
 
+static WEBP_INLINE uint32_t TransformColor(const Multipliers* const m,
+                                           uint32_t argb, int inverse) {
+  const uint32_t green = argb >> 8;
+  const uint32_t red = argb >> 16;
+  uint32_t new_red = red;
+  uint32_t new_blue = argb;
+
+  if (inverse) {
+    new_red += ColorTransformDelta(m->green_to_red_, green);
+    new_red &= 0xff;
+    new_blue += ColorTransformDelta(m->green_to_blue_, green);
+    new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
+    new_blue &= 0xff;
+  } else {
+    new_red -= ColorTransformDelta(m->green_to_red_, green);
+    new_red &= 0xff;
+    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
+    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
+    new_blue &= 0xff;
+  }
+  return (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
+}
+
+#ifdef USE_LOSSLESS_ENCODER
+static WEBP_INLINE int SkipRepeatedPixels(const uint32_t* const argb,
+                                          int ix, int xsize) {
+  const uint32_t v = argb[ix];
+  if (ix >= xsize + 3) {
+    if (v == argb[ix - xsize] &&
+        argb[ix - 1] == argb[ix - xsize - 1] &&
+        argb[ix - 2] == argb[ix - xsize - 2] &&
+        argb[ix - 3] == argb[ix - xsize - 3]) {
+      return 1;
+    }
+    return v == argb[ix - 3] && v == argb[ix - 2] && v == argb[ix - 1];
+  } else if (ix >= 3) {
+    return v == argb[ix - 3] && v == argb[ix - 2] && v == argb[ix - 1];
+  }
+  return 0;
+}
+
+static double PredictionCostCrossColor(const int accumulated[256],
+                                       const int counts[256]) {
+  // Favor low entropy, locally and globally.
+  int i;
+  int combo[256];
+  for (i = 0; i < 256; ++i) {
+    combo[i] = accumulated[i] + counts[i];
+  }
+  return ShannonEntropy(combo, 256) +
+         ShannonEntropy(counts, 256) +
+         PredictionCostSpatial(counts, 3, 2.4);  // Favor small absolute values.
+}
+
+static Multipliers GetBestColorTransformForTile(
+    int tile_x, int tile_y, int bits,
+    Multipliers prevX,
+    Multipliers prevY,
+    int step, int xsize, int ysize,
+    int* accumulated_red_histo,
+    int* accumulated_blue_histo,
+    const uint32_t* const argb) {
+  double best_diff = 1e99;
+  double cur_diff;
+  const int halfstep = step / 2;
+  const int max_tile_size = 1 << bits;
+  const int tile_y_offset = tile_y * max_tile_size;
+  const int tile_x_offset = tile_x * max_tile_size;
+  int green_to_red;
+  int green_to_blue;
+  int red_to_blue;
+  int all_x_max = tile_x_offset + max_tile_size;
+  int all_y_max = tile_y_offset + max_tile_size;
+  Multipliers best_tx;
+  MultipliersClear(&best_tx);
+  if (all_x_max > xsize) {
+    all_x_max = xsize;
+  }
+  if (all_y_max > ysize) {
+    all_y_max = ysize;
+  }
+  for (green_to_red = -64; green_to_red <= 64; green_to_red += halfstep) {
+    int histo[256] = { 0 };
+    int all_y;
+    Multipliers tx;
+    MultipliersClear(&tx);
+    tx.green_to_red_ = green_to_red & 0xff;
+
+    for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
+      uint32_t predict;
+      int ix = all_y * xsize + tile_x_offset;
+      int all_x;
+      for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
+        if (SkipRepeatedPixels(argb, ix, xsize)) {
+          continue;
+        }
+        predict = TransformColor(&tx, argb[ix], 0);
+        ++histo[(predict >> 16) & 0xff];  // red.
+      }
+    }
+    cur_diff = PredictionCostCrossColor(&accumulated_red_histo[0], &histo[0]);
+    if (tx.green_to_red_ == prevX.green_to_red_) {
+      cur_diff -= 3;  // favor keeping the areas locally similar
+    }
+    if (tx.green_to_red_ == prevY.green_to_red_) {
+      cur_diff -= 3;  // favor keeping the areas locally similar
+    }
+    if (tx.green_to_red_ == 0) {
+      cur_diff -= 3;
+    }
+    if (cur_diff < best_diff) {
+      best_diff = cur_diff;
+      best_tx = tx;
+    }
+  }
+  best_diff = 1e99;
+  green_to_red = best_tx.green_to_red_;
+  for (green_to_blue = -32; green_to_blue <= 32; green_to_blue += step) {
+    for (red_to_blue = -32; red_to_blue <= 32; red_to_blue += step) {
+      int all_y;
+      int histo[256] = { 0 };
+      Multipliers tx;
+      tx.green_to_red_ = green_to_red;
+      tx.green_to_blue_ = green_to_blue;
+      tx.red_to_blue_ = red_to_blue;
+      for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
+        uint32_t predict;
+        int all_x;
+        int ix = all_y * xsize + tile_x_offset;
+        for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
+          if (SkipRepeatedPixels(argb, ix, xsize)) {
+            continue;
+          }
+          predict = TransformColor(&tx, argb[ix], 0);
+          ++histo[predict & 0xff];  // blue.
+        }
+      }
+      cur_diff =
+        PredictionCostCrossColor(&accumulated_blue_histo[0], &histo[0]);
+      if (tx.green_to_blue_ == prevX.green_to_blue_) {
+        cur_diff -= 3;  // favor keeping the areas locally similar
+      }
+      if (tx.green_to_blue_ == prevY.green_to_blue_) {
+        cur_diff -= 3;  // favor keeping the areas locally similar
+      }
+      if (tx.red_to_blue_ == prevX.red_to_blue_) {
+        cur_diff -= 3;  // favor keeping the areas locally similar
+      }
+      if (tx.red_to_blue_ == prevY.red_to_blue_) {
+        cur_diff -= 3;  // favor keeping the areas locally similar
+      }
+      if (tx.green_to_blue_ == 0) {
+        cur_diff -= 3;
+      }
+      if (tx.red_to_blue_ == 0) {
+        cur_diff -= 3;
+      }
+      if (cur_diff < best_diff) {
+        best_diff = cur_diff;
+        best_tx = tx;
+      }
+    }
+  }
+  return best_tx;
+}
+
+static void CopyTileWithColorTransform(int xsize, int ysize,
+                                       int tile_x, int tile_y, int bits,
+                                       Multipliers color_transform,
+                                       uint32_t* const argb) {
+  int y;
+  int xscan = 1 << bits;
+  int yscan = 1 << bits;
+  tile_x <<= bits;
+  tile_y <<= bits;
+  if (xscan > xsize - tile_x) {
+    xscan = xsize - tile_x;
+  }
+  if (yscan > ysize - tile_y) {
+    yscan = ysize - tile_y;
+  }
+  yscan += tile_y;
+  for (y = tile_y; y < yscan; ++y) {
+    int ix = y * xsize + tile_x;
+    const int end_ix = ix + xscan;
+    for (; ix < end_ix; ++ix) {
+      argb[ix] = TransformColor(&color_transform, argb[ix], 0);
+    }
+  }
+}
+
+void VP8LColorSpaceTransform(int width, int height, int bits, int step,
+                             uint32_t* const argb, uint32_t* image) {
+  const int max_tile_size = 1 << bits;
+  int tile_xsize = VP8LSubSampleSize(width, bits);
+  int tile_ysize = VP8LSubSampleSize(height, bits);
+  int accumulated_red_histo[256] = { 0 };
+  int accumulated_blue_histo[256] = { 0 };
+  int tile_y;
+  int tile_x;
+  Multipliers prevX;
+  Multipliers prevY;
+  MultipliersClear(&prevY);
+  MultipliersClear(&prevX);
+  for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
+    for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
+      Multipliers color_transform;
+      int all_x_max;
+      int y;
+      const int tile_y_offset = tile_y * max_tile_size;
+      const int tile_x_offset = tile_x * max_tile_size;
+      if (tile_y != 0) {
+        ColorCodeToMultipliers(image[tile_y * tile_xsize + tile_x - 1], &prevX);
+        ColorCodeToMultipliers(image[(tile_y - 1) * tile_xsize + tile_x],
+                               &prevY);
+      } else if (tile_x != 0) {
+        ColorCodeToMultipliers(image[tile_y * tile_xsize + tile_x - 1], &prevX);
+      }
+      color_transform =
+          GetBestColorTransformForTile(tile_x, tile_y, bits,
+                                       prevX, prevY,
+                                       step, width, height,
+                                       &accumulated_red_histo[0],
+                                       &accumulated_blue_histo[0],
+                                       argb);
+      image[tile_y * tile_xsize + tile_x] =
+          MultipliersToColorCode(&color_transform);
+      CopyTileWithColorTransform(width, height, tile_x, tile_y, bits,
+                                 color_transform, argb);
+
+      // Gather accumulated histogram data.
+      all_x_max = tile_x_offset + max_tile_size;
+      if (all_x_max > width) {
+        all_x_max = width;
+      }
+      for (y = 0; y < max_tile_size; ++y) {
+        int ix;
+        int all_x;
+        int all_y = tile_y_offset + y;
+        if (all_y >= height) {
+          break;
+        }
+        ix = all_y * width + tile_x_offset;
+        for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
+          if (ix >= 2 &&
+              argb[ix] == argb[ix - 2] &&
+              argb[ix] == argb[ix - 1]) {
+            continue;  // repeated pixels are handled by backward references
+          }
+          if (ix >= width + 2 &&
+              argb[ix - 2] == argb[ix - width - 2] &&
+              argb[ix - 1] == argb[ix - width - 1] &&
+              argb[ix] == argb[ix - width]) {
+            continue;  // repeated pixels are handled by backward references
+          }
+          ++accumulated_red_histo[(argb[ix] >> 16) & 0xff];
+          ++accumulated_blue_histo[argb[ix] & 0xff];
+        }
+      }
+    }
+  }
+}
+#endif
+
 // Color space inverse transform.
 static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
                                        int y_start, int y_end, uint32_t* data) {
@@ -277,7 +880,7 @@ static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
 
     for (x = 0; x < width; ++x) {
       if ((x & mask) == 0) ColorCodeToMultipliers(*pred++, &m);
-      TransformColor(&m, data + x);
+      data[x] = TransformColor(&m, data[x], 1);
     }
     data += width;
     ++y;
diff --git a/src/dsp/lossless.h b/src/dsp/lossless.h
index 10fdb96c..60a6a7fe 100644
--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@@ -21,7 +21,7 @@ extern "C" {
 #endif
 
 //------------------------------------------------------------------------------
-// Inverse image transforms.
+// Image transforms.
 
 struct VP8LTransform;  // Defined in dec/vp8li.h.
 
@@ -33,13 +33,25 @@ void VP8LInverseTransform(const struct VP8LTransform* const transform,
                           int row_start, int row_end,
                           uint32_t* const data_in, uint32_t* const data_out);
 
+#ifdef USE_LOSSLESS_ENCODER
+// Subtracts green from blue and red channels.
+void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs);
+
+void VP8LResidualImage(int width, int height, int bits,
+                       uint32_t* const argb, uint32_t* const argb_scratch,
+                       uint32_t* const image);
+
+void VP8LColorSpaceTransform(int width, int height, int bits, int step,
+                             uint32_t* const argb, uint32_t* image);
+#endif
+
 //------------------------------------------------------------------------------
 // Color space conversion.
 
 // Converts from BGRA to other color spaces.
 void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
-                        WEBP_CSP_MODE out_colorspace,
-                        uint8_t* const rgba);
+                         WEBP_CSP_MODE out_colorspace,
+                         uint8_t* const rgba);
 
 //------------------------------------------------------------------------------
 // Misc methods.
@@ -50,6 +62,20 @@ static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
   return (size + (1 << sampling_bits) - 1) >> sampling_bits;
 }
 
+#ifdef USE_LOSSLESS_ENCODER
+// Faster logarithm for small integers, with the property of log(0) == 0.
+float VP8LFastLog(int v);
+
+// In-place difference of each component with mod 256.
+static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
+  const uint32_t alpha_and_green =
+      0x00ff00ffu + (a & 0xff00ff00u) - (b & 0xff00ff00u);
+  const uint32_t red_and_blue =
+      0xff00ff00u + (a & 0x00ff00ffu) - (b & 0x00ff00ffu);
+  return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
+}
+#endif
+
 //------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
diff --git a/src/enc/backward_references.c b/src/enc/backward_references.c
new file mode 100644
index 00000000..2d89b8e7
--- /dev/null
+++ b/src/enc/backward_references.c
@@ -0,0 +1,748 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+
+#ifdef USE_LOSSLESS_ENCODER
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./backward_references.h"
+#include "./histogram.h"
+#include "../utils/color_cache.h"
+
+#define VALUES_IN_BYTE 256
+
+static const uint8_t plane_to_code_lut[128] = {
+ 96,   73,  55,  39,  23,  13,   5,  1,  255, 255, 255, 255, 255, 255, 255, 255,
+ 101,  78,  58,  42,  26,  16,   8,  2,    0,   3,  9,   17,  27,  43,  59,  79,
+ 102,  86,  62,  46,  32,  20,  10,  6,    4,   7,  11,  21,  33,  47,  63,  87,
+ 105,  90,  70,  52,  37,  28,  18,  14,  12,  15,  19,  29,  38,  53,  71,  91,
+ 110,  99,  82,  66,  48,  35,  30,  24,  22,  25,  31,  36,  49,  67,  83, 100,
+ 115, 108,  94,  76,  64,  50,  44,  40,  34,  41,  45,  51,  65,  77,  95, 109,
+ 118, 113, 103,  92,  80,  68,  60,  56,  54,  57,  61,  69,  81,  93, 104, 114,
+ 119, 116, 111, 106,  97,  88,  84,  74,  72,  75,  85,  89,  98, 107, 112, 117,
+};
+
+static const int kMinLength = 2;
+
+int VP8LDistanceToPlaneCode(int xsize, int dist) {
+  const int yoffset = dist / xsize;
+  const int xoffset = dist - yoffset * xsize;
+  if (xoffset <= 8 && yoffset < 8) {
+    return plane_to_code_lut[yoffset * 16 + 8 - xoffset] + 1;
+  } else if (xoffset > xsize - 8 && yoffset < 7) {
+    return plane_to_code_lut[(yoffset + 1) * 16 + 8 + (xsize - xoffset)] + 1;
+  }
+  return dist + 120;
+}
+
+static WEBP_INLINE int FindMatchLength(const uint32_t* const array1,
+                                       const uint32_t* const array2,
+                                       const int max_limit) {
+  int match_len = 0;
+  while (match_len < max_limit && array1[match_len] == array2[match_len]) {
+    ++match_len;
+  }
+  return match_len;
+}
+
+#define HASH_BITS 18
+#define HASH_SIZE (1 << HASH_BITS)
+static const uint64_t kHashMultiplier = 0xc6a4a7935bd1e995ULL;
+static const int kWindowSize = (1 << 20) - 120;  // A window with 1M pixels
+                                                 // (4 megabytes) - 120
+                                                 // special codes for short
+                                                 // distances.
+
+static WEBP_INLINE uint64_t GetHash64(uint64_t num) {
+  num *= kHashMultiplier;
+  num >>= 64 - HASH_BITS;
+  return num;
+}
+
+static WEBP_INLINE uint64_t GetPixPair(const uint32_t* const argb) {
+  return ((uint64_t)(argb[1]) << 32) | argb[0];
+}
+
+typedef struct {
+  // Stores the most recently added position with the given hash value.
+  int32_t hash_to_first_index_[HASH_SIZE];
+  // chain_[pos] stores the previous position with the same hash value
+  // for every pixel in the image.
+  int32_t* chain_;
+} VP8LHashChain;
+
+static int VP8LHashChainInit(VP8LHashChain* const p, int size) {
+  int i;
+  p->chain_ = (int*)malloc(size * sizeof(*p->chain_));
+  if (p->chain_ == NULL) {
+    return 0;
+  }
+  for (i = 0; i < size; ++i) {
+    p->chain_[i] = -1;
+  }
+  for (i = 0; i < HASH_SIZE; ++i) {
+    p->hash_to_first_index_[i] = -1;
+  }
+  return 1;
+}
+
+static void VP8LHashChainClear(VP8LHashChain* const p) {
+  if (p != NULL) {
+    free(p->chain_);
+  }
+}
+
+static void VP8LHashChainInsert(VP8LHashChain* const p,
+                                const uint32_t* const argb, int32_t pos) {
+  // Insertion of two pixels at a time.
+  const uint64_t key = GetPixPair(argb);
+  const uint64_t hash_code = GetHash64(key);
+  p->chain_[pos] = p->hash_to_first_index_[hash_code];
+  p->hash_to_first_index_[hash_code] = pos;
+}
+
+static int VP8LHashChainFindCopy(
+    const VP8LHashChain* const p, int quality, int index, int xsize,
+    const uint32_t* const argb, int maxlen, int* const distance_ptr,
+    int* const length_ptr) {
+  const uint64_t next_two_pixels = GetPixPair(&argb[index]);
+  const uint64_t hash_code = GetHash64(next_two_pixels);
+  int prev_length = 0;
+  int64_t best_val = 0;
+  int give_up = 10 + (quality >> 1);
+  const int min_pos = (index > kWindowSize) ? index - kWindowSize : 0;
+  int32_t pos;
+  int64_t val;
+  int best_length = 0;
+  int best_distance = 0;
+  for (pos = p->hash_to_first_index_[hash_code];
+       pos >= min_pos;
+       pos = p->chain_[pos]) {
+    int curr_length;
+    if (give_up < 0) {
+      if (give_up < -quality * 2 || best_val >= 0xff0000) {
+        break;
+      }
+    }
+    --give_up;
+    if (best_length != 0 &&
+        argb[pos + best_length - 1] != argb[index + best_length - 1]) {
+      continue;
+    }
+    curr_length = FindMatchLength(argb + pos, argb + index, maxlen);
+    if (curr_length < prev_length) {
+      continue;
+    }
+    val = 65536 * curr_length;
+    // Favoring 2d locality here gives savings for certain images.
+    if (index - pos < 9 * xsize) {
+      const int y = (index - pos) / xsize;
+      int x = (index - pos) % xsize;
+      if (x > xsize / 2) {
+        x = xsize - x;
+      }
+      if (x <= 7 && x >= -8) {
+        val -= y * y + x * x;
+      } else {
+        val -= 9 * 9 + 9 * 9;
+      }
+    } else {
+      val -= 9 * 9 + 9 * 9;
+    }
+    if (best_val < val) {
+      prev_length = curr_length;
+      best_val = val;
+      best_length = curr_length;
+      best_distance = index - pos;
+      if (curr_length >= kMaxLength) {
+        break;
+      }
+      if ((best_distance == 1 || best_distance == xsize) &&
+          best_length >= 128) {
+        break;
+      }
+    }
+  }
+  *distance_ptr = best_distance;
+  *length_ptr = best_length;
+  return best_length >= kMinLength;
+}
+
+static WEBP_INLINE void PushBackCopy(VP8LBackwardRefs* const refs, int length) {
+  while (length >= kMaxLength) {
+    refs->refs[refs->size++] = PixOrCopyCreateCopy(1, kMaxLength);
+    length -= kMaxLength;
+  }
+  if (length > 0) {
+    refs->refs[refs->size++] = PixOrCopyCreateCopy(1, length);
+  }
+}
+
+void VP8LBackwardReferencesRle(
+    int xsize, int ysize, const uint32_t* const argb,
+    VP8LBackwardRefs* const refs) {
+  const int pix_count = xsize * ysize;
+  int match_len = 0;
+  int i;
+  refs->size = 0;
+  for (i = 0; i < pix_count; ++i) {
+    if (i >= 1 && argb[i] == argb[i - 1]) {
+      ++match_len;
+    } else {
+      PushBackCopy(refs, match_len);
+      match_len = 0;
+      refs->refs[refs->size++] = PixOrCopyCreateLiteral(argb[i]);
+    }
+  }
+  PushBackCopy(refs, match_len);
+}
+
+// Returns 1 when successful.
+int VP8LBackwardReferencesHashChain(
+    int xsize, int ysize, int use_color_cache, const uint32_t* const argb,
+    int cache_bits, int quality, VP8LBackwardRefs* const refs) {
+  int i;
+  int ok = 0;
+  const int pix_count = xsize * ysize;
+  VP8LHashChain* hash_chain = (VP8LHashChain*)malloc(sizeof(*hash_chain));
+  VP8LColorCache hashers;
+  if (hash_chain == NULL ||
+      !VP8LColorCacheInit(&hashers, cache_bits) ||
+      !VP8LHashChainInit(hash_chain, pix_count)) {
+    goto Error;
+  }
+  refs->size = 0;
+  for (i = 0; i < pix_count; ) {
+    // Alternative#1: Code the pixels starting at 'i' using backward reference.
+    int offset = 0;
+    int len = 0;
+    if (i < pix_count - 1) {  // FindCopy(i,..) reads pixels at [i] and [i + 1].
+      int maxlen = pix_count - i;
+      if (maxlen > kMaxLength) {
+        maxlen = kMaxLength;
+      }
+      VP8LHashChainFindCopy(hash_chain, quality, i, xsize, argb, maxlen,
+                            &offset, &len);
+    }
+    if (len >= kMinLength) {
+      // Alternative#2: Insert the pixel at 'i' as literal, and code the
+      // pixels starting at 'i + 1' using backward reference.
+      int offset2 = 0;
+      int len2 = 0;
+      int k;
+      VP8LHashChainInsert(hash_chain, &argb[i], i);
+      if (i < pix_count - 2) {  // FindCopy(i+1,..) reads [i + 1] and [i + 2].
+        int maxlen = pix_count - (i + 1);
+        if (maxlen > kMaxLength) {
+          maxlen = kMaxLength;
+        }
+        VP8LHashChainFindCopy(hash_chain, quality,
+                               i + 1, xsize, argb, maxlen, &offset2, &len2);
+        if (len2 > len + 1) {
+          // Alternative#2 is a better match. So push pixel at 'i' as literal.
+          if (use_color_cache && VP8LColorCacheContains(&hashers, argb[i])) {
+            const int ix = VP8LColorCacheGetIndex(&hashers, argb[i]);
+            refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
+          } else {
+            refs->refs[refs->size] = PixOrCopyCreateLiteral(argb[i]);
+          }
+          ++refs->size;
+          VP8LColorCacheInsert(&hashers, argb[i]);
+          i++;  // Backward reference to be done for next pixel.
+          len = len2;
+          offset = offset2;
+        }
+      }
+      if (len >= kMaxLength) {
+        len = kMaxLength - 1;
+      }
+      refs->refs[refs->size++] = PixOrCopyCreateCopy(offset, len);
+      for (k = 0; k < len; ++k) {
+        VP8LColorCacheInsert(&hashers, argb[i + k]);
+        if (k != 0 && i + k + 1 < pix_count) {
+          // Add to the hash_chain (but cannot add the last pixel).
+          VP8LHashChainInsert(hash_chain, &argb[i + k], i + k);
+        }
+      }
+      i += len;
+    } else {
+      if (use_color_cache && VP8LColorCacheContains(&hashers, argb[i])) {
+        // push pixel as a PixOrCopyCreateCacheIdx pixel
+        int ix = VP8LColorCacheGetIndex(&hashers, argb[i]);
+        refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
+      } else {
+        refs->refs[refs->size] = PixOrCopyCreateLiteral(argb[i]);
+      }
+      ++refs->size;
+      VP8LColorCacheInsert(&hashers, argb[i]);
+      if (i + 1 < pix_count) {
+        VP8LHashChainInsert(hash_chain, &argb[i], i);
+      }
+      ++i;
+    }
+  }
+  ok = 1;
+Error:
+  VP8LHashChainClear(hash_chain);
+  free(hash_chain);
+  VP8LColorCacheClear(&hashers);
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+
+typedef struct {
+  double alpha_[VALUES_IN_BYTE];
+  double red_[VALUES_IN_BYTE];
+  double literal_[PIX_OR_COPY_CODES_MAX];
+  double blue_[VALUES_IN_BYTE];
+  double distance_[DISTANCE_CODES_MAX];
+  int cache_bits_;
+} CostModel;
+
+static int CostModelBuild(CostModel* const p, int xsize, int ysize,
+                          int recursion_level, int use_color_cache,
+                          const uint32_t* const argb, int cache_bits) {
+  int ok = 0;
+  VP8LHistogram histo;
+  VP8LBackwardRefs refs;
+
+  if (!VP8LBackwardRefsAlloc(&refs, xsize * ysize)) goto Error;
+
+  p->cache_bits_ = cache_bits;
+  if (recursion_level > 0) {
+    if (!VP8LBackwardReferencesTraceBackwards(xsize, ysize, recursion_level - 1,
+                                              use_color_cache, argb, cache_bits,
+                                              &refs)) {
+      goto Error;
+    }
+  } else {
+    const int quality = 100;
+    if (!VP8LBackwardReferencesHashChain(xsize, ysize, use_color_cache, argb,
+                                         cache_bits, quality, &refs)) {
+      goto Error;
+    }
+  }
+  VP8LHistogramCreate(&histo, &refs, cache_bits);
+  VP8LConvertPopulationCountTableToBitEstimates(
+      VP8LHistogramNumCodes(&histo),
+      &histo.literal_[0], &p->literal_[0]);
+  VP8LConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, &histo.red_[0], &p->red_[0]);
+  VP8LConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, &histo.blue_[0], &p->blue_[0]);
+  VP8LConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, &histo.alpha_[0], &p->alpha_[0]);
+  VP8LConvertPopulationCountTableToBitEstimates(
+      DISTANCE_CODES_MAX, &histo.distance_[0], &p->distance_[0]);
+  ok = 1;
+
+ Error:
+  VP8LClearBackwardRefs(&refs);
+  return ok;
+}
+
+static WEBP_INLINE double GetLiteralCost(const CostModel* const p, uint32_t v) {
+  return p->alpha_[v >> 24] +
+      p->red_[(v >> 16) & 0xff] +
+      p->literal_[(v >> 8) & 0xff] +
+      p->blue_[v & 0xff];
+}
+
+static WEBP_INLINE double GetCacheCost(const CostModel* const p, uint32_t idx) {
+  const int literal_idx = VALUES_IN_BYTE + kLengthCodes + idx;
+  return p->literal_[literal_idx];
+}
+
+static WEBP_INLINE double GetLengthCost(const CostModel* const p,
+                                        uint32_t length) {
+  int code, extra_bits_count, extra_bits_value;
+  PrefixEncode(length, &code, &extra_bits_count, &extra_bits_value);
+  return p->literal_[VALUES_IN_BYTE + code] + extra_bits_count;
+}
+
+static WEBP_INLINE double GetDistanceCost(const CostModel* const p,
+                                          uint32_t distance) {
+  int code, extra_bits_count, extra_bits_value;
+  PrefixEncode(distance, &code, &extra_bits_count, &extra_bits_value);
+  return p->distance_[code] + extra_bits_count;
+}
+
+static int BackwardReferencesHashChainDistanceOnly(
+    int xsize, int ysize, int recursive_cost_model, int use_color_cache,
+    const uint32_t* const argb, int cache_bits, uint32_t* const dist_array) {
+  const int quality = 100;
+  const int pix_count = xsize * ysize;
+  double* cost = (double*)malloc(pix_count * sizeof(*cost));
+  int i;
+  CostModel* cost_model = (CostModel*)malloc(sizeof(*cost_model));
+
+  VP8LColorCache hashers;
+  VP8LHashChain* hash_chain = (VP8LHashChain*)malloc(sizeof(*hash_chain));
+  int ok = 0;
+  if (cost == NULL ||
+      cost_model == NULL ||
+      hash_chain == NULL ||
+      !VP8LColorCacheInit(&hashers, cache_bits)) {
+    goto Error;
+  }
+  VP8LHashChainInit(hash_chain, pix_count);
+  CostModelBuild(cost_model, xsize, ysize, recursive_cost_model,
+                  use_color_cache, argb, cache_bits);
+  for (i = 0; i < pix_count; ++i) {
+    cost[i] = 1e100;
+  }
+  // We loop one pixel at a time, but store all currently best points to
+  // non-processed locations from this point.
+  dist_array[0] = 0;
+  for (i = 0; i < pix_count; ++i) {
+    double prev_cost = 0.0;
+    int shortmax;
+    if (i > 0) {
+      prev_cost = cost[i - 1];
+    }
+    for (shortmax = 0; shortmax < 2; ++shortmax) {
+      int offset = 0;
+      int len = 0;
+      if (i < pix_count - 1) {  // FindCopy reads pixels at [i] and [i + 1].
+        int maxlen = shortmax ? 2 : kMaxLength;
+        if (maxlen > pix_count - i) {
+          maxlen = pix_count - i;
+        }
+        VP8LHashChainFindCopy(hash_chain, quality, i, xsize, argb, maxlen,
+                               &offset, &len);
+      }
+      if (len >= kMinLength) {
+        const int code = VP8LDistanceToPlaneCode(xsize, offset);
+        const double distance_cost =
+            prev_cost + GetDistanceCost(cost_model, code);
+        int k;
+        for (k = 1; k < len; ++k) {
+          const double cost_val =
+              distance_cost + GetLengthCost(cost_model, k);
+          if (cost[i + k] > cost_val) {
+            cost[i + k] = cost_val;
+            dist_array[i + k] = k + 1;
+          }
+        }
+        // This if is for speedup only. It roughly doubles the speed, and
+        // makes compression worse by .1 %.
+        if (len >= 128 && code < 2) {
+          // Long copy for short distances, let's skip the middle
+          // lookups for better copies.
+          // 1) insert the hashes.
+          for (k = 0; k < len; ++k) {
+            VP8LColorCacheInsert(&hashers, argb[i + k]);
+            if (i + k + 1 < pix_count) {
+              // Add to the hash_chain (but cannot add the last pixel).
+              VP8LHashChainInsert(hash_chain, &argb[i + k], i + k);
+            }
+          }
+          // 2) jump.
+          i += len - 1;  // for loop does ++i, thus -1 here.
+          goto next_symbol;
+        }
+      }
+    }
+    if (i < pix_count - 1) {
+      VP8LHashChainInsert(hash_chain, &argb[i], i);
+    }
+    {
+      // inserting a literal pixel
+      double cost_val = prev_cost;
+      double mul0 = 1.0;
+      double mul1 = 1.0;
+      if (recursive_cost_model == 0) {
+        mul0 = 0.68;
+        mul1 = 0.82;
+      }
+      if (use_color_cache && VP8LColorCacheContains(&hashers, argb[i])) {
+        int ix = VP8LColorCacheGetIndex(&hashers, argb[i]);
+        cost_val += GetCacheCost(cost_model, ix) * mul0;
+      } else {
+        cost_val += GetLiteralCost(cost_model, argb[i]) * mul1;
+      }
+      if (cost[i] > cost_val) {
+        cost[i] = cost_val;
+        dist_array[i] = 1;  // only one is inserted.
+      }
+      VP8LColorCacheInsert(&hashers, argb[i]);
+    }
+ next_symbol: ;
+  }
+  // Last pixel still to do, it can only be a single step if not reached
+  // through cheaper means already.
+  ok = 1;
+Error:
+  if (hash_chain) VP8LHashChainClear(hash_chain);
+  free(hash_chain);
+  free(cost_model);
+  free(cost);
+  VP8LColorCacheClear(&hashers);
+  return ok;
+}
+
+static void TraceBackwards(
+    const uint32_t* const dist_array, int dist_array_size,
+    uint32_t** const chosen_path, int* const chosen_path_size) {
+  int i;
+  // Count how many.
+  int count = 0;
+  for (i = dist_array_size - 1; i >= 0; ) {
+    int k = dist_array[i];
+    assert(k >= 1);
+    ++count;
+    i -= k;
+  }
+  // Allocate.
+  *chosen_path_size = count;
+  *chosen_path = (uint32_t*)malloc(count * sizeof(*chosen_path));
+  // Write in reverse order.
+  for (i = dist_array_size - 1; i >= 0; ) {
+    int k = dist_array[i];
+    assert(k >= 1);
+    (*chosen_path)[--count] = k;
+    i -= k;
+  }
+}
+
+static int BackwardReferencesHashChainFollowChosenPath(
+    int xsize, int ysize, int use_color_cache, const uint32_t* const argb,
+    int cache_bits, const uint32_t* const chosen_path, int chosen_path_size,
+    VP8LBackwardRefs* const refs) {
+  const int quality = 100;
+  const int pix_count = xsize * ysize;
+  int size = 0;
+  int i = 0;
+  int k;
+  int ix;
+  int ok = 0;
+  VP8LColorCache hashers;
+  VP8LHashChain* hash_chain = (VP8LHashChain*)malloc(sizeof(*hash_chain));
+  VP8LHashChainInit(hash_chain, pix_count);
+  if (hash_chain == NULL ||
+      !VP8LColorCacheInit(&hashers, cache_bits)) {
+    goto Error;
+  }
+  refs->size = 0;
+  for (ix = 0; ix < chosen_path_size; ++ix, ++size) {
+    int offset = 0;
+    int len = 0;
+    int maxlen = chosen_path[ix];
+    if (maxlen != 1) {
+      VP8LHashChainFindCopy(hash_chain, quality,
+                             i, xsize, argb, maxlen, &offset, &len);
+      assert(len == maxlen);
+      refs->refs[size] = PixOrCopyCreateCopy(offset, len);
+      for (k = 0; k < len; ++k) {
+        VP8LColorCacheInsert(&hashers, argb[i + k]);
+        if (i + k + 1 < pix_count) {
+          // Add to the hash_chain (but cannot add the last pixel).
+          VP8LHashChainInsert(hash_chain, &argb[i + k], i + k);
+        }
+      }
+      i += len;
+    } else {
+      if (use_color_cache && VP8LColorCacheContains(&hashers, argb[i])) {
+        // push pixel as a color cache index
+        int ix = VP8LColorCacheGetIndex(&hashers, argb[i]);
+        refs->refs[size] = PixOrCopyCreateCacheIdx(ix);
+      } else {
+        refs->refs[size] = PixOrCopyCreateLiteral(argb[i]);
+      }
+      VP8LColorCacheInsert(&hashers, argb[i]);
+      if (i + 1 < pix_count) {
+        VP8LHashChainInsert(hash_chain, &argb[i], i);
+      }
+      ++i;
+    }
+  }
+  assert(size < refs->max_size);
+  refs->size = size;
+  ok = 1;
+Error:
+  VP8LHashChainClear(hash_chain);
+  if (hash_chain) {
+    free(hash_chain);
+  }
+  VP8LColorCacheClear(&hashers);
+  return ok;
+}
+
+// Returns 1 on success.
+int VP8LBackwardReferencesTraceBackwards(
+    int xsize, int ysize, int recursive_cost_model, int use_color_cache,
+    const uint32_t* const argb, int cache_bits, VP8LBackwardRefs* const refs) {
+  int ok = 0;
+  const int dist_array_size = xsize * ysize;
+  uint32_t* chosen_path = NULL;
+  int chosen_path_size = 0;
+  uint32_t* const dist_array =
+      (uint32_t*)malloc(dist_array_size * sizeof(*dist_array));
+  if (dist_array == NULL) {
+    goto Error;
+  }
+  if (!BackwardReferencesHashChainDistanceOnly(
+          xsize, ysize, recursive_cost_model, use_color_cache, argb, cache_bits,
+          dist_array)) {
+    free(dist_array);
+    goto Error;
+  }
+  TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
+  free(dist_array);
+  if (!BackwardReferencesHashChainFollowChosenPath(
+          xsize, ysize, use_color_cache, argb, cache_bits,
+          chosen_path, chosen_path_size, refs)) {
+    goto Error;
+  }
+  ok = 1;
+ Error:
+  free(chosen_path);
+  return ok;
+}
+
+void VP8LBackwardReferences2DLocality(int xsize, VP8LBackwardRefs* const refs) {
+  int i;
+  for (i = 0; i < refs->size; ++i) {
+    if (PixOrCopyIsCopy(&refs->refs[i])) {
+      const int dist = refs->refs[i].argb_or_distance;
+      const int transformed_dist = VP8LDistanceToPlaneCode(xsize, dist);
+      refs->refs[i].argb_or_distance = transformed_dist;
+    }
+  }
+}
+
+int VP8LVerifyBackwardReferences(
+    const uint32_t* const argb, int xsize, int ysize, int cache_bits,
+    const VP8LBackwardRefs* const refs) {
+  int num_pixels = 0;
+  int i;
+  VP8LColorCache hashers;
+  VP8LColorCacheInit(&hashers, cache_bits);
+  for (i = 0; i < refs->size; ++i) {
+    const PixOrCopy token = refs->refs[i];
+    if (PixOrCopyIsLiteral(&token)) {
+      if (argb[num_pixels] != PixOrCopyArgb(&token)) {
+        VP8LColorCacheClear(&hashers);
+        return 0;
+      }
+      VP8LColorCacheInsert(&hashers, argb[num_pixels]);
+      ++num_pixels;
+    } else if (PixOrCopyIsCacheIdx(&token)) {
+      const uint32_t cache_entry =
+          VP8LColorCacheLookup(&hashers, PixOrCopyCacheIdx(&token));
+      if (argb[num_pixels] != cache_entry) {
+        VP8LColorCacheClear(&hashers);
+        return 0;
+      }
+      VP8LColorCacheInsert(&hashers, argb[num_pixels]);
+      ++num_pixels;
+    } else if (PixOrCopyIsCopy(&token)) {
+      int k;
+      if (PixOrCopyDistance(&token) == 0) {
+        VP8LColorCacheClear(&hashers);
+        return 0;
+      }
+      for (k = 0; k < token.len; ++k) {
+        if (argb[num_pixels] != argb[num_pixels - PixOrCopyDistance(&token)]) {
+          VP8LColorCacheClear(&hashers);
+          return 0;
+        }
+        VP8LColorCacheInsert(&hashers, argb[num_pixels]);
+        ++num_pixels;
+      }
+    }
+  }
+  {
+    const int pix_count = xsize * ysize;
+    if (num_pixels != pix_count) {
+      VP8LColorCacheClear(&hashers);
+      return 0;
+    }
+  }
+  VP8LColorCacheClear(&hashers);
+  return 1;
+}
+
+// Returns 1 on success.
+static int ComputeCacheHistogram(
+    const uint32_t* const argb, int xsize, int ysize,
+    const VP8LBackwardRefs* const refs, int cache_bits,
+    VP8LHistogram* const histo) {
+  int pixel_index = 0;
+  int i;
+  uint32_t k;
+  VP8LColorCache hashers;
+  if (!VP8LColorCacheInit(&hashers, cache_bits)) {
+    return 0;
+  }
+  for (i = 0; i < refs->size; ++i) {
+    const PixOrCopy* const v = &refs->refs[i];
+    if (PixOrCopyIsLiteral(v)) {
+      if (cache_bits != 0 &&
+          VP8LColorCacheContains(&hashers, argb[pixel_index])) {
+        // push pixel as a cache index
+        const int ix = VP8LColorCacheGetIndex(&hashers, argb[pixel_index]);
+        const PixOrCopy token = PixOrCopyCreateCacheIdx(ix);
+        VP8LHistogramAddSinglePixOrCopy(histo, &token);
+      } else {
+        VP8LHistogramAddSinglePixOrCopy(histo, v);
+      }
+    } else {
+      VP8LHistogramAddSinglePixOrCopy(histo, v);
+    }
+    for (k = 0; k < PixOrCopyLength(v); ++k) {
+      VP8LColorCacheInsert(&hashers, argb[pixel_index]);
+      ++pixel_index;
+    }
+  }
+  assert(pixel_index == xsize * ysize);
+  (void)xsize;  // xsize is not used in non-debug compilations otherwise.
+  (void)ysize;  // ysize is not used in non-debug compilations otherwise.
+  VP8LColorCacheClear(&hashers);
+  return 1;
+}
+
+// Returns how many bits are to be used for a color cache.
+int VP8LCalculateEstimateForCacheSize(
+    const uint32_t* const argb, int xsize, int ysize,
+    int* const best_cache_bits) {
+  int ok = 0;
+  int cache_bits;
+  double lowest_entropy = 1e99;
+  VP8LBackwardRefs refs;
+  static const double kSmallPenaltyForLargeCache = 4.0;
+  static const int quality = 30;
+  if (!VP8LBackwardRefsAlloc(&refs, xsize * ysize) ||
+      !VP8LBackwardReferencesHashChain(xsize, ysize, 0, argb, 0, quality,
+                                       &refs)) {
+    goto Error;
+  }
+  for (cache_bits = 0; cache_bits <= kColorCacheBitsMax; ++cache_bits) {
+    double cur_entropy;
+    VP8LHistogram histo;
+    VP8LHistogramInit(&histo, cache_bits);
+    ComputeCacheHistogram(argb, xsize, ysize, &refs, cache_bits, &histo);
+    cur_entropy = VP8LHistogramEstimateBits(&histo) +
+        kSmallPenaltyForLargeCache * cache_bits;
+    if (cache_bits == 0 || cur_entropy < lowest_entropy) {
+      *best_cache_bits = cache_bits;
+      lowest_entropy = cur_entropy;
+    }
+  }
+  ok = 1;
+ Error:
+  VP8LClearBackwardRefs(&refs);
+  return ok;
+}
+
+#endif
diff --git a/src/enc/backward_references.h b/src/enc/backward_references.h
new file mode 100644
index 00000000..8d647010
--- /dev/null
+++ b/src/enc/backward_references.h
@@ -0,0 +1,253 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+
+#ifndef WEBP_ENC_BACKWARD_REFERENCES_H_
+#define WEBP_ENC_BACKWARD_REFERENCES_H_
+
+#ifdef USE_LOSSLESS_ENCODER
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "../webp/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// Backward reference distance prefix codes
+#define DISTANCE_CODES_MAX 40
+
+// Compression constants
+#define CODE_LENGTH_CODES 19
+static const int kLengthCodes = 24;
+// The spec allows 11, we use 9 bits to reduce memory consumption in encoding.
+// Having 9 instead of 11 removes about 0.25 % of compression density.
+static const int kColorCacheBitsMax = 9;
+#define PIX_OR_COPY_CODES_MAX (256 + 24 + (1 << 9))
+static const int kMaxLength = 4096;
+
+// use GNU builtins where available.
+#if defined(__GNUC__) && \
+    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  return n == 0 ? -1 : 31 ^ __builtin_clz(n);
+}
+#else
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  int log;
+  uint32_t value;
+  int i;
+  if (n == 0)
+    return -1;
+  log = 0;
+  value = n;
+  for (i = 4; i >= 0; --i) {
+    int shift = (1 << i);
+    uint32_t x = value >> shift;
+    if (x != 0) {
+      value = x;
+      log += shift;
+    }
+  }
+  return log;
+}
+#endif
+
+static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
+  int floor = BitsLog2Floor(n);
+  if (n == (n & ~(n - 1)))  // zero or a power of two.
+    return floor;
+  else
+    return floor + 1;
+}
+
+// Splitting of distance and length codes into prefixes and
+// extra bits. The prefixes are encoded with an entropy code
+// while the extra bits are stored just as normal bits.
+static WEBP_INLINE void PrefixEncode(
+    int distance,
+    int *code,
+    int *extra_bits_count,
+    int *extra_bits_value) {
+  // Collect the two most significant bits where the highest bit is 1.
+  const int highest_bit = BitsLog2Floor(--distance);
+  // & 0x3f is to make behavior well defined when highest_bit
+  // does not exist or is the least significant bit.
+  const int second_highest_bit =
+      (distance >> ((highest_bit - 1) & 0x3f)) & 1;
+  *extra_bits_count = (highest_bit > 0) ? highest_bit - 1 : 0;
+  *extra_bits_value = distance & ((1 << *extra_bits_count) - 1);
+  *code = (highest_bit > 0) ? 2 * highest_bit + second_highest_bit :
+      (highest_bit == 0) ? 1 : 0;
+}
+
+enum Mode {
+  kLiteral,
+  kCacheIdx,
+  kCopy,
+  kNone,
+};
+
+typedef struct {
+  // mode as uint8_t to make the memory layout to be exactly 8 bytes.
+  uint8_t mode;
+  uint16_t len;
+  uint32_t argb_or_distance;
+} PixOrCopy;
+
+
+static WEBP_INLINE PixOrCopy PixOrCopyCreateCopy(uint32_t distance,
+                                                 uint16_t len) {
+  PixOrCopy retval;
+  retval.mode = kCopy;
+  retval.argb_or_distance = distance;
+  retval.len = len;
+  return retval;
+}
+
+static WEBP_INLINE PixOrCopy PixOrCopyCreateCacheIdx(int idx) {
+  PixOrCopy retval;
+  assert(idx >= 0);
+  assert(idx < (1 << kColorCacheBitsMax));
+  retval.mode = kCacheIdx;
+  retval.argb_or_distance = idx;
+  retval.len = 1;
+  return retval;
+}
+
+static WEBP_INLINE PixOrCopy PixOrCopyCreateLiteral(uint32_t argb) {
+  PixOrCopy retval;
+  retval.mode = kLiteral;
+  retval.argb_or_distance = argb;
+  retval.len = 1;
+  return retval;
+}
+
+static WEBP_INLINE int PixOrCopyIsLiteral(const PixOrCopy* const p) {
+  return (p->mode == kLiteral);
+}
+
+static WEBP_INLINE int PixOrCopyIsCacheIdx(const PixOrCopy* const p) {
+  return (p->mode == kCacheIdx);
+}
+
+static WEBP_INLINE int PixOrCopyIsCopy(const PixOrCopy* const p) {
+  return (p->mode == kCopy);
+}
+
+static WEBP_INLINE uint32_t PixOrCopyLiteral(const PixOrCopy* const p,
+                                             int component) {
+  assert(p->mode == kLiteral);
+  return (p->argb_or_distance >> (component * 8)) & 0xff;
+}
+
+static WEBP_INLINE uint32_t PixOrCopyLength(const PixOrCopy* const p) {
+  return p->len;
+}
+
+static WEBP_INLINE uint32_t PixOrCopyArgb(const PixOrCopy* const p) {
+  assert(p->mode == kLiteral);
+  return p->argb_or_distance;
+}
+
+static WEBP_INLINE uint32_t PixOrCopyCacheIdx(const PixOrCopy* const p) {
+  assert(p->mode == kCacheIdx);
+  assert(p->argb_or_distance < (1 << kColorCacheBitsMax));
+  return p->argb_or_distance;
+}
+
+static WEBP_INLINE uint32_t PixOrCopyDistance(const PixOrCopy* const p) {
+  assert(p->mode == kCopy);
+  return p->argb_or_distance;
+}
+
+// -----------------------------------------------------------------------------
+// VP8LBackwardRefs
+
+typedef struct {
+  PixOrCopy* refs;
+  int size;      // currently used
+  int max_size;  // maximum capacity
+} VP8LBackwardRefs;
+
+
+static WEBP_INLINE void VP8LInitBackwardRefs(VP8LBackwardRefs* const refs) {
+  if (refs != NULL) {
+    refs->refs = NULL;
+    refs->size = 0;
+    refs->max_size = 0;
+  }
+}
+
+static WEBP_INLINE void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs) {
+  if (refs != NULL) {
+    free(refs->refs);
+    VP8LInitBackwardRefs(refs);
+  }
+}
+
+// Allocate 'max_size' references. Returns false in case of memory error.
+static WEBP_INLINE int VP8LBackwardRefsAlloc(VP8LBackwardRefs* const refs,
+                                             int max_size) {
+  assert(refs != NULL);
+  refs->size = 0;
+  refs->max_size = 0;
+  refs->refs = (PixOrCopy*)malloc(max_size * sizeof(*refs->refs));
+  if (refs->refs == NULL) return 0;
+  refs->max_size = max_size;
+  return 1;
+}
+
+// Ridiculously simple backward references for images where it is unlikely
+// that there are large backward references (photos).
+void VP8LBackwardReferencesRle(
+    int xsize, int ysize, const uint32_t* const argb,
+    VP8LBackwardRefs* const refs);
+
+// This is a simple fast function for obtaining backward references
+// based on simple heuristics. Returns 1 on success.
+int VP8LBackwardReferencesHashChain(
+    int xsize, int ysize, int use_color_cache, const uint32_t* const argb,
+    int cache_bits, int quality, VP8LBackwardRefs* const refs);
+
+// This method looks for a shortest path through the backward reference
+// network based on a cost model generated by a first round of compression.
+// Returns 1 on success.
+int VP8LBackwardReferencesTraceBackwards(
+    int xsize, int ysize, int recursive_cost_model, int use_color_cache,
+    const uint32_t* const argb, int cache_bits, VP8LBackwardRefs* const refs);
+
+// Convert backward references that are of linear distance along
+// the image scan lines to have a 2d locality indexing where
+// smaller values are used for backward references that are close by.
+void VP8LBackwardReferences2DLocality(int xsize, VP8LBackwardRefs* const refs);
+
+// Internals of locality transform exposed for testing use.
+int VP8LDistanceToPlaneCode(int xsize, int distance);
+
+// Returns true if the given backward references actually produce
+// the image given in tuple (argb, xsize, ysize).
+int VP8LVerifyBackwardReferences(
+    const uint32_t* const argb, int xsize, int ysize, int cache_bits,
+    const VP8LBackwardRefs* const refs);
+
+// Produce an estimate for a good color cache size for the image.
+int VP8LCalculateEstimateForCacheSize(
+    const uint32_t* const argb, int xsize, int ysize,
+    int* const best_cache_bits);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif
+
+#endif  // WEBP_ENC_BACKWARD_REFERENCES_H_
diff --git a/src/enc/config.c b/src/enc/config.c
index d51d9995..178d4689 100644
--- a/src/enc/config.c
+++ b/src/enc/config.c
@@ -44,6 +44,7 @@ int WebPConfigInitInternal(WebPConfig* const config,
   config->alpha_compression = 1;
   config->alpha_filtering = 1;
   config->alpha_quality = 100;
+  config->lossless = 0;
 
   // TODO(skal): tune.
   switch (preset) {
@@ -116,6 +117,13 @@ int WebPValidateConfig(const WebPConfig* const config) {
     return 0;
   if (config->alpha_quality < 0 || config->alpha_quality > 100)
     return 0;
+#ifdef USE_LOSSLESS_ENCODER
+  if (config->lossless < 0 || config->lossless > 1)
+    return 0;
+#else
+  if (config->lossless != 0)
+    return 0;
+#endif
   return 1;
 }
 
diff --git a/src/enc/histogram.c b/src/enc/histogram.c
new file mode 100644
index 00000000..0958b6b8
--- /dev/null
+++ b/src/enc/histogram.c
@@ -0,0 +1,420 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+
+#ifdef USE_LOSSLESS_ENCODER
+
+#include <math.h>
+#include <stdio.h>
+
+#include "./backward_references.h"
+#include "./histogram.h"
+#include "../dsp/lossless.h"
+
+static void HistogramClear(VP8LHistogram* const p) {
+  memset(p->literal_, 0, sizeof(p->literal_));
+  memset(p->red_, 0, sizeof(p->red_));
+  memset(p->blue_, 0, sizeof(p->blue_));
+  memset(p->alpha_, 0, sizeof(p->alpha_));
+  memset(p->distance_, 0, sizeof(p->distance_));
+  p->bit_cost_ = 0;
+}
+
+void VP8LHistogramCreate(VP8LHistogram* const p,
+                         const VP8LBackwardRefs* const refs,
+                         int palette_code_bits) {
+  int i;
+  if (palette_code_bits >= 0) {
+    p->palette_code_bits_ = palette_code_bits;
+  }
+  HistogramClear(p);
+  for (i = 0; i < refs->size; ++i) {
+    VP8LHistogramAddSinglePixOrCopy(p, &refs->refs[i]);
+  }
+}
+
+void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits) {
+  p->palette_code_bits_ = palette_code_bits;
+  HistogramClear(p);
+}
+
+VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
+  int i;
+  VP8LHistogramSet* set;
+  VP8LHistogram* bulk;
+  const size_t total_size = sizeof(*set)
+                          + size * sizeof(*set->histograms)
+                          + size * sizeof(**set->histograms);
+  uint8_t* memory = (uint8_t*)malloc(total_size);
+  if (memory == NULL) return NULL;
+
+  set = (VP8LHistogramSet*)memory;
+  memory += sizeof(*set);
+  set->histograms = (VP8LHistogram**)memory;
+  memory += size * sizeof(*set->histograms);
+  bulk = (VP8LHistogram*)memory;
+  set->max_size = size;
+  set->size = size;
+  for (i = 0; i < size; ++i) {
+    set->histograms[i] = bulk + i;
+    VP8LHistogramInit(set->histograms[i], cache_bits);
+  }
+  return set;
+}
+
+// -----------------------------------------------------------------------------
+
+void VP8LConvertPopulationCountTableToBitEstimates(
+    int num_symbols, const int* const population_counts,
+    double* const output) {
+  int sum = 0;
+  int nonzeros = 0;
+  int i;
+  for (i = 0; i < num_symbols; ++i) {
+    sum += population_counts[i];
+    if (population_counts[i] > 0) {
+      ++nonzeros;
+    }
+  }
+  if (nonzeros <= 1) {
+    memset(output, 0, num_symbols * sizeof(*output));
+    return;
+  }
+  {
+    const double log2sum = log2(sum);
+    for (i = 0; i < num_symbols; ++i) {
+      if (population_counts[i] == 0) {
+        output[i] = log2sum;
+      } else {
+        output[i] = log2sum - log2(population_counts[i]);
+      }
+    }
+  }
+}
+
+void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const p,
+                                     const PixOrCopy* const v) {
+  if (PixOrCopyIsLiteral(v)) {
+    ++p->alpha_[PixOrCopyLiteral(v, 3)];
+    ++p->red_[PixOrCopyLiteral(v, 2)];
+    ++p->literal_[PixOrCopyLiteral(v, 1)];
+    ++p->blue_[PixOrCopyLiteral(v, 0)];
+  } else if (PixOrCopyIsCacheIdx(v)) {
+    int literal_ix = 256 + kLengthCodes + PixOrCopyCacheIdx(v);
+    ++p->literal_[literal_ix];
+  } else {
+    int code, extra_bits_count, extra_bits_value;
+    PrefixEncode(PixOrCopyLength(v),
+                 &code, &extra_bits_count, &extra_bits_value);
+    ++p->literal_[256 + code];
+    PrefixEncode(PixOrCopyDistance(v),
+                 &code, &extra_bits_count, &extra_bits_value);
+    ++p->distance_[code];
+  }
+}
+
+
+
+static double BitsEntropy(const int* const array, int n) {
+  double retval = 0;
+  int sum = 0;
+  int nonzeros = 0;
+  int max_val = 0;
+  int i;
+  double mix;
+  for (i = 0; i < n; ++i) {
+    if (array[i] != 0) {
+      sum += array[i];
+      ++nonzeros;
+      retval += array[i] * VP8LFastLog(array[i]);
+      if (max_val < array[i]) {
+        max_val = array[i];
+      }
+    }
+  }
+  retval -= sum * VP8LFastLog(sum);
+  retval *= -1.4426950408889634;  // 1.0 / -Log(2);
+  mix = 0.627;
+  if (nonzeros < 5) {
+    if (nonzeros <= 1) {
+      return 0;
+    }
+    // Two symbols, they will be 0 and 1 in a Huffman code.
+    // Let's mix in a bit of entropy to favor good clustering when
+    // distributions of these are combined.
+    if (nonzeros == 2) {
+      return 0.99 * sum + 0.01 * retval;
+    }
+    // No matter what the entropy says, we cannot be better than min_limit
+    // with Huffman coding. I am mixing a bit of entropy into the
+    // min_limit since it produces much better (~0.5 %) compression results
+    // perhaps because of better entropy clustering.
+    if (nonzeros == 3) {
+      mix = 0.95;
+    } else {
+      mix = 0.7;  // nonzeros == 4.
+    }
+  }
+  {
+    double min_limit = 2 * sum - max_val;
+    min_limit = mix * min_limit + (1.0 - mix) * retval;
+    if (retval < min_limit) {
+      return min_limit;
+    }
+  }
+  return retval;
+}
+
+double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p) {
+  double retval = BitsEntropy(&p->literal_[0], VP8LHistogramNumCodes(p)) +
+      BitsEntropy(&p->red_[0], 256) +
+      BitsEntropy(&p->blue_[0], 256) +
+      BitsEntropy(&p->alpha_[0], 256) +
+      BitsEntropy(&p->distance_[0], DISTANCE_CODES_MAX);
+  // Compute the extra bits cost.
+  int i;
+  for (i = 2; i < kLengthCodes - 2; ++i) {
+    retval +=
+        (i >> 1) * p->literal_[256 + i + 2];
+  }
+  for (i = 2; i < DISTANCE_CODES_MAX - 2; ++i) {
+    retval += (i >> 1) * p->distance_[i + 2];
+  }
+  return retval;
+}
+
+double VP8LHistogramEstimateBits(const VP8LHistogram* const p) {
+  return VP8LHistogramEstimateBitsHeader(p) + VP8LHistogramEstimateBitsBulk(p);
+}
+
+// Returns the cost encode the rle-encoded entropy code.
+// The constants in this function are experimental.
+static double HuffmanCost(const int* const population, int length) {
+  // Small bias because Huffman code length is typically not stored in
+  // full length.
+  static const int kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3;
+  static const double kSmallBias = 9.1;
+  double retval = kHuffmanCodeOfHuffmanCodeSize - kSmallBias;
+  int streak = 0;
+  int i = 0;
+  for (; i < length - 1; ++i) {
+    ++streak;
+    if (population[i] == population[i + 1]) {
+      continue;
+    }
+ last_streak_hack:
+    // population[i] points now to the symbol in the streak of same values.
+    if (streak > 3) {
+      if (population[i] == 0) {
+        retval += 1.5625 + 0.234375 * streak;
+      } else {
+        retval += 2.578125 + 0.703125 * streak;
+      }
+    } else {
+      if (population[i] == 0) {
+        retval += 1.796875 * streak;
+      } else {
+        retval += 3.28125 * streak;
+      }
+    }
+    streak = 0;
+  }
+  if (i == length - 1) {
+    ++streak;
+    goto last_streak_hack;
+  }
+  return retval;
+}
+
+double VP8LHistogramEstimateBitsHeader(const VP8LHistogram* const p) {
+  return HuffmanCost(&p->alpha_[0], 256) +
+         HuffmanCost(&p->red_[0], 256) +
+         HuffmanCost(&p->literal_[0], VP8LHistogramNumCodes(p)) +
+         HuffmanCost(&p->blue_[0], 256) +
+         HuffmanCost(&p->distance_[0], DISTANCE_CODES_MAX);
+}
+
+static void HistogramBuildImage(int xsize, int histo_bits,
+                                const VP8LBackwardRefs* const backward_refs,
+                                VP8LHistogramSet* const image) {
+  int i;
+  int x = 0, y = 0;
+  const int histo_xsize =
+      (histo_bits > 0) ? VP8LSubSampleSize(xsize, histo_bits) : 1;
+  for (i = 0; i < backward_refs->size; ++i) {
+    const PixOrCopy* const v = &backward_refs->refs[i];
+    const int ix =
+        (histo_bits > 0) ? (y >> histo_bits) * histo_xsize + (x >> histo_bits)
+                         : 0;
+    VP8LHistogramAddSinglePixOrCopy(image->histograms[ix], v);
+    x += PixOrCopyLength(v);
+    while (x >= xsize) {
+      x -= xsize;
+      ++y;
+    }
+  }
+}
+
+static int HistogramCombine(const VP8LHistogramSet* const in,
+                            VP8LHistogramSet* const out, int num_pairs) {
+  int ok = 0;
+  int i, iter;
+  unsigned int seed = 0;
+  int tries_with_no_success = 0;
+  const int min_cluster_size = 2;
+  int out_size = in->size;
+  const int outer_iters = in->size * 3;
+  VP8LHistogram* const histos = (VP8LHistogram*)malloc(2 * sizeof(*histos));
+  VP8LHistogram* cur_combo = histos + 0;    // trial merged histogram
+  VP8LHistogram* best_combo = histos + 1;   // best merged histogram so far
+  if (histos == NULL) goto End;
+
+  // Copy histograms from in[] to out[].
+  assert(in->size <= out->size);
+  for (i = 0; i < in->size; ++i) {
+    in->histograms[i]->bit_cost_ = VP8LHistogramEstimateBits(in->histograms[i]);
+    *out->histograms[i] = *in->histograms[i];
+  }
+
+  // Collapse similar histograms in 'out'.
+  for (iter = 0; iter < outer_iters && out_size >= min_cluster_size; ++iter) {
+    // We pick the best pair to be combined out of 'inner_iters' pairs.
+    double best_cost_diff = 0.;
+    int best_idx1 = 0, best_idx2 = 1;
+    int j;
+    for (j = 0; j < num_pairs; ++j) {
+      double curr_cost_diff;
+      // Choose two histograms at random and try to combine them.
+      const int idx1 = rand_r(&seed) % out_size;
+      const int tmp = ((j & 7) + 1) % (out_size - 1);
+      const int diff = (tmp < 3) ? tmp : rand_r(&seed) % (out_size - 1);
+      const int idx2 = (idx1 + diff + 1) % out_size;
+      if (idx1 == idx2) {
+        continue;
+      }
+      *cur_combo = *out->histograms[idx1];
+      VP8LHistogramAdd(cur_combo, out->histograms[idx2]);
+      cur_combo->bit_cost_ = VP8LHistogramEstimateBits(cur_combo);
+      // Calculate cost reduction on combining.
+      curr_cost_diff = cur_combo->bit_cost_
+                     - out->histograms[idx1]->bit_cost_
+                     - out->histograms[idx2]->bit_cost_;
+      if (best_cost_diff > curr_cost_diff) {    // found a better pair?
+        {     // swap cur/best combo histograms
+          VP8LHistogram* const tmp = cur_combo;
+          cur_combo = best_combo;
+          best_combo = tmp;
+        }
+        best_cost_diff = curr_cost_diff;
+        best_idx1 = idx1;
+        best_idx2 = idx2;
+      }
+    }
+
+    if (best_cost_diff < 0.0) {
+      *out->histograms[best_idx1] = *best_combo;
+      // swap best_idx2 slot with last one (which is now unused)
+      --out_size;
+      if (best_idx2 != out_size) {
+        out->histograms[best_idx2] = out->histograms[out_size];
+        out->histograms[out_size] = NULL;   // just for sanity check.
+      }
+      tries_with_no_success = 0;
+    }
+    if (++tries_with_no_success >= 50) {
+      break;
+    }
+  }
+  out->size = out_size;
+  ok = 1;
+
+ End:
+  free(histos);
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+// Histogram refinement
+
+// What is the bit cost of moving square_histogram from
+// cur_symbol to candidate_symbol.
+// TODO(skal): we don't really need to copy the histogram and Add(). Instead
+// we just need VP8LDualHistogramEstimateBits(A, B) estimation function.
+static double HistogramDistance(const VP8LHistogram* const square_histogram,
+                                const VP8LHistogram* const candidate) {
+  const double previous_bit_cost = candidate->bit_cost_;
+  double new_bit_cost;
+  VP8LHistogram modified_histo;
+  modified_histo = *candidate;
+  VP8LHistogramAdd(&modified_histo, square_histogram);
+  new_bit_cost = VP8LHistogramEstimateBits(&modified_histo);
+
+  return new_bit_cost - previous_bit_cost;
+}
+
+// Find the best 'out' histogram for each of the 'in' histograms.
+// Note: we assume that out[]->bit_cost_ is already up-to-date.
+static void HistogramRemap(const VP8LHistogramSet* const in,
+                           const VP8LHistogramSet* const out,
+                           uint16_t* const symbols) {
+  int i;
+  for (i = 0; i < in->size; ++i) {
+    int best_out = 0;
+    double best_bits = HistogramDistance(in->histograms[i], out->histograms[0]);
+    int k;
+    for (k = 1; k < out->size; ++k) {
+      const double cur_bits =
+          HistogramDistance(in->histograms[i], out->histograms[k]);
+      if (cur_bits < best_bits) {
+        best_bits = cur_bits;
+        best_out = k;
+      }
+    }
+    symbols[i] = best_out;
+  }
+
+  // Recompute each out based on raw and symbols.
+  for (i = 0; i < out->size; ++i) {
+    HistogramClear(out->histograms[i]);
+  }
+  for (i = 0; i < in->size; ++i) {
+    VP8LHistogramAdd(out->histograms[symbols[i]], in->histograms[i]);
+  }
+}
+
+int VP8LGetHistoImageSymbols(int xsize, int ysize,
+                             const VP8LBackwardRefs* const refs,
+                             int quality, int histo_bits, int cache_bits,
+                             VP8LHistogramSet* const image_in,
+                             uint16_t* const histogram_symbols) {
+  int ok = 0;
+  const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1;
+  const int histo_ysize = histo_bits ? VP8LSubSampleSize(ysize, histo_bits) : 1;
+  const int num_histo_pairs = 10 + quality / 2;  // For HistogramCombine().
+  const int histo_image_raw_size = histo_xsize * histo_ysize;
+  VP8LHistogramSet* const image_out =
+      VP8LAllocateHistogramSet(histo_image_raw_size, cache_bits);
+  if (image_out == NULL) return 0;
+
+  // Build histogram image.
+  HistogramBuildImage(xsize, histo_bits, refs, image_out);
+  // Collapse similar histograms.
+  if (!HistogramCombine(image_out, image_in, num_histo_pairs)) {
+    goto Error;
+  }
+  // Find the optimal map from original histograms to the final ones.
+  HistogramRemap(image_out, image_in, histogram_symbols);
+  ok = 1;
+
+Error:
+  free(image_out);
+  return ok;
+}
+
+#endif
diff --git a/src/enc/histogram.h b/src/enc/histogram.h
new file mode 100644
index 00000000..99b17659
--- /dev/null
+++ b/src/enc/histogram.h
@@ -0,0 +1,140 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+// Models the histograms of literal and distance codes.
+
+#ifndef WEBP_ENC_HISTOGRAM_H_
+#define WEBP_ENC_HISTOGRAM_H_
+
+#ifdef USE_LOSSLESS_ENCODER
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "./backward_references.h"
+#include "../webp/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// A simple container for histograms of data.
+typedef struct {
+  // literal_ contains green literal, palette-code and
+  // copy-length-prefix histogram
+  int literal_[PIX_OR_COPY_CODES_MAX];
+  int red_[256];
+  int blue_[256];
+  int alpha_[256];
+  // Backward reference prefix-code histogram.
+  int distance_[DISTANCE_CODES_MAX];
+  int palette_code_bits_;
+  double bit_cost_;   // cached value of VP8LHistogramEstimateBits(this)
+} VP8LHistogram;
+
+// Collection of histograms with fixed capacity, allocated as one
+// big memory chunk. Can be destroyed by simply calling 'free()'.
+typedef struct {
+  int size;         // number of slots currently in use
+  int max_size;     // maximum capacity
+  VP8LHistogram** histograms;
+} VP8LHistogramSet;
+
+// Create the histogram.
+//
+// The input data is the PixOrCopy data, which models the literals, stop
+// codes and backward references (both distances and lengths).  Also: if
+// palette_code_bits is >= 0, initialize the histogram with this value.
+void VP8LHistogramCreate(VP8LHistogram* const p,
+                         const VP8LBackwardRefs* const refs,
+                         int palette_code_bits);
+
+// Set the palette_code_bits and reset the stats.
+void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits);
+
+// Allocate an array of pointer to histograms, allocated and initialized
+// using 'cache_bits'. Return NULL in case of memory error.
+VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits);
+
+void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const p,
+                                     const PixOrCopy* const v);
+
+// Estimate how many bits the combined entropy of literals and distance
+// approximately maps to.
+double VP8LHistogramEstimateBits(const VP8LHistogram* const p);
+
+// This function estimates the Huffman dictionary + other block overhead
+// size for creating a new deflate block.
+double VP8LHistogramEstimateBitsHeader(const VP8LHistogram* const p);
+
+// This function estimates the cost in bits excluding the bits needed to
+// represent the entropy code itself.
+double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p);
+
+static WEBP_INLINE void VP8LHistogramAdd(VP8LHistogram* const p,
+                                         const VP8LHistogram* const a) {
+  int i;
+  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
+    p->literal_[i] += a->literal_[i];
+  }
+  for (i = 0; i < DISTANCE_CODES_MAX; ++i) {
+    p->distance_[i] += a->distance_[i];
+  }
+  for (i = 0; i < 256; ++i) {
+    p->red_[i] += a->red_[i];
+    p->blue_[i] += a->blue_[i];
+    p->alpha_[i] += a->alpha_[i];
+  }
+}
+
+static WEBP_INLINE void VP8LHistogramRemove(VP8LHistogram* const p,
+                                            const VP8LHistogram* const a) {
+  int i;
+  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
+    p->literal_[i] -= a->literal_[i];
+    assert(p->literal_[i] >= 0);
+  }
+  for (i = 0; i < DISTANCE_CODES_MAX; ++i) {
+    p->distance_[i] -= a->distance_[i];
+    assert(p->distance_[i] >= 0);
+  }
+  for (i = 0; i < 256; ++i) {
+    p->red_[i] -= a->red_[i];
+    p->blue_[i] -= a->blue_[i];
+    p->alpha_[i] -= a->alpha_[i];
+    assert(p->red_[i] >= 0);
+    assert(p->blue_[i] >= 0);
+    assert(p->alpha_[i] >= 0);
+  }
+}
+
+static WEBP_INLINE int VP8LHistogramNumCodes(const VP8LHistogram* const p) {
+  return 256 + kLengthCodes + (1 << p->palette_code_bits_);
+}
+
+void VP8LConvertPopulationCountTableToBitEstimates(
+    int n, const int* const population_counts, double* const output);
+
+// Builds the histogram image.
+int VP8LGetHistoImageSymbols(int xsize, int ysize,
+                             const VP8LBackwardRefs* const refs,
+                             int quality, int histogram_bits, int cache_bits,
+                             VP8LHistogramSet* const image_in,
+                             uint16_t* const histogram_symbols);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif
+
+#endif  // WEBP_ENC_HISTOGRAM_H_
diff --git a/src/enc/picture.c b/src/enc/picture.c
index ef20608d..68a3bd6e 100644
--- a/src/enc/picture.c
+++ b/src/enc/picture.c
@@ -32,75 +32,94 @@ int WebPPictureAlloc(WebPPicture* const picture) {
     const int has_alpha = picture->colorspace & WEBP_CSP_ALPHA_BIT;
     const int width = picture->width;
     const int height = picture->height;
-    const int y_stride = width;
-    const int uv_width = HALVE(width);
-    const int uv_height = HALVE(height);
-    const int uv_stride = uv_width;
-    int uv0_stride = 0;
-    int a_width, a_stride;
-    uint64_t y_size, uv_size, uv0_size, a_size, total_size;
-    uint8_t* mem;
 
-    // U/V
-    switch (uv_csp) {
-      case WEBP_YUV420:
-        break;
+    if (!picture->use_argb_input) {
+      const int y_stride = width;
+      const int uv_width = HALVE(width);
+      const int uv_height = HALVE(height);
+      const int uv_stride = uv_width;
+      int uv0_stride = 0;
+      int a_width, a_stride;
+      uint64_t y_size, uv_size, uv0_size, a_size, total_size;
+      uint8_t* mem;
+
+      // U/V
+      switch (uv_csp) {
+        case WEBP_YUV420:
+          break;
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-      case WEBP_YUV400:    // for now, we'll just reset the U/V samples
-        break;
-      case WEBP_YUV422:
-        uv0_stride = uv_width;
-        break;
-      case WEBP_YUV444:
-        uv0_stride = width;
-        break;
+        case WEBP_YUV400:    // for now, we'll just reset the U/V samples
+          break;
+        case WEBP_YUV422:
+          uv0_stride = uv_width;
+          break;
+        case WEBP_YUV444:
+          uv0_stride = width;
+          break;
 #endif
-      default:
+        default:
+          return 0;
+      }
+      uv0_size = height * uv0_stride;
+
+      // alpha
+      a_width = has_alpha ? width : 0;
+      a_stride = a_width;
+      y_size = (uint64_t)y_stride * height;
+      uv_size = (uint64_t)uv_stride * uv_height;
+      a_size =  (uint64_t)a_stride * height;
+
+      total_size = y_size + a_size + 2 * uv_size + 2 * uv0_size;
+
+      // Security and validation checks
+      if (width <= 0 || height <= 0 ||       // check for luma/alpha param error
+          uv_width < 0 || uv_height < 0 ||   // check for u/v param error
+          y_size >= (1ULL << 40) ||            // check for reasonable global size
+          (size_t)total_size != total_size) {  // check for overflow on 32bit
         return 0;
-    }
-    uv0_size = height * uv0_stride;
+      }
+      picture->y_stride  = y_stride;
+      picture->uv_stride = uv_stride;
+      picture->a_stride  = a_stride;
+      picture->uv0_stride  = uv0_stride;
+      WebPPictureFree(picture);   // erase previous buffer
+      mem = (uint8_t*)malloc((size_t)total_size);
+      if (mem == NULL) return 0;
 
-    // alpha
-    a_width = has_alpha ? width : 0;
-    a_stride = a_width;
-    y_size = (uint64_t)y_stride * height;
-    uv_size = (uint64_t)uv_stride * uv_height;
-    a_size =  (uint64_t)a_stride * height;
+      picture->y = mem;
+      mem += y_size;
 
-    total_size = y_size + a_size + 2 * uv_size + 2 * uv0_size;
+      picture->u = mem;
+      mem += uv_size;
+      picture->v = mem;
+      mem += uv_size;
 
-    // Security and validation checks
-    if (width <= 0 || height <= 0 ||       // check for luma/alpha param error
-        uv_width < 0 || uv_height < 0 ||   // check for u/v param error
-        y_size >= (1ULL << 40) ||            // check for reasonable global size
-        (size_t)total_size != total_size) {  // check for overflow on 32bit
+      if (a_size) {
+        picture->a = mem;
+        mem += a_size;
+      }
+      if (uv0_size) {
+        picture->u0 = mem;
+        mem += uv0_size;
+        picture->v0 = mem;
+        mem += uv0_size;
+      }
+    } else {
+#ifdef USE_LOSSLESS_ENCODER
+      const uint64_t argb_size = (uint64_t)width * height;
+      const uint64_t total_size = argb_size * sizeof(*picture->argb);
+      if (width <= 0 || height <= 0 ||
+          argb_size >= (1ULL << 40) ||
+          (size_t)total_size != total_size) {
+        return 0;
+      }
+      WebPPictureFree(picture);   // erase previous buffer
+      picture->argb = (uint32_t*)malloc(total_size);
+      if (picture->argb == NULL) return 0;
+      picture->argb_stride = width;
+#else
       return 0;
-    }
-    picture->y_stride  = y_stride;
-    picture->uv_stride = uv_stride;
-    picture->a_stride  = a_stride;
-    picture->uv0_stride  = uv0_stride;
-    WebPPictureFree(picture);   // erase previous buffer
-    mem = (uint8_t*)malloc((size_t)total_size);
-    if (mem == NULL) return 0;
-
-    picture->y = mem;
-    mem += y_size;
-
-    picture->u = mem;
-    mem += uv_size;
-    picture->v = mem;
-    mem += uv_size;
-
-    if (a_size) {
-      picture->a = mem;
-      mem += a_size;
-    }
-    if (uv0_size) {
-      picture->u0 = mem;
-      mem += uv0_size;
-      picture->v0 = mem;
-      mem += uv0_size;
+#endif
     }
   }
   return 1;
@@ -114,12 +133,18 @@ static void WebPPictureGrabSpecs(const WebPPicture* const src,
   dst->y = dst->u = dst->v = NULL;
   dst->u0 = dst->v0 = NULL;
   dst->a = NULL;
+#ifdef USE_LOSSLESS_ENCODER
+  dst->argb = NULL;
+#endif
 }
 
 // Release memory owned by 'picture'.
 void WebPPictureFree(WebPPicture* const picture) {
   if (picture != NULL) {
     free(picture->y);
+#ifdef USE_LOSSLESS_ENCODER
+    free(picture->argb);
+#endif
     WebPPictureGrabSpecs(NULL, picture);
   }
 }
@@ -144,28 +169,38 @@ int WebPPictureCopy(const WebPPicture* const src, WebPPicture* const dst) {
   WebPPictureGrabSpecs(src, dst);
   if (!WebPPictureAlloc(dst)) return 0;
 
-  CopyPlane(src->y, src->y_stride,
-            dst->y, dst->y_stride, dst->width, dst->height);
-  CopyPlane(src->u, src->uv_stride,
-            dst->u, dst->uv_stride, HALVE(dst->width), HALVE(dst->height));
-  CopyPlane(src->v, src->uv_stride,
-            dst->v, dst->uv_stride, HALVE(dst->width), HALVE(dst->height));
-  if (dst->a != NULL)  {
-    CopyPlane(src->a, src->a_stride,
-              dst->a, dst->a_stride, dst->width, dst->height);
-  }
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  if (dst->u0 != NULL)  {
-    int uv0_width = src->width;
-    if ((dst->colorspace & WEBP_CSP_UV_MASK) == WEBP_YUV422) {
-      uv0_width = HALVE(uv0_width);
+  if (!src->use_argb_input) {
+    CopyPlane(src->y, src->y_stride,
+              dst->y, dst->y_stride, dst->width, dst->height);
+    CopyPlane(src->u, src->uv_stride,
+              dst->u, dst->uv_stride, HALVE(dst->width), HALVE(dst->height));
+    CopyPlane(src->v, src->uv_stride,
+              dst->v, dst->uv_stride, HALVE(dst->width), HALVE(dst->height));
+    if (dst->a != NULL)  {
+      CopyPlane(src->a, src->a_stride,
+                dst->a, dst->a_stride, dst->width, dst->height);
+    }
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    if (dst->u0 != NULL)  {
+      int uv0_width = src->width;
+      if ((dst->colorspace & WEBP_CSP_UV_MASK) == WEBP_YUV422) {
+        uv0_width = HALVE(uv0_width);
+      }
+      CopyPlane(src->u0, src->uv0_stride,
+                dst->u0, dst->uv0_stride, uv0_width, dst->height);
+      CopyPlane(src->v0, src->uv0_stride,
+                dst->v0, dst->uv0_stride, uv0_width, dst->height);
     }
-    CopyPlane(src->u0, src->uv0_stride,
-              dst->u0, dst->uv0_stride, uv0_width, dst->height);
-    CopyPlane(src->v0, src->uv0_stride,
-              dst->v0, dst->uv0_stride, uv0_width, dst->height);
-  }
 #endif
+  } else {
+#ifdef USE_LOSSLESS_ENCODER
+    CopyPlane((uint8_t*)src->argb, 4 * src->argb_stride,
+              (uint8_t*)dst->argb, 4 * dst->argb_stride,
+              4 * dst->width, dst->height);
+#else
+    return 0;
+#endif
+  }
   return 1;
 }
 
@@ -438,66 +473,100 @@ static int Import(WebPPicture* const picture,
   const int width = picture->width;
   const int height = picture->height;
 
-  // Import luma plane
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x < width; ++x) {
-      const int offset = step * x + y * rgb_stride;
-      picture->y[x + y * picture->y_stride] =
-        rgb_to_y(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
-    }
-  }
-
-  // Downsample U/V plane
-  if (uv_csp != WEBP_YUV400) {
-    for (y = 0; y < (height >> 1); ++y) {
-      for (x = 0; x < (width >> 1); ++x) {
-        RGB_TO_UV(x, y, SUM4);
-      }
-      if (picture->width & 1) {
-        RGB_TO_UV(x, y, SUM2V);
-      }
-    }
-    if (height & 1) {
-      for (x = 0; x < (width >> 1); ++x) {
-        RGB_TO_UV(x, y, SUM2H);
-      }
-      if (width & 1) {
-        RGB_TO_UV(x, y, SUM1);
-      }
-    }
-
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    // Store original U/V samples too
-    if (uv_csp == WEBP_YUV422) {
-      for (y = 0; y < height; ++y) {
-        for (x = 0; x < (width >> 1); ++x) {
-          RGB_TO_UV0(2 * x, x, y, SUM2H);
-        }
-        if (width & 1) {
-          RGB_TO_UV0(2 * x, x, y, SUM1);
-        }
-      }
-    } else if (uv_csp == WEBP_YUV444) {
-      for (y = 0; y < height; ++y) {
-        for (x = 0; x < width; ++x) {
-          RGB_TO_UV0(x, x, y, SUM1);
-        }
-      }
-    }
-#endif
-  } else {
-    MakeGray(picture);
-  }
-
-  if (import_alpha) {
-    const uint8_t* const a_ptr = rgb + 3;
-    assert(step >= 4);
+  if (!picture->use_argb_input) {
+    // Import luma plane
     for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
-        picture->a[x + y * picture->a_stride] =
-          a_ptr[step * x + y * rgb_stride];
+        const int offset = step * x + y * rgb_stride;
+        picture->y[x + y * picture->y_stride] =
+          rgb_to_y(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
       }
     }
+
+    // Downsample U/V plane
+    if (uv_csp != WEBP_YUV400) {
+      for (y = 0; y < (height >> 1); ++y) {
+        for (x = 0; x < (width >> 1); ++x) {
+          RGB_TO_UV(x, y, SUM4);
+        }
+        if (picture->width & 1) {
+          RGB_TO_UV(x, y, SUM2V);
+        }
+      }
+      if (height & 1) {
+        for (x = 0; x < (width >> 1); ++x) {
+          RGB_TO_UV(x, y, SUM2H);
+        }
+        if (width & 1) {
+          RGB_TO_UV(x, y, SUM1);
+        }
+      }
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+      // Store original U/V samples too
+      if (uv_csp == WEBP_YUV422) {
+        for (y = 0; y < height; ++y) {
+          for (x = 0; x < (width >> 1); ++x) {
+            RGB_TO_UV0(2 * x, x, y, SUM2H);
+          }
+          if (width & 1) {
+            RGB_TO_UV0(2 * x, x, y, SUM1);
+          }
+        }
+      } else if (uv_csp == WEBP_YUV444) {
+        for (y = 0; y < height; ++y) {
+          for (x = 0; x < width; ++x) {
+            RGB_TO_UV0(x, x, y, SUM1);
+          }
+        }
+      }
+#endif
+    } else {
+      MakeGray(picture);
+    }
+
+    if (import_alpha) {
+      const uint8_t* const a_ptr = rgb + 3;
+      assert(step >= 4);
+      for (y = 0; y < height; ++y) {
+        for (x = 0; x < width; ++x) {
+          picture->a[x + y * picture->a_stride] =
+            a_ptr[step * x + y * rgb_stride];
+        }
+      }
+    }
+  } else {
+#ifdef USE_LOSSLESS_ENCODER
+    if (!import_alpha) {
+      for (y = 0; y < height; ++y) {
+        for (x = 0; x < width; ++x) {
+          const int offset = step * x + y * rgb_stride;
+          const uint32_t argb =
+              0xff000000 |
+              (r_ptr[offset] << 16) |
+              (g_ptr[offset] <<  8) |
+              (b_ptr[offset]);
+          picture->argb[x + y * picture->argb_stride] = argb;
+        }
+      }
+    } else {
+      const uint8_t* const a_ptr = rgb + 3;
+      assert(step >= 4);
+      for (y = 0; y < height; ++y) {
+        for (x = 0; x < width; ++x) {
+          const int offset = step * x + y * rgb_stride;
+          const uint32_t argb =
+              (a_ptr[offset] << 24) |
+              (r_ptr[offset] << 16) |
+              (g_ptr[offset] <<  8) |
+              (b_ptr[offset]);
+          picture->argb[x + y * picture->argb_stride] = argb;
+        }
+      }
+    }
+#else
+    return 0;
+#endif
   }
   return 1;
 }
diff --git a/src/enc/vp8l.c b/src/enc/vp8l.c
new file mode 100644
index 00000000..7d5e70fa
--- /dev/null
+++ b/src/enc/vp8l.c
@@ -0,0 +1,1239 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// main entry for the lossless encoder.
+//
+// Author: Vikas Arora (vikaas.arora@gmail.com)
+//
+
+#ifdef USE_LOSSLESS_ENCODER
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "./backward_references.h"
+#include "./vp8enci.h"
+#include "./vp8li.h"
+#include "../dsp/lossless.h"
+#include "../utils/bit_writer.h"
+#include "../utils/huffman_encode.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define MAX_HUFF_IMAGE_SIZE       (32 * 1024 * 1024)
+
+// TODO(vikas): find a common place between enc and dec for these:
+#define PREDICTOR_TRANSFORM      0
+#define CROSS_COLOR_TRANSFORM    1
+#define SUBTRACT_GREEN           2
+#define COLOR_INDEXING_TRANSFORM 3
+#define TRANSFORM_PRESENT 1
+
+#define IMAGE_SIZE_BITS 14
+
+// -----------------------------------------------------------------------------
+// Palette
+
+static int CompareColors(const void* p1, const void* p2) {
+  const uint32_t a = *(const uint32_t*)p1;
+  const uint32_t b = *(const uint32_t*)p2;
+  return (a < b) ? -1 : (a > b) ? 1 : 0;
+}
+
+// If number of colors in the image is less than or equal to MAX_PALETTE_SIZE,
+// creates a palette and returns true, else returns false.
+static int AnalyzeAndCreatePalette(const uint32_t* const argb, int num_pix,
+                                   uint32_t palette[MAX_PALETTE_SIZE],
+                                   int* const palette_size) {
+  int i, key;
+  int num_colors = 0;
+  uint8_t in_use[MAX_PALETTE_SIZE * 4] = { 0 };
+  uint32_t colors[MAX_PALETTE_SIZE * 4];
+  static const uint32_t kHashMul = 0x1e35a7bd;
+
+  key = (kHashMul * argb[0]) >> PALETTE_KEY_RIGHT_SHIFT;
+  colors[key] = argb[0];
+  in_use[key] = 1;
+  ++num_colors;
+
+  for (i = 1; i < num_pix; ++i) {
+    if (argb[i] == argb[i - 1]) {
+      continue;
+    }
+    key = (kHashMul * argb[i]) >> PALETTE_KEY_RIGHT_SHIFT;
+    while (1) {
+      if (!in_use[key]) {
+        colors[key] = argb[i];
+        in_use[key] = 1;
+        ++num_colors;
+        if (num_colors > MAX_PALETTE_SIZE) {
+          return 0;
+        }
+        break;
+      } else if (colors[key] == argb[i]) {
+        // The color is already there.
+        break;
+      } else {
+        // Some other color sits there.
+        // Do linear conflict resolution.
+        ++key;
+        key &= (MAX_PALETTE_SIZE * 4 - 1);  // key mask for 1K buffer.
+      }
+    }
+  }
+
+  num_colors = 0;
+  for (i = 0; i < (int)(sizeof(in_use) / sizeof(in_use[0])); ++i) {
+    if (in_use[i]) {
+      palette[num_colors] = colors[i];
+      ++num_colors;
+    }
+  }
+
+  qsort(palette, num_colors, sizeof(*palette), CompareColors);
+  *palette_size = num_colors;
+  return 1;
+}
+
+static int AnalyzeEntropy(const uint32_t const *argb, int xsize, int ysize,
+                          double* const nonpredicted_bits,
+                          double* const predicted_bits) {
+  int i;
+  VP8LHistogram* nonpredicted = NULL;
+  VP8LHistogram* predicted = (VP8LHistogram*)malloc(2 * sizeof(*predicted));
+  if (predicted == NULL) return 0;
+  nonpredicted = predicted + 1;
+
+  VP8LHistogramInit(predicted, 0);
+  VP8LHistogramInit(nonpredicted, 0);
+  for (i = 1; i < xsize * ysize; ++i) {
+    const uint32_t pix = argb[i];
+    const uint32_t pix_diff = VP8LSubPixels(pix, argb[i - 1]);
+    if (pix_diff == 0) continue;
+    if (i >= xsize && pix == argb[i - xsize]) {
+      continue;
+    }
+    {
+      const PixOrCopy pix_token = PixOrCopyCreateLiteral(pix);
+      const PixOrCopy pix_diff_token = PixOrCopyCreateLiteral(pix_diff);
+      VP8LHistogramAddSinglePixOrCopy(nonpredicted, &pix_token);
+      VP8LHistogramAddSinglePixOrCopy(predicted, &pix_diff_token);
+    }
+  }
+  *nonpredicted_bits = VP8LHistogramEstimateBitsBulk(nonpredicted);
+  *predicted_bits = VP8LHistogramEstimateBitsBulk(predicted);
+  free(predicted);
+  return 1;
+}
+
+static int VP8LEncAnalyze(VP8LEncoder* const enc) {
+  const WebPPicture* const pic = enc->pic_;
+  assert(pic != NULL && pic->argb != NULL);
+
+  enc->use_palette_ =
+        AnalyzeAndCreatePalette(pic->argb, pic->width * pic->height,
+                                enc->palette_, &enc->palette_size_);
+  if (!enc->use_palette_) {
+    double non_pred_entropy, pred_entropy;
+    if (!AnalyzeEntropy(pic->argb, pic->width, pic->height,
+                        &non_pred_entropy, &pred_entropy)) {
+      return 0;
+    }
+
+    if (pred_entropy < 0.95 * non_pred_entropy) {
+      enc->use_predict_ = 1;
+      enc->use_cross_color_ = 1;
+    }
+  }
+  return 1;
+}
+
+// -----------------------------------------------------------------------------
+
+static int GetBackwardReferences(int width, int height,
+                                 const uint32_t* const argb,
+                                 int quality, int use_color_cache,
+                                 int cache_bits, int use_2d_locality,
+                                 VP8LBackwardRefs* const best) {
+  int ok = 0;
+  int lz77_is_useful;
+  VP8LBackwardRefs refs_rle, refs_lz77;
+  const int num_pix = width * height;
+
+  VP8LBackwardRefsAlloc(&refs_rle, num_pix);
+  VP8LBackwardRefsAlloc(&refs_lz77, num_pix);
+  VP8LInitBackwardRefs(best);
+  if (refs_rle.refs == NULL || refs_lz77.refs == NULL) {
+ Error1:
+    VP8LClearBackwardRefs(&refs_rle);
+    VP8LClearBackwardRefs(&refs_lz77);
+    goto End;
+  }
+
+  if (!VP8LBackwardReferencesHashChain(width, height, use_color_cache,
+                                       argb, cache_bits, quality,
+                                       &refs_lz77)) {
+    goto End;
+  }
+  // Backward Reference using RLE only.
+  VP8LBackwardReferencesRle(width, height, argb, &refs_rle);
+
+  {
+    double bit_cost_lz77, bit_cost_rle;
+    VP8LHistogram* const histo = (VP8LHistogram*)malloc(sizeof(*histo));
+    if (histo == NULL) goto Error1;
+    // Evaluate lz77 coding
+    VP8LHistogramCreate(histo, &refs_lz77, cache_bits);
+    bit_cost_lz77 = VP8LHistogramEstimateBits(histo);
+    // Evaluate RLE coding
+    VP8LHistogramCreate(histo, &refs_rle, cache_bits);
+    bit_cost_rle = VP8LHistogramEstimateBits(histo);
+    // Decide if LZ77 is useful.
+    lz77_is_useful = (bit_cost_lz77 < bit_cost_rle);
+    free(histo);
+  }
+
+  // Choose appropriate backward reference.
+  if (lz77_is_useful) {
+    // TraceBackwards is costly. Run it for higher qualities.
+    const int try_lz77_trace_backwards = (quality >= 75);
+    *best = refs_lz77;   // default guess: lz77 is better
+    VP8LClearBackwardRefs(&refs_rle);
+    if (try_lz77_trace_backwards) {
+      const int recursion_level = (num_pix < 320 * 200) ? 1 : 0;
+      VP8LBackwardRefs refs_trace;
+      if (!VP8LBackwardRefsAlloc(&refs_trace, num_pix)) {
+        goto End;
+      }
+      if (VP8LBackwardReferencesTraceBackwards(width, height, recursion_level,
+                                               use_color_cache,
+                                               argb, cache_bits,
+                                               &refs_trace)) {
+        VP8LClearBackwardRefs(&refs_lz77);
+        *best = refs_trace;
+      }
+    }
+  } else {
+    VP8LClearBackwardRefs(&refs_lz77);
+    *best = refs_rle;
+  }
+
+  if (use_2d_locality) {  // Use backward reference with 2D locality.
+    VP8LBackwardReferences2DLocality(width, best);
+  }
+  ok = 1;
+
+End:
+  if (!ok) {
+    VP8LClearBackwardRefs(best);
+  }
+  return ok;
+}
+
+// Heuristics for selecting the stride ranges to collapse.
+static int ValuesShouldBeCollapsedToStrideAverage(int a, int b) {
+  return abs(a - b) < 4;
+}
+
+// Change the population counts in a way that the consequent
+// Hufmann tree compression, especially its rle-part will be more
+// likely to compress this data more efficiently.
+//
+// length contains the size of the histogram.
+// data contains the population counts.
+static int OptimizeHuffmanForRle(int length, int* counts) {
+  int stride;
+  int limit;
+  int sum;
+  uint8_t* good_for_rle;
+  // 1) Let's make the Huffman code more compatible with rle encoding.
+  int i;
+  for (; length >= 0; --length) {
+    if (length == 0) {
+      return 1;  // All zeros.
+    }
+    if (counts[length - 1] != 0) {
+      // Now counts[0..length - 1] does not have trailing zeros.
+      break;
+    }
+  }
+  // 2) Let's mark all population counts that already can be encoded
+  // with an rle code.
+  good_for_rle = (uint8_t*)calloc(length, 1);
+  if (good_for_rle == NULL) {
+    return 0;
+  }
+  {
+    // Let's not spoil any of the existing good rle codes.
+    // Mark any seq of 0's that is longer as 5 as a good_for_rle.
+    // Mark any seq of non-0's that is longer as 7 as a good_for_rle.
+    int symbol = counts[0];
+    int stride = 0;
+    for (i = 0; i < length + 1; ++i) {
+      if (i == length || counts[i] != symbol) {
+        if ((symbol == 0 && stride >= 5) ||
+            (symbol != 0 && stride >= 7)) {
+          int k;
+          for (k = 0; k < stride; ++k) {
+            good_for_rle[i - k - 1] = 1;
+          }
+        }
+        stride = 1;
+        if (i != length) {
+          symbol = counts[i];
+        }
+      } else {
+        ++stride;
+      }
+    }
+  }
+  // 3) Let's replace those population counts that lead to more rle codes.
+  stride = 0;
+  limit = counts[0];
+  sum = 0;
+  for (i = 0; i < length + 1; ++i) {
+    if (i == length || good_for_rle[i] ||
+        (i != 0 && good_for_rle[i - 1]) ||
+        !ValuesShouldBeCollapsedToStrideAverage(counts[i], limit)) {
+      if (stride >= 4 || (stride >= 3 && sum == 0)) {
+        int k;
+        // The stride must end, collapse what we have, if we have enough (4).
+        int count = (sum + stride / 2) / stride;
+        if (count < 1) {
+          count = 1;
+        }
+        if (sum == 0) {
+          // Don't make an all zeros stride to be upgraded to ones.
+          count = 0;
+        }
+        for (k = 0; k < stride; ++k) {
+          // We don't want to change value at counts[i],
+          // that is already belonging to the next stride. Thus - 1.
+          counts[i - k - 1] = count;
+        }
+      }
+      stride = 0;
+      sum = 0;
+      if (i < length - 3) {
+        // All interesting strides have a count of at least 4,
+        // at least when non-zeros.
+        limit = (counts[i] + counts[i + 1] +
+                 counts[i + 2] + counts[i + 3] + 2) / 4;
+      } else if (i < length) {
+        limit = counts[i];
+      } else {
+        limit = 0;
+      }
+    }
+    ++stride;
+    if (i != length) {
+      sum += counts[i];
+      if (stride >= 4) {
+        limit = (sum + stride / 2) / stride;
+      }
+    }
+  }
+  free(good_for_rle);
+  return 1;
+}
+
+// TODO(vikasa): Wrap bit_codes and bit_lengths in a Struct.
+static int GetHuffBitLengthsAndCodes(
+    const VP8LHistogramSet* const histogram_image,
+    int use_color_cache, int** bit_length_sizes,
+    uint16_t*** bit_codes, uint8_t*** bit_lengths) {
+  int i, k;
+  int ok = 1;
+  const int histogram_image_size = histogram_image->size;
+  for (i = 0; i < histogram_image_size; ++i) {
+    VP8LHistogram* const histo = histogram_image->histograms[i];
+    const int num_literals = VP8LHistogramNumCodes(histo);
+    k = 0;
+    // TODO(vikasa): Alloc one big buffer instead of allocating in the loop.
+    (*bit_length_sizes)[5 * i] = num_literals;
+    (*bit_lengths)[5 * i] = (uint8_t*)calloc(num_literals, 1);
+    (*bit_codes)[5 * i] = (uint16_t*)
+        malloc(num_literals * sizeof(*(*bit_codes)[5 * i]));
+    if ((*bit_lengths)[5 * i] == NULL || (*bit_codes)[5 * i] == NULL) {
+      ok = 0;
+      goto Error;
+    }
+
+    // For each component, optimize histogram for Huffman with RLE compression.
+    ok = ok && OptimizeHuffmanForRle(num_literals, histo->literal_);
+    if (!use_color_cache) {
+      // Implies that palette_bits == 0,
+      // and so number of palette entries = (1 << 0) = 1.
+      // Optimization might have smeared population count in this single
+      // palette entry, so zero it out.
+      histo->literal_[256 + kLengthCodes] = 0;
+    }
+    ok = ok && OptimizeHuffmanForRle(256, histo->red_);
+    ok = ok && OptimizeHuffmanForRle(256, histo->blue_);
+    ok = ok && OptimizeHuffmanForRle(256, histo->alpha_);
+    ok = ok && OptimizeHuffmanForRle(DISTANCE_CODES_MAX, histo->distance_);
+
+    // Create a Huffman tree (in the form of bit lengths) for each component.
+    ok = ok && VP8LCreateHuffmanTree(histo->literal_, num_literals,
+                                     15, (*bit_lengths)[5 * i]);
+    for (k = 1; k < 5; ++k) {
+      int val = 256;
+      if (k == 4) {
+        val = DISTANCE_CODES_MAX;
+      }
+      (*bit_length_sizes)[5 * i + k] = val;
+      (*bit_lengths)[5 * i + k] = (uint8_t*)calloc(val, 1);
+      (*bit_codes)[5 * i + k] = (uint16_t*)calloc(val, sizeof(bit_codes[0]));
+      if ((*bit_lengths)[5 * i + k] == NULL ||
+          (*bit_codes)[5 * i + k] == NULL) {
+        ok = 0;
+        goto Error;
+      }
+    }
+    ok = ok &&
+        VP8LCreateHuffmanTree(histo->red_, 256, 15,
+                                     (*bit_lengths)[5 * i + 1]) &&
+        VP8LCreateHuffmanTree(histo->blue_, 256, 15,
+                              (*bit_lengths)[5 * i + 2]) &&
+        VP8LCreateHuffmanTree(histo->alpha_, 256, 15,
+                              (*bit_lengths)[5 * i + 3]) &&
+        VP8LCreateHuffmanTree(histo->distance_, DISTANCE_CODES_MAX, 15,
+                              (*bit_lengths)[5 * i + 4]);
+    // Create the actual bit codes for the bit lengths.
+    for (k = 0; k < 5; ++k) {
+      int ix = 5 * i + k;
+      VP8LConvertBitDepthsToSymbols((*bit_lengths)[ix], (*bit_length_sizes)[ix],
+                                    (*bit_codes)[ix]);
+    }
+  }
+  return ok;
+
+ Error:
+  {
+    int idx;
+    for (idx = 0; idx <= 5 * i + k; ++idx) {
+      free((*bit_lengths)[idx]);
+      free((*bit_codes)[idx]);
+    }
+  }
+  return 0;
+}
+
+static void ClearHuffmanTreeIfOnlyOneSymbol(const int num_symbols,
+                                            uint8_t* lengths,
+                                            uint16_t* symbols) {
+  int k;
+  int count = 0;
+  for (k = 0; k < num_symbols; ++k) {
+    if (lengths[k] != 0) {
+      ++count;
+      if (count > 1) return;
+    }
+  }
+  for (k = 0; k < num_symbols; ++k) {
+    lengths[k] = 0;
+    symbols[k] = 0;
+  }
+}
+
+static void StoreHuffmanTreeOfHuffmanTreeToBitMask(
+    VP8LBitWriter* const bw, const uint8_t* code_length_bitdepth) {
+  // RFC 1951 will calm you down if you are worried about this funny sequence.
+  // This sequence is tuned from that, but more weighted for lower symbol count,
+  // and more spiking histograms.
+  int i;
+  static const uint8_t kStorageOrder[CODE_LENGTH_CODES] = {
+    17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  };
+  // Throw away trailing zeros:
+  int codes_to_store = sizeof(kStorageOrder);
+  for (; codes_to_store > 4; --codes_to_store) {
+    if (code_length_bitdepth[kStorageOrder[codes_to_store - 1]] != 0) {
+      break;
+    }
+  }
+  // How many code length codes we write above the first four (see RFC 1951).
+  VP8LWriteBits(bw, 4, codes_to_store - 4);
+  for (i = 0; i < codes_to_store; ++i) {
+    VP8LWriteBits(bw, 3, code_length_bitdepth[kStorageOrder[i]]);
+  }
+}
+
+static void StoreHuffmanTreeToBitMask(
+    VP8LBitWriter* const bw,
+    const uint8_t* huffman_tree,
+    const uint8_t* huffman_tree_extra_bits,
+    const int num_symbols,
+    const uint8_t* code_length_bitdepth,
+    const uint16_t* code_length_bitdepth_symbols) {
+  int i;
+  for (i = 0; i < num_symbols; ++i) {
+    const int ix = huffman_tree[i];
+    VP8LWriteBits(bw, code_length_bitdepth[ix],
+                  code_length_bitdepth_symbols[ix]);
+    switch (ix) {
+      case 16:
+        VP8LWriteBits(bw, 2, huffman_tree_extra_bits[i]);
+        break;
+      case 17:
+        VP8LWriteBits(bw, 3, huffman_tree_extra_bits[i]);
+        break;
+      case 18:
+        VP8LWriteBits(bw, 7, huffman_tree_extra_bits[i]);
+        break;
+    }
+  }
+}
+
+static int StoreFullHuffmanCode(VP8LBitWriter* const bw,
+                                const uint8_t* const bit_lengths,
+                                int bit_lengths_size) {
+  int ok = 0;
+  int huffman_tree_size = 0;
+  uint8_t code_length_bitdepth[CODE_LENGTH_CODES] = { 0 };
+  uint16_t code_length_bitdepth_symbols[CODE_LENGTH_CODES] = { 0 };
+  uint8_t* huffman_tree_extra_bits;
+  uint8_t* const huffman_tree =
+      (uint8_t*)malloc(bit_lengths_size * sizeof(*huffman_tree) +
+                       bit_lengths_size * sizeof(*huffman_tree_extra_bits));
+
+  if (huffman_tree == NULL) return 0;
+  huffman_tree_extra_bits =
+      huffman_tree + bit_lengths_size * sizeof(*huffman_tree);
+
+  VP8LWriteBits(bw, 1, 0);
+  VP8LCreateCompressedHuffmanTree(bit_lengths, bit_lengths_size,
+                                  &huffman_tree_size, huffman_tree,
+                                  huffman_tree_extra_bits);
+  {
+    int histogram[CODE_LENGTH_CODES] = { 0 };
+    int i;
+    for (i = 0; i < huffman_tree_size; ++i) {
+      ++histogram[huffman_tree[i]];
+    }
+
+    if (!VP8LCreateHuffmanTree(histogram, CODE_LENGTH_CODES,
+                               7, code_length_bitdepth)) {
+      goto End;
+    }
+  }
+  VP8LConvertBitDepthsToSymbols(code_length_bitdepth, CODE_LENGTH_CODES,
+                                code_length_bitdepth_symbols);
+  StoreHuffmanTreeOfHuffmanTreeToBitMask(bw, code_length_bitdepth);
+  ClearHuffmanTreeIfOnlyOneSymbol(CODE_LENGTH_CODES,
+                                  code_length_bitdepth,
+                                  code_length_bitdepth_symbols);
+  {
+    int trailing_zero_bits = 0;
+    int trimmed_length = huffman_tree_size;
+    int write_trimmed_length;
+    int length;
+    int i = huffman_tree_size;
+    while (i-- > 0) {
+      const int ix = huffman_tree[i];
+      if (ix == 0 || ix == 17 || ix == 18) {
+        --trimmed_length;   // discount trailing zeros
+        trailing_zero_bits += code_length_bitdepth[ix];
+        if (ix == 17) {
+          trailing_zero_bits += 3;
+        } else if (ix == 18) {
+          trailing_zero_bits += 7;
+        }
+      } else {
+        break;
+      }
+    }
+    write_trimmed_length = (trimmed_length > 1 && trailing_zero_bits > 12);
+    length = write_trimmed_length ? trimmed_length : huffman_tree_size;
+    VP8LWriteBits(bw, 1, write_trimmed_length);
+    if (write_trimmed_length) {
+      const int nbits = VP8LBitsLog2Ceiling(trimmed_length - 1);
+      const int nbitpairs = (nbits == 0) ? 1 : (nbits + 1) / 2;
+      VP8LWriteBits(bw, 3, nbitpairs - 1);
+      VP8LWriteBits(bw, nbitpairs * 2, trimmed_length - 2);
+    }
+    StoreHuffmanTreeToBitMask(bw, huffman_tree, huffman_tree_extra_bits,
+                              length, code_length_bitdepth,
+                              code_length_bitdepth_symbols);
+  }
+  ok = 1;
+ End:
+  free(huffman_tree);
+  return ok;
+}
+
+static int StoreHuffmanCode(VP8LBitWriter* const bw,
+                            const uint8_t* const bit_lengths,
+                            int bit_lengths_size) {
+  int i;
+  int count = 0;
+  int symbols[2] = { 0, 0 };
+  const int kMaxBits = 8;
+  const int kMaxSymbol = 1 << kMaxBits;
+
+  // Check whether it's a small tree.
+  for (i = 0; i < bit_lengths_size && count < 3; ++i) {
+    if (bit_lengths[i] != 0) {
+      if (count < 2) symbols[count] = i;
+      ++count;
+    }
+  }
+
+  if (count == 0) {   // emit minimal tree for empty cases
+    // bits: small tree marker: 1, count-1: 0, large 8-bit code: 0, code: 0
+    VP8LWriteBits(bw, 4, 0x01);
+    return 1;
+  } else if (count <= 2 && symbols[0] < kMaxSymbol && symbols[1] < kMaxSymbol) {
+    VP8LWriteBits(bw, 1, 1);  // Small tree marker to encode 1 or 2 symbols.
+    VP8LWriteBits(bw, 1, count - 1);
+    if (symbols[0] <= 1) {
+      VP8LWriteBits(bw, 1, 0);  // Code bit for small (1 bit) symbol value.
+      VP8LWriteBits(bw, 1, symbols[0]);
+    } else {
+      VP8LWriteBits(bw, 1, 1);
+      VP8LWriteBits(bw, 8, symbols[0]);
+    }
+    if (count == 2) {
+      VP8LWriteBits(bw, 8, symbols[1]);
+    }
+    return 1;
+  } else {
+    return StoreFullHuffmanCode(bw, bit_lengths, bit_lengths_size);
+  }
+}
+
+static void StoreImageToBitMask(
+    VP8LBitWriter* const bw, int width, int histo_bits,
+    const VP8LBackwardRefs* const refs,
+    const uint16_t* histogram_symbols,
+    uint8_t** const bitdepths, uint16_t** const bit_symbols) {
+  // x and y trace the position in the image.
+  int x = 0;
+  int y = 0;
+  const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1;
+  int i;
+  for (i = 0; i < refs->size; ++i) {
+    const PixOrCopy* const v = &refs->refs[i];
+    const int histogram_ix = histogram_symbols[histo_bits ?
+                                               (y >> histo_bits) * histo_xsize +
+                                               (x >> histo_bits) : 0];
+    if (PixOrCopyIsCacheIdx(v)) {
+      const int code = PixOrCopyCacheIdx(v);
+      const int literal_ix = 256 + kLengthCodes + code;
+      VP8LWriteBits(bw, bitdepths[5 * histogram_ix][literal_ix],
+                    bit_symbols[5 * histogram_ix][literal_ix]);
+    } else if (PixOrCopyIsLiteral(v)) {
+      static const int order[] = {1, 2, 0, 3};
+      int k;
+      for (k = 0; k < 4; ++k) {
+        const int code = PixOrCopyLiteral(v, order[k]);
+        VP8LWriteBits(bw, bitdepths[5 * histogram_ix + k][code],
+                      bit_symbols[5 * histogram_ix + k][code]);
+      }
+    } else {
+      int bits, n_bits;
+      int code, distance;
+      int len_ix;
+      PrefixEncode(v->len, &code, &n_bits, &bits);
+      len_ix = 256 + code;
+      VP8LWriteBits(bw, bitdepths[5 * histogram_ix][len_ix],
+                    bit_symbols[5 * histogram_ix][len_ix]);
+      VP8LWriteBits(bw, n_bits, bits);
+
+      distance = PixOrCopyDistance(v);
+      PrefixEncode(distance, &code, &n_bits, &bits);
+      VP8LWriteBits(bw, bitdepths[5 * histogram_ix + 4][code],
+                    bit_symbols[5 * histogram_ix + 4][code]);
+      VP8LWriteBits(bw, n_bits, bits);
+    }
+    x += PixOrCopyLength(v);
+    while (x >= width) {
+      x -= width;
+      ++y;
+    }
+  }
+}
+
+static int EncodeImageInternal(VP8LBitWriter* const bw,
+                               const uint32_t* const argb,
+                               int width, int height, int quality,
+                               int cache_bits, int histogram_bits) {
+  int i;
+  int ok = 0;
+  int write_histogram_image;
+  int* bit_lengths_sizes = NULL;
+  uint8_t** bit_lengths = NULL;
+  uint16_t** bit_codes = NULL;
+  const int use_2d_locality = 1;
+  const int use_color_cache = (cache_bits > 0);
+  const int color_cache_size = use_color_cache ? (1 << cache_bits) : 0;
+  const int histogram_image_xysize =
+      VP8LSubSampleSize(width, histogram_bits) *
+      VP8LSubSampleSize(height, histogram_bits);
+  VP8LHistogramSet* histogram_image =
+      VP8LAllocateHistogramSet(histogram_image_xysize, 0);
+  int histogram_image_size;
+  VP8LBackwardRefs refs;
+  uint16_t* const histogram_symbols =
+      (uint16_t*)malloc(histogram_image_xysize * sizeof(*histogram_symbols));
+
+  if (histogram_image == NULL || histogram_symbols == NULL) goto Error;
+
+  // Calculate backward references from ARGB image.
+  if (!GetBackwardReferences(width, height, argb, quality,
+                             use_color_cache, cache_bits, use_2d_locality,
+                             &refs)) {
+    goto Error;
+  }
+  // Build histogram image & symbols from backward references.
+  if (!VP8LGetHistoImageSymbols(width, height, &refs,
+                                quality, histogram_bits, cache_bits,
+                                histogram_image,
+                                histogram_symbols)) {
+    goto Error;
+  }
+  // Create Huffman bit lengths & codes for each histogram image.
+  histogram_image_size = histogram_image->size;
+  bit_lengths_sizes = (int*)calloc(5 * histogram_image_size,
+                                   sizeof(*bit_lengths_sizes));
+  bit_lengths = (uint8_t**)calloc(5 * histogram_image_size,
+                                  sizeof(*bit_lengths));
+  bit_codes = (uint16_t**)calloc(5 * histogram_image_size,
+                                 sizeof(*bit_codes));
+  if (bit_lengths_sizes == NULL || bit_lengths == NULL || bit_codes == NULL ||
+      !GetHuffBitLengthsAndCodes(histogram_image, use_color_cache,
+                                 &bit_lengths_sizes,
+                                 &bit_codes, &bit_lengths)) {
+    goto Error;
+  }
+
+  // Color Cache parameters.
+  VP8LWriteBits(bw, 1, use_color_cache);
+  if (use_color_cache) {
+    VP8LWriteBits(bw, 4, cache_bits);
+  }
+
+  // Huffman image + meta huffman.
+  write_histogram_image = (histogram_image_size > 1);
+  VP8LWriteBits(bw, 1, write_histogram_image);
+  if (write_histogram_image) {
+    uint32_t* const histogram_argb =
+        (uint32_t*)malloc(histogram_image_xysize * sizeof(*histogram_argb));
+    int max_index = 0;
+    if (histogram_argb == NULL) goto Error;
+    for (i = 0; i < histogram_image_xysize; ++i) {
+      const int index = histogram_symbols[i] & 0xffff;
+      histogram_argb[i] = 0xff000000 | (index << 8);
+      if (index >= max_index) {
+        max_index = index + 1;
+      }
+    }
+    histogram_image_size = max_index;
+
+    VP8LWriteBits(bw, 4, histogram_bits);
+    ok = EncodeImageInternal(bw, histogram_argb,
+                             VP8LSubSampleSize(width, histogram_bits),
+                             VP8LSubSampleSize(height, histogram_bits),
+                             quality, 0, 0);
+    free(histogram_argb);
+    if (!ok) goto Error;
+  }
+
+  // Store Huffman codes.
+  for (i = 0; i < histogram_image_size; ++i) {
+    int k;
+    for (k = 0; k < 5; ++k) {
+      const uint8_t* const cur_bit_lengths =  bit_lengths[5 * i + k];
+      const int cur_bit_lengths_size = (k == 0) ?
+                   256 + kLengthCodes + color_cache_size :
+                   bit_lengths_sizes[5 * i + k];
+      if (!StoreHuffmanCode(bw, cur_bit_lengths, cur_bit_lengths_size)) {
+        goto Error;
+      }
+    }
+  }
+
+  // Free combined histograms.
+  free(histogram_image);
+  histogram_image = NULL;
+
+  // Emit no bits if there is only one symbol in the histogram.
+  // This gives better compression for some images.
+  for (i = 0; i < 5 * histogram_image_size; ++i) {
+    ClearHuffmanTreeIfOnlyOneSymbol(bit_lengths_sizes[i], bit_lengths[i],
+                                    bit_codes[i]);
+  }
+  // Store actual literals.
+  StoreImageToBitMask(bw, width, histogram_bits, &refs,
+                      histogram_symbols, bit_lengths, bit_codes);
+  ok = 1;
+
+ Error:
+  if (!ok) free(histogram_image);
+
+  VP8LClearBackwardRefs(&refs);
+  for (i = 0; i < 5 * histogram_image_size; ++i) {
+    free(bit_lengths[i]);
+    free(bit_codes[i]);
+  }
+  free(bit_lengths_sizes);
+  free(bit_lengths);
+  free(bit_codes);
+  free(histogram_symbols);
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+// Transforms
+
+// Check if it would be a good idea to subtract green from red and blue. We
+// only impact entropy in red/blue components, don't bother to look at others.
+static int EvalAndApplySubtractGreen(const VP8LEncoder* const enc,
+                                     int width, int height,
+                                     VP8LBitWriter* const bw) {
+  if (!enc->use_palette_) {
+    int i;
+    const uint32_t* const argb = enc->argb_;
+    double bit_cost_before, bit_cost_after;
+    VP8LHistogram* const histo = (VP8LHistogram*)malloc(sizeof(*histo));
+    if (histo == NULL) return 0;
+
+    VP8LHistogramInit(histo, 1);
+    for (i = 0; i < width * height; ++i) {
+      const uint32_t c = argb[i];
+      ++histo->red_[(c >> 16) & 0xff];
+      ++histo->blue_[(c >> 0) & 0xff];
+    }
+    bit_cost_before = VP8LHistogramEstimateBits(histo);
+
+    VP8LHistogramInit(histo, 1);
+    for (i = 0; i < width * height; ++i) {
+      const uint32_t c = argb[i];
+      const int green = (c >> 8) & 0xff;
+      ++histo->red_[((c >> 16) - green) & 0xff];
+      ++histo->blue_[((c >> 0) - green) & 0xff];
+    }
+    bit_cost_after = VP8LHistogramEstimateBits(histo);
+    free(histo);
+
+    // Check if subtracting green yields low entropy.
+    if (bit_cost_after < bit_cost_before) {
+      VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
+      VP8LWriteBits(bw, 2, SUBTRACT_GREEN);
+      VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
+    }
+  }
+  return 1;
+}
+
+static int ApplyPredictFilter(const VP8LEncoder* const enc,
+                              int width, int height, int quality,
+                              VP8LBitWriter* const bw) {
+  const int pred_bits = enc->transform_bits_;
+  const int transform_width = VP8LSubSampleSize(width, pred_bits);
+  const int transform_height = VP8LSubSampleSize(height, pred_bits);
+
+  VP8LResidualImage(width, height, pred_bits, enc->argb_, enc->argb_scratch_,
+                    enc->transform_data_);
+  VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
+  VP8LWriteBits(bw, 2, PREDICTOR_TRANSFORM);
+  VP8LWriteBits(bw, 4, pred_bits);
+  if (!EncodeImageInternal(bw, enc->transform_data_,
+                           transform_width, transform_height, quality, 0, 0)) {
+    return 0;
+  }
+  return 1;
+}
+
+static int ApplyCrossColorFilter(const VP8LEncoder* const enc,
+                                 int width, int height, int quality,
+                                 VP8LBitWriter* const bw) {
+  const int ccolor_transform_bits = enc->transform_bits_;
+  const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits);
+  const int transform_height = VP8LSubSampleSize(height, ccolor_transform_bits);
+  const int step = (quality == 0) ? 32 : 8;
+
+  VP8LColorSpaceTransform(width, height, ccolor_transform_bits, step,
+                          enc->argb_, enc->transform_data_);
+  VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
+  VP8LWriteBits(bw, 2, CROSS_COLOR_TRANSFORM);
+  VP8LWriteBits(bw, 4, ccolor_transform_bits);
+  if (!EncodeImageInternal(bw, enc->transform_data_,
+                           transform_width, transform_height, quality, 0, 0)) {
+    return 0;
+  }
+  return 1;
+}
+
+// -----------------------------------------------------------------------------
+
+static void PutLE32(uint8_t* const data, uint32_t val) {
+  data[0] = (val >>  0) & 0xff;
+  data[1] = (val >>  8) & 0xff;
+  data[2] = (val >> 16) & 0xff;
+  data[3] = (val >> 24) & 0xff;
+}
+
+static WebPEncodingError WriteRiffHeader(const VP8LEncoder* const enc,
+                                         size_t riff_size, size_t vp8l_size) {
+  const WebPPicture* const pic = enc->pic_;
+  uint8_t riff[HEADER_SIZE + SIGNATURE_SIZE] = {
+    'R', 'I', 'F', 'F', 0, 0, 0, 0, 'W', 'E', 'B', 'P',
+    'V', 'P', '8', 'L', 0, 0, 0, 0, LOSSLESS_MAGIC_BYTE,
+  };
+  if (riff_size < (vp8l_size + TAG_SIZE + CHUNK_HEADER_SIZE)) {
+    return VP8_ENC_ERROR_INVALID_CONFIGURATION;
+  }
+  PutLE32(riff + TAG_SIZE, (uint32_t)riff_size);
+  PutLE32(riff + RIFF_HEADER_SIZE + TAG_SIZE, (uint32_t)vp8l_size);
+  if (!pic->writer(riff, sizeof(riff), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+static void WriteImageSize(VP8LEncoder* const enc, VP8LBitWriter* const bw) {
+  WebPPicture* const pic = enc->pic_;
+  const int width = pic->width - 1;
+  const int height = pic->height -1;
+  assert(width < WEBP_MAX_DIMENSION && height < WEBP_MAX_DIMENSION);
+
+  VP8LWriteBits(bw, IMAGE_SIZE_BITS, width);
+  VP8LWriteBits(bw, IMAGE_SIZE_BITS, height);
+}
+
+static WebPEncodingError WriteImage(const VP8LEncoder* const enc,
+                                    VP8LBitWriter* const bw) {
+  size_t riff_size, vp8l_size, webpll_size, pad;
+  const WebPPicture* const pic = enc->pic_;
+  WebPEncodingError err = VP8_ENC_OK;
+  const uint8_t* const webpll_data = VP8LBitWriterFinish(bw);
+
+  webpll_size = VP8LBitWriterNumBytes(bw);
+  vp8l_size = SIGNATURE_SIZE + webpll_size;
+  pad = vp8l_size & 1;
+  vp8l_size += pad;
+
+  riff_size = TAG_SIZE + CHUNK_HEADER_SIZE + vp8l_size;
+  err = WriteRiffHeader(enc, riff_size, vp8l_size);
+  if (err != VP8_ENC_OK) goto Error;
+
+  if (!pic->writer(webpll_data, webpll_size, pic)) {
+    err = VP8_ENC_ERROR_BAD_WRITE;
+    goto Error;
+  }
+
+  if (pad) {
+    const uint8_t pad_byte[1] = { 0 };
+    if (!pic->writer(pad_byte, 1, pic)) {
+      err = VP8_ENC_ERROR_BAD_WRITE;
+      goto Error;
+    }
+  }
+  return VP8_ENC_OK;
+
+ Error:
+  return err;
+}
+
+// -----------------------------------------------------------------------------
+
+// Allocates the memory for argb (W x H) buffer, 2 rows of context for
+// prediction and transform data.
+static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
+                                                 int width, int height) {
+  WebPEncodingError err = VP8_ENC_OK;
+  const size_t tile_size = 1 << enc->transform_bits_;
+  const size_t image_size = height * width;
+  const size_t argb_scratch_size = (tile_size + 1) * width;
+  const size_t transform_data_size =
+      VP8LSubSampleSize(height, enc->transform_bits_) *
+      VP8LSubSampleSize(width, enc->transform_bits_);
+  const size_t total_size =
+      image_size + argb_scratch_size + transform_data_size;
+  uint32_t* mem = (uint32_t*)malloc(total_size * sizeof(*mem));
+  if (mem == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+  enc->argb_ = mem;
+  mem += image_size;
+  enc->argb_scratch_ = mem;
+  mem += argb_scratch_size;
+  enc->transform_data_ = mem;
+  enc->current_width_ = width;
+
+ Error:
+  return err;
+}
+
+// Bundles multiple (2, 4 or 8) pixels into a single pixel.
+// Returns the new xsize.
+static void BundleColorMap(const uint32_t* const argb,
+                           int width, int height, int xbits,
+                           uint32_t* bundled_argb, int xs) {
+  int x, y;
+  const int bit_depth = 1 << (3 - xbits);
+  uint32_t code = 0;
+
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      const int mask = (1 << xbits) - 1;
+      const int xsub = x & mask;
+      if (xsub == 0) {
+        code = 0;
+      }
+      // TODO(vikasa): simplify the bundling logic.
+      code |= (argb[y * width + x] & 0xff00) << (bit_depth * xsub);
+      bundled_argb[y * xs + (x >> xbits)] = 0xff000000 | code;
+    }
+  }
+}
+
+// Note: Expects "enc->palette_" to be set properly.
+// Also, "enc->palette_" will be modified after this call and should not be used
+// later.
+static WebPEncodingError ApplyPalette(VP8LBitWriter* const bw,
+                                      VP8LEncoder* const enc,
+                                      int width, int height, int quality) {
+  WebPEncodingError err = VP8_ENC_OK;
+  int i;
+  uint32_t* const argb = enc->pic_->argb;
+  uint32_t* const palette = enc->palette_;
+  const int palette_size = enc->palette_size_;
+
+  // Replace each input pixel by corresponding palette index.
+  for (i = 0; i < width * height; ++i) {
+    int k;
+    for (k = 0; k < palette_size; ++k) {
+      const uint32_t pix = argb[i];
+      if (pix == palette[k]) {
+        argb[i] = 0xff000000u | (k << 8);
+        break;
+      }
+    }
+  }
+
+  // Save palette to bitstream.
+  VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
+  VP8LWriteBits(bw, 2, COLOR_INDEXING_TRANSFORM);
+  VP8LWriteBits(bw, 8, palette_size - 1);
+  for (i = palette_size - 1; i >= 1; --i) {
+    palette[i] = VP8LSubPixels(palette[i], palette[i - 1]);
+  }
+  if (!EncodeImageInternal(bw, palette, palette_size, 1, quality, 0, 0)) {
+    err = VP8_ENC_ERROR_INVALID_CONFIGURATION;
+    goto Error;
+  }
+
+  if (palette_size <= 16) {
+    // Image can be packed (multiple pixels per uint32_t).
+    int xbits = 1;
+    if (palette_size <= 2) {
+      xbits = 3;
+    } else if (palette_size <= 4) {
+      xbits = 2;
+    }
+    err = AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height);
+    if (err != VP8_ENC_OK) goto Error;
+    BundleColorMap(argb, width, height, xbits, enc->argb_, enc->current_width_);
+  }
+
+ Error:
+  return err;
+}
+
+// -----------------------------------------------------------------------------
+
+static int GetHistoBits(const WebPConfig* const config,
+                        const WebPPicture* const pic) {
+  const int width = pic->width;
+  const int height = pic->height;
+  const size_t hist_size = sizeof(VP8LHistogram);
+  int histo_bits = 9 - (int)(config->quality / 16.f + .5f);
+  while (1) {
+    const size_t huff_image_size = VP8LSubSampleSize(width, histo_bits) *
+                                   VP8LSubSampleSize(height, histo_bits) *
+                                   hist_size;
+    if (huff_image_size <= MAX_HUFF_IMAGE_SIZE) break;
+    ++histo_bits;
+  }
+  return (histo_bits < 3) ? 3 : (histo_bits > 10) ? 10 : histo_bits;
+}
+
+static void InitEncParams(VP8LEncoder* const enc) {
+  const WebPConfig* const config = enc->config_;
+  const WebPPicture* const picture = enc->pic_;
+  const int method = config->method;
+  const float quality = config->quality;
+  enc->transform_bits_ = (method < 4) ? 5 : (method > 4) ? 3 : 4;
+  enc->histo_bits_ = GetHistoBits(config, picture);
+  enc->cache_bits_ = (quality <= 25.f) ? 0 : 7;
+}
+
+// -----------------------------------------------------------------------------
+// VP8LEncoder
+
+static VP8LEncoder* NewVP8LEncoder(const WebPConfig* const config,
+                                   WebPPicture* const picture) {
+  VP8LEncoder* const enc = (VP8LEncoder*)calloc(1, sizeof(*enc));
+  if (enc == NULL) {
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    return NULL;
+  }
+  enc->config_ = config;
+  enc->pic_ = picture;
+  return enc;
+}
+
+static void DeleteVP8LEncoder(VP8LEncoder* enc) {
+  free(enc->argb_);
+  free(enc);
+}
+
+// -----------------------------------------------------------------------------
+// Main call
+
+int VP8LEncodeImage(const WebPConfig* const config,
+                    WebPPicture* const picture) {
+  int ok = 0;
+  int width, height, quality;
+  VP8LEncoder* enc = NULL;
+  WebPEncodingError err = VP8_ENC_OK;
+  VP8LBitWriter bw;
+
+  if (config == NULL || picture == NULL) return 0;
+
+  if (picture->argb == NULL) {
+    err = VP8_ENC_ERROR_NULL_PARAMETER;
+    goto Error;
+  }
+
+  enc = NewVP8LEncoder(config, picture);
+  if (enc == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+  width = picture->width;
+  height = picture->height;
+  quality = config->quality;
+
+  InitEncParams(enc);
+
+  // ---------------------------------------------------------------------------
+  // Analyze image (entropy, num_palettes etc)
+
+  if (!VP8LEncAnalyze(enc)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  // Write image size.
+  VP8LBitWriterInit(&bw, (width * height) >> 1);
+  WriteImageSize(enc, &bw);
+
+  if (enc->use_palette_) {
+    err = ApplyPalette(&bw, enc, width, height, quality);
+    if (err != VP8_ENC_OK) goto Error;
+    enc->cache_bits_ = 0;
+  }
+
+  // In case image is not packed.
+  if (enc->argb_ == NULL) {
+    const size_t image_size = height * width;
+    err = AllocateTransformBuffer(enc, width, height);
+    if (err != VP8_ENC_OK) goto Error;
+    memcpy(enc->argb_, picture->argb, image_size * sizeof(*enc->argb_));
+    enc->current_width_ = width;
+  }
+
+  // ---------------------------------------------------------------------------
+  // Apply transforms and write transform data.
+
+  if (!EvalAndApplySubtractGreen(enc, enc->current_width_, height, &bw)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  if (enc->use_predict_) {
+    if (!ApplyPredictFilter(enc, enc->current_width_, height, quality, &bw)) {
+      err = VP8_ENC_ERROR_INVALID_CONFIGURATION;
+      goto Error;
+    }
+  }
+
+  if (enc->use_cross_color_) {
+    if (!ApplyCrossColorFilter(enc, enc->current_width_, height, quality,
+                               &bw)) {
+      err = VP8_ENC_ERROR_INVALID_CONFIGURATION;
+      goto Error;
+    }
+  }
+
+  VP8LWriteBits(&bw, 1, !TRANSFORM_PRESENT);  // No more transforms.
+
+  // ---------------------------------------------------------------------------
+  // Estimate the color cache size.
+
+  if (enc->cache_bits_ > 0) {
+    if (!VP8LCalculateEstimateForCacheSize(enc->argb_, enc->current_width_,
+                                           height, &enc->cache_bits_)) {
+      err = VP8_ENC_ERROR_INVALID_CONFIGURATION;
+      goto Error;
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Encode and write the transformed image.
+
+  ok = EncodeImageInternal(&bw, enc->argb_, enc->current_width_, height,
+                           quality, enc->cache_bits_, enc->histo_bits_);
+  if (!ok) goto Error;
+
+  err = WriteImage(enc, &bw);
+  if (err != VP8_ENC_OK) {
+    ok = 0;
+    goto Error;
+  }
+
+  if (picture->stats != NULL) {
+    WebPAuxStats* const stats = picture->stats;
+    memset(stats, 0, sizeof(*stats));
+    stats->PSNR[0] = 99.;
+    stats->PSNR[1] = 99.;
+    stats->PSNR[2] = 99.;
+    stats->PSNR[3] = 99.;
+    // note: padding byte may be missing. Not a big deal.
+    stats->coded_size = VP8LBitWriterNumBytes(&bw) + HEADER_SIZE;
+  }
+
+  if (picture->extra_info != NULL) {
+    const int mb_w = (width + 15) >> 4;
+    const int mb_h = (height + 15) >> 4;
+    memset(picture->extra_info, 0, mb_w * mb_h * sizeof(*picture->extra_info));
+  }
+
+ Error:
+  VP8LBitWriterDestroy(&bw);
+  DeleteVP8LEncoder(enc);
+  if (!ok) {
+    assert(err != VP8_ENC_OK);
+    WebPEncodingSetError(picture, err);
+  }
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif
diff --git a/src/enc/vp8li.h b/src/enc/vp8li.h
new file mode 100644
index 00000000..6ddf6978
--- /dev/null
+++ b/src/enc/vp8li.h
@@ -0,0 +1,78 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Lossless encoder: internal header.
+//
+// Author: Vikas Arora (vikaas.arora@gmail.com)
+
+#ifndef WEBP_ENC_VP8LI_H_
+#define WEBP_ENC_VP8LI_H_
+
+#ifdef USE_LOSSLESS_ENCODER
+
+#include "./histogram.h"
+#include "../webp/encode.h"
+#include "../utils/bit_writer.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// TODO(vikasa): factorize these with ones used in lossless decoder.
+#define TAG_SIZE             4
+#define CHUNK_HEADER_SIZE    8
+#define RIFF_HEADER_SIZE     12
+#define HEADER_SIZE          (RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE)
+#define SIGNATURE_SIZE       1
+#define LOSSLESS_MAGIC_BYTE  0x64
+
+#define MAX_PALETTE_SIZE         256
+#define PALETTE_KEY_RIGHT_SHIFT   22  // Key for 1K buffer.
+
+typedef struct {
+  const WebPConfig* config_;    // user configuration and parameters
+  WebPPicture* pic_;            // input picture.
+
+  uint32_t* argb_;              // Transformed argb image data.
+  uint32_t* argb_scratch_;      // Scratch memory for argb rows
+                                // (used for prediction).
+  uint32_t* transform_data_;    // Scratch memory for transform data.
+  int       current_width_;     // Corresponds to packed image width.
+
+  // Encoding parameters derived from quality parameter.
+  int histo_bits_;
+  int transform_bits_;
+  int cache_bits_;        // If equal to 0, don't use color cache.
+
+  // Encoding parameters derived from image characteristics.
+  int use_cross_color_;
+  int use_predict_;
+  int use_palette_;
+  int palette_size_;
+  uint32_t palette_[MAX_PALETTE_SIZE];
+} VP8LEncoder;
+
+//------------------------------------------------------------------------------
+// internal functions. Not public.
+
+// in vp8l.c
+
+// Encodes the picture.
+// Returns 0 if config or picture is NULL or picture doesn't have valid argb
+// input.
+int VP8LEncodeImage(const WebPConfig* const config,
+                    WebPPicture* const picture);
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif
+
+#endif  /* WEBP_ENC_VP8LI_H_ */
diff --git a/src/enc/webpenc.c b/src/enc/webpenc.c
index ee68d741..92c83c08 100644
--- a/src/enc/webpenc.c
+++ b/src/enc/webpenc.c
@@ -15,6 +15,7 @@
 #include <math.h>
 
 #include "./vp8enci.h"
+#include "./vp8li.h"
 
 // #define PRINT_MEMORY_INFO
 
@@ -142,8 +143,8 @@ static void MapConfigToTools(VP8Encoder* const enc) {
 //              LFStats: 2048
 // Picture size (yuv): 589824
 
-static VP8Encoder* InitEncoder(const WebPConfig* const config,
-                               WebPPicture* const picture) {
+static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
+                                  WebPPicture* const picture) {
   const int use_filter =
       (config->filter_strength > 0) || (config->autofilter > 0);
   const int mb_w = (picture->width + 15) >> 4;
@@ -259,7 +260,7 @@ static VP8Encoder* InitEncoder(const WebPConfig* const config,
   return enc;
 }
 
-static void DeleteEncoder(VP8Encoder* enc) {
+static void DeleteVP8Encoder(VP8Encoder* enc) {
   if (enc != NULL) {
     VP8EncDeleteAlpha(enc);
 #ifdef WEBP_EXPERIMENTAL_FEATURES
@@ -327,7 +328,6 @@ int WebPReportProgress(VP8Encoder* const enc, int percent) {
 //------------------------------------------------------------------------------
 
 int WebPEncode(const WebPConfig* const config, WebPPicture* const pic) {
-  VP8Encoder* enc;
   int ok;
 
   if (pic == NULL)
@@ -339,27 +339,40 @@ int WebPEncode(const WebPConfig* const config, WebPPicture* const pic) {
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_INVALID_CONFIGURATION);
   if (pic->width <= 0 || pic->height <= 0)
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
-  if (pic->y == NULL || pic->u == NULL || pic->v == NULL)
-    return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
   if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION)
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
 
-  enc = InitEncoder(config, pic);
-  if (enc == NULL) return 0;  // pic->error is already set.
-  // Note: each of the tasks below account for 20% in the progress report.
-  ok = VP8EncAnalyze(enc)
-    && VP8StatLoop(enc)
-    && VP8EncLoop(enc)
-    && VP8EncFinishAlpha(enc)
+  if (!config->lossless) {
+    VP8Encoder* enc = NULL;
+    if (pic->y == NULL || pic->u == NULL || pic->v == NULL)
+      return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
+
+    enc = InitVP8Encoder(config, pic);
+    if (enc == NULL) return 0;  // pic->error is already set.
+    // Note: each of the tasks below account for 20% in the progress report.
+    ok = VP8EncAnalyze(enc)
+      && VP8StatLoop(enc)
+      && VP8EncLoop(enc)
+      && VP8EncFinishAlpha(enc)
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-    && VP8EncFinishLayer(enc)
+      && VP8EncFinishLayer(enc)
+#endif
+      && VP8EncWrite(enc);
+    StoreStats(enc);
+    if (!ok) {
+      VP8EncFreeBitWriters(enc);
+    }
+    DeleteVP8Encoder(enc);
+  } else {
+#ifdef USE_LOSSLESS_ENCODER
+    if (pic->argb == NULL)
+      return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
+
+    ok = VP8LEncodeImage(config, pic);  // Sets pic->error in case of problem.
+#else
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_INVALID_CONFIGURATION);
 #endif
-    && VP8EncWrite(enc);
-  StoreStats(enc);
-  if (!ok) {
-    VP8EncFreeBitWriters(enc);
   }
-  DeleteEncoder(enc);
 
   return ok;
 }
diff --git a/src/utils/bit_writer.c b/src/utils/bit_writer.c
index 6ab44833..bc6e4097 100644
--- a/src/utils/bit_writer.c
+++ b/src/utils/bit_writer.c
@@ -8,6 +8,7 @@
 // Bit writing and boolean coder
 //
 // Author: Skal (pascal.massimino@gmail.com)
+//         Vikas Arora (vikaas.arora@gmail.com)
 
 #include <assert.h>
 #include <string.h>   // for memcpy()
@@ -186,6 +187,86 @@ void VP8BitWriterWipeOut(VP8BitWriter* const bw) {
   }
 }
 
+#ifdef USE_LOSSLESS_ENCODER
+//------------------------------------------------------------------------------
+// VP8LBitWriter
+
+// Returns 1 on success.
+static int VP8LBitWriterResize(VP8LBitWriter* const bw, size_t extra_size) {
+  uint8_t* allocated_buf;
+  size_t allocated_size;
+  const size_t size_required = VP8LBitWriterNumBytes(bw) + extra_size;
+  if ((bw->max_bytes_ > 0) && (size_required <= bw->max_bytes_)) return 1;
+  allocated_size = (3 * bw->max_bytes_) >> 1;
+  if (allocated_size < size_required) {
+    allocated_size = size_required;
+  }
+  // Make Allocated size multiple of KBs
+  allocated_size = (((allocated_size >> 10) + 1) << 10);
+  allocated_buf = (uint8_t*)malloc(allocated_size);
+  if (allocated_buf == NULL) return 0;
+  memset(allocated_buf, 0, allocated_size);
+  if (bw->bit_pos_ > 0) {
+    memcpy(allocated_buf, bw->buf_, VP8LBitWriterNumBytes(bw));
+  }
+  free(bw->buf_);
+  bw->buf_ = allocated_buf;
+  bw->max_bytes_ = allocated_size;
+  return 1;
+}
+
+int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size) {
+  memset(bw, 0, sizeof(*bw));
+  return VP8LBitWriterResize(bw, expected_size);
+}
+
+void VP8LBitWriterDestroy(VP8LBitWriter* const bw) {
+  if (bw != NULL) {
+    free(bw->buf_);
+    memset(bw, 0, sizeof(*bw));
+  }
+}
+
+void VP8LWriteBits(VP8LBitWriter* const bw, int n_bits, uint32_t bits) {
+  if (n_bits < 1) return;
+#if !defined(__BIG_ENDIAN__)
+  // Technically, this branch of the code can write up to 25 bits at a time,
+  // but in deflate, the maximum number of bits written is 16 at a time.
+  {
+    uint8_t* p = &bw->buf_[bw->bit_pos_ >> 3];
+    uint32_t v = *(const uint32_t*)(p);
+    v |= bits << (bw->bit_pos_ & 7);
+    *(uint32_t*)(p) = v;
+    bw->bit_pos_ += n_bits;
+  }
+#else  // LITTLE_ENDIAN
+  // implicit & 0xff is assumed for uint8_t arithmetics
+  {
+    uint8_t* p = &bw->buf_[bw->bit_pos_ >> 3];
+    const int bits_reserved_in_first_byte = (bw->bit_pos_ & 7);
+    *p++ |= (bits << bits_reserved_in_first_byte);
+    const int bits_left_to_write = n_bits - 8 + bits_reserved_in_first_byte;
+    if (bits_left_to_write >= 1) {
+      *p++ = bits >> (8 - bits_reserved_in_first_byte);
+      if (bits_left_to_write >= 9) {
+        *p++ = bits >> (16 - bits_reserved_in_first_byte);
+      }
+    }
+    *p = 0;
+    bw->bit_pos_ += n_bits;
+  }
+#endif  // BIG_ENDIAN
+  if ((bw->bit_pos_ >> 3) > (bw->max_bytes_ - 8)) {
+    const size_t kAdditionalBuffer = 32768 + bw->max_bytes_;
+    if (!VP8LBitWriterResize(bw, kAdditionalBuffer)) {
+      bw->bit_pos_ = 0;
+      bw->error_ = 1;
+    }
+  }
+}
+
+#endif
+
 //------------------------------------------------------------------------------
 
 #if defined(__cplusplus) || defined(c_plusplus)
diff --git a/src/utils/bit_writer.h b/src/utils/bit_writer.h
index c85356fe..b9cf0b92 100644
--- a/src/utils/bit_writer.h
+++ b/src/utils/bit_writer.h
@@ -64,7 +64,59 @@ static WEBP_INLINE size_t VP8BitWriterSize(const VP8BitWriter* const bw) {
   return bw->pos_;
 }
 
+#ifdef USE_LOSSLESS_ENCODER
 //------------------------------------------------------------------------------
+// VP8LBitWriter
+// TODO(vikasa): VP8LBitWriter is copied as-is from lossless code. There's scope
+// of re-using VP8BitWriter. Will evaluate once basic lossless encoder is
+// implemented.
+
+typedef struct {
+  uint8_t* buf_;
+  size_t bit_pos_;
+  size_t max_bytes_;
+
+  // After all bits are written, the caller must observe the state of
+  // error_. A value of 1 indicates that a memory allocation failure
+  // has happened during bit writing. A value of 0 indicates successful
+  // writing of bits.
+  int error_;
+} VP8LBitWriter;
+
+static WEBP_INLINE size_t VP8LBitWriterNumBytes(VP8LBitWriter* const bw) {
+  return (bw->bit_pos_ + 7) >> 3;
+}
+
+static WEBP_INLINE uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw) {
+  return bw->buf_;
+}
+
+// Returns 0 in case of memory allocation error.
+int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size);
+
+void VP8LBitWriterDestroy(VP8LBitWriter* const bw);
+
+// This function writes bits into bytes in increasing addresses, and within
+// a byte least-significant-bit first.
+//
+// The function can write up to 16 bits in one go with WriteBits
+// Example: let's assume that 3 bits (Rs below) have been written already:
+//
+// BYTE-0     BYTE+1       BYTE+2
+//
+// 0000 0RRR    0000 0000    0000 0000
+//
+// Now, we could write 5 or less bits in MSB by just sifting by 3
+// and OR'ing to BYTE-0.
+//
+// For n bits, we take the last 5 bytes, OR that with high bits in BYTE-0,
+// and locate the rest in BYTE+1 and BYTE+2.
+//
+// VP8LBitWriter's error_ flag is set in case of  memory allocation error.
+void VP8LWriteBits(VP8LBitWriter* const bw, int n_bits, uint32_t bits);
+
+//------------------------------------------------------------------------------
+#endif
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
diff --git a/src/utils/color_cache.c b/src/utils/color_cache.c
index 06e8a718..9b1be7a5 100644
--- a/src/utils/color_cache.c
+++ b/src/utils/color_cache.c
@@ -32,14 +32,17 @@ int VP8LColorCacheInit(VP8LColorCache* const cc, int hash_bits) {
   return 1;
 }
 
-void VP8LColorCacheDelete(VP8LColorCache* const cc) {
+void VP8LColorCacheClear(VP8LColorCache* const cc) {
   if (cc != NULL) {
     free(cc->colors_);
-    free(cc);
   }
 }
 
+void VP8LColorCacheDelete(VP8LColorCache* const cc) {
+  VP8LColorCacheClear(cc);
+  free(cc);
+}
+
 #if defined(__cplusplus) || defined(c_plusplus)
 }
 #endif
-
diff --git a/src/utils/color_cache.h b/src/utils/color_cache.h
index aece1453..64435b5a 100644
--- a/src/utils/color_cache.h
+++ b/src/utils/color_cache.h
@@ -39,6 +39,7 @@ static WEBP_INLINE void VP8LColorCacheInsert(const VP8LColorCache* const cc,
   cc->colors_[key] = argb;
 }
 
+#ifdef USE_LOSSLESS_ENCODER
 static WEBP_INLINE int VP8LColorCacheGetIndex(const VP8LColorCache* const cc,
                                               uint32_t argb) {
   return (kHashMul * argb) >> cc->hash_shift_;
@@ -49,6 +50,7 @@ static WEBP_INLINE int VP8LColorCacheContains(const VP8LColorCache* const cc,
   const uint32_t key = (kHashMul * argb) >> cc->hash_shift_;
   return cc->colors_[key] == argb;
 }
+#endif
 
 //------------------------------------------------------------------------------
 
@@ -57,6 +59,9 @@ static WEBP_INLINE int VP8LColorCacheContains(const VP8LColorCache* const cc,
 int VP8LColorCacheInit(VP8LColorCache* const color_cache, int hash_bits);
 
 // Delete the color cache.
+void VP8LColorCacheClear(VP8LColorCache* const color_cache);
+
+// Delete the color_cache object.
 void VP8LColorCacheDelete(VP8LColorCache* const color_cache);
 
 //------------------------------------------------------------------------------
diff --git a/src/utils/huffman_encode.c b/src/utils/huffman_encode.c
new file mode 100644
index 00000000..7d88187f
--- /dev/null
+++ b/src/utils/huffman_encode.c
@@ -0,0 +1,318 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Author: jyrki@google.com (Jyrki Alakuijala)
+//
+// Flate like entropy encoding (Huffman) for webp lossless.
+
+#ifdef USE_LOSSLESS_ENCODER
+
+#include "./huffman_encode.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+typedef struct {
+  int total_count_;
+  int value_;
+  int pool_index_left_;
+  int pool_index_right_;
+} HuffmanTree;
+
+// Sort the root nodes, most popular first.
+static int CompHuffmanTree(const void* vp0, const void* vp1) {
+  const HuffmanTree* v0 = (const HuffmanTree*)vp0;
+  const HuffmanTree* v1 = (const HuffmanTree*)vp1;
+  if (v0->total_count_ > v1->total_count_) {
+    return -1;
+  } else if (v0->total_count_ < v1->total_count_) {
+    return 1;
+  } else {
+    if (v0->value_ < v1->value_) {
+      return -1;
+    }
+    if (v0->value_ > v1->value_) {
+      return 1;
+    }
+    return 0;
+  }
+}
+
+static void SetDepth(const HuffmanTree* p,
+                     HuffmanTree* pool,
+                     uint8_t* depth,
+                     const int level) {
+  if (p->pool_index_left_ >= 0) {
+    SetDepth(&pool[p->pool_index_left_], pool, depth, level + 1);
+    SetDepth(&pool[p->pool_index_right_], pool, depth, level + 1);
+  } else {
+    depth[p->value_] = level;
+  }
+}
+
+// This function will create a Huffman tree.
+//
+// The catch here is that the tree cannot be arbitrarily deep.
+// Deflate specifies a maximum depth of 15 bits for "code trees"
+// and 7 bits for "code length code trees."
+//
+// count_limit is the value that is to be faked as the minimum value
+// and this minimum value is raised until the tree matches the
+// maximum length requirement.
+//
+// This algorithm is not of excellent performance for very long data blocks,
+// especially when population counts are longer than 2**tree_limit, but
+// we are not planning to use this with extremely long blocks.
+//
+// See http://en.wikipedia.org/wiki/Huffman_coding
+int VP8LCreateHuffmanTree(const int* const histogram, int histogram_size,
+                          int tree_depth_limit,
+                          uint8_t* const bit_depths) {
+  HuffmanTree* tree;
+  HuffmanTree* tree_pool;
+  int tree_pool_size;
+  // For block sizes with less than 64k symbols we never need to do a
+  // second iteration of this loop.
+  // If we actually start running inside this loop a lot, we would perhaps
+  // be better off with the Katajainen algorithm.
+  int count_limit;
+  for (count_limit = 1; ; count_limit *= 2) {
+    int tree_size = 0;
+    int i;
+    for (i = 0; i < histogram_size; ++i) {
+      if (histogram[i]) {
+        ++tree_size;
+      }
+    }
+    // 3 * tree_size is enough to cover all the nodes representing a
+    // population and all the inserted nodes combining two existing nodes.
+    // The tree pool needs 2 * (tree_size - 1) entities, and the
+    // tree needs exactly tree_size entities.
+    tree = (HuffmanTree*)malloc(3 * tree_size * sizeof(*tree));
+    if (tree == NULL) {
+      return 0;
+    }
+    {
+      int j = 0;
+      int i;
+      for (i = 0; i < histogram_size; ++i) {
+        if (histogram[i]) {
+          const int count =
+              (histogram[i] < count_limit) ? count_limit : histogram[i];
+          tree[j].total_count_ = count;
+          tree[j].value_ = i;
+          tree[j].pool_index_left_ = -1;
+          tree[j].pool_index_right_ = -1;
+          ++j;
+        }
+      }
+    }
+    qsort((void*)tree, tree_size, sizeof(*tree), CompHuffmanTree);
+    tree_pool = tree + tree_size;
+    tree_pool_size = 0;
+    if (tree_size >= 2) {
+      while (tree_size >= 2) {  // Finish when we have only one root.
+        int count;
+        tree_pool[tree_pool_size] = tree[tree_size - 1];
+        ++tree_pool_size;
+        tree_pool[tree_pool_size] = tree[tree_size - 2];
+        ++tree_pool_size;
+        count =
+            tree_pool[tree_pool_size - 1].total_count_ +
+            tree_pool[tree_pool_size - 2].total_count_;
+        tree_size -= 2;
+        {
+          int k = 0;
+          // Search for the insertion point.
+          for (k = 0; k < tree_size; ++k) {
+            if (tree[k].total_count_ <= count) {
+              break;
+            }
+          }
+          memmove(tree + (k + 1), tree + k, (tree_size - k) * sizeof(*tree));
+          tree[k].total_count_ = count;
+          tree[k].value_ = -1;
+
+          tree[k].pool_index_left_ = tree_pool_size - 1;
+          tree[k].pool_index_right_ = tree_pool_size - 2;
+          tree_size = tree_size + 1;
+        }
+      }
+      SetDepth(&tree[0], tree_pool, bit_depths, 0);
+    } else {
+      if (tree_size == 1) {
+        // Only one element.
+        bit_depths[tree[0].value_] = 1;
+      }
+    }
+    free(tree);
+    // We need to pack the Huffman tree in tree_depth_limit bits.
+    // If this was not successful, add fake entities to the lowest values
+    // and retry.
+    {
+      int max_depth = bit_depths[0];
+      int j;
+      for (j = 1; j < histogram_size; ++j) {
+        if (max_depth < bit_depths[j]) {
+          max_depth = bit_depths[j];
+        }
+      }
+      if (max_depth <= tree_depth_limit) {
+        break;
+      }
+    }
+  }
+  return 1;
+}
+
+static void WriteHuffmanTreeRepetitions(
+    const int value,
+    const int prev_value,
+    int repetitions,
+    int* num_symbols,
+    uint8_t* tree,
+    uint8_t* extra_bits_data) {
+  if (value != prev_value) {
+    tree[*num_symbols] = value;
+    extra_bits_data[*num_symbols] = 0;
+    ++(*num_symbols);
+    --repetitions;
+  }
+  while (repetitions >= 1) {
+    if (repetitions < 3) {
+      int i;
+      for (i = 0; i < repetitions; ++i) {
+        tree[*num_symbols] = value;
+        extra_bits_data[*num_symbols] = 0;
+        ++(*num_symbols);
+      }
+      return;
+    } else if (repetitions < 7) {
+      // 3 to 6 left
+      tree[*num_symbols] = 16;
+      extra_bits_data[*num_symbols] = repetitions - 3;
+      ++(*num_symbols);
+      return;
+    } else {
+      tree[*num_symbols] = 16;
+      extra_bits_data[*num_symbols] = 3;
+      ++(*num_symbols);
+      repetitions -= 6;
+    }
+  }
+}
+
+static void WriteHuffmanTreeRepetitionsZeros(
+    const int value,
+    int repetitions,
+    int* num_symbols,
+    uint8_t* tree,
+    uint8_t* extra_bits_data) {
+  while (repetitions >= 1) {
+    if (repetitions < 3) {
+      int i;
+      for (i = 0; i < repetitions; ++i) {
+        tree[*num_symbols] = value;
+        extra_bits_data[*num_symbols] = 0;
+        ++(*num_symbols);
+      }
+      return;
+    } else if (repetitions < 11) {
+      tree[*num_symbols] = 17;
+      extra_bits_data[*num_symbols] = repetitions - 3;
+      ++(*num_symbols);
+      return;
+    } else if (repetitions < 139) {
+      tree[*num_symbols] = 18;
+      extra_bits_data[*num_symbols] = repetitions - 11;
+      ++(*num_symbols);
+      return;
+    } else {
+      tree[*num_symbols] = 18;
+      extra_bits_data[*num_symbols] = 0x7f;  // 138 repeated 0s
+      ++(*num_symbols);
+      repetitions -= 138;
+    }
+  }
+}
+
+void VP8LCreateCompressedHuffmanTree(const uint8_t* const depth,
+                                     int depth_size,
+                                     int* num_symbols,
+                                     uint8_t* tree,
+                                     uint8_t* extra_bits_data) {
+  int prev_value = 8;  // 8 is the initial value for rle.
+  int i;
+  for (i = 0; i < depth_size;) {
+    const int value = depth[i];
+    int reps = 1;
+    int k;
+    for (k = i + 1; k < depth_size && depth[k] == value; ++k) {
+      ++reps;
+    }
+    if (value == 0) {
+      WriteHuffmanTreeRepetitionsZeros(value, reps,
+                                       num_symbols,
+                                       tree, extra_bits_data);
+    } else {
+      WriteHuffmanTreeRepetitions(value, prev_value, reps,
+                                  num_symbols,
+                                  tree, extra_bits_data);
+      prev_value = value;
+    }
+    i += reps;
+  }
+}
+
+static uint32_t ReverseBits(int num_bits, uint32_t bits) {
+  uint32_t retval = 0;
+  int i;
+  for (i = 0; i < num_bits; ++i) {
+    retval <<= 1;
+    retval |= bits & 1;
+    bits >>= 1;
+  }
+  return retval;
+}
+
+void VP8LConvertBitDepthsToSymbols(const uint8_t* depth, int len,
+                                   uint16_t* bits) {
+  // This function is based on RFC 1951.
+  //
+  // In deflate, all bit depths are [1..15]
+  // 0 bit depth means that the symbol does not exist.
+
+  // 0..15 are values for bits
+#define MAX_BITS 16
+  uint32_t next_code[MAX_BITS];
+  uint32_t bl_count[MAX_BITS] = { 0 };
+  int i;
+  {
+    for (i = 0; i < len; ++i) {
+      ++bl_count[depth[i]];
+    }
+    bl_count[0] = 0;
+  }
+  next_code[0] = 0;
+  {
+    int code = 0;
+    int bits;
+    for (bits = 1; bits < MAX_BITS; ++bits) {
+      code = (code + bl_count[bits - 1]) << 1;
+      next_code[bits] = code;
+    }
+  }
+  for (i = 0; i < len; ++i) {
+    if (depth[i]) {
+      bits[i] = ReverseBits(depth[i], next_code[depth[i]]++);
+    }
+  }
+}
+#undef MAX_BITS
+
+#endif
diff --git a/src/utils/huffman_encode.h b/src/utils/huffman_encode.h
new file mode 100644
index 00000000..c931142d
--- /dev/null
+++ b/src/utils/huffman_encode.h
@@ -0,0 +1,54 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Author: jyrki@google.com (Jyrki Alakuijala)
+//
+// Flate like entropy encoding (Huffman) for webp lossless
+
+#ifndef WEBP_UTILS_HUFFMAN_ENCODE_H_
+#define WEBP_UTILS_HUFFMAN_ENCODE_H_
+
+#ifdef USE_LOSSLESS_ENCODER
+
+#include <stdint.h>
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// This function will create a Huffman tree.
+//
+// The (data,length) contains the population counts.
+// The tree_limit is the maximum bit depth of the Huffman codes.
+//
+// The depth contains the tree, i.e., how many bits are used for
+// the symbol.
+//
+// See http://en.wikipedia.org/wiki/Huffman_coding
+//
+// Returns 0 when an error has occured.
+int VP8LCreateHuffmanTree(const int* data, const int length,
+                          const int tree_limit, uint8_t* depth);
+
+// Write a huffman tree from bit depths. The generated Huffman tree is
+// compressed once more using a Huffman tree.
+void VP8LCreateCompressedHuffmanTree(const uint8_t* const depth, int len,
+                                     int* num_symbols,
+                                     uint8_t* tree,
+                                     uint8_t* extra_bits_data);
+
+// Get the actual bit values for a tree of bit depths.
+void VP8LConvertBitDepthsToSymbols(const uint8_t* depth, int len,
+                                   uint16_t* bits);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif
+
+#endif  // WEBP_UTILS_HUFFMAN_ENCODE_H_
diff --git a/src/webp/encode.h b/src/webp/encode.h
index cabc9823..28a1d6bf 100644
--- a/src/webp/encode.h
+++ b/src/webp/encode.h
@@ -76,6 +76,7 @@ typedef struct {
                           //  0: none, 1: fast, 2: best. Default if 1.
   int alpha_quality;      // Between 0 (smallest size) and 100 (lossless).
                           // Default is 100.
+  int lossless;           // Lossless encoding (0=lossy(default), 1=lossless).
 } WebPConfig;
 
 // Enumerate some predefined settings for WebPConfig, depending on the type
@@ -189,7 +190,7 @@ struct WebPPicture {
   int width, height;         // dimensions (less or equal to WEBP_MAX_DIMENSION)
   uint8_t *y, *u, *v;        // pointers to luma/chroma planes.
   int y_stride, uv_stride;   // luma/chroma strides.
-  uint8_t *a;                // pointer to the alpha plane
+  uint8_t* a;                // pointer to the alpha plane
   int a_stride;              // stride of the alpha plane
 
   // output
@@ -216,6 +217,10 @@ struct WebPPicture {
   WebPEncodingError error_code;   // error code in case of problem.
 
   WebPProgressHook progress_hook;  // if not NULL, called while encoding.
+
+  int use_argb_input;     // Flag for encoder to use argb pixels as input.
+  uint32_t* argb;         // Pointer to argb (32 bit) plane.
+  int argb_stride;        // This is stride in pixels units, not bytes.
 };
 
 // Internal, version-checked, entry point