move the decoder sourcetree to a sub-location src/dec

to make room for future libs sources also extract the types declaration into its own types.h file Change-Id: I8bae8a323a479a29375cf509792228ae6af51c7a
2025-07-14 21:09:55 +02:00 · 2011-01-06 07:25:43 -08:00
parent a9b3eab6fd
commit 6421a7a4fb
22 changed files with 17747 additions and 23160 deletions
--- a/src/dec/bits.c
+++ b/src/dec/bits.c
@ -0,0 +1,78 @@
+// Copyright 2010 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Boolean decoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "bits.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//-----------------------------------------------------------------------------
+// VP8BitReader
+
+void VP8Init(VP8BitReader* const br, const uint8_t* buf, uint32_t size) {
+  assert(br);
+  assert(buf);
+  br->range_ = 255 - 1;
+  br->eof_ = 0;
+  br->buf_ = buf;
+  br->buf_end_ = buf + size;
+  // Need two initial bytes.
+  br->value_ = (VP8GetByte(br) << 8) | VP8GetByte(br);
+  br->left_ = -8;
+}
+
+const uint8_t kVP8Log2Range[128] = {
+     7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  0
+};
+
+// range = ((range + 1) << kVP8Log2Range[range]) - 1
+const uint8_t kVP8NewRange[128] = {
+  127, 127, 191, 127, 159, 191, 223, 127, 143, 159, 175, 191, 207, 223, 239,
+  127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239,
+  247, 127, 131, 135, 139, 143, 147, 151, 155, 159, 163, 167, 171, 175, 179,
+  183, 187, 191, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239,
+  243, 247, 251, 127, 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149,
+  151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 171, 173, 175, 177, 179,
+  181, 183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203, 205, 207, 209,
+  211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239,
+  241, 243, 245, 247, 249, 251, 253, 127
+};
+
+//-----------------------------------------------------------------------------
+// Higher-level calls
+
+uint32_t VP8GetValue(VP8BitReader* const br, int bits) {
+  uint32_t v = 0;
+  while (bits-- > 0) {
+    v |= VP8GetBit(br, 0x80) << bits;
+  }
+  return v;
+}
+
+int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
+  const int value = VP8GetValue(br, bits);
+  return VP8Get(br) ? -value : value;
+}
+
+//-----------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/bits.h
+++ b/src/dec/bits.h
@ -0,0 +1,106 @@
+// Copyright 2010 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Boolean decoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DECODE_BITS_H_
+#define WEBP_DECODE_BITS_H_
+
+#include <assert.h>
+#include "webp/decode_vp8.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//-----------------------------------------------------------------------------
+// Bitreader and code-tree reader
+
+typedef struct {
+  const uint8_t* buf_;        // next byte to be read
+  const uint8_t* buf_end_;    // end of read buffer
+  int eof_;                   // true if input is exhausted
+
+  // boolean decoder
+  uint32_t range_;            // current range minus 1. In [127, 254] interval.
+  uint32_t value_;            // current value
+  int left_;                  // how many unused bits (negated)
+} VP8BitReader;
+
+// Initialize the bit reader and the boolean decoder.
+void VP8Init(VP8BitReader* const br, const uint8_t* buf, uint32_t size);
+
+// return the next value made of 'num_bits' bits
+uint32_t VP8GetValue(VP8BitReader* const br, int num_bits);
+static inline uint32_t VP8Get(VP8BitReader* const br) {
+  return VP8GetValue(br, 1);
+}
+
+// return the next value with sign-extension.
+int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits);
+
+// Read a bit with proba 'prob'. Speed-critical function!
+extern const uint8_t kVP8Log2Range[128];
+extern const uint8_t kVP8NewRange[128];
+static inline uint32_t VP8GetByte(VP8BitReader* const br) {
+  assert(br);
+  if (br->buf_ < br->buf_end_) {
+    assert(br->buf_);
+    return *br->buf_++;
+  }
+  br->eof_ = 1;
+  return 0x80;
+}
+
+static inline void VP8Shift(VP8BitReader* const br) {
+  // range_ is in [0..127] interval here.
+  const int shift = kVP8Log2Range[br->range_];
+  br->range_ = kVP8NewRange[br->range_];
+  br->value_ <<= shift;
+  br->left_ += shift;
+  if (br->left_ > 0) {
+    br->value_ |= VP8GetByte(br) << br->left_;
+    br->left_ -= 8;
+  }
+}
+
+static inline uint32_t VP8GetBit(VP8BitReader* const br, int prob) {
+  const uint32_t split = (br->range_ * prob) >> 8;
+  const uint32_t bit = ((br->value_ >> 8) > split);
+  if (bit) {
+    br->range_ -= split + 1;
+    br->value_ -= (split + 1) << 8;
+  } else {
+    br->range_ = split;
+  }
+  if (br->range_ < 0x7f) {
+    VP8Shift(br);
+  }
+  return bit;
+}
+
+static inline int VP8GetSigned(VP8BitReader* const br, int v) {
+  const uint32_t split = br->range_ >> 1;
+  const uint32_t bit = ((br->value_ >> 8) > split);
+  if (bit) {
+    br->range_ -= split + 1;
+    br->value_ -= (split + 1) << 8;
+    v = -v;
+  } else {
+    br->range_ = split;
+  }
+  VP8Shift(br);
+  return v;
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  // WEBP_DECODE_BITS_H_
--- a/src/dec/dsp.c
+++ b/src/dec/dsp.c
@ -0,0 +1,694 @@
+// Copyright 2010 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// speed-critical functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "vp8i.h"
+
+#if defined(__SSE2__)
+#include <emmintrin.h>
+#endif
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//-----------------------------------------------------------------------------
+// run-time tables (~4k)
+
+static uint8_t abs0[255 + 255 + 1];     // abs(i)
+static uint8_t abs1[255 + 255 + 1];     // abs(i)>>1
+static int8_t sclip1[1020 + 1020 + 1];  // clips [-1020, 1020] to [-128, 127]
+static int8_t sclip2[112 + 112 + 1];    // clips [-112, 112] to [-16, 15]
+static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
+
+static int tables_ok = 0;
+
+void VP8DspInitTables() {
+  if (!tables_ok) {
+    int i;
+    for (i = -255; i <= 255; ++i) {
+      abs0[255 + i] = (i < 0) ? -i : i;
+      abs1[255 + i] = abs0[255 + i] >> 1;
+    }
+    for (i = -1020; i <= 1020; ++i) {
+      sclip1[1020 + i] = (i < -128) ? -128 : (i > 127) ? 127 : i;
+    }
+    for (i = -112; i <= 112; ++i) {
+      sclip2[112 + i] = (i < -16) ? -16 : (i > 15) ? 15 : i;
+    }
+    for (i = -255; i <= 255 + 255; ++i) {
+      clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
+    }
+    tables_ok = 1;
+  }
+}
+
+static inline uint8_t clip_8b(int v) {
+  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+}
+
+//-----------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+#define STORE(x, y, v) \
+  dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+#define MUL(a, b) (((a) * (b)) >> 16)
+
+static void Transform(const int16_t* in, uint8_t* dst) {
+  int C[4 * 4], *tmp;
+  int i;
+  tmp = C;
+  for (i = 0; i < 4; ++i) {    // vertical pass
+    const int a = in[0] + in[8];    // [-4096, 4094]
+    const int b = in[0] - in[8];    // [-4095, 4095]
+    const int c = MUL(in[4], kC2) - MUL(in[12], kC1);   // [-3783, 3783]
+    const int d = MUL(in[4], kC1) + MUL(in[12], kC2);   // [-3785, 3781]
+    tmp[0] = a + d;   // [-7881, 7875]
+    tmp[1] = b + c;   // [-7878, 7878]
+    tmp[2] = b - c;   // [-7878, 7878]
+    tmp[3] = a - d;   // [-7877, 7879]
+    tmp += 4;
+    in++;
+  }
+  // Each pass is expanding the dynamic range by ~3.85 (upper bound).
+  // The exact value is (2. + (kC1 + kC2) / 65536).
+  // After the second pass, maximum interval is [-3794, 3794], assuming
+  // an input in [-2048, 2047] interval. We then need to add a dst value
+  // in the [0, 255] range.
+  // In the worst case scenario, the input to clip_8b() can be as large as
+  // [-60713, 60968].
+  tmp = C;
+  for (i = 0; i < 4; ++i) {    // horizontal pass
+    const int dc = tmp[0] + 4;
+    const int a =  dc +  tmp[8];
+    const int b =  dc -  tmp[8];
+    const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
+    const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
+    STORE(0, 0, a + d);
+    STORE(1, 0, b + c);
+    STORE(2, 0, b - c);
+    STORE(3, 0, a - d);
+    tmp++;
+    dst += BPS;
+  }
+}
+#undef MUL
+
+static void TransformUV(const int16_t* in, uint8_t* dst) {
+  Transform(in + 0 * 16, dst);
+  Transform(in + 1 * 16, dst + 4);
+  Transform(in + 2 * 16, dst + 4 * BPS);
+  Transform(in + 3 * 16, dst + 4 * BPS + 4);
+}
+
+static void TransformDC(const int16_t *in, uint8_t* dst) {
+  const int DC = in[0] + 4;
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    for (i = 0; i < 4; ++i) {
+      STORE(i, j, DC);
+    }
+  }
+}
+
+static void TransformDCUV(const int16_t* in, uint8_t* dst) {
+  if (in[0 * 16]) TransformDC(in + 0 * 16, dst);
+  if (in[1 * 16]) TransformDC(in + 1 * 16, dst + 4);
+  if (in[2 * 16]) TransformDC(in + 2 * 16, dst + 4 * BPS);
+  if (in[3 * 16]) TransformDC(in + 3 * 16, dst + 4 * BPS + 4);
+}
+
+#undef STORE
+
+// default C implementations:
+VP8Idct VP8Transform = Transform;
+VP8Idct VP8TransformUV = TransformUV;
+VP8Idct VP8TransformDC = TransformDC;
+VP8Idct VP8TransformDCUV = TransformDCUV;
+
+//-----------------------------------------------------------------------------
+// Paragraph 14.3
+
+static void TransformWHT(const int16_t* in, int16_t* out) {
+  int tmp[16];
+  int i;
+  for (i = 0; i < 4; ++i) {
+    const int a0 = in[0 + i] + in[12 + i];
+    const int a1 = in[4 + i] + in[ 8 + i];
+    const int a2 = in[4 + i] - in[ 8 + i];
+    const int a3 = in[0 + i] - in[12 + i];
+    tmp[0  + i] = a0 + a1;
+    tmp[8  + i] = a0 - a1;
+    tmp[4  + i] = a3 + a2;
+    tmp[12 + i] = a3 - a2;
+  }
+  for (i = 0; i < 4; ++i) {
+    const int dc = tmp[0 + i * 4] + 3;    // w/ rounder
+    const int a0 = dc             + tmp[3 + i * 4];
+    const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
+    const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
+    const int a3 = dc             - tmp[3 + i * 4];
+    out[ 0] = (a0 + a1) >> 3;
+    out[16] = (a3 + a2) >> 3;
+    out[32] = (a0 - a1) >> 3;
+    out[48] = (a3 - a2) >> 3;
+    out += 64;
+  }
+}
+
+void (*VP8TransformWHT)(const int16_t* in, int16_t* out) = TransformWHT;
+
+//-----------------------------------------------------------------------------
+// Intra predictions
+
+#define OUT(x, y) dst[(x) + (y) * BPS]
+
+static inline void TrueMotion(uint8_t *dst, int size) {
+  const uint8_t* top = dst - BPS;
+  const uint8_t* const clip0 = clip1 + 255 - top[-1];
+  int y;
+  for (y = 0; y < size; ++y) {
+    const uint8_t* const clip = clip0 + dst[-1];
+    int x;
+    for (x = 0; x < size; ++x) {
+      dst[x] = clip[top[x]];
+    }
+    dst += BPS;
+  }
+}
+static void TM4(uint8_t *dst)   { TrueMotion(dst, 4); }
+static void TM8uv(uint8_t *dst) { TrueMotion(dst, 8); }
+static void TM16(uint8_t *dst)  { TrueMotion(dst, 16); }
+
+//-----------------------------------------------------------------------------
+// 16x16
+
+static void VE16(uint8_t *dst) {     // vertical
+  int j;
+  for (j = 0; j < 16; ++j) {
+    memcpy(dst + j * BPS, dst - BPS, 16);
+  }
+}
+
+static void HE16(uint8_t *dst) {     // horizontal
+  int j;
+  for (j = 16; j > 0; --j) {
+    memset(dst, dst[-1], 16);
+    dst += BPS;
+  }
+}
+
+static inline void Put16(int v, uint8_t* dst) {
+  int j;
+  for (j = 0; j < 16; ++j) {
+    memset(dst + j * BPS, v, 16);
+  }
+}
+
+static void DC16(uint8_t *dst) {    // DC
+  int DC = 16;
+  int j;
+  for (j = 0; j < 16; ++j) {
+    DC += dst[-1 + j * BPS] + dst[j - BPS];
+  }
+  Put16(DC >> 5, dst);
+}
+
+static void DC16NoTop(uint8_t *dst) {   // DC with top samples not available
+  int DC = 8;
+  int j;
+  for (j = 0; j < 16; ++j) {
+    DC += dst[-1 + j * BPS];
+  }
+  Put16(DC >> 4, dst);
+}
+
+static void DC16NoLeft(uint8_t *dst) {  // DC with left samples not available
+  int DC = 8;
+  int i;
+  for (i = 0; i < 16; ++i) {
+    DC += dst[i - BPS];
+  }
+  Put16(DC >> 4, dst);
+}
+
+static void DC16NoTopLeft(uint8_t *dst) {  // DC with no top and left samples
+  Put16(0x80, dst);
+}
+
+//-----------------------------------------------------------------------------
+// 4x4
+
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+static void VE4(uint8_t *dst) {    // vertical
+  const uint8_t* top = dst - BPS;
+  const uint8_t vals[4] = {
+    AVG3(top[-1], top[0], top[1]),
+    AVG3(top[ 0], top[1], top[2]),
+    AVG3(top[ 1], top[2], top[3]),
+    AVG3(top[ 2], top[3], top[4])
+  };
+  int i;
+  for (i = 0; i < 4; ++i) {
+    memcpy(dst + i * BPS, vals, sizeof(vals));
+  }
+}
+
+static void HE4(uint8_t *dst) {    // horizontal
+  const int A = dst[-1 - BPS];
+  const int B = dst[-1];
+  const int C = dst[-1 + BPS];
+  const int D = dst[-1 + 2 * BPS];
+  const int E = dst[-1 + 3 * BPS];
+  *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(A, B, C);
+  *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(B, C, D);
+  *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(C, D, E);
+  *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(D, E, E);
+}
+
+static void DC4(uint8_t *dst) {   // DC
+  uint32_t dc = 4;
+  int i;
+  for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
+  dc >>= 3;
+  for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
+}
+
+static void RD4(uint8_t *dst) {   // Down-right
+  const int I = dst[-1 + 0 * BPS];
+  const int J = dst[-1 + 1 * BPS];
+  const int K = dst[-1 + 2 * BPS];
+  const int L = dst[-1 + 3 * BPS];
+  const int X = dst[-1 - BPS];
+  const int A = dst[0 - BPS];
+  const int B = dst[1 - BPS];
+  const int C = dst[2 - BPS];
+  const int D = dst[3 - BPS];
+  OUT(0, 3)                                     = AVG3(J, K, L);
+  OUT(0, 2) = OUT(1, 3)                         = AVG3(I, J, K);
+  OUT(0, 1) = OUT(1, 2) = OUT(2, 3)             = AVG3(X, I, J);
+  OUT(0, 0) = OUT(1, 1) = OUT(2, 2) = OUT(3, 3) = AVG3(A, X, I);
+  OUT(1, 0) = OUT(2, 1) = OUT(3, 2)             = AVG3(B, A, X);
+  OUT(2, 0) = OUT(3, 1)                         = AVG3(C, B, A);
+  OUT(3, 0)                                     = AVG3(D, C, B);
+}
+
+static void LD4(uint8_t *dst) {   // Down-Left
+  const int A = dst[0 - BPS];
+  const int B = dst[1 - BPS];
+  const int C = dst[2 - BPS];
+  const int D = dst[3 - BPS];
+  const int E = dst[4 - BPS];
+  const int F = dst[5 - BPS];
+  const int G = dst[6 - BPS];
+  const int H = dst[7 - BPS];
+  OUT(0, 0)                                     = AVG3(A, B, C);
+  OUT(1, 0) = OUT(0, 1)                         = AVG3(B, C, D);
+  OUT(2, 0) = OUT(1, 1) = OUT(0, 2)             = AVG3(C, D, E);
+  OUT(3, 0) = OUT(2, 1) = OUT(1, 2) = OUT(0, 3) = AVG3(D, E, F);
+  OUT(3, 1) = OUT(2, 2) = OUT(1, 3)             = AVG3(E, F, G);
+  OUT(3, 2) = OUT(2, 3)                         = AVG3(F, G, H);
+  OUT(3, 3)                                     = AVG3(G, H, H);
+}
+
+static void VR4(uint8_t *dst) {   // Vertical-Right
+  const int I = dst[-1 + 0 * BPS];
+  const int J = dst[-1 + 1 * BPS];
+  const int K = dst[-1 + 2 * BPS];
+  const int X = dst[-1 - BPS];
+  const int A = dst[0 - BPS];
+  const int B = dst[1 - BPS];
+  const int C = dst[2 - BPS];
+  const int D = dst[3 - BPS];
+  OUT(0, 0) = OUT(1, 2) = AVG2(X, A);
+  OUT(1, 0) = OUT(2, 2) = AVG2(A, B);
+  OUT(2, 0) = OUT(3, 2) = AVG2(B, C);
+  OUT(3, 0)             = AVG2(C, D);
+
+  OUT(0, 3) =             AVG3(K, J, I);
+  OUT(0, 2) =             AVG3(J, I, X);
+  OUT(0, 1) = OUT(1, 3) = AVG3(I, X, A);
+  OUT(1, 1) = OUT(2, 3) = AVG3(X, A, B);
+  OUT(2, 1) = OUT(3, 3) = AVG3(A, B, C);
+  OUT(3, 1) =             AVG3(B, C, D);
+}
+
+static void VL4(uint8_t *dst) {   // Vertical-Left
+  const int A = dst[0 - BPS];
+  const int B = dst[1 - BPS];
+  const int C = dst[2 - BPS];
+  const int D = dst[3 - BPS];
+  const int E = dst[4 - BPS];
+  const int F = dst[5 - BPS];
+  const int G = dst[6 - BPS];
+  const int H = dst[7 - BPS];
+  OUT(0, 0) =             AVG2(A, B);
+  OUT(1, 0) = OUT(0, 2) = AVG2(B, C);
+  OUT(2, 0) = OUT(1, 2) = AVG2(C, D);
+  OUT(3, 0) = OUT(2, 2) = AVG2(D, E);
+
+  OUT(0, 1) =             AVG3(A, B, C);
+  OUT(1, 1) = OUT(0, 3) = AVG3(B, C, D);
+  OUT(2, 1) = OUT(1, 3) = AVG3(C, D, E);
+  OUT(3, 1) = OUT(2, 3) = AVG3(D, E, F);
+              OUT(3, 2) = AVG3(E, F, G);
+              OUT(3, 3) = AVG3(F, G, H);
+}
+
+static void HU4(uint8_t *dst) {   // Horizontal-Up
+  const int I = dst[-1 + 0 * BPS];
+  const int J = dst[-1 + 1 * BPS];
+  const int K = dst[-1 + 2 * BPS];
+  const int L = dst[-1 + 3 * BPS];
+  OUT(0, 0) =             AVG2(I, J);
+  OUT(2, 0) = OUT(0, 1) = AVG2(J, K);
+  OUT(2, 1) = OUT(0, 2) = AVG2(K, L);
+  OUT(1, 0) =             AVG3(I, J, K);
+  OUT(3, 0) = OUT(1, 1) = AVG3(J, K, L);
+  OUT(3, 1) = OUT(1, 2) = AVG3(K, L, L);
+  OUT(3, 2) = OUT(2, 2) =
+    OUT(0, 3) = OUT(1, 3) = OUT(2, 3) = OUT(3, 3) = L;
+}
+
+static void HD4(uint8_t *dst) {  // Horizontal-Down
+  const int I = dst[-1 + 0 * BPS];
+  const int J = dst[-1 + 1 * BPS];
+  const int K = dst[-1 + 2 * BPS];
+  const int L = dst[-1 + 3 * BPS];
+  const int X = dst[-1 - BPS];
+  const int A = dst[0 - BPS];
+  const int B = dst[1 - BPS];
+  const int C = dst[2 - BPS];
+
+  OUT(0, 0) = OUT(2, 1) = AVG2(I, X);
+  OUT(0, 1) = OUT(2, 2) = AVG2(J, I);
+  OUT(0, 2) = OUT(2, 3) = AVG2(K, J);
+  OUT(0, 3)             = AVG2(L, K);
+
+  OUT(3, 0)             = AVG3(A, B, C);
+  OUT(2, 0)             = AVG3(X, A, B);
+  OUT(1, 0) = OUT(3, 1) = AVG3(I, X, A);
+  OUT(1, 1) = OUT(3, 2) = AVG3(J, I, X);
+  OUT(1, 2) = OUT(3, 3) = AVG3(K, J, I);
+  OUT(1, 3)             = AVG3(L, K, J);
+}
+
+#undef AVG3
+#undef AVG2
+
+//-----------------------------------------------------------------------------
+// Chroma
+
+static void VE8uv(uint8_t *dst) {    // vertical
+  int j;
+  for (j = 0; j < 8; ++j) {
+    memcpy(dst + j * BPS, dst - BPS, 8);
+  }
+}
+
+static void HE8uv(uint8_t *dst) {    // horizontal
+  int j;
+  for (j = 0; j < 8; ++j) {
+    memset(dst, dst[-1], 8);
+    dst += BPS;
+  }
+}
+
+// helper for chroma-DC predictions
+static inline void Put8x8uv(uint64_t v, uint8_t* dst) {
+  int j;
+  for (j = 0; j < 8; ++j) {
+    *(uint64_t*)(dst + j * BPS) = v;
+  }
+}
+
+static void DC8uv(uint8_t *dst) {     // DC
+  int dc0 = 8;
+  int i;
+  for (i = 0; i < 8; ++i) {
+    dc0 += dst[i - BPS] + dst[-1 + i * BPS];
+  }
+  Put8x8uv((uint64_t)((dc0 >> 4) * 0x0101010101010101ULL), dst);
+}
+
+static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
+  int dc0 = 4;
+  int i;
+  for (i = 0; i < 8; ++i) {
+    dc0 += dst[i - BPS];
+  }
+  Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
+}
+
+static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
+  int dc0 = 4;
+  int i;
+  for (i = 0; i < 8; ++i) {
+    dc0 += dst[-1 + i * BPS];
+  }
+  Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
+}
+
+static void DC8uvNoTopLeft(uint8_t *dst) {    // DC with nothing
+  Put8x8uv(0x8080808080808080ULL, dst);
+}
+
+//-----------------------------------------------------------------------------
+// default C implementations
+
+VP8PredFunc VP8PredLuma4[11] = {
+  DC4, TM4, VE4, HE4, RD4, VR4, LD4, VL4, HD4, HU4
+};
+
+VP8PredFunc VP8PredLuma16[7] = {
+  DC16, TM16, VE16, HE16,
+  DC16NoTop, DC16NoLeft, DC16NoTopLeft
+};
+
+VP8PredFunc VP8PredChroma8[7] = {
+  DC8uv, TM8uv, VE8uv, HE8uv,
+  DC8uvNoTop, DC8uvNoLeft, DC8uvNoTopLeft
+};
+
+//-----------------------------------------------------------------------------
+// Edge filtering functions
+
+// 4 pixels in, 2 pixels out
+static inline void do_filter2(uint8_t* p, int step) {
+  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int a = 3 * (q0 - p0) + sclip1[1020 + p1 - q1];
+  const int a1 = sclip2[112 + ((a + 4) >> 3)];
+  const int a2 = sclip2[112 + ((a + 3) >> 3)];
+  p[-step] = clip1[255 + p0 + a2];
+  p[    0] = clip1[255 + q0 - a1];
+}
+
+// 4 pixels in, 4 pixels out
+static inline void do_filter4(uint8_t* p, int step) {
+  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int a = 3 * (q0 - p0);
+  const int a1 = sclip2[112 + ((a + 4) >> 3)];
+  const int a2 = sclip2[112 + ((a + 3) >> 3)];
+  const int a3 = (a1 + 1) >> 1;
+  p[-2*step] = clip1[255 + p1 + a3];
+  p[-  step] = clip1[255 + p0 + a2];
+  p[      0] = clip1[255 + q0 - a1];
+  p[   step] = clip1[255 + q1 - a3];
+}
+
+// 6 pixels in, 6 pixels out
+static inline void do_filter6(uint8_t* p, int step) {
+  const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2*step];
+  const int a = sclip1[1020 + 3 * (q0 - p0) + sclip1[1020 + p1 - q1]];
+  const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
+  const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
+  const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
+  p[-3*step] = clip1[255 + p2 + a3];
+  p[-2*step] = clip1[255 + p1 + a2];
+  p[-  step] = clip1[255 + p0 + a1];
+  p[      0] = clip1[255 + q0 - a1];
+  p[   step] = clip1[255 + q1 - a2];
+  p[ 2*step] = clip1[255 + q2 - a3];
+}
+
+static inline int hev(const uint8_t* p, int step, int thresh) {
+  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  return (abs0[255 + p1 - p0] > thresh) || (abs0[255 + q1 - q0] > thresh);
+}
+
+static inline int needs_filter(const uint8_t* p, int step, int thresh) {
+  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  return (2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) <= thresh;
+}
+
+static inline int needs_filter2(const uint8_t* p, int step, int t, int it) {
+  const int p3 = p[-4*step], p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2*step], q3 = p[3*step];
+  if ((2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) > t)
+    return 0;
+  return abs0[255 + p3 - p2] <= it && abs0[255 + p2 - p1] <= it &&
+         abs0[255 + p1 - p0] <= it && abs0[255 + q3 - q2] <= it &&
+         abs0[255 + q2 - q1] <= it && abs0[255 + q1 - q0] <= it;
+}
+
+//-----------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  for (i = 0; i < 16; ++i) {
+    if (needs_filter(p + i, stride, thresh)) {
+      do_filter2(p + i, stride);
+    }
+  }
+}
+
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  for (i = 0; i < 16; ++i) {
+    if (needs_filter(p + i * stride, 1, thresh)) {
+      do_filter2(p + i * stride, 1);
+    }
+  }
+}
+
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    SimpleVFilter16(p, stride, thresh);
+  }
+}
+
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    SimpleHFilter16(p, stride, thresh);
+  }
+}
+
+//-----------------------------------------------------------------------------
+// Complex In-loop filtering (Paragraph 15.3)
+
+static inline void FilterLoop26(uint8_t* p, int hstride, int vstride, int size,
+                                int thresh, int ithresh, int hev_thresh) {
+  while (size-- > 0) {
+    if (needs_filter2(p, hstride, thresh, ithresh)) {
+      if (hev(p, hstride, hev_thresh)) {
+        do_filter2(p, hstride);
+      } else {
+        do_filter6(p, hstride);
+      }
+    }
+    p += vstride;
+  }
+}
+
+static inline void FilterLoop24(uint8_t* p, int hstride, int vstride, int size,
+                                int thresh, int ithresh, int hev_thresh) {
+  while (size-- > 0) {
+    if (needs_filter2(p, hstride, thresh, ithresh)) {
+      if (hev(p, hstride, hev_thresh)) {
+        do_filter2(p, hstride);
+      } else {
+        do_filter4(p, hstride);
+      }
+    }
+    p += vstride;
+  }
+}
+
+// on macroblock edges
+static void VFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+}
+
+// on three inner edges
+static void VFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+static void HFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+// 8-pixels wide variant, for chroma filtering
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+//-----------------------------------------------------------------------------
+
+void (*VP8VFilter16)(uint8_t*, int, int, int, int) = VFilter16;
+void (*VP8HFilter16)(uint8_t*, int, int, int, int) = HFilter16;
+void (*VP8VFilter8)(uint8_t*, uint8_t*, int, int, int, int) = VFilter8;
+void (*VP8HFilter8)(uint8_t*, uint8_t*, int, int, int, int) = HFilter8;
+void (*VP8VFilter16i)(uint8_t*, int, int, int, int) = VFilter16i;
+void (*VP8HFilter16i)(uint8_t*, int, int, int, int) = HFilter16i;
+void (*VP8VFilter8i)(uint8_t*, uint8_t*, int, int, int, int) = VFilter8i;
+void (*VP8HFilter8i)(uint8_t*, uint8_t*, int, int, int, int) = HFilter8i;
+
+void (*VP8SimpleVFilter16)(uint8_t*, int, int) = SimpleVFilter16;
+void (*VP8SimpleHFilter16)(uint8_t*, int, int) = SimpleHFilter16;
+void (*VP8SimpleVFilter16i)(uint8_t*, int, int) = SimpleVFilter16i;
+void (*VP8SimpleHFilter16i)(uint8_t*, int, int) = SimpleHFilter16i;
+
+//-----------------------------------------------------------------------------
+
+void VP8DspInit() {
+  // later we'll plug some SSE2 variant here
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@ -0,0 +1,410 @@
+// Copyright 2010 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Frame-reconstruction function. Memory allocation.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+#include "vp8i.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define ALIGN_MASK (32 - 1)
+
+//-----------------------------------------------------------------------------
+// Memory setup
+
+// how many extra luma lines are needed for caching, given a filtering level
+static const uint8_t kFilterExtraRows[3] = { 0, 4, 8 };
+
+int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
+  const int mb_w = dec->mb_w_;
+  const int intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
+  const int top_size = (16 + 8 + 8) * mb_w;
+  const int info_size = (mb_w + 1) * sizeof(VP8MB);
+  const int yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
+  const int coeffs_size = 384 * sizeof(*dec->coeffs_);
+  const int cache_height = (16 + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
+  const int cache_size = top_size * cache_height;
+  const int needed = intra_pred_mode_size
+                   + top_size + info_size
+                   + yuv_size + coeffs_size
+                   + cache_size + ALIGN_MASK;
+  uint8_t* mem;
+
+  if (needed > dec->mem_size_) {
+    free(dec->mem_);
+    dec->mem_size_ = 0;
+    dec->mem_ = (uint8_t*)malloc(needed);
+    if (dec->mem_ == NULL) {
+      return VP8SetError(dec, 1, "no memory during frame initialization.");
+    }
+    dec->mem_size_ = needed;
+  }
+
+  mem = (uint8_t*)dec->mem_;
+  dec->intra_t_ = (uint8_t*)mem;
+  mem += intra_pred_mode_size;
+
+  dec->y_t_ = (uint8_t*)mem;
+  mem += 16 * mb_w;
+  dec->u_t_ = (uint8_t*)mem;
+  mem += 8 * mb_w;
+  dec->v_t_ = (uint8_t*)mem;
+  mem += 8 * mb_w;
+
+  dec->mb_info_ = ((VP8MB*)mem) + 1;
+  mem += info_size;
+
+  mem = (uint8_t*)((uintptr_t)(mem + ALIGN_MASK) & ~ALIGN_MASK);
+  assert((yuv_size & ALIGN_MASK) == 0);
+  dec->yuv_b_ = (uint8_t*)mem;
+  mem += yuv_size;
+
+  dec->coeffs_ = (int16_t*)mem;
+  mem += coeffs_size;
+
+  dec->cache_y_stride_ = 16 * mb_w;
+  dec->cache_uv_stride_ = 8 * mb_w;
+  {
+    const int extra_rows = kFilterExtraRows[dec->filter_type_];
+    const int extra_y = extra_rows * dec->cache_y_stride_;
+    const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
+    dec->cache_y_ = ((uint8_t*)mem) + extra_y;
+    dec->cache_u_ = dec->cache_y_ + 16 * dec->cache_y_stride_ + extra_uv;
+    dec->cache_v_ = dec->cache_u_ + 8 * dec->cache_uv_stride_ + extra_uv;
+  }
+  mem += cache_size;
+
+  // note: left-info is initialized once for all.
+  memset(dec->mb_info_ - 1, 0, (mb_w + 1) * sizeof(*dec->mb_info_));
+
+  // initialize top
+  memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
+
+  // prepare 'io'
+  io->width = dec->pic_hdr_.width_;
+  io->height = dec->pic_hdr_.height_;
+  io->mb_y = 0;
+  io->y = dec->cache_y_;
+  io->u = dec->cache_u_;
+  io->v = dec->cache_v_;
+  io->y_stride = dec->cache_y_stride_;
+  io->uv_stride = dec->cache_uv_stride_;
+  io->fancy_upscaling = 0;    // default
+
+  // Init critical function pointers and look-up tables.
+  VP8DspInitTables();
+  VP8DspInit();
+
+  return 1;
+}
+
+//-----------------------------------------------------------------------------
+// Filtering
+
+static inline int hev_thresh_from_level(int level, int keyframe) {
+  if (keyframe) {
+    return (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
+  } else {
+    return (level >= 40) ? 3 : (level >= 20) ? 2 : (level >= 15) ? 1 : 0;
+  }
+}
+
+static void DoFilter(VP8Decoder* const dec, int mb_x, int mb_y) {
+  VP8MB* const mb = dec->mb_info_ + mb_x;
+  uint8_t* const y_dst = dec->cache_y_ + mb_x * 16;
+  const int y_bps = dec->cache_y_stride_;
+  const int level = mb->f_level_;
+  const int ilevel = mb->f_ilevel_;
+  const int limit = 2 * level + ilevel;
+  if (dec->filter_type_ == 1) {   // simple
+    if (mb_x > 0) {
+      VP8SimpleHFilter16(y_dst, y_bps, limit + 4);
+    }
+    if (mb->f_inner_) {
+      VP8SimpleHFilter16i(y_dst, y_bps, limit);
+    }
+    if (mb_y > 0) {
+      VP8SimpleVFilter16(y_dst, y_bps, limit + 4);
+    }
+    if (mb->f_inner_) {
+      VP8SimpleVFilter16i(y_dst, y_bps, limit);
+    }
+  } else {    // complex
+    uint8_t* const u_dst = dec->cache_u_ + mb_x * 8;
+    uint8_t* const v_dst = dec->cache_v_ + mb_x * 8;
+    const int uv_bps = dec->cache_uv_stride_;
+    const int hev_thresh =
+        hev_thresh_from_level(level, dec->frm_hdr_.key_frame_);
+    if (mb_x > 0) {
+      VP8HFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
+      VP8HFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
+    }
+    if (mb->f_inner_) {
+      VP8HFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
+      VP8HFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
+    }
+    if (mb_y > 0) {
+      VP8VFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
+      VP8VFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
+    }
+    if (mb->f_inner_) {
+      VP8VFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
+      VP8VFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
+    }
+  }
+}
+
+void VP8StoreBlock(VP8Decoder* const dec) {
+  if (dec->filter_type_ > 0) {
+    VP8MB* const info = dec->mb_info_ + dec->mb_x_;
+    int level = dec->filter_levels_[dec->segment_];
+    if (dec->filter_hdr_.use_lf_delta_) {
+      // TODO(skal): only CURRENT is handled for now.
+      level += dec->filter_hdr_.ref_lf_delta_[0];
+      if (dec->is_i4x4_) {
+        level += dec->filter_hdr_.mode_lf_delta_[0];
+      }
+    }
+    level = (level < 0) ? 0 : (level > 63) ? 63 : level;
+    info->f_level_ = level;
+
+    if (dec->filter_hdr_.sharpness_ > 0) {
+      if (dec->filter_hdr_.sharpness_ > 4) {
+        level >>= 2;
+      } else {
+        level >>= 1;
+      }
+      if (level > 9 - dec->filter_hdr_.sharpness_) {
+        level = 9 - dec->filter_hdr_.sharpness_;
+      }
+    }
+
+    info->f_ilevel_ = (level < 1) ? 1 : level;
+    info->f_inner_ = (!info->skip_ || dec->is_i4x4_);
+  }
+  {
+    // Transfer samples to row cache
+    int y;
+    uint8_t* const ydst = dec->cache_y_ + dec->mb_x_ * 16;
+    uint8_t* const udst = dec->cache_u_ + dec->mb_x_ * 8;
+    uint8_t* const vdst = dec->cache_v_ + dec->mb_x_ * 8;
+    for (y = 0; y < 16; ++y) {
+      memcpy(ydst + y * dec->cache_y_stride_,
+             dec->yuv_b_ + Y_OFF + y * BPS, 16);
+    }
+    for (y = 0; y < 8; ++y) {
+      memcpy(udst + y * dec->cache_uv_stride_,
+           dec->yuv_b_ + U_OFF + y * BPS, 8);
+      memcpy(vdst + y * dec->cache_uv_stride_,
+           dec->yuv_b_ + V_OFF + y * BPS, 8);
+    }
+  }
+}
+
+void VP8FinishRow(VP8Decoder* const dec, VP8Io* io) {
+  const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
+  const int ysize = extra_y_rows * dec->cache_y_stride_;
+  const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
+  const int first_row = (dec->mb_y_ == 0);
+  const int last_row = (dec->mb_y_ >= dec->mb_h_ - 1);
+  uint8_t* const ydst = dec->cache_y_ - ysize;
+  uint8_t* const udst = dec->cache_u_ - uvsize;
+  uint8_t* const vdst = dec->cache_v_ - uvsize;
+  if (dec->filter_type_ > 0) {
+    int mb_x;
+    for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
+      DoFilter(dec, mb_x, dec->mb_y_);
+    }
+  }
+  if (io->put) {
+    int y_start = dec->mb_y_ * 16;
+    int y_end = y_start + 16;
+    if (!first_row) {
+      y_start -= extra_y_rows;
+      io->y = ydst;
+      io->u = udst;
+      io->v = vdst;
+    } else {
+      io->y = dec->cache_y_;
+      io->u = dec->cache_u_;
+      io->v = dec->cache_v_;
+    }
+    if (!last_row) {
+      y_end -= extra_y_rows;
+    }
+    if (y_end > io->height) {
+      y_end = io->height;
+    }
+    io->mb_y = y_start;
+    io->mb_h = y_end - y_start;
+    io->put(io);
+  }
+    // rotate top samples
+  if (!last_row) {
+    memcpy(ydst, ydst + 16 * dec->cache_y_stride_, ysize);
+    memcpy(udst, udst + 8 * dec->cache_uv_stride_, uvsize);
+    memcpy(vdst, vdst + 8 * dec->cache_uv_stride_, uvsize);
+  }
+}
+
+//-----------------------------------------------------------------------------
+// Main reconstruction function.
+
+static const int kScan[16] = {
+  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
+  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
+  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
+  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
+};
+
+static inline int CheckMode(VP8Decoder* const dec, int mode) {
+  if (mode == B_DC_PRED) {
+    if (dec->mb_x_ == 0) {
+      return (dec->mb_y_ == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
+    } else {
+      return (dec->mb_y_ == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
+    }
+  }
+  return mode;
+}
+
+static inline void Copy32b(uint8_t* dst, uint8_t* src) {
+  *(uint32_t*)dst = *(uint32_t*)src;
+}
+
+void VP8ReconstructBlock(VP8Decoder* const dec) {
+  uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
+  uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
+  uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
+
+  // Rotate in the left samples from previously decoded block. We move four
+  // pixels at a time for alignment reason, and because of in-loop filter.
+  if (dec->mb_x_ > 0) {
+    int j;
+    for (j = -1; j < 16; ++j) {
+      Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
+    }
+    for (j = -1; j < 8; ++j) {
+      Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
+      Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
+    }
+  } else {
+    int j;
+    for (j = 0; j < 16; ++j) {
+      y_dst[j * BPS - 1] = 129;
+    }
+    for (j = 0; j < 8; ++j) {
+      u_dst[j * BPS - 1] = 129;
+      v_dst[j * BPS - 1] = 129;
+    }
+    // Init top-left sample on left column too
+    if (dec->mb_y_ > 0) {
+      y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
+    }
+  }
+  {
+    // bring top samples into the cache
+    uint8_t* const top_y = dec->y_t_ + dec->mb_x_ * 16;
+    uint8_t* const top_u = dec->u_t_ + dec->mb_x_ * 8;
+    uint8_t* const top_v = dec->v_t_ + dec->mb_x_ * 8;
+    const int16_t* coeffs = dec->coeffs_;
+    int n;
+
+    if (dec->mb_y_ > 0) {
+      memcpy(y_dst - BPS, top_y, 16);
+      memcpy(u_dst - BPS, top_u, 8);
+      memcpy(v_dst - BPS, top_v, 8);
+    } else if (dec->mb_x_ == 0) {
+      // we only need to do this init once at block (0,0).
+      // Afterward, it remains valid for the whole topmost row.
+      memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
+      memset(u_dst - BPS - 1, 127, 8 + 1);
+      memset(v_dst - BPS - 1, 127, 8 + 1);
+    }
+
+    // predict and add residuals
+
+    if (dec->is_i4x4_) {   // 4x4
+      uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
+
+      if (dec->mb_y_ > 0) {
+        if (dec->mb_x_ >= dec->mb_w_ - 1) {    // on rightmost border
+          top_right[0] = top_y[15] * 0x01010101u;
+        } else {
+          memcpy(top_right, top_y + 16, sizeof(*top_right));
+        }
+      }
+      // replicate the top-right pixels below
+      top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
+
+      // predict and add residues for all 4x4 blocks in turn.
+      for (n = 0; n < 16; n++) {
+        uint8_t* const dst = y_dst + kScan[n];
+        VP8PredLuma4[dec->imodes_[n]](dst);
+        if (dec->non_zero_ & (1 << n)) {
+          VP8Transform(coeffs + n * 16, dst);
+        } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
+          VP8TransformDC(coeffs + n * 16, dst);
+        }
+      }
+    } else {    // 16x16
+      const int pred_func = CheckMode(dec, dec->imodes_[0]);
+      VP8PredLuma16[pred_func](y_dst);
+      if (dec->non_zero_) {
+        for (n = 0; n < 16; n++) {
+          uint8_t* const dst = y_dst + kScan[n];
+          if (dec->non_zero_ac_ & (1 << n)) {
+            VP8Transform(coeffs + n * 16, dst);
+          } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
+            VP8TransformDC(coeffs + n * 16, dst);
+          }
+        }
+      }
+    }
+    {
+      // Chroma
+      const int pred_func = CheckMode(dec, dec->uvmode_);
+      VP8PredChroma8[pred_func](u_dst);
+      VP8PredChroma8[pred_func](v_dst);
+
+      if (dec->non_zero_ & 0x0f0000) {   // chroma-U
+        const int16_t* const u_coeffs = dec->coeffs_ + 16 * 16;
+        if (dec->non_zero_ac_ & 0x0f0000) {
+          VP8TransformUV(u_coeffs, u_dst);
+        } else {
+          VP8TransformDCUV(u_coeffs, u_dst);
+        }
+      }
+      if (dec->non_zero_ & 0xf00000) {   // chroma-V
+        const int16_t* const v_coeffs = dec->coeffs_ + 20 * 16;
+        if (dec->non_zero_ac_ & 0xf00000) {
+          VP8TransformUV(v_coeffs, v_dst);
+        } else {
+          VP8TransformDCUV(v_coeffs, v_dst);
+        }
+      }
+
+      // stash away top samples for next block
+      if (dec->mb_y_ < dec->mb_h_ - 1) {
+        memcpy(top_y, y_dst + 15 * BPS, 16);
+        memcpy(top_u, u_dst +  7 * BPS,  8);
+        memcpy(top_v, v_dst +  7 * BPS,  8);
+      }
+    }
+  }
+}
+
+//-----------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/quant.c
+++ b/src/dec/quant.c
@ -0,0 +1,111 @@
+// Copyright 2010 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Quantizer initialization
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "vp8i.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+static inline int clip(int v, int M) {
+  return v < 0 ? 0 : v > M ? M : v;
+}
+
+// Paragraph 14.1
+static const uint8_t kDcTable[128] = {
+  4,     5,   6,   7,   8,   9,  10,  10,
+  11,   12,  13,  14,  15,  16,  17,  17,
+  18,   19,  20,  20,  21,  21,  22,  22,
+  23,   23,  24,  25,  25,  26,  27,  28,
+  29,   30,  31,  32,  33,  34,  35,  36,
+  37,   37,  38,  39,  40,  41,  42,  43,
+  44,   45,  46,  46,  47,  48,  49,  50,
+  51,   52,  53,  54,  55,  56,  57,  58,
+  59,   60,  61,  62,  63,  64,  65,  66,
+  67,   68,  69,  70,  71,  72,  73,  74,
+  75,   76,  76,  77,  78,  79,  80,  81,
+  82,   83,  84,  85,  86,  87,  88,  89,
+  91,   93,  95,  96,  98, 100, 101, 102,
+  104, 106, 108, 110, 112, 114, 116, 118,
+  122, 124, 126, 128, 130, 132, 134, 136,
+  138, 140, 143, 145, 148, 151, 154, 157
+};
+
+static const uint16_t kAcTable[128] = {
+  4,     5,   6,   7,   8,   9,  10,  11,
+  12,   13,  14,  15,  16,  17,  18,  19,
+  20,   21,  22,  23,  24,  25,  26,  27,
+  28,   29,  30,  31,  32,  33,  34,  35,
+  36,   37,  38,  39,  40,  41,  42,  43,
+  44,   45,  46,  47,  48,  49,  50,  51,
+  52,   53,  54,  55,  56,  57,  58,  60,
+  62,   64,  66,  68,  70,  72,  74,  76,
+  78,   80,  82,  84,  86,  88,  90,  92,
+  94,   96,  98, 100, 102, 104, 106, 108,
+  110, 112, 114, 116, 119, 122, 125, 128,
+  131, 134, 137, 140, 143, 146, 149, 152,
+  155, 158, 161, 164, 167, 170, 173, 177,
+  181, 185, 189, 193, 197, 201, 205, 209,
+  213, 217, 221, 225, 229, 234, 239, 245,
+  249, 254, 259, 264, 269, 274, 279, 284
+};
+
+//-----------------------------------------------------------------------------
+// Paragraph 9.6
+
+void VP8ParseQuant(VP8Decoder* const dec) {
+  VP8BitReader* const br = &dec->br_;
+  const int base_q0 = VP8GetValue(br, 7);
+  const int dqy1_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
+  const int dqy2_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
+  const int dqy2_ac = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
+  const int dquv_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
+  const int dquv_ac = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
+
+  const VP8SegmentHeader* const hdr = &dec->segment_hdr_;
+  int i;
+
+  for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
+    int q;
+    if (hdr->use_segment_) {
+      q = hdr->quantizer_[i];
+      if (!hdr->absolute_delta_) {
+        q += base_q0;
+      }
+    } else {
+      if (i > 0) {
+        dec->dqm_[i] = dec->dqm_[0];
+        continue;
+      } else {
+        q = base_q0;
+      }
+    }
+    {
+      VP8QuantMatrix* const m = &dec->dqm_[i];
+      m->y1_mat_[0] = kDcTable[clip(q + dqy1_dc, 127)];
+      m->y1_mat_[1] = kAcTable[clip(q + 0,       127)];
+
+      m->y2_mat_[0] = kDcTable[clip(q + dqy2_dc, 127)] * 2;
+      // TODO(skal): make it another table?
+      m->y2_mat_[1] = kAcTable[clip(q + dqy2_ac, 127)] * 155 / 100;
+      if (m->y2_mat_[1] < 8) m->y2_mat_[1] = 8;
+
+      m->uv_mat_[0] = kDcTable[clip(q + dquv_dc, 117)];
+      m->uv_mat_[1] = kAcTable[clip(q + dquv_ac, 127)];
+    }
+  }
+}
+
+//-----------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/tree.c
+++ b/src/dec/tree.c
@ -0,0 +1,590 @@
+// Copyright 2010 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Coding trees and probas
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdio.h>
+#include "vp8i.h"
+
+#define USE_GENERIC_TREE
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#ifdef USE_GENERIC_TREE
+static const int8_t kYModesIntra4[18] = {
+  -B_DC_PRED, 1,
+    -B_TM_PRED, 2,
+      -B_VE_PRED, 3,
+        4, 6,
+          -B_HE_PRED, 5,
+            -B_RD_PRED, -B_VR_PRED,
+        -B_LD_PRED, 7,
+          -B_VL_PRED, 8,
+            -B_HD_PRED, -B_HU_PRED
+};
+#endif
+
+#ifndef ONLY_KEYFRAME_CODE
+
+// inter prediction modes
+enum {
+  LEFT4 = 0, ABOVE4 = 1, ZERO4 = 2, NEW4 = 3,
+  NEARESTMV, NEARMV, ZEROMV, NEWMV, SPLITMV };
+
+static const int8_t kYModesInter[8] = {
+  -DC_PRED, 1,
+    2, 3,
+      -V_PRED, -H_PRED,
+      -TM_PRED, -B_PRED
+};
+
+static const int8_t kMBSplit[6] = {
+  -3, 1,
+    -2, 2,
+      -0, -1
+};
+
+static const int8_t kMVRef[8] = {
+  -ZEROMV, 1,
+    -NEARESTMV, 2,
+      -NEARMV, 3,
+        -NEWMV, -SPLITMV
+};
+
+static const int8_t kMVRef4[6] = {
+  -LEFT4, 1
+    -ABOVE4, 2
+      -ZERO4, -NEW4
+};
+#endif
+
+//-----------------------------------------------------------------------------
+// Default probabilities
+
+// Inter
+#ifndef ONLY_KEYFRAME_CODE
+static const uint8_t kYModeProbaInter0[4] = { 112, 86, 140, 37 };
+static const uint8_t kUVModeProbaInter0[3] = { 162, 101, 204 };
+static const uint8_t kMVProba0[2][NUM_MV_PROBAS] = {
+  { 162, 128, 225, 146, 172, 147, 214,  39,
+    156, 128, 129, 132,  75, 145, 178, 206,
+    239, 254, 254 },
+  { 164, 128, 204, 170, 119, 235, 140, 230,
+    228, 128, 130, 130,  74, 148, 180, 203,
+    236, 254, 254 }
+};
+#endif
+
+// Paragraph 13.5
+static const uint8_t
+  CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
+  // genereated using vp8_default_coef_probs() in entropy.c:129
+  { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
+      { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
+      { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
+      { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
+      { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
+    },
+    { { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
+      { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
+      { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
+    },
+    { { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
+      { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
+      { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
+      { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
+      { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
+      { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
+      { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }
+  },
+  { { { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 },
+      { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 },
+      { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }
+    },
+    { { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
+      { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
+      { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }
+    },
+    { { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
+      { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
+      { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }
+    },
+    { { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
+      { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
+      { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
+      { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
+      { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }
+    },
+    { { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
+      { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
+      { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }
+    },
+    { { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
+      { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
+      { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }
+    },
+    { { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
+      { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }
+    }
+  },
+  { { { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
+      { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
+      { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }
+    },
+    { { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
+      { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
+      { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
+      { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
+      { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }
+    },
+    { { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
+      { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
+      { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }
+  },
+  { { { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
+      { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
+      { 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 }
+    },
+    { { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
+      { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
+      { 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 }
+    },
+    { { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
+      { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
+      { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }
+    },
+    { { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
+      { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
+      { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }
+    },
+    { { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
+      { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
+      { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }
+    },
+    { { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
+      { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
+      { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
+      { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
+      { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }
+    },
+    { { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }
+  }
+};
+
+// Paragraph 11.5
+static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
+  { { 231, 120, 48, 89, 115, 113, 120, 152, 112 },
+    { 152, 179, 64, 126, 170, 118, 46, 70, 95 },
+    { 175, 69, 143, 80, 85, 82, 72, 155, 103 },
+    { 56, 58, 10, 171, 218, 189, 17, 13, 152 },
+    { 114, 26, 17, 163, 44, 195, 21, 10, 173 },
+    { 121, 24, 80, 195, 26, 62, 44, 64, 85 },
+    { 144, 71, 10, 38, 171, 213, 144, 34, 26 },
+    { 170, 46, 55, 19, 136, 160, 33, 206, 71 },
+    { 63, 20, 8, 114, 114, 208, 12, 9, 226 },
+    { 81, 40, 11, 96, 182, 84, 29, 16, 36 } },
+  { { 134, 183, 89, 137, 98, 101, 106, 165, 148 },
+    { 72, 187, 100, 130, 157, 111, 32, 75, 80 },
+    { 66, 102, 167, 99, 74, 62, 40, 234, 128 },
+    { 41, 53, 9, 178, 241, 141, 26, 8, 107 },
+    { 74, 43, 26, 146, 73, 166, 49, 23, 157 },
+    { 65, 38, 105, 160, 51, 52, 31, 115, 128 },
+    { 104, 79, 12, 27, 217, 255, 87, 17, 7 },
+    { 87, 68, 71, 44, 114, 51, 15, 186, 23 },
+    { 47, 41, 14, 110, 182, 183, 21, 17, 194 },
+    { 66, 45, 25, 102, 197, 189, 23, 18, 22 } },
+  { { 88, 88, 147, 150, 42, 46, 45, 196, 205 },
+    { 43, 97, 183, 117, 85, 38, 35, 179, 61 },
+    { 39, 53, 200, 87, 26, 21, 43, 232, 171 },
+    { 56, 34, 51, 104, 114, 102, 29, 93, 77 },
+    { 39, 28, 85, 171, 58, 165, 90, 98, 64 },
+    { 34, 22, 116, 206, 23, 34, 43, 166, 73 },
+    { 107, 54, 32, 26, 51, 1, 81, 43, 31 },
+    { 68, 25, 106, 22, 64, 171, 36, 225, 114 },
+    { 34, 19, 21, 102, 132, 188, 16, 76, 124 },
+    { 62, 18, 78, 95, 85, 57, 50, 48, 51 } },
+  { { 193, 101, 35, 159, 215, 111, 89, 46, 111 },
+    { 60, 148, 31, 172, 219, 228, 21, 18, 111 },
+    { 112, 113, 77, 85, 179, 255, 38, 120, 114 },
+    { 40, 42, 1, 196, 245, 209, 10, 25, 109 },
+    { 88, 43, 29, 140, 166, 213, 37, 43, 154 },
+    { 61, 63, 30, 155, 67, 45, 68, 1, 209 },
+    { 100, 80, 8, 43, 154, 1, 51, 26, 71 },
+    { 142, 78, 78, 16, 255, 128, 34, 197, 171 },
+    { 41, 40, 5, 102, 211, 183, 4, 1, 221 },
+    { 51, 50, 17, 168, 209, 192, 23, 25, 82 } },
+  { { 138, 31, 36, 171, 27, 166, 38, 44, 229 },
+    { 67, 87, 58, 169, 82, 115, 26, 59, 179 },
+    { 63, 59, 90, 180, 59, 166, 93, 73, 154 },
+    { 40, 40, 21, 116, 143, 209, 34, 39, 175 },
+    { 47, 15, 16, 183, 34, 223, 49, 45, 183 },
+    { 46, 17, 33, 183, 6, 98, 15, 32, 183 },
+    { 57, 46, 22, 24, 128, 1, 54, 17, 37 },
+    { 65, 32, 73, 115, 28, 128, 23, 128, 205 },
+    { 40, 3, 9, 115, 51, 192, 18, 6, 223 },
+    { 87, 37, 9, 115, 59, 77, 64, 21, 47 } },
+  { { 104, 55, 44, 218, 9, 54, 53, 130, 226 },
+    { 64, 90, 70, 205, 40, 41, 23, 26, 57 },
+    { 54, 57, 112, 184, 5, 41, 38, 166, 213 },
+    { 30, 34, 26, 133, 152, 116, 10, 32, 134 },
+    { 39, 19, 53, 221, 26, 114, 32, 73, 255 },
+    { 31, 9, 65, 234, 2, 15, 1, 118, 73 },
+    { 75, 32, 12, 51, 192, 255, 160, 43, 51 },
+    { 88, 31, 35, 67, 102, 85, 55, 186, 85 },
+    { 56, 21, 23, 111, 59, 205, 45, 37, 192 },
+    { 55, 38, 70, 124, 73, 102, 1, 34, 98 } },
+  { { 125, 98, 42, 88, 104, 85, 117, 175, 82 },
+    { 95, 84, 53, 89, 128, 100, 113, 101, 45 },
+    { 75, 79, 123, 47, 51, 128, 81, 171, 1 },
+    { 57, 17, 5, 71, 102, 57, 53, 41, 49 },
+    { 38, 33, 13, 121, 57, 73, 26, 1, 85 },
+    { 41, 10, 67, 138, 77, 110, 90, 47, 114 },
+    { 115, 21, 2, 10, 102, 255, 166, 23, 6 },
+    { 101, 29, 16, 10, 85, 128, 101, 196, 26 },
+    { 57, 18, 10, 102, 102, 213, 34, 20, 43 },
+    { 117, 20, 15, 36, 163, 128, 68, 1, 26 } },
+  { { 102, 61, 71, 37, 34, 53, 31, 243, 192 },
+    { 69, 60, 71, 38, 73, 119, 28, 222, 37 },
+    { 68, 45, 128, 34, 1, 47, 11, 245, 171 },
+    { 62, 17, 19, 70, 146, 85, 55, 62, 70 },
+    { 37, 43, 37, 154, 100, 163, 85, 160, 1 },
+    { 63, 9, 92, 136, 28, 64, 32, 201, 85 },
+    { 75, 15, 9, 9, 64, 255, 184, 119, 16 },
+    { 86, 6, 28, 5, 64, 255, 25, 248, 1 },
+    { 56, 8, 17, 132, 137, 255, 55, 116, 128 },
+    { 58, 15, 20, 82, 135, 57, 26, 121, 40 } },
+  { { 164, 50, 31, 137, 154, 133, 25, 35, 218 },
+    { 51, 103, 44, 131, 131, 123, 31, 6, 158 },
+    { 86, 40, 64, 135, 148, 224, 45, 183, 128 },
+    { 22, 26, 17, 131, 240, 154, 14, 1, 209 },
+    { 45, 16, 21, 91, 64, 222, 7, 1, 197 },
+    { 56, 21, 39, 155, 60, 138, 23, 102, 213 },
+    { 83, 12, 13, 54, 192, 255, 68, 47, 28 },
+    { 85, 26, 85, 85, 128, 128, 32, 146, 171 },
+    { 18, 11, 7, 63, 144, 171, 4, 4, 246 },
+    { 35, 27, 10, 146, 174, 171, 12, 26, 128 } },
+  { { 190, 80, 35, 99, 180, 80, 126, 54, 45 },
+    { 85, 126, 47, 87, 176, 51, 41, 20, 32 },
+    { 101, 75, 128, 139, 118, 146, 116, 128, 85 },
+    { 56, 41, 15, 176, 236, 85, 37, 9, 62 },
+    { 71, 30, 17, 119, 118, 255, 17, 18, 138 },
+    { 101, 38, 60, 138, 55, 70, 43, 26, 142 },
+    { 146, 36, 19, 30, 171, 255, 97, 27, 20 },
+    { 138, 45, 61, 62, 219, 1, 81, 188, 64 },
+    { 32, 41, 20, 117, 151, 142, 20, 21, 163 },
+    { 112, 19, 12, 61, 195, 128, 48, 4, 24 } }
+};
+
+void VP8ResetProba(VP8Proba* const proba) {
+  memset(proba->segments_, 255u, sizeof(proba->segments_));
+  memcpy(proba->coeffs_, CoeffsProba0, sizeof(CoeffsProba0));
+#ifndef ONLY_KEYFRAME_CODE
+  memcpy(proba->mv_, kMVProba0, sizeof(kMVProba0));
+  memcpy(proba->ymode_, kYModeProbaInter0, sizeof(kYModeProbaInter0));
+  memcpy(proba->uvmode_, kUVModeProbaInter0, sizeof(kYModeProbaInter0));
+#endif
+}
+
+void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec) {
+  uint8_t* const top = dec->intra_t_ + 4 * dec->mb_x_;
+  uint8_t* const left = dec->intra_l_;
+  // Hardcoded 16x16 intra-mode decision tree.
+  dec->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
+  if (!dec->is_i4x4_) {
+    const int ymode =
+        VP8GetBit(br, 156) ? (VP8GetBit(br, 128) ? TM_PRED : H_PRED)
+                           : (VP8GetBit(br, 163) ? V_PRED : DC_PRED);
+    dec->imodes_[0] = ymode;
+    memset(top, ymode, 4 * sizeof(top[0]));
+    memset(left, ymode, 4 * sizeof(left[0]));
+  } else {
+    uint8_t* modes = dec->imodes_;
+    int y;
+    for (y = 0; y < 4; ++y) {
+      int ymode = left[y];
+      int x;
+      for (x = 0; x < 4; ++x) {
+        const uint8_t* const prob = kBModesProba[top[x]][ymode];
+#ifdef USE_GENERIC_TREE
+        // Generic tree-parsing
+        int i = 0;
+        do {
+          i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i])];
+        } while (i > 0);
+        ymode = -i;
+#else
+        // Hardcoded tree parsing
+        ymode = !VP8GetBit(br, prob[0]) ? B_DC_PRED :
+                  !VP8GetBit(br, prob[1]) ? B_TM_PRED :
+                    !VP8GetBit(br, prob[2]) ? B_VE_PRED :
+                      !VP8GetBit(br, prob[3]) ?
+                        (!VP8GetBit(br, prob[4]) ? B_HE_PRED :
+                          (!VP8GetBit(br, prob[5]) ? B_RD_PRED : B_VR_PRED)) :
+                        (!VP8GetBit(br, prob[6]) ? B_LD_PRED :
+                          (!VP8GetBit(br, prob[7]) ? B_VL_PRED :
+                            (!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
+#endif    // USE_GENERIC_TREE
+        top[x] = ymode;
+        *modes++ = ymode;
+      }
+      left[y] = ymode;
+    }
+  }
+  // Hardcoded UVMode decision tree
+  dec->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
+               : !VP8GetBit(br, 114) ? V_PRED
+               : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
+}
+
+//-----------------------------------------------------------------------------
+// Paragraph 13
+
+static const uint8_t
+    CoeffsUpdateProba[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
+  { { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255 },
+      { 250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  },
+  { { { 217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255 },
+      { 234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255 }
+    },
+    { { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  },
+  { { { 186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  },
+  { { { 248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255 },
+      { 248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  }
+};
+
+#ifndef ONLY_KEYFRAME_CODE
+static const uint8_t MVUpdateProba[2][NUM_MV_PROBAS] = {
+  { 237, 246, 253, 253, 254, 254, 254, 254,
+    254, 254, 254, 254, 254, 254, 250, 250,
+    252, 254, 254 },
+  { 231, 243, 245, 253, 254, 254, 254, 254,
+    254, 254, 254, 254, 254, 254, 251, 251,
+    254, 254, 254 }
+};
+#endif
+
+// Paragraph 9.9
+void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
+  VP8Proba* const proba = &dec->proba_;
+  int t, b, c, p;
+  for (t = 0; t < NUM_TYPES; ++t) {
+    for (b = 0; b < NUM_BANDS; ++b) {
+      for (c = 0; c < NUM_CTX; ++c) {
+        for (p = 0; p < NUM_PROBAS; ++p) {
+          if (VP8GetBit(br, CoeffsUpdateProba[t][b][c][p])) {
+            proba->coeffs_[t][b][c][p] = VP8GetValue(br, 8);
+          }
+        }
+      }
+    }
+  }
+  dec->use_skip_proba_ = VP8Get(br);
+  if (dec->use_skip_proba_) {
+    dec->skip_p_ = VP8GetValue(br, 8);
+  }
+#ifndef ONLY_KEYFRAME_CODE
+  if (!dec->frm_hdr_.key_frame_) {
+    int i;
+    dec->intra_p_ = VP8GetValue(br, 8);
+    dec->last_p_ = VP8GetValue(br, 8);
+    dec->golden_p_ = VP8GetValue(br, 8);
+    if (VP8Get(br)) {   // update y-mode
+      for (i = 0; i < 4; ++i) {
+        proba->ymode_[i] = VP8GetValue(br, 8);
+      }
+    }
+    if (VP8Get(br)) {   // update uv-mode
+      for (i = 0; i < 3; ++i) {
+        proba->uvmode_[i] = VP8GetValue(br, 8);
+      }
+    }
+    // update MV
+    for (i = 0; i < 2; ++i) {
+      int k;
+      for (k = 0; k < NUM_MV_PROBAS; ++k) {
+        if (VP8GetBit(br, MVUpdateProba[i][k])) {
+          const int v = VP8GetValue(br, 7);
+          proba->mv_[i][k] = v ? v << 1 : 1;
+        }
+      }
+    }
+  }
+#endif
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@ -0,0 +1,624 @@
+// Copyright 2010 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// main entry for the decoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+#include "vp8i.h"
+
+//-----------------------------------------------------------------------------
+// VP8Decoder
+
+static void SetOk(VP8Decoder* const dec) {
+  dec->status_ = 0;
+  dec->error_msg_ = "OK";
+}
+
+void VP8InitIo(VP8Io* const io) {
+  if (io) {
+    memset(io, 0, sizeof(*io));
+  }
+}
+
+VP8Decoder* VP8New() {
+  VP8Decoder* dec = (VP8Decoder*)calloc(1, sizeof(VP8Decoder));
+  if (dec) {
+    SetOk(dec);
+    dec->ready_ = 0;
+  }
+  return dec;
+}
+
+int VP8Status(VP8Decoder* const dec) {
+  if (!dec) return 2;
+  return dec->status_;
+}
+
+const char* VP8StatusMessage(VP8Decoder* const dec) {
+  if (!dec) return "no object";
+  if (!dec->error_msg_) return "OK";
+  return dec->error_msg_;
+}
+
+void VP8Delete(VP8Decoder* const dec) {
+  if (dec) {
+    VP8Clear(dec);
+    free(dec);
+  }
+}
+
+int VP8SetError(VP8Decoder* const dec, int error, const char *msg) {
+  dec->status_ = error;
+  dec->error_msg_ = msg;
+  dec->ready_ = 0;
+  return 0;
+}
+
+//-----------------------------------------------------------------------------
+// Header parsing
+
+static void ResetSegmentHeader(VP8SegmentHeader* const hdr) {
+  assert(hdr);
+  hdr->use_segment_ = 0;
+  hdr->update_map_ = 0;
+  hdr->absolute_delta_ = 1;
+  memset(hdr->quantizer_, 0, sizeof(hdr->quantizer_));
+  memset(hdr->filter_strength_, 0, sizeof(hdr->filter_strength_));
+}
+
+// Paragraph 9.3
+static int ParseSegmentHeader(VP8BitReader* br,
+                              VP8SegmentHeader* hdr, VP8Proba* proba) {
+  assert(br);
+  assert(hdr);
+  hdr->use_segment_ = VP8Get(br);
+  if (hdr->use_segment_) {
+    hdr->update_map_ = VP8Get(br);
+    if (VP8Get(br)) {   // update data
+      int s;
+      hdr->absolute_delta_ = VP8Get(br);
+      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+        hdr->quantizer_[s] = VP8Get(br) ? VP8GetSignedValue(br, 7) : 0;
+      }
+      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+        hdr->filter_strength_[s] = VP8Get(br) ? VP8GetSignedValue(br, 6) : 0;
+      }
+    }
+    if (hdr->update_map_) {
+      int s;
+      for (s = 0; s < MB_FEATURE_TREE_PROBS; ++s) {
+        proba->segments_[s] = VP8Get(br) ? VP8GetValue(br, 8) : 255u;
+      }
+    }
+  } else {
+    hdr->update_map_ = 0;
+  }
+  return 1;
+}
+
+// Paragraph 9.5
+static int ParsePartitions(VP8Decoder* const dec,
+                           const uint8_t* buf, uint32_t size) {
+  VP8BitReader* const br = &dec->br_;
+  const uint8_t* sz = buf;
+  int last_part;
+  uint32_t offset;
+  int p;
+
+  dec->num_parts_ = 1 << VP8GetValue(br, 2);
+  last_part = dec->num_parts_ - 1;
+  offset = last_part * 3;
+
+  if (size <= offset) {
+    return 0;
+  }
+  for (p = 0; p < last_part; ++p) {
+    const uint32_t psize = sz[0] | (sz[1] << 8) | (sz[2] << 16);
+    if (offset + psize > size) {
+      return 0;
+    }
+    VP8Init(dec->parts_ + p, buf + offset, psize);
+    offset += psize;
+    sz += 3;
+  }
+  size -= offset;
+  VP8Init(dec->parts_ + last_part, buf + offset, size);
+  return 1;
+}
+
+// Paragraph 9.4
+static int ParseFilterHeader(VP8BitReader* br, VP8Decoder* const dec) {
+  VP8FilterHeader* const hdr = &dec->filter_hdr_;
+  hdr->simple_    = VP8Get(br);
+  hdr->level_     = VP8GetValue(br, 6);
+  hdr->sharpness_ = VP8GetValue(br, 3);
+  hdr->use_lf_delta_ = VP8Get(br);
+  if (hdr->use_lf_delta_) {
+    if (VP8Get(br)) {   // update lf-delta?
+      int i;
+      for (i = 0; i < NUM_REF_LF_DELTAS; ++i) {
+        if (VP8Get(br)) {
+          hdr->ref_lf_delta_[i] = VP8GetSignedValue(br, 6);
+        }
+      }
+      for (i = 0; i < NUM_MODE_LF_DELTAS; ++i) {
+        if (VP8Get(br)) {
+          hdr->mode_lf_delta_[i] = VP8GetSignedValue(br, 6);
+        }
+      }
+    }
+  }
+  dec->filter_type_ = (hdr->level_ == 0) ? 0 : hdr->simple_ ? 1 : 2;
+  if (dec->filter_type_ > 0) {    // precompute filter levels per segment
+    if (dec->segment_hdr_.use_segment_) {
+      int s;
+      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+        int strength = dec->segment_hdr_.filter_strength_[s];
+        if (!dec->segment_hdr_.absolute_delta_) {
+          strength += hdr->level_;
+        }
+        dec->filter_levels_[s] = strength;
+      }
+    } else {
+      dec->filter_levels_[0] = hdr->level_;
+    }
+  }
+  return 1;
+}
+
+static inline uint32_t get_le32(const uint8_t* const data) {
+  return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
+}
+
+// Topmost call
+int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
+  uint8_t* buf;
+  uint32_t buf_size;
+  VP8FrameHeader* frm_hdr;
+  VP8PictureHeader* pic_hdr;
+  VP8BitReader* br;
+
+  if (dec == NULL) {
+    return 0;
+  }
+  SetOk(dec);
+  if (io == NULL) {
+    return VP8SetError(dec, 2, "null VP8Io passed to VP8GetHeaders()");
+  }
+
+  buf = (uint8_t *)io->data;
+  buf_size = io->data_size;
+  if (buf == NULL || buf_size <= 4) {
+    return VP8SetError(dec, 2, "Not enough data to parse frame header");
+  }
+
+  // Skip over valid RIFF headers
+  if (!memcmp(buf, "RIFF", 4)) {
+    uint32_t riff_size;
+    uint32_t chunk_size;
+    if (buf_size < 20 + 4) {
+      return VP8SetError(dec, 2, "RIFF: Truncated header.");
+    }
+    if (memcmp(buf + 8, "WEBP", 4)) {   // wrong image file signature
+      return VP8SetError(dec, 2, "RIFF: WEBP signature not found.");
+    }
+    riff_size = get_le32(buf + 4);
+    if (memcmp(buf + 12, "VP8 ", 4)) {
+      return VP8SetError(dec, 2, "RIFF: Invalid compression format.");
+    }
+    chunk_size = get_le32(buf + 16);
+    if ((chunk_size > riff_size + 8) || (chunk_size & 1)) {
+      return VP8SetError(dec, 2, "RIFF: Inconsistent size information.");
+    }
+    buf += 20;
+    buf_size -= 20;
+  }
+
+  // Paragraph 9.1
+  {
+    const uint32_t bits = buf[0] | (buf[1] << 8) | (buf[2] << 16);
+    frm_hdr = &dec->frm_hdr_;
+    frm_hdr->key_frame_ = !(bits & 1);
+    frm_hdr->profile_ = (bits >> 1) & 7;
+    frm_hdr->show_ = (bits >> 4) & 1;
+    frm_hdr->partition_length_ = (bits >> 5);
+    buf += 3;
+    buf_size -= 3;
+  }
+
+  pic_hdr = &dec->pic_hdr_;
+  if (frm_hdr->key_frame_) {
+    // Paragraph 9.2
+    if (buf_size < 7) {
+      return VP8SetError(dec, 2, "cannot parse picture header");
+    }
+    if (buf[0] != 0x9d || buf[1] != 0x01 || buf[2] != 0x2a) {
+      return VP8SetError(dec, 2, "Bad code word");
+    }
+    pic_hdr->width_ = ((buf[4] << 8) | buf[3]) & 0x3fff;
+    pic_hdr->xscale_ = buf[4] >> 6;   // ratio: 1, 5/4 5/3 or 2
+    pic_hdr->height_ = ((buf[6] << 8) | buf[5]) & 0x3fff;
+    pic_hdr->yscale_ = buf[6] >> 6;
+    buf += 7;
+    buf_size -= 7;
+
+    dec->mb_w_ = (pic_hdr->width_ + 15) >> 4;
+    dec->mb_h_ = (pic_hdr->height_ + 15) >> 4;
+    io->width = pic_hdr->width_;
+    io->height = pic_hdr->height_;
+
+    VP8ResetProba(&dec->proba_);
+    ResetSegmentHeader(&dec->segment_hdr_);
+    dec->segment_ = 0;    // default for intra
+  }
+
+  br = &dec->br_;
+  VP8Init(br, buf, buf_size);
+  if (frm_hdr->partition_length_ > buf_size) {
+    return VP8SetError(dec, 2, "bad partition length");
+  }
+  buf += frm_hdr->partition_length_;
+  buf_size -= frm_hdr->partition_length_;
+  if (frm_hdr->key_frame_) {
+    pic_hdr->colorspace_ = VP8Get(br);
+    pic_hdr->clamp_type_ = VP8Get(br);
+  }
+  if (!ParseSegmentHeader(br, &dec->segment_hdr_, &dec->proba_)) {
+    return VP8SetError(dec, 2, "cannot parse segment header");
+  }
+  // Filter specs
+  if (!ParseFilterHeader(br, dec)) {
+    return VP8SetError(dec, 2, "cannot parse filter header");
+  }
+  if (!ParsePartitions(dec, buf, buf_size)) {
+    return VP8SetError(dec, 2, "cannot parse partitions");
+  }
+
+  // quantizer change
+  VP8ParseQuant(dec);
+
+  // Frame buffer marking
+  if (!frm_hdr->key_frame_) {
+    // Paragraph 9.7
+#ifndef ONLY_KEYFRAME_CODE
+    dec->buffer_flags_ = VP8Get(br) << 0;   // update golden
+    dec->buffer_flags_ |= VP8Get(br) << 1;  // update alt ref
+    if (!(dec->buffer_flags_ & 1)) {
+      dec->buffer_flags_ |= VP8GetValue(br, 2) << 2;
+    }
+    if (!(dec->buffer_flags_ & 2)) {
+      dec->buffer_flags_ |= VP8GetValue(br, 2) << 4;
+    }
+    dec->buffer_flags_ |= VP8Get(br) << 6;    // sign bias golden
+    dec->buffer_flags_ |= VP8Get(br) << 7;    // sign bias alt ref
+#else
+    return VP8SetError(dec, 2, "Not a key frame.");
+#endif
+  } else {
+    dec->buffer_flags_ = 0x003 | 0x100;
+  }
+
+  // Paragraph 9.8
+#ifndef ONLY_KEYFRAME_CODE
+  dec->update_proba_ = VP8Get(br);
+  if (!dec->update_proba_) {    // save for later restore
+    dec->proba_saved_ = dec->proba_;
+  }
+  dec->buffer_flags_ &= 1 << 8;
+  dec->buffer_flags_ |=
+      (frm_hdr->key_frame_ || VP8Get(br)) << 8;    // refresh last frame
+#else
+  VP8Get(br);   // just ignore the value of update_proba_
+#endif
+
+  VP8ParseProba(br, dec);
+
+  // sanitized state
+  dec->ready_ = 1;
+  return 1;
+}
+
+//-----------------------------------------------------------------------------
+// Residual decoding (Paragraph 13.2 / 13.3)
+
+static const uint8_t kBands[16 + 1] = {
+  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+  0  // extra entry as sentinel
+};
+
+static const uint8_t kCat3[] = { 173, 148, 140, 0 };
+static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
+static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
+static const uint8_t kCat6[] =
+  { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
+static const uint8_t * const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
+static const uint8_t kZigzag[16] = {
+  0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
+};
+
+typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];  // for const-casting
+
+// Returns 1 if there's non-zero coeffs, 0 otherwise
+static int GetCoeffs(VP8BitReader* const br, ProbaArray prob,
+                     int ctx, const uint16_t dq[2], int n, int16_t* out) {
+  const uint8_t* p = prob[kBands[n]][ctx];
+  if (!VP8GetBit(br, p[0])) {   // first EOB is more a 'CBP' bit.
+    return 0;
+  }
+  while (1) {
+    ++n;
+    if (!VP8GetBit(br, p[1])) {
+      p = prob[kBands[n]][0];
+    } else {  // non zero coeff
+      int v, j;
+      if (!VP8GetBit(br, p[2])) {
+        p = prob[kBands[n]][1];
+        v = 1;
+      } else {
+        if (!VP8GetBit(br, p[3])) {
+          if (!VP8GetBit(br, p[4])) {
+            v = 2;
+          } else {
+            v = 3 + VP8GetBit(br, p[5]);
+          }
+        } else {
+          if (!VP8GetBit(br, p[6])) {
+            if (!VP8GetBit(br, p[7])) {
+              v = 5 + VP8GetBit(br, 159);
+            } else {
+              v = 7 + 2 * VP8GetBit(br, 165) + VP8GetBit(br, 145);
+            }
+          } else {
+            const uint8_t* tab;
+            const int bit1 = VP8GetBit(br, p[8]);
+            const int bit0 = VP8GetBit(br, p[9 + bit1]);
+            const int cat = 2 * bit1 + bit0;
+            v = 0;
+            for (tab = kCat3456[cat]; *tab; ++tab) {
+              v += v + VP8GetBit(br, *tab);
+            }
+            v += 3 + (8 << cat);
+          }
+        }
+        p = prob[kBands[n]][2];
+      }
+      j = kZigzag[n - 1];
+      out[j] = VP8GetSigned(br, v) * dq[j > 0];
+      if (n == 16 || !VP8GetBit(br, p[0])) {   // EOB
+        return 1;
+      }
+    }
+    if (n == 16) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+// Table to unpack four bits into four bytes
+static const uint8_t kUnpackTab[16][4] = {
+  {0, 0, 0, 0},  {1, 0, 0, 0},  {0, 1, 0, 0},  {1, 1, 0, 0},
+  {0, 0, 1, 0},  {1, 0, 1, 0},  {0, 1, 1, 0},  {1, 1, 1, 0},
+  {0, 0, 0, 1},  {1, 0, 0, 1},  {0, 1, 0, 1},  {1, 1, 0, 1},
+  {0, 0, 1, 1},  {1, 0, 1, 1},  {0, 1, 1, 1},  {1, 1, 1, 1} };
+
+// Macro to pack four LSB of four bytes into four bits.
+#if defined(__PPC__) || defined(_M_PPC) || defined(_ARCH_PPC) || \
+    defined(__BIG_ENDIAN__)
+#define PACK_CST 0x08040201U
+#else
+#define PACK_CST 0x01020408U
+#endif
+#define PACK(X, S) ((((*(uint32_t*)(X)) * PACK_CST) & 0xff000000) >> (S))
+
+static int ParseResiduals(VP8Decoder* const dec,
+                          VP8MB* const mb, VP8BitReader* const token_br) {
+  int out_t_nz, out_l_nz, first;
+  ProbaArray ac_prob;
+  const VP8QuantMatrix* q = &dec->dqm_[dec->segment_];
+  int16_t* dst = dec->coeffs_;
+  VP8MB* const left_mb = dec->mb_info_ - 1;
+  uint8_t nz_ac[4], nz_dc[4];
+  uint32_t non_zero_ac = 0;
+  uint32_t non_zero_dc = 0;
+  uint8_t tnz[4], lnz[4];
+  int x, y, ch;
+
+  memset(dst, 0, 384 * sizeof(*dst));
+  if (!dec->is_i4x4_) {    // parse DC
+    int16_t dc[16] = { 0 };
+    const int ctx = mb->dc_nz_ + left_mb->dc_nz_;
+    mb->dc_nz_ = left_mb->dc_nz_ =
+        GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[1],
+                  ctx, q->y2_mat_, 0, dc);
+    first = 1;
+    ac_prob = (ProbaArray)dec->proba_.coeffs_[0];
+    VP8TransformWHT(dc, dst);
+  } else {
+    first = 0;
+    ac_prob = (ProbaArray)dec->proba_.coeffs_[3];
+  }
+
+  memcpy(tnz, kUnpackTab[mb->nz_ & 0xf], sizeof(tnz));
+  memcpy(lnz, kUnpackTab[left_mb->nz_ & 0xf], sizeof(lnz));
+  for (y = 0; y < 4; ++y) {
+    int l = lnz[y];
+
+    for (x = 0; x < 4; ++x) {
+      const int ctx = l + tnz[x];
+      l = GetCoeffs(token_br, ac_prob, ctx,
+                    q->y1_mat_, first, dst);
+      nz_dc[x] = (dst[0] != 0);
+      nz_ac[x] = tnz[x] = l;
+      dst += 16;
+    }
+    lnz[y] = l;
+    non_zero_dc |= PACK(nz_dc, 24 - y * 4);
+    non_zero_ac |= PACK(nz_ac, 24 - y * 4);
+  }
+  out_t_nz = PACK(tnz, 24);
+  out_l_nz = PACK(lnz, 24);
+
+  memcpy(tnz, kUnpackTab[mb->nz_ >> 4], sizeof(tnz));
+  memcpy(lnz, kUnpackTab[left_mb->nz_ >> 4], sizeof(lnz));
+  for (ch = 0; ch < 4; ch += 2) {
+    for (y = 0; y < 2; ++y) {
+      int l = lnz[ch + y];
+      for (x = 0; x < 2; ++x) {
+        const int ctx = l + tnz[ch + x];
+        l = GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[2],
+                      ctx, q->uv_mat_, 0, dst);
+        nz_dc[y * 2 + x] = (dst[0] != 0);
+        nz_ac[y * 2 + x] = tnz[ch + x] = l;
+        dst += 16;
+      }
+      lnz[ch + y] = l;
+    }
+    non_zero_dc |= PACK(nz_dc, 8 - ch * 2);
+    non_zero_ac |= PACK(nz_ac, 8 - ch * 2);
+  }
+  out_t_nz |= PACK(tnz, 20);
+  out_l_nz |= PACK(lnz, 20);
+  mb->nz_ = out_t_nz;
+  left_mb->nz_ = out_l_nz;
+
+  dec->non_zero_ac_ = non_zero_ac;
+  dec->non_zero_ = non_zero_ac | non_zero_dc;
+  mb->skip_ = !dec->non_zero_;
+
+  return 1;
+}
+#undef PACK
+
+//-----------------------------------------------------------------------------
+// Main loop
+
+static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
+  int ok = 1;
+  VP8BitReader* const br = &dec->br_;
+  VP8BitReader* token_br;
+
+  for (dec->mb_y_ = 0; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
+    VP8MB* const left = dec->mb_info_ - 1;
+
+    memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
+
+    left->nz_ = 0;
+    left->dc_nz_ = 0;
+    token_br = &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
+
+    for (dec->mb_x_ = 0; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
+      VP8MB* const info = dec->mb_info_ + dec->mb_x_;
+
+      // Note: we don't save segment map (yet), as we don't expect
+      // to decode more than 1 keyframe.
+      if (dec->segment_hdr_.update_map_) {
+        // Hardcoded tree parsing
+        dec->segment_ = !VP8GetBit(br, dec->proba_.segments_[0]) ?
+              VP8GetBit(br, dec->proba_.segments_[1]) :
+          2 + VP8GetBit(br, dec->proba_.segments_[2]);
+      }
+      info->skip_ = dec->use_skip_proba_ ? VP8GetBit(br, dec->skip_p_) : 0;
+
+      VP8ParseIntraMode(br, dec);
+
+      if (!info->skip_) {
+        if (!ParseResiduals(dec, info, token_br)) {
+          ok = 0;
+          break;
+        }
+      } else {
+        left->nz_ = info->nz_ = 0;
+        if (!dec->is_i4x4_) {
+          left->dc_nz_ = info->dc_nz_ = 0;
+        }
+        dec->non_zero_ = 0;
+        dec->non_zero_ac_ = 0;
+      }
+      VP8ReconstructBlock(dec);
+
+      // Store data and save block's filtering params
+      VP8StoreBlock(dec);
+    }
+    if (!ok) {
+      break;
+    }
+    VP8FinishRow(dec, io);
+    if (dec->br_.eof_ || token_br->eof_) {
+      ok = 0;
+      break;
+    }
+  }
+
+  // Finish
+#ifndef ONLY_KEYFRAME_CODE
+  if (!dec->update_proba_) {
+    dec->proba_ = dec->proba_saved_;
+  }
+#endif
+
+  return ok;
+}
+
+// Main entry point
+int VP8Decode(VP8Decoder* const dec, VP8Io* const io) {
+  if (dec == NULL) {
+    return 0;
+  }
+  if (io == NULL) {
+    return VP8SetError(dec, 2, "NULL VP8Io parameter in VP8Decode().");
+  }
+
+  if (!dec->ready_) {
+    if (!VP8GetHeaders(dec, io)) {
+      return 0;
+    }
+  }
+  assert(dec->ready_);
+
+  // will allocate memory and prepare everything.
+  if (!VP8InitFrame(dec, io)) {
+    VP8Clear(dec);
+    return VP8SetError(dec, 3, "Allocation failed");
+  }
+
+
+  if (io->setup && !io->setup(io)) {
+    VP8Clear(dec);
+    return VP8SetError(dec, 3, "Frame setup failed");
+  }
+
+  // Main decoding loop
+  {
+    const int ret = ParseFrame(dec, io);
+    if (io->teardown) {
+      io->teardown(io);
+    }
+    if (!ret) {
+      VP8Clear(dec);
+      return VP8SetError(dec, 3, "Frame decoding failed");
+    }
+  }
+
+  dec->ready_ = 0;
+  return 1;
+}
+
+void VP8Clear(VP8Decoder* const dec) {
+  if (dec == NULL) {
+    return;
+  }
+  if (dec->mem_) {
+    free(dec->mem_);
+  }
+  dec->mem_ = NULL;
+  dec->mem_size_ = 0;
+  memset(&dec->br_, 0, sizeof(dec->br_));
+  dec->ready_ = 0;
+}
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@ -0,0 +1,316 @@
+// Copyright 2010 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// VP8 decoder: internal header.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DECODE_VP8I_H_
+#define WEBP_DECODE_VP8I_H_
+
+#include <string.h>     // for memcpy()
+#include "bits.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//-----------------------------------------------------------------------------
+// Various defines and enums
+
+#define ONLY_KEYFRAME_CODE      // to remove any code related to P-Frames
+
+// intra prediction modes
+enum { B_DC_PRED = 0,   // 4x4 modes
+       B_TM_PRED,
+       B_VE_PRED,
+       B_HE_PRED,
+       B_RD_PRED,
+       B_VR_PRED,
+       B_LD_PRED,
+       B_VL_PRED,
+       B_HD_PRED,
+       B_HU_PRED,
+       NUM_BMODES = B_HU_PRED + 1 - B_DC_PRED,  // = 10
+
+       // Luma16 or UV modes
+       DC_PRED = B_DC_PRED, V_PRED = B_VE_PRED,
+       H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED,
+       B_PRED = NUM_BMODES,   // refined I4x4 mode
+
+       // special modes
+       B_DC_PRED_NOTOP = 4,
+       B_DC_PRED_NOLEFT = 5,
+       B_DC_PRED_NOTOPLEFT = 6 };
+
+enum { MB_FEATURE_TREE_PROBS = 3,
+       NUM_MB_SEGMENTS = 4,
+       NUM_REF_LF_DELTAS = 4,
+       NUM_MODE_LF_DELTAS = 4,    // I4x4, ZERO, *, SPLIT
+       MAX_NUM_PARTITIONS = 8,
+       // Probabilities
+       NUM_TYPES = 4,
+       NUM_BANDS = 8,
+       NUM_CTX = 3,
+       NUM_PROBAS = 11,
+       NUM_MV_PROBAS = 19 };
+
+// YUV-cache parameters.
+// Constraints are: We need to store one 16x16 block of luma samples (y),
+// and two 8x8 chroma blocks (u/v). These are better be 16-bytes aligned,
+// in order to be SIMD-friendly. We also need to store the top, left and
+// top-left samples (from previously decoded blocks), along with four
+// extra top-right samples for luma (intra4x4 prediction only).
+// One possible layout is, using 32 * (17 + 9) bytes:
+//
+//   .+------   <- only 1 pixel high
+//   .|yyyyt.
+//   .|yyyyt.
+//   .|yyyyt.
+//   .|yyyy..
+//   .+--.+--   <- only 1 pixel high
+//   .|uu.|vv
+//   .|uu.|vv
+//
+// Every character is a 4x4 block, with legend:
+//  '.' = unused
+//  'y' = y-samples   'u' = u-samples     'v' = u-samples
+//  '|' = left sample,   '-' = top sample,    '+' = top-left sample
+//  't' = extra top-right sample for 4x4 modes
+// With this layout, BPS (=Bytes Per Scan-line) is one cacheline size.
+#define BPS       32    // this is the common stride used by yuv[]
+#define YUV_SIZE (BPS * 17 + BPS * 9)
+#define Y_SIZE   (BPS * 17)
+#define Y_OFF    (BPS * 1 + 8)
+#define U_OFF    (Y_OFF + BPS * 16 + BPS)
+#define V_OFF    (U_OFF + 16)
+
+//-----------------------------------------------------------------------------
+// Headers
+
+typedef struct {
+  uint8_t key_frame_;
+  uint8_t profile_;
+  uint8_t show_;
+  uint32_t partition_length_;
+} VP8FrameHeader;
+
+typedef struct {
+  uint16_t width_;
+  uint16_t height_;
+  uint8_t xscale_;
+  uint8_t yscale_;
+  uint8_t colorspace_;   // 0 = YCbCr
+  uint8_t clamp_type_;
+} VP8PictureHeader;
+
+// segment features
+typedef struct {
+  int use_segment_;
+  int update_map_;        // whether to update the segment map or not
+  int absolute_delta_;    // absolute or delta values for quantizer and filter
+  int8_t quantizer_[NUM_MB_SEGMENTS];        // quantization changes
+  int8_t filter_strength_[NUM_MB_SEGMENTS];  // filter strength for segments
+} VP8SegmentHeader;
+
+// Struct collecting all frame-persistent probabilities.
+typedef struct {
+  uint8_t segments_[MB_FEATURE_TREE_PROBS];
+  // Type: 0:Intra16-AC  1:Intra16-DC   2:Chroma   3:Intra4
+  uint8_t coeffs_[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS];
+#ifndef ONLY_KEYFRAME_CODE
+  uint8_t ymode_[4], uvmode_[3];
+  uint8_t mv_[2][NUM_MV_PROBAS];
+#endif
+} VP8Proba;
+
+// Filter parameters
+typedef struct {
+  int simple_;                  // 0=complex, 1=simple
+  int level_;                   // [0..63]
+  int sharpness_;               // [0..7]
+  int use_lf_delta_;
+  int ref_lf_delta_[NUM_REF_LF_DELTAS];
+  int mode_lf_delta_[NUM_MODE_LF_DELTAS];
+} VP8FilterHeader;
+
+//-----------------------------------------------------------------------------
+// Informations about the macroblocks.
+
+typedef struct {
+  // block type
+  uint8_t skip_:1;
+  // filter specs
+  uint8_t f_level_:6;      // filter strength: 0..63
+  uint8_t f_ilevel_:6;     // inner limit: 1..63
+  uint8_t f_inner_:1;      // do inner filtering?
+  // cbp
+  uint8_t nz_;        // non-zero AC/DC coeffs
+  uint8_t dc_nz_;     // non-zero DC coeffs
+} VP8MB;
+
+// Dequantization matrices
+typedef struct {
+  uint16_t y1_mat_[2], y2_mat_[2], uv_mat_[2];    // [DC / AC]
+} VP8QuantMatrix;
+
+//-----------------------------------------------------------------------------
+// VP8Decoder: the main opaque structure handed over to user
+
+struct VP8Decoder {
+  int status_;    // 0 = OK
+  int ready_;     // true if ready to decode a picture with VP8Decode()
+  const char* error_msg_;  // set when status_ is not OK.
+
+  // Main data source
+  VP8BitReader br_;
+
+  // headers
+  VP8FrameHeader   frm_hdr_;
+  VP8PictureHeader pic_hdr_;
+  VP8FilterHeader  filter_hdr_;
+  VP8SegmentHeader segment_hdr_;
+
+  // dimension, in macroblock units.
+  int mb_w_, mb_h_;
+
+  // number of partitions.
+  int num_parts_;
+  // per-partition boolean decoders.
+  VP8BitReader parts_[MAX_NUM_PARTITIONS];
+
+  // buffer refresh flags
+  //   bit 0: refresh Gold, bit 1: refresh Alt
+  //   bit 2-3: copy to Gold, bit 4-5: copy to Alt
+  //   bit 6: Gold sign bias, bit 7: Alt sign bias
+  //   bit 8: refresh last frame
+  uint32_t buffer_flags_;
+
+  // dequantization (one set of DC/AC dequant factor per segment)
+  VP8QuantMatrix dqm_[NUM_MB_SEGMENTS];
+
+  // probabilities
+  VP8Proba proba_;
+  int use_skip_proba_;
+  uint8_t skip_p_;
+#ifndef ONLY_KEYFRAME_CODE
+  uint8_t intra_p_, last_p_, golden_p_;
+  VP8Proba proba_saved_;
+  int update_proba_;
+#endif
+
+  // Boundary data cache and persistent buffers.
+  uint8_t* intra_t_;     // top intra modes values: 4 * mb_w_
+  uint8_t  intra_l_[4];  // left intra modes values
+  uint8_t *y_t_;         // top luma samples: 16 * mb_w_
+  uint8_t *u_t_, *v_t_;  // top u/v samples: 8 * mb_w_ each
+
+  VP8MB* mb_info_;       // contextual macroblock infos (mb_w_ + 1)
+  uint8_t* yuv_b_;       // main block for Y/U/V (size = YUV_SIZE)
+  int16_t* coeffs_;      // 384 coeffs = (16+8+8) * 4*4
+
+  uint8_t* cache_y_;     // macroblock row for storing unfiltered samples
+  uint8_t* cache_u_;
+  uint8_t* cache_v_;
+  int cache_y_stride_;
+  int cache_uv_stride_;
+
+  // main memory chunk for the above data. Persistent.
+  void* mem_;
+  int mem_size_;
+
+  // Per macroblock non-persistent infos.
+  int mb_x_, mb_y_;       // current position, in macroblock units
+  uint8_t is_i4x4_;       // true if intra4x4
+  uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
+  uint8_t uvmode_;        // chroma prediction mode
+  uint8_t segment_;       // block's segment
+
+  // bit-wise info about the content of each sub-4x4 blocks: there are 16 bits
+  // for luma (bits #0->#15), then 4 bits for chroma-u (#16->#19) and 4 bits for
+  // chroma-v (#20->#23), each corresponding to one 4x4 block in decoding order.
+  // If the bit is set, the 4x4 block contains some non-zero coefficients.
+  uint32_t non_zero_;
+  uint32_t non_zero_ac_;
+
+  // Filtering side-info
+  int filter_type_;                       // 0=off, 1=simple, 2=complex
+  uint8_t filter_levels_[NUM_MB_SEGMENTS];  // precalculated per-segment
+};
+
+//-----------------------------------------------------------------------------
+// internal functions. Not public.
+
+// in vp8.c
+int VP8SetError(VP8Decoder* const dec, int error, const char *msg);
+
+// in tree.c
+void VP8ResetProba(VP8Proba* const proba);
+void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec);
+void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec);
+
+// in quant.c
+void VP8ParseQuant(VP8Decoder* const dec);
+
+// in frame.c
+int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
+// Predict a block and add residual
+void VP8ReconstructBlock(VP8Decoder* const dec);
+// Store a block, along with filtering params
+void VP8StoreBlock(VP8Decoder* const dec);
+// Finalize and transmit a complete row
+void VP8FinishRow(VP8Decoder* const dec, VP8Io* io);
+
+// in dsp.c
+typedef void (*VP8Idct)(const int16_t* coeffs, uint8_t* dst);
+extern VP8Idct VP8Transform;
+extern VP8Idct VP8TransformUV;
+extern VP8Idct VP8TransformDC;
+extern VP8Idct VP8TransformDCUV;
+extern void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
+
+// *dst is the destination block, with stride BPS. Boundary samples are
+// assumed accessible when needed.
+typedef void (*VP8PredFunc)(uint8_t *dst);
+extern VP8PredFunc VP8PredLuma16[7];
+extern VP8PredFunc VP8PredChroma8[7];
+extern VP8PredFunc VP8PredLuma4[11];
+
+void VP8DspInit();        // must be called before anything using the above
+void VP8DspInitTables();  // needs to be called no matter what.
+
+// simple filter (only for luma)
+typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
+extern VP8SimpleFilterFunc VP8SimpleVFilter16;
+extern VP8SimpleFilterFunc VP8SimpleHFilter16;
+extern VP8SimpleFilterFunc VP8SimpleVFilter16i;  // filter 3 inner edges
+extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
+
+// regular filter (on both macroblock edges and inner edges)
+typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
+                                  int thresh, int ithresh, int hev_t);
+typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride,
+                                    int thresh, int ithresh, int hev_t);
+// on outter edge
+extern VP8LumaFilterFunc VP8VFilter16;
+extern VP8LumaFilterFunc VP8HFilter16;
+extern VP8ChromaFilterFunc VP8VFilter8;
+extern VP8ChromaFilterFunc VP8HFilter8;
+
+// on inner edge
+extern VP8LumaFilterFunc VP8VFilter16i;   // filtering 3 inner edges altogether
+extern VP8LumaFilterFunc VP8HFilter16i;
+extern VP8ChromaFilterFunc VP8VFilter8i;  // filtering u and v altogether
+extern VP8ChromaFilterFunc VP8HFilter8i;
+
+//-----------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  // WEBP_DECODE_VP8I_H_
--- a/src/dec/webp.c
+++ b/src/dec/webp.c
@ -0,0 +1,579 @@
+// Copyright 2010 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Main decoding functions for WEBP images.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+#include "vp8i.h"
+#include "yuv.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define FANCY_UPSCALING   // undefined to remove fancy upscaling support
+
+//-----------------------------------------------------------------------------
+// RIFF layout is:
+//   0ffset  tag
+//   0...3   "RIFF" 4-byte tag
+//   4...7   size of image data (including metadata) starting at offset 8
+//   8...11  "WEBP"   our form-type signature
+//   12..15  "VP8 ": 4-bytes tags, describing the raw video format used
+//   16..19  size of the raw VP8 image data, starting at offset 20
+//   20....  the VP8 bytes
+// There can be extra chunks after the "VP8 " chunk (ICMT, ICOP, ...)
+// All 32-bits sizes are in little-endian order.
+// Note: chunk data must be padded to multiple of 2 in size
+
+static inline uint32_t get_le32(const uint8_t* const data) {
+  return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
+}
+
+// If a RIFF container is detected, validate it and skip over it.
+static uint32_t CheckRIFFHeader(const uint8_t** data_ptr,
+                                uint32_t *data_size_ptr) {
+  uint32_t chunk_size = 0xffffffffu;
+  if (*data_size_ptr >= 10 + 20 && !memcmp(*data_ptr, "RIFF", 4)) {
+    if (memcmp(*data_ptr + 8, "WEBP", 4)) {
+      return 0;  // wrong image file signature
+    } else {
+      const uint32_t riff_size = get_le32(*data_ptr + 4);
+      if (memcmp(*data_ptr + 12, "VP8 ", 4)) {
+        return 0;   // invalid compression format
+      }
+      chunk_size = get_le32(*data_ptr + 16);
+      if ((chunk_size > riff_size + 8) || (chunk_size & 1)) {
+        return 0;  // inconsistent size information.
+      }
+      // We have a IFF container. Skip it.
+      *data_ptr += 20;
+      *data_size_ptr -= 20;
+    }
+    return chunk_size;
+  }
+  return *data_size_ptr;
+}
+
+//-----------------------------------------------------------------------------
+// Fancy upscaling
+
+typedef enum { MODE_RGB = 0, MODE_RGBA = 1,
+               MODE_BGR = 2, MODE_BGRA = 3,
+               MODE_YUV = 4 } CSP_MODE;
+
+#ifdef FANCY_UPSCALING
+
+// Given samples laid out in a square as:
+//  [a b]
+//  [c d]
+// we interpolate u/v as:
+//  ([9*a + 3*b + 3*c +   d    3*a + 9*b + 3*c +   d] + [8 8]) / 16
+//  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
+
+// We process u and v together stashed into 32bit (16bit each).
+#define LOAD_UV(u,v) ((u) | ((v) << 16))
+
+#define UPSCALE_FUNC(FUNC_NAME, FUNC, XSTEP)                                   \
+static inline void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,    \
+                             const uint8_t* top_u, const uint8_t* top_v,       \
+                             const uint8_t* cur_u, const uint8_t* cur_v,       \
+                             uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
+  int x;                                                                       \
+  const int last_pixel_pair = (len - 1) >> 1;                                  \
+  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
+  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
+  if (top_y) {                                                                 \
+    const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
+    FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
+  }                                                                            \
+  if (bottom_y) {                                                              \
+    const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
+    FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
+  }                                                                            \
+  for (x = 1; x <= last_pixel_pair; ++x) {                                     \
+    const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]);  /* top sample */       \
+    const uint32_t uv   = LOAD_UV(cur_u[x], cur_v[x]);  /* sample */           \
+    /* precompute invariant values associated with first and second diagonals*/\
+    const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
+    const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
+    const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
+    if (top_y) {                                                               \
+      const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
+      const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
+      FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
+           top_dst + (2 * x - 1) * XSTEP);                                     \
+      FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
+           top_dst + (2 * x - 0) * XSTEP);                                     \
+    }                                                                          \
+    if (bottom_y) {                                                            \
+      const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
+      const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
+      FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
+           bottom_dst + (2 * x - 1) * XSTEP);                                  \
+      FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16),                       \
+           bottom_dst + (2 * x + 0) * XSTEP);                                  \
+    }                                                                          \
+    tl_uv = t_uv;                                                              \
+    l_uv = uv;                                                                 \
+  }                                                                            \
+  if (!(len & 1)) {                                                            \
+    if (top_y) {                                                               \
+      const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
+      FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
+           top_dst + (len - 1) * XSTEP);                                       \
+    }                                                                          \
+    if (bottom_y) {                                                            \
+      const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
+      FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
+           bottom_dst + (len - 1) * XSTEP);                                    \
+    }                                                                          \
+  }                                                                            \
+}
+
+// All variants implemented.
+UPSCALE_FUNC(UpscaleRgbLinePair,  VP8YuvToRgb,  3)
+UPSCALE_FUNC(UpscaleBgrLinePair,  VP8YuvToBgr,  3)
+UPSCALE_FUNC(UpscaleRgbaLinePair, VP8YuvToRgba, 4)
+UPSCALE_FUNC(UpscaleBgraLinePair, VP8YuvToBgra, 4)
+
+// Main driver function.
+static inline
+void UpscaleLinePair(const uint8_t* top_y, const uint8_t* bottom_y,
+                     const uint8_t* top_u, const uint8_t* top_v,
+                     const uint8_t* cur_u, const uint8_t* cur_v,
+                     uint8_t* top_dst, uint8_t* bottom_dst, int len,
+                     CSP_MODE mode) {
+  if (mode == MODE_RGB) {
+    UpscaleRgbLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
+                       top_dst, bottom_dst, len);
+  } else if (mode == MODE_BGR) {
+    UpscaleBgrLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
+                       top_dst, bottom_dst, len);
+  } else if (mode == MODE_RGBA) {
+    UpscaleRgbaLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
+                        top_dst, bottom_dst, len);
+  } else {
+    assert(mode == MODE_BGRA);
+    UpscaleBgraLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
+                        top_dst, bottom_dst, len);
+  }
+}
+
+#undef LOAD_UV
+#undef UPSCALE_FUNC
+
+#endif  // FANCY_UPSCALING
+
+//-----------------------------------------------------------------------------
+// Main conversion driver.
+
+typedef struct {
+  uint8_t* output;      // rgb(a) or luma
+  uint8_t *u, *v;
+  uint8_t *top_y, *top_u, *top_v;
+  int stride;           // rgb(a) stride or luma stride
+  int u_stride;
+  int v_stride;
+  CSP_MODE mode;
+} Params;
+
+static void CustomPut(const VP8Io* io) {
+  Params *p = (Params*)io->opaque;
+  const int w = io->width;
+  const int mb_h = io->mb_h;
+  const int uv_w = (w + 1) / 2;
+  assert(!(io->mb_y & 1));
+
+  if (w <= 0 || mb_h <= 0) return;
+
+  if (p->mode == MODE_YUV) {
+    uint8_t* const y_dst = p->output + io->mb_y * p->stride;
+    uint8_t* const u_dst = p->u + (io->mb_y >> 1) * p->u_stride;
+    uint8_t* const v_dst = p->v + (io->mb_y >> 1) * p->v_stride;
+    int j;
+    for (j = 0; j < mb_h; ++j) {
+      memcpy(y_dst + j * p->stride, io->y + j * io->y_stride, w);
+    }
+    for (j = 0; j < (mb_h + 1) / 2; ++j) {
+      memcpy(u_dst + j * p->u_stride, io->u + j * io->uv_stride, uv_w);
+      memcpy(v_dst + j * p->v_stride, io->v + j * io->uv_stride, uv_w);
+    }
+  } else {
+    uint8_t* dst = p->output + io->mb_y * p->stride;
+    if (io->fancy_upscaling) {
+#ifdef FANCY_UPSCALING
+      const uint8_t* cur_y = io->y;
+      const uint8_t* cur_u = io->u;
+      const uint8_t* cur_v = io->v;
+      const uint8_t* top_u = p->top_u;
+      const uint8_t* top_v = p->top_v;
+      int y = io->mb_y;
+      int y_end = io->mb_y + io->mb_h;
+      if (y == 0) {
+        // First line is special cased. We mirror the u/v samples at boundary.
+        UpscaleLinePair(NULL, cur_y, cur_u, cur_v, cur_u, cur_v,
+                        NULL, dst, w, p->mode);
+      } else {
+        // We can finish the left-over line from previous call
+        UpscaleLinePair(p->top_y, cur_y, top_u, top_v, cur_u, cur_v,
+                        dst - p->stride, dst, w, p->mode);
+      }
+      // Loop over each output pairs of row.
+      for (; y + 2 < y_end; y += 2) {
+        top_u = cur_u;
+        top_v = cur_v;
+        cur_u += io->uv_stride;
+        cur_v += io->uv_stride;
+        dst += 2 * p->stride;
+        cur_y += 2 * io->y_stride;
+        UpscaleLinePair(cur_y - io->y_stride, cur_y,
+                        top_u, top_v, cur_u, cur_v,
+                        dst - p->stride, dst, w, p->mode);
+      }
+      // move to last row
+      cur_y += io->y_stride;
+      if (y_end != io->height) {
+        // Save the unfinished samples for next call (as we're not done yet).
+        memcpy(p->top_y, cur_y, w * sizeof(*p->top_y));
+        memcpy(p->top_u, cur_u, uv_w * sizeof(*p->top_u));
+        memcpy(p->top_v, cur_v, uv_w * sizeof(*p->top_v));
+      } else {
+        // Process the very last row of even-sized picture
+        if (!(y_end & 1)) {
+          UpscaleLinePair(cur_y, NULL, cur_u, cur_v, cur_u, cur_v,
+                          dst + p->stride, NULL, w, p->mode);
+        }
+      }
+#else
+      assert(0);  // shouldn't happen.
+#endif
+    } else {
+      // Point-sampling U/V upscaler.
+      int j;
+      for (j = 0; j < mb_h; ++j) {
+        const uint8_t* y_src = io->y + j * io->y_stride;
+        int i;
+        for (i = 0; i < w; ++i) {
+          const int y = y_src[i];
+          const int u = io->u[(j / 2) * io->uv_stride + (i / 2)];
+          const int v = io->v[(j / 2) * io->uv_stride + (i / 2)];
+          if (p->mode == MODE_RGB) {
+            VP8YuvToRgb(y, u, v, dst + i * 3);
+          } else if (p->mode == MODE_BGR) {
+            VP8YuvToBgr(y, u, v, dst + i * 3);
+          } else if (p->mode == MODE_RGBA) {
+            VP8YuvToRgba(y, u, v, dst + i * 4);
+          } else {
+            VP8YuvToBgra(y, u, v, dst + i * 4);
+          }
+        }
+        dst += p->stride;
+      }
+    }
+  }
+}
+
+//-----------------------------------------------------------------------------
+
+static int CustomSetup(VP8Io* io) {
+#ifdef FANCY_UPSCALING
+  Params *p = (Params*)io->opaque;
+  p->top_y = p->top_u = p->top_v = NULL;
+  if (p->mode != MODE_YUV) {
+    const int uv_width = (io->width + 1) >> 1;
+    p->top_y = (uint8_t*)malloc(io->width + 2 * uv_width);
+    if (p->top_y == NULL) {
+      return 0;   // memory error.
+    }
+    p->top_u = p->top_y + io->width;
+    p->top_v = p->top_u + uv_width;
+    io->fancy_upscaling = 1;  // activate fancy upscaling
+  }
+#endif
+  return 1;
+}
+
+static void CustomTeardown(const VP8Io* io) {
+#ifdef FANCY_UPSCALING
+  Params *p = (Params*)io->opaque;
+  if (p->top_y) {
+    free(p->top_y);
+    p->top_y = p->top_u = p->top_v = NULL;
+  }
+#endif
+}
+
+//-----------------------------------------------------------------------------
+// "Into" variants
+
+static uint8_t* DecodeInto(CSP_MODE mode,
+                           const uint8_t* data, uint32_t data_size,
+                           Params* params, int output_size,
+                           int output_u_size, int output_v_size) {
+  VP8Decoder* dec = VP8New();
+  VP8Io io;
+  int ok = 1;
+
+  if (dec == NULL) {
+    return NULL;
+  }
+
+  VP8InitIo(&io);
+  io.data = data;
+  io.data_size = data_size;
+
+  params->mode = mode;
+  io.opaque = params;
+  io.put = CustomPut;
+  io.setup = CustomSetup;
+  io.teardown = CustomTeardown;
+
+  if (!VP8GetHeaders(dec, &io)) {
+    VP8Delete(dec);
+    return NULL;
+  }
+  // check output buffers
+
+  ok &= (params->stride * io.height <= output_size);
+  if (mode == MODE_RGB || mode == MODE_BGR) {
+    ok &= (params->stride >= io.width * 3);
+  } else if (mode == MODE_RGBA || mode == MODE_BGRA) {
+    ok &= (params->stride >= io.width * 4);
+  } else {
+    // some extra checks for U/V
+    const int u_size = params->u_stride * ((io.height + 1) / 2);
+    const int v_size = params->v_stride * ((io.height + 1) / 2);
+    ok &= (params->stride >= io.width);
+    ok &= (params->u_stride >= (io.width + 1) / 2) &&
+          (params->v_stride >= (io.width + 1) / 2);
+    ok &= (u_size <= output_u_size && v_size <= output_v_size);
+  }
+  if (!ok) {
+    VP8Delete(dec);
+    return NULL;
+  }
+
+  if (mode != MODE_YUV) {
+    VP8YUVInit();
+  }
+
+  ok = VP8Decode(dec, &io);
+  VP8Delete(dec);
+  return ok ? params->output : NULL;
+}
+
+uint8_t* WebPDecodeRGBInto(const uint8_t* data, uint32_t data_size,
+                           uint8_t* output, int output_size,
+                           int output_stride) {
+  Params params;
+
+  if (output == NULL) {
+    return NULL;
+  }
+
+  params.output = output;
+  params.stride = output_stride;
+  return DecodeInto(MODE_RGB, data, data_size, &params, output_size, 0, 0);
+}
+
+uint8_t* WebPDecodeRGBAInto(const uint8_t* data, uint32_t data_size,
+                            uint8_t* output, int output_size,
+                            int output_stride) {
+  Params params;
+
+  if (output == NULL) {
+    return NULL;
+  }
+
+  params.output = output;
+  params.stride = output_stride;
+  return DecodeInto(MODE_RGBA, data, data_size, &params, output_size, 0, 0);
+}
+
+uint8_t* WebPDecodeBGRInto(const uint8_t* data, uint32_t data_size,
+                           uint8_t* output, int output_size,
+                           int output_stride) {
+  Params params;
+
+  if (output == NULL) {
+    return NULL;
+  }
+
+  params.output = output;
+  params.stride = output_stride;
+  return DecodeInto(MODE_BGR, data, data_size, &params, output_size, 0, 0);
+}
+
+uint8_t* WebPDecodeBGRAInto(const uint8_t* data, uint32_t data_size,
+                            uint8_t* output, int output_size,
+                            int output_stride) {
+  Params params;
+
+  if (output == NULL) {
+    return NULL;
+  }
+
+  params.output = output;
+  params.stride = output_stride;
+  return DecodeInto(MODE_BGRA, data, data_size, &params, output_size, 0, 0);
+}
+
+uint8_t* WebPDecodeYUVInto(const uint8_t* data, uint32_t data_size,
+                           uint8_t* luma, int luma_size, int luma_stride,
+                           uint8_t* u, int u_size, int u_stride,
+                           uint8_t* v, int v_size, int v_stride) {
+  Params params;
+
+  if (luma == NULL) {
+    return NULL;
+  }
+
+  params.output = luma;
+  params.stride = luma_stride;
+  params.u = u;
+  params.u_stride = u_stride;
+  params.v = v;
+  params.v_stride = v_stride;
+  return DecodeInto(MODE_YUV, data, data_size, &params,
+                    luma_size, u_size, v_size);
+}
+
+//-----------------------------------------------------------------------------
+
+static uint8_t* Decode(CSP_MODE mode, const uint8_t* data, uint32_t data_size,
+                       int* width, int* height, Params* params_out) {
+  int w, h, stride;
+  int uv_size = 0;
+  int uv_stride = 0;
+  int size;
+  uint8_t* output;
+  Params params = { 0 };
+
+  if (!WebPGetInfo(data, data_size, &w, &h)) {
+    return NULL;
+  }
+  if (width) *width = w;
+  if (height) *height = h;
+
+  // initialize output buffer, now that dimensions are known.
+  stride = (mode == MODE_RGB || mode == MODE_BGR) ? 3 * w
+             : (mode == MODE_RGBA || mode == MODE_BGRA) ? 4 * w
+             : w;
+  size = stride * h;
+
+  if (mode == MODE_YUV) {
+    uv_stride = (w + 1) / 2;
+    uv_size = uv_stride * ((h + 1) / 2);
+  }
+
+  output = (uint8_t*)malloc(size + 2 * uv_size);
+  if (!output) {
+    return NULL;
+  }
+
+  params.output = output;
+  params.stride = stride;
+  if (mode == MODE_YUV) {
+    params.u = output + size;
+    params.u_stride = uv_stride;
+    params.v = output + size + uv_size;
+    params.v_stride = uv_stride;
+  }
+  if (params_out) *params_out = params;
+  return DecodeInto(mode, data, data_size, &params, size, uv_size, uv_size);
+}
+
+uint8_t* WebPDecodeRGB(const uint8_t* data, uint32_t data_size,
+                       int *width, int *height) {
+  return Decode(MODE_RGB, data, data_size, width, height, NULL);
+}
+
+uint8_t* WebPDecodeRGBA(const uint8_t* data, uint32_t data_size,
+                        int *width, int *height) {
+  return Decode(MODE_RGBA, data, data_size, width, height, NULL);
+}
+
+uint8_t* WebPDecodeBGR(const uint8_t* data, uint32_t data_size,
+                       int *width, int *height) {
+  return Decode(MODE_BGR, data, data_size, width, height, NULL);
+}
+
+uint8_t* WebPDecodeBGRA(const uint8_t* data, uint32_t data_size,
+                        int *width, int *height) {
+  return Decode(MODE_BGRA, data, data_size, width, height, NULL);
+}
+
+uint8_t* WebPDecodeYUV(const uint8_t* data, uint32_t data_size,
+                       int *width, int *height, uint8_t** u, uint8_t** v,
+                       int *stride, int* uv_stride) {
+  Params params;
+  uint8_t* const out = Decode(MODE_YUV, data, data_size,
+                              width, height, &params);
+
+  if (out) {
+    *u = params.u;
+    *v = params.v;
+    *stride = params.stride;
+    *uv_stride = params.u_stride;
+    assert(params.u_stride == params.v_stride);
+  }
+  return out;
+}
+
+//-----------------------------------------------------------------------------
+// WebPGetInfo()
+
+int WebPGetInfo(const uint8_t* data, uint32_t data_size,
+                int *width, int *height) {
+  const uint32_t chunk_size = CheckRIFFHeader(&data, &data_size);
+  if (!chunk_size) {
+    return 0;         // unsupported RIFF header
+  }
+  // Validate raw video data
+  if (data_size < 10) {
+    return 0;         // not enough data
+  }
+  // check signature
+  if (data[3] != 0x9d || data[4] != 0x01 || data[5] != 0x2a) {
+    return 0;         // Wrong signature.
+  } else {
+    const uint32_t bits = data[0] | (data[1] << 8) | (data[2] << 16);
+    const int key_frame = !(bits & 1);
+    const int w = ((data[7] << 8) | data[6]) & 0x3fff;
+    const int h = ((data[9] << 8) | data[8]) & 0x3fff;
+
+    if (!key_frame) {   // Not a keyframe.
+      return 0;
+    }
+
+    if (((bits >> 1) & 7) > 3) {
+      return 0;         // unknown profile
+    }
+    if (!((bits >> 4) & 1)) {
+      return 0;         // first frame is invisible!
+    }
+    if (((bits >> 5)) >= chunk_size) { // partition_length
+      return 0;         // inconsistent size information.
+    }
+
+    if (width) {
+      *width = w;
+    }
+    if (height) {
+      *height = h;
+    }
+
+    return 1;
+  }
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/yuv.c
+++ b/src/dec/yuv.c
@ -0,0 +1,46 @@
+// Copyright 2010 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// YUV->RGB conversion function
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "yuv.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+enum { YUV_HALF = 1 << (YUV_FIX - 1) };
+
+int16_t VP8kVToR[256], VP8kUToB[256];
+int32_t VP8kVToG[256], VP8kUToG[256];
+uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
+
+static int done = 0;
+
+void VP8YUVInit() {
+  int i;
+  if (done) {
+    return;
+  }
+  for (i = 0; i < 256; ++i) {
+    VP8kVToR[i] = (89858 * (i - 128) + YUV_HALF) >> YUV_FIX;
+    VP8kUToG[i] = -22014 * (i - 128) + YUV_HALF;
+    VP8kVToG[i] = -45773 * (i - 128);
+    VP8kUToB[i] = (113618 * (i - 128) + YUV_HALF) >> YUV_FIX;
+  }
+  for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
+    const int k = ((i - 16) * 76283 + YUV_HALF) >> YUV_FIX;
+    VP8kClip[i - YUV_RANGE_MIN] = (k < 0) ? 0 : (k > 255) ? 255 : k;
+  }
+  done = 1;
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/yuv.h
+++ b/src/dec/yuv.h
@ -0,0 +1,66 @@
+// Copyright 2010 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// inline YUV->RGB conversion function
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DECODE_YUV_H_
+#define WEBP_DECODE_YUV_H_
+
+#include "webp/decode_vp8.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+enum { YUV_FIX = 16,                // fixed-point precision
+       YUV_RANGE_MIN = -227,        // min value of r/g/b output
+       YUV_RANGE_MAX = 256 + 226    // max value of r/g/b output
+};
+extern int16_t VP8kVToR[256], VP8kUToB[256];
+extern int32_t VP8kVToG[256], VP8kUToG[256];
+extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
+
+inline static void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
+                               uint8_t* const rgb) {
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  rgb[0] = VP8kClip[y + r_off - YUV_RANGE_MIN];
+  rgb[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
+  rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
+}
+
+inline static void VP8YuvToRgba(int y, int u, int v, uint8_t* const rgba) {
+  VP8YuvToRgb(y, u, v, rgba);
+  rgba[3] = 0xff;
+}
+
+inline static void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
+                               uint8_t* const bgr) {
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN];
+  bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
+  bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
+}
+
+inline static void VP8YuvToBgra(int y, int u, int v, uint8_t* const bgra) {
+  VP8YuvToBgr(y, u, v, bgra);
+  bgra[3] = 0xff;
+}
+
+// Must be called before everything, to initialize the tables.
+void VP8YUVInit();
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  // WEBP_DECODE_YUV_H_