From 73d361dd5f58f7d6f52ab0beaea129dfe8f392d0 Mon Sep 17 00:00:00 2001
From: skal <pascal.massimino@gmail.com>
Date: Mon, 25 Aug 2014 13:16:14 -0700
Subject: [PATCH] introduce VP8EncQuantize2Blocks to quantize two blocks at a
 time

No speed diff for now. We might reorder better the instructions later,
to speed things up.

Change-Id: I1949525a0b329c7fd861b8dbea7db4b23d37709c
---
 src/dsp/dsp.h        |  5 +++++
 src/dsp/enc.c        | 10 ++++++++++
 src/dsp/enc_mips32.c |  9 +++++++++
 src/dsp/enc_neon.c   |  9 +++++++++
 src/dsp/enc_sse2.c   | 10 ++++++++++
 src/enc/quant.c      | 13 +++++++------
 6 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h
index 912fab21..5c0be204 100644
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -117,7 +117,12 @@ extern VP8BlockCopy VP8Copy4x4;
 struct VP8Matrix;   // forward declaration
 typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
                                 const struct VP8Matrix* const mtx);
+// Same as VP8QuantizeBlock, but quantizes two consecutive blocks.
+typedef int (*VP8Quantize2Blocks)(int16_t in[32], int16_t out[32],
+                                  const struct VP8Matrix* const mtx);
+
 extern VP8QuantizeBlock VP8EncQuantizeBlock;
+extern VP8Quantize2Blocks VP8EncQuantize2Blocks;
 
 // specific to 2nd transform:
 typedef int (*VP8QuantizeBlockWHT)(int16_t in[16], int16_t out[16],
diff --git a/src/dsp/enc.c b/src/dsp/enc.c
index e4ea8cb8..3a17c690 100644
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@@ -625,6 +625,14 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return (last >= 0);
 }
 
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
+  int nz;
+  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
 static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
                             const VP8Matrix* const mtx) {
   int n, last = -1;
@@ -684,6 +692,7 @@ VP8Metric VP8SSE4x4;
 VP8WMetric VP8TDisto4x4;
 VP8WMetric VP8TDisto16x16;
 VP8QuantizeBlock VP8EncQuantizeBlock;
+VP8Quantize2Blocks VP8EncQuantize2Blocks;
 VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
 VP8BlockCopy VP8Copy4x4;
 
@@ -711,6 +720,7 @@ void VP8EncDspInit(void) {
   VP8TDisto4x4 = Disto4x4;
   VP8TDisto16x16 = Disto16x16;
   VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
   VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
   VP8Copy4x4 = Copy4x4;
 
diff --git a/src/dsp/enc_mips32.c b/src/dsp/enc_mips32.c
index 6acde8a5..acd18fd6 100644
--- a/src/dsp/enc_mips32.c
+++ b/src/dsp/enc_mips32.c
@@ -237,6 +237,14 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return 0;
 }
 
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
+  int nz;
+  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
 #undef QUANTIZE_ONE
 
 // macro for one horizontal pass in Disto4x4 (TTransform)
@@ -756,6 +764,7 @@ void VP8EncDspInitMIPS32(void) {
 #if defined(WEBP_USE_MIPS32)
   VP8ITransform = ITransform;
   VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
   VP8TDisto4x4 = Disto4x4;
   VP8TDisto16x16 = Disto16x16;
   VP8FTransform = FTransform;
diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c
index 5b79c9c2..3ef9fe36 100644
--- a/src/dsp/enc_neon.c
+++ b/src/dsp/enc_neon.c
@@ -1047,6 +1047,14 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return 0;
 }
 
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
+  int nz;
+  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
 #endif   // !WORK_AROUND_GCC
 
 #endif   // WEBP_USE_NEON
@@ -1072,6 +1080,7 @@ void VP8EncDspInitNEON(void) {
   VP8SSE4x4 = SSE4x4;
 #if !defined(WORK_AROUND_GCC)
   VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
 #endif
 #endif   // WEBP_USE_NEON
 }
diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c
index 9958d9f6..928cf837 100644
--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
@@ -929,6 +929,15 @@ static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
   return DoQuantizeBlock(in, out, NULL, mtx);
 }
 
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
+  int nz;
+  const uint16_t* const sharpen = &mtx->sharpen_[0];
+  nz  = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+  nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+  return nz;
+}
+
 // Forward declaration.
 void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
                               VP8Residual* const res);
@@ -967,6 +976,7 @@ void VP8EncDspInitSSE2(void) {
 #if defined(WEBP_USE_SSE2)
   VP8CollectHistogram = CollectHistogram;
   VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
   VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
   VP8ITransform = ITransform;
   VP8FTransform = FTransform;
diff --git a/src/enc/quant.c b/src/enc/quant.c
index 9130a416..9b4f4d73 100644
--- a/src/enc/quant.c
+++ b/src/enc/quant.c
@@ -746,12 +746,13 @@ static int ReconstructIntra16(VP8EncIterator* const it,
       }
     }
   } else {
-    for (n = 0; n < 16; ++n) {
+     for (n = 0; n < 16; n += 2) {
       // Zero-out the first coeff, so that: a) nz is correct below, and
       // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
-      tmp[n][0] = 0;
-      nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
-      assert(rd->y_ac_levels[n][0] == 0);
+      tmp[n][0] = tmp[n + 1][0] = 0;
+      nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
+      assert(rd->y_ac_levels[n + 0][0] == 0);
+      assert(rd->y_ac_levels[n + 1][0] == 0);
     }
   }
 
@@ -816,8 +817,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
       }
     }
   } else {
-    for (n = 0; n < 8; ++n) {
-      nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
+    for (n = 0; n < 8; n += 2) {
+      nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
     }
   }