From 57606047ecb05210ef51464473e357c39d9de5bc Mon Sep 17 00:00:00 2001
From: Pascal Massimino <pascal.massimino@gmail.com>
Date: Thu, 4 Dec 2014 09:17:18 +0100
Subject: [PATCH] encoder: switch BPS to 32 instead of 16

this is a first step to unifying encoding/decoding cache stride
and possibly sharing the prediction functions in dsp/

With this layout, there's a little (~7%) space lost with unused samples.
But no speed change was observed.

Change-Id: I016df8cad41bde5088df3579e6ad65d884ee711e
---
 src/dsp/enc_mips32.c      |  25 +++++---
 src/dsp/enc_mips_dsp_r2.c |  13 +++--
 src/enc/quant.c           |  22 +++----
 src/enc/vp8enci.h         | 118 ++++++++++++++------------------------
 4 files changed, 82 insertions(+), 96 deletions(-)

diff --git a/src/dsp/enc_mips32.c b/src/dsp/enc_mips32.c
index c8799c93..2f3fe981 100644
--- a/src/dsp/enc_mips32.c
+++ b/src/dsp/enc_mips32.c
@@ -769,17 +769,26 @@ extern WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
 #if defined(WEBP_USE_MIPS32)
-  VP8ITransform = ITransform;
+  // TODO(djordje): fix these to use generic BPS instead of hardcoded value 16
+  (void)ITransform;
+  (void)FTransform;
+  (void)Disto4x4;
+  (void)Disto16x16;
+//  VP8ITransform = ITransform;
+//  VP8FTransform = FTransform;
   VP8EncQuantizeBlock = QuantizeBlock;
   VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
-  VP8FTransform = FTransform;
+//  VP8TDisto4x4 = Disto4x4;
+//  VP8TDisto16x16 = Disto16x16;
 #if !defined(WORK_AROUND_GCC)
-  VP8SSE16x16 = SSE16x16;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE4x4 = SSE4x4;
+  (void)SSE16x16;
+  (void)SSE8x8;
+  (void)SSE16x8;
+  (void)SSE4x4;
+//  VP8SSE16x16 = SSE16x16;
+//  VP8SSE8x8 = SSE8x8;
+//  VP8SSE16x8 = SSE16x8;
+//  VP8SSE4x4 = SSE4x4;
 #endif
 #endif  // WEBP_USE_MIPS32
 }
diff --git a/src/dsp/enc_mips_dsp_r2.c b/src/dsp/enc_mips_dsp_r2.c
index feb8e388..eeac7be4 100644
--- a/src/dsp/enc_mips_dsp_r2.c
+++ b/src/dsp/enc_mips_dsp_r2.c
@@ -318,9 +318,14 @@ extern WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
 #if defined(WEBP_USE_MIPS_DSP_R2)
-  VP8FTransform = FTransform;
-  VP8ITransform = ITransform;
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
+  // TODO(djordje): fix these to use generic BPS instead of hardcoded value
+  (void)ITransform;
+  (void)FTransform;
+  (void)Disto4x4;
+  (void)Disto16x16;
+//  VP8FTransform = FTransform;
+//  VP8ITransform = ITransform;
+//  VP8TDisto4x4 = Disto4x4;
+//  VP8TDisto16x16 = Disto16x16;
 #endif  // WEBP_USE_MIPS_DSP_R2
 }
diff --git a/src/enc/quant.c b/src/enc/quant.c
index b107cba8..03cf26a9 100644
--- a/src/enc/quant.c
+++ b/src/enc/quant.c
@@ -444,15 +444,12 @@ void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
 // Quantize
 
 // Layout:
-// +----+
-// |YYYY| 0
-// |YYYY| 4
-// |YYYY| 8
-// |YYYY| 12
-// +----+
-// |UUVV| 16
-// |UUVV| 20
-// +----+
+// +----+----+
+// |YYYY|UUVV| 0
+// |YYYY|UUVV| 4
+// |YYYY|....| 8
+// |YYYY|....| 12
+// +----+----+
 
 const int VP8Scan[16] = {  // Luma
   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
@@ -1069,7 +1066,12 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
   }
   VP8SetIntraUVMode(it, rd->mode_uv);
   AddScore(rd, &rd_best);
-  if (dst != dst0) memcpy(dst0, dst, UV_SIZE);
+  if (dst != dst0) {   // copy 16x8 block if needed
+    int i;
+    for (i = 0; i < 8; ++i) {
+      memcpy(dst0 + i * BPS, dst + i * BPS, 2 * 8 * sizeof(*dst0));
+    }
+  }
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/enc/vp8enci.h b/src/enc/vp8enci.h
index 7194aa6f..2244799a 100644
--- a/src/enc/vp8enci.h
+++ b/src/enc/vp8enci.h
@@ -69,64 +69,34 @@ typedef enum {   // Rate-distortion optimization levels
   RD_OPT_TRELLIS_ALL = 3   // trellis-quant for every scoring (much slower)
 } VP8RDLevel;
 
-// YUV-cache parameters. Cache is 16-pixels wide.
-// The original or reconstructed samples can be accessed using VP8Scan[]
+// YUV-cache parameters. Cache is 32-pixels wide.
+// The original or reconstructed samples can be accessed using VP8Scan[].
 // The predicted blocks can be accessed using offsets to yuv_p_ and
-// the arrays VP8*ModeOffsets[];
-//         +----+      YUV Samples area. See VP8Scan[] for accessing the blocks.
-//  Y_OFF  |YYYY| <- original samples  ('yuv_in_')
-//         |YYYY|
-//         |YYYY|
-//         |YYYY|
-//  U_OFF  |UUVV| V_OFF  (=U_OFF + 8)
-//         |UUVV|
-//         +----+
-//  Y_OFF  |YYYY| <- compressed/decoded samples  ('yuv_out_')
-//         |YYYY|    There are two buffers like this ('yuv_out_'/'yuv_out2_')
-//         |YYYY|
-//         |YYYY|
-//  U_OFF  |UUVV| V_OFF
-//         |UUVV|
-//          x2 (for yuv_out2_)
-//         +----+     Prediction area ('yuv_p_', size = PRED_SIZE)
-// I16DC16 |YYYY|  Intra16 predictions (16x16 block each)
-//         |YYYY|
-//         |YYYY|
-//         |YYYY|
-// I16TM16 |YYYY|
-//         |YYYY|
-//         |YYYY|
-//         |YYYY|
-// I16VE16 |YYYY|
-//         |YYYY|
-//         |YYYY|
-//         |YYYY|
-// I16HE16 |YYYY|
-//         |YYYY|
-//         |YYYY|
-//         |YYYY|
-//         +----+  Chroma U/V predictions (16x8 block each)
-// C8DC8   |UUVV|
-//         |UUVV|
-// C8TM8   |UUVV|
-//         |UUVV|
-// C8VE8   |UUVV|
-//         |UUVV|
-// C8HE8   |UUVV|
-//         |UUVV|
-//         +----+  Intra 4x4 predictions (4x4 block each)
-//         |YYYY| I4DC4 I4TM4 I4VE4 I4HE4
-//         |YYYY| I4RD4 I4VR4 I4LD4 I4VL4
-//         |YY..| I4HD4 I4HU4 I4TMP
-//         +----+
-#define BPS       16   // this is the common stride
-#define Y_SIZE   (BPS * 16)
-#define UV_SIZE  (BPS * 8)
-#define YUV_SIZE (Y_SIZE + UV_SIZE)
-#define PRED_SIZE (6 * 16 * BPS + 12 * BPS)
+// the arrays VP8*ModeOffsets[].
+// * YUV Samples area (yuv_in_/yuv_out_/yuv_out2_)
+//   (see VP8Scan[] for accessing the blocks, along with Y_OFF/U_OFF/V_OFF):
+//         +----+----+
+//  Y_OFF  |YYYY|UUVV|
+//  U_OFF  |YYYY|UUVV|
+//  V_OFF  |YYYY|....| <- 25% wasted U/V area
+//         |YYYY|....|
+//         +----+----+
+// * Prediction area ('yuv_p_', size = PRED_SIZE)
+//   Intra16 predictions (16x16 block each, two per row):
+//         |I16DC16|I16TM16|
+//         |I16VE16|I16HE16|
+//   Chroma U/V predictions (16x8 block each, two per row):
+//         |C8DC8|C8TM8|
+//         |C8VE8|C8HE8|
+//   Intra 4x4 predictions (4x4 block each)
+//         |I4DC4 I4TM4 I4VE4 I4HE4|I4RD4 I4VR4 I4LD4 I4VL4|
+//         |I4HD4 I4HU4 I4TMP .....|.......................| <- ~31% wasted
+#define BPS       32   // this is the common stride
+#define YUV_SIZE (BPS * 16)  // 25% lost
+#define PRED_SIZE (32 * BPS + 16 * BPS + 8 * BPS)   // I16+Chroma+I4 preds
 #define Y_OFF    (0)
-#define U_OFF    (Y_SIZE)
-#define V_OFF    (U_OFF + 8)
+#define U_OFF    (16)
+#define V_OFF    (16 + 8)
 #define ALIGN_CST 15
 #define DO_ALIGN(PTR) ((uintptr_t)((PTR) + ALIGN_CST) & ~ALIGN_CST)
 
@@ -138,26 +108,26 @@ extern const int VP8I4ModeOffsets[NUM_BMODES];
 // Layout of prediction blocks
 // intra 16x16
 #define I16DC16 (0 * 16 * BPS)
-#define I16TM16 (1 * 16 * BPS)
-#define I16VE16 (2 * 16 * BPS)
-#define I16HE16 (3 * 16 * BPS)
+#define I16TM16 (I16DC16 + 16)
+#define I16VE16 (1 * 16 * BPS)
+#define I16HE16 (I16VE16 + 16)
 // chroma 8x8, two U/V blocks side by side (hence: 16x8 each)
-#define C8DC8 (4 * 16 * BPS)
-#define C8TM8 (4 * 16 * BPS + 8 * BPS)
-#define C8VE8 (5 * 16 * BPS)
-#define C8HE8 (5 * 16 * BPS + 8 * BPS)
+#define C8DC8 (2 * 16 * BPS)
+#define C8TM8 (C8DC8 + 1 * 16)
+#define C8VE8 (2 * 16 * BPS + 8 * BPS)
+#define C8HE8 (C8VE8 + 1 * 16)
 // intra 4x4
-#define I4DC4 (6 * 16 * BPS +  0)
-#define I4TM4 (6 * 16 * BPS +  4)
-#define I4VE4 (6 * 16 * BPS +  8)
-#define I4HE4 (6 * 16 * BPS + 12)
-#define I4RD4 (6 * 16 * BPS + 4 * BPS +  0)
-#define I4VR4 (6 * 16 * BPS + 4 * BPS +  4)
-#define I4LD4 (6 * 16 * BPS + 4 * BPS +  8)
-#define I4VL4 (6 * 16 * BPS + 4 * BPS + 12)
-#define I4HD4 (6 * 16 * BPS + 8 * BPS +  0)
-#define I4HU4 (6 * 16 * BPS + 8 * BPS +  4)
-#define I4TMP (6 * 16 * BPS + 8 * BPS +  8)
+#define I4DC4 (3 * 16 * BPS +  0)
+#define I4TM4 (I4DC4 +  4)
+#define I4VE4 (I4DC4 +  8)
+#define I4HE4 (I4DC4 + 12)
+#define I4RD4 (I4DC4 + 16)
+#define I4VR4 (I4DC4 + 20)
+#define I4LD4 (I4DC4 + 24)
+#define I4VL4 (I4DC4 + 28)
+#define I4HD4 (3 * 16 * BPS + 4 * BPS)
+#define I4HU4 (I4HD4 + 4)
+#define I4TMP (I4HD4 + 8)
 
 typedef int64_t score_t;     // type used for scores, rate, distortion
 // Note that MAX_COST is not the maximum allowed by sizeof(score_t),