encoder: switch BPS to 32 instead of 16

this is a first step to unifying encoding/decoding cache stride and possibly sharing the prediction functions in dsp/ With this layout, there's a little (~7%) space lost with unused samples. But no speed change was observed. Change-Id: I016df8cad41bde5088df3579e6ad65d884ee711e
2025-09-20 01:32:03 +02:00 · 2014-12-04 09:17:18 +01:00
parent 1b66bbe998
commit 57606047ec
4 changed files with 82 additions and 96 deletions
--- a/src/enc/quant.c
+++ b/src/enc/quant.c
@@ -444,15 +444,12 @@ void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
 // Quantize

 // Layout:
-// +----+
-// |YYYY| 0
-// |YYYY| 4
-// |YYYY| 8
-// |YYYY| 12
-// +----+
-// |UUVV| 16
-// |UUVV| 20
-// +----+
+// +----+----+
+// |YYYY|UUVV| 0
+// |YYYY|UUVV| 4
+// |YYYY|....| 8
+// |YYYY|....| 12
+// +----+----+

 const int VP8Scan[16] = {  // Luma
  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
@@ -1069,7 +1066,12 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
  }
  VP8SetIntraUVMode(it, rd->mode_uv);
  AddScore(rd, &rd_best);
-  if (dst != dst0) memcpy(dst0, dst, UV_SIZE);
+  if (dst != dst0) {   // copy 16x8 block if needed
+    int i;
+    for (i = 0; i < 8; ++i) {
+      memcpy(dst0 + i * BPS, dst + i * BPS, 2 * 8 * sizeof(*dst0));
+    }
+  }
 }

 //------------------------------------------------------------------------------