Merge changes I6eac17e5,I32d2b514

* changes: dec_neon: add TM8uv dsp: initialize VP8PredChroma8 in VP8DspInit()
2026-04-09 14:22:31 +02:00 · 2014-11-25 15:32:14 -08:00
parent 8e517eca68 9de9074c92
commit f399d30764
3 changed files with 54 additions and 27 deletions
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@@ -463,10 +463,7 @@ const VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES] = {
  DC16NoTop, DC16NoLeft, DC16NoTopLeft
 };

-const VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES] = {
-  DC8uv, TM8uv, VE8uv, HE8uv,
-  DC8uvNoTop, DC8uvNoLeft, DC8uvNoTopLeft
-};
+VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];

 //------------------------------------------------------------------------------
 // Edge filtering functions
@@ -721,6 +718,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
  VP8PredLuma4[8] = HD4;
  VP8PredLuma4[9] = HU4;

+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TM8uv;
+  VP8PredChroma8[2] = VE8uv;
+  VP8PredChroma8[3] = HE8uv;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+  VP8PredChroma8[6] = DC8uvNoTopLeft;
+
  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@@ -1281,33 +1281,48 @@ static void DC4(uint8_t* dst) {    // DC
  }
 }

-static void TM4(uint8_t* dst) {    // TrueMotion
+// TrueMotion (4x4 + 8x8)
+static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
  const uint8x8_t TL = vdup_n_u8(dst[-BPS - 1]);  // top-left pixel 'A[-1]'
  const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
  const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL));  // A[c] - A[-1]
-  const int16x8_t l0 = ConvertU8ToS16(vld1_u8(dst + 0 * BPS - 1));  // left edge
-  const int16x8_t l1 = ConvertU8ToS16(vld1_u8(dst + 1 * BPS - 1));
-  const int16x8_t l2 = ConvertU8ToS16(vld1_u8(dst + 2 * BPS - 1));
-  const int16x8_t l3 = ConvertU8ToS16(vld1_u8(dst + 3 * BPS - 1));
-  const int16x8_t L0 = vdupq_lane_s16(vget_low_s16(l0), 0);
-  const int16x8_t L1 = vdupq_lane_s16(vget_low_s16(l1), 0);
-  const int16x8_t L2 = vdupq_lane_s16(vget_low_s16(l2), 0);
-  const int16x8_t L3 = vdupq_lane_s16(vget_low_s16(l3), 0);
-  const int16x8_t r0 = vaddq_s16(L0, d);  // L[r] + A[c] - A[-1]
-  const int16x8_t r1 = vaddq_s16(L1, d);
-  const int16x8_t r2 = vaddq_s16(L2, d);
-  const int16x8_t r3 = vaddq_s16(L3, d);
-  // Saturate and store the result.
-  const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
-  const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
-  const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
-  const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
-  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
-  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
-  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
-  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
+  int y;
+  for (y = 0; y < size; y += 4) {
+    // left edge
+    const int16x8_t l0 = ConvertU8ToS16(vld1_u8(dst + 0 * BPS - 1));
+    const int16x8_t l1 = ConvertU8ToS16(vld1_u8(dst + 1 * BPS - 1));
+    const int16x8_t l2 = ConvertU8ToS16(vld1_u8(dst + 2 * BPS - 1));
+    const int16x8_t l3 = ConvertU8ToS16(vld1_u8(dst + 3 * BPS - 1));
+    const int16x8_t L0 = vdupq_lane_s16(vget_low_s16(l0), 0);
+    const int16x8_t L1 = vdupq_lane_s16(vget_low_s16(l1), 0);
+    const int16x8_t L2 = vdupq_lane_s16(vget_low_s16(l2), 0);
+    const int16x8_t L3 = vdupq_lane_s16(vget_low_s16(l3), 0);
+    const int16x8_t r0 = vaddq_s16(L0, d);  // L[r] + A[c] - A[-1]
+    const int16x8_t r1 = vaddq_s16(L1, d);
+    const int16x8_t r2 = vaddq_s16(L2, d);
+    const int16x8_t r3 = vaddq_s16(L3, d);
+    // Saturate and store the result.
+    const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
+    const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
+    const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
+    const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
+    if (size == 4) {
+      vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
+      vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
+      vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
+      vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
+    } else {
+      vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
+      vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
+      vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
+      vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
+    }
+    dst += 4 * BPS;
+  }
 }

+static void TM4(uint8_t* dst) { return TrueMotion(dst, 4); }
+
 static void VE4(uint8_t* dst) {    // vertical
  // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1));  // top row
@@ -1371,6 +1386,11 @@ static void LD4(uint8_t* dst) {    // Down-left
  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
 }

+//------------------------------------------------------------------------------
+// Chroma
+
+static void TM8uv(uint8_t* dst) { return TrueMotion(dst, 8); }
+
 #endif   // WEBP_USE_NEON

 //------------------------------------------------------------------------------
@@ -1407,5 +1427,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
  VP8PredLuma4[2] = VE4;
  VP8PredLuma4[4] = RD4;
  VP8PredLuma4[6] = LD4;
+
+  VP8PredChroma8[1] = TM8uv;
 #endif   // WEBP_USE_NEON
 }
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -176,7 +176,7 @@ extern VP8WHT VP8TransformWHT;
 // assumed accessible when needed.
 typedef void (*VP8PredFunc)(uint8_t* dst);
 extern const VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
-extern const VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
+extern VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
 extern VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];

 // clipping tables (for filtering)