diff --git a/src/dsp/dec_neon.c b/src/dsp/dec_neon.c index 14f300cf..781b1fae 100644 --- a/src/dsp/dec_neon.c +++ b/src/dsp/dec_neon.c @@ -1281,33 +1281,48 @@ static void DC4(uint8_t* dst) { // DC } } -static void TM4(uint8_t* dst) { // TrueMotion +// TrueMotion (4x4 + 8x8) +static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) { const uint8x8_t TL = vdup_n_u8(dst[-BPS - 1]); // top-left pixel 'A[-1]' const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]' const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1] - const int16x8_t l0 = ConvertU8ToS16(vld1_u8(dst + 0 * BPS - 1)); // left edge - const int16x8_t l1 = ConvertU8ToS16(vld1_u8(dst + 1 * BPS - 1)); - const int16x8_t l2 = ConvertU8ToS16(vld1_u8(dst + 2 * BPS - 1)); - const int16x8_t l3 = ConvertU8ToS16(vld1_u8(dst + 3 * BPS - 1)); - const int16x8_t L0 = vdupq_lane_s16(vget_low_s16(l0), 0); - const int16x8_t L1 = vdupq_lane_s16(vget_low_s16(l1), 0); - const int16x8_t L2 = vdupq_lane_s16(vget_low_s16(l2), 0); - const int16x8_t L3 = vdupq_lane_s16(vget_low_s16(l3), 0); - const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1] - const int16x8_t r1 = vaddq_s16(L1, d); - const int16x8_t r2 = vaddq_s16(L2, d); - const int16x8_t r3 = vaddq_s16(L3, d); - // Saturate and store the result. - const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0)); - const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1)); - const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2)); - const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3)); - vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0); - vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0); - vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0); - vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0); + int y; + for (y = 0; y < size; y += 4) { + // left edge + const int16x8_t l0 = ConvertU8ToS16(vld1_u8(dst + 0 * BPS - 1)); + const int16x8_t l1 = ConvertU8ToS16(vld1_u8(dst + 1 * BPS - 1)); + const int16x8_t l2 = ConvertU8ToS16(vld1_u8(dst + 2 * BPS - 1)); + const int16x8_t l3 = ConvertU8ToS16(vld1_u8(dst + 3 * BPS - 1)); + const int16x8_t L0 = vdupq_lane_s16(vget_low_s16(l0), 0); + const int16x8_t L1 = vdupq_lane_s16(vget_low_s16(l1), 0); + const int16x8_t L2 = vdupq_lane_s16(vget_low_s16(l2), 0); + const int16x8_t L3 = vdupq_lane_s16(vget_low_s16(l3), 0); + const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1] + const int16x8_t r1 = vaddq_s16(L1, d); + const int16x8_t r2 = vaddq_s16(L2, d); + const int16x8_t r3 = vaddq_s16(L3, d); + // Saturate and store the result. + const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0)); + const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1)); + const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2)); + const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3)); + if (size == 4) { + vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0); + vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0); + vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0); + vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0); + } else { + vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32); + vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32); + vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32); + vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32); + } + dst += 4 * BPS; + } } +static void TM4(uint8_t* dst) { return TrueMotion(dst, 4); } + static void VE4(uint8_t* dst) { // vertical // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS. const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row @@ -1371,6 +1386,11 @@ static void LD4(uint8_t* dst) { // Down-left vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0); } +//------------------------------------------------------------------------------ +// Chroma + +static void TM8uv(uint8_t* dst) { return TrueMotion(dst, 8); } + #endif // WEBP_USE_NEON //------------------------------------------------------------------------------ @@ -1407,5 +1427,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) { VP8PredLuma4[2] = VE4; VP8PredLuma4[4] = RD4; VP8PredLuma4[6] = LD4; + + VP8PredChroma8[1] = TM8uv; #endif // WEBP_USE_NEON }