diff --git a/src/dsp/enc.c b/src/dsp/enc.c index 20507480..4bef1bab 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -359,8 +359,7 @@ static void Intra16Preds_C(uint8_t* WEBP_RESTRICT dst, //------------------------------------------------------------------------------ // luma 4x4 prediction -// TODO: b/366668849 - Restore this condition after Intra4Preds_NEON is fixed. -#if 1 // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 +#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32 #define DST(x, y) dst[(x) + (y) * BPS] #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2)) @@ -551,7 +550,7 @@ static void Intra4Preds_C(uint8_t* WEBP_RESTRICT dst, HU4(I4HU4 + dst, top); } -#endif // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 +#endif // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32 //------------------------------------------------------------------------------ // Metric @@ -798,9 +797,9 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) { VP8EncQuantizeBlockWHT = QuantizeBlock_C; #endif - // TODO: b/366668849 - Move this into the #if after Intra4Preds_NEON is - // fixed. +#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32 VP8EncPredLuma4 = Intra4Preds_C; +#endif #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 VP8EncPredLuma16 = Intra16Preds_C; #endif diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c index 8b474211..b373245d 100644 --- a/src/dsp/enc_neon.c +++ b/src/dsp/enc_neon.c @@ -927,8 +927,7 @@ static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32], #if WEBP_AARCH64 -// TODO: b/366668849 - enable Intra4Preds_NEON after fixing overread. -#if 0 +#if BPS == 32 #define DC4_VE4_HE4_TM4_NEON(dst, tbl, res, lane) \ do { \ uint8x16_t r; \ @@ -1040,7 +1039,7 @@ static void Intra4Preds_NEON(uint8_t* WEBP_RESTRICT dst, vst1_u8(dst + I4HD4 + BPS * 2, vget_low_u8(result1)); vst1_u8(dst + I4HD4 + BPS * 3, vget_high_u8(result1)); } -#endif // 0 +#endif // BPS == 32 static WEBP_INLINE void Fill_NEON(uint8_t* dst, const uint8_t value) { uint8x16_t a = vdupq_n_u8(value); @@ -1212,8 +1211,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) { VP8SSE4x4 = SSE4x4_NEON; #if WEBP_AARCH64 - // TODO: b/366668849 - enable Intra4Preds_NEON after fixing overread. -#if 0 // BPS == 32 +#if BPS == 32 VP8EncPredLuma4 = Intra4Preds_NEON; #endif VP8EncPredLuma16 = Intra16Preds_NEON; diff --git a/src/enc/iterator_enc.c b/src/enc/iterator_enc.c index 4232253c..eb83a327 100644 --- a/src/enc/iterator_enc.c +++ b/src/enc/iterator_enc.c @@ -13,6 +13,7 @@ #include +#include "src/dsp/cpu.h" #include "src/enc/vp8i_enc.h" //------------------------------------------------------------------------------ @@ -425,6 +426,15 @@ void VP8IteratorStartI4(VP8EncIterator* const it) { it->i4_boundary_[17 + i] = it->i4_boundary_[17 + 15]; } } +#if WEBP_AARCH64 && BPS == 32 && defined(WEBP_MSAN) + // Intra4Preds_NEON() reads 3 uninitialized bytes from i4_boundary_ when top + // is positioned at offset 29 (VP8TopLeftI4[3]). The values are not used + // meaningfully, but due to limitations in MemorySanitizer related to + // modeling of tbl instructions, a warning will be issued. This can be + // removed if MSan is updated to support the instructions. See + // https://issues.webmproject.org/372109644. + memset(it->i4_boundary_ + sizeof(it->i4_boundary_) - 3, 0xaa, 3); +#endif VP8IteratorNzToBytes(it); // import the non-zero context } diff --git a/src/enc/vp8i_enc.h b/src/enc/vp8i_enc.h index 3a030cc3..b6d00a7c 100644 --- a/src/enc/vp8i_enc.h +++ b/src/enc/vp8i_enc.h @@ -16,6 +16,7 @@ #include // for memcpy() #include "src/dec/common_dec.h" +#include "src/dsp/cpu.h" #include "src/dsp/dsp.h" #include "src/utils/bit_writer_utils.h" #include "src/utils/thread_utils.h" @@ -233,7 +234,11 @@ typedef struct { VP8BitWriter* bw_; // current bit-writer uint8_t* preds_; // intra mode predictors (4x4 blocks) uint32_t* nz_; // non-zero pattern +#if WEBP_AARCH64 && BPS == 32 + uint8_t i4_boundary_[40]; // 32+8 boundary samples needed by intra4x4 +#else uint8_t i4_boundary_[37]; // 32+5 boundary samples needed by intra4x4 +#endif uint8_t* i4_top_; // pointer to the current top boundary sample int i4_; // current intra4x4 mode being tested int top_nz_[9]; // top-non-zero context.