fix overread in Intra4Preds_NEON

Extend VP8EncIterator::i4_boundary_ by 3 bytes to avoid Intra4Preds_NEON
reading deeper into the struct (likely padding) when top is positioned
at offset 29. This data is memset with MSan to prevent a warning due to
its incorrect modeling of tbl instructions.

Prior to:
  169dfbf9 disable Intra4Preds_NEON
there was a mismatch in the preprocessor checks for enabling the
function in NEON and removing the C version; NEON used `BPS == 32` while
the C code was removed unconditionally when building for aarch64. This
patch also normalizes those checks to look for `BPS == 32` and `BPS !=
32` as appropriate.

Bug: b:366668849,webp:372109644
Change-Id: Ic9e6ad4b2d844cb446decd63aec0b2676a89c8d0
This commit is contained in:
James Zern 2024-10-08 12:53:47 -07:00
parent 874069042e
commit 78ed683978
4 changed files with 22 additions and 10 deletions

View File

@ -359,8 +359,7 @@ static void Intra16Preds_C(uint8_t* WEBP_RESTRICT dst,
//------------------------------------------------------------------------------
// luma 4x4 prediction
// TODO: b/366668849 - Restore this condition after Intra4Preds_NEON is fixed.
#if 1 // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
#define DST(x, y) dst[(x) + (y) * BPS]
#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
@ -551,7 +550,7 @@ static void Intra4Preds_C(uint8_t* WEBP_RESTRICT dst,
HU4(I4HU4 + dst, top);
}
#endif // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
#endif // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
//------------------------------------------------------------------------------
// Metric
@ -798,9 +797,9 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
VP8EncQuantizeBlockWHT = QuantizeBlock_C;
#endif
// TODO: b/366668849 - Move this into the #if after Intra4Preds_NEON is
// fixed.
#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
VP8EncPredLuma4 = Intra4Preds_C;
#endif
#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
VP8EncPredLuma16 = Intra16Preds_C;
#endif

View File

@ -927,8 +927,7 @@ static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
#if WEBP_AARCH64
// TODO: b/366668849 - enable Intra4Preds_NEON after fixing overread.
#if 0
#if BPS == 32
#define DC4_VE4_HE4_TM4_NEON(dst, tbl, res, lane) \
do { \
uint8x16_t r; \
@ -1040,7 +1039,7 @@ static void Intra4Preds_NEON(uint8_t* WEBP_RESTRICT dst,
vst1_u8(dst + I4HD4 + BPS * 2, vget_low_u8(result1));
vst1_u8(dst + I4HD4 + BPS * 3, vget_high_u8(result1));
}
#endif // 0
#endif // BPS == 32
static WEBP_INLINE void Fill_NEON(uint8_t* dst, const uint8_t value) {
uint8x16_t a = vdupq_n_u8(value);
@ -1212,8 +1211,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
VP8SSE4x4 = SSE4x4_NEON;
#if WEBP_AARCH64
// TODO: b/366668849 - enable Intra4Preds_NEON after fixing overread.
#if 0 // BPS == 32
#if BPS == 32
VP8EncPredLuma4 = Intra4Preds_NEON;
#endif
VP8EncPredLuma16 = Intra16Preds_NEON;

View File

@ -13,6 +13,7 @@
#include <string.h>
#include "src/dsp/cpu.h"
#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
@ -425,6 +426,15 @@ void VP8IteratorStartI4(VP8EncIterator* const it) {
it->i4_boundary_[17 + i] = it->i4_boundary_[17 + 15];
}
}
#if WEBP_AARCH64 && BPS == 32 && defined(WEBP_MSAN)
// Intra4Preds_NEON() reads 3 uninitialized bytes from i4_boundary_ when top
// is positioned at offset 29 (VP8TopLeftI4[3]). The values are not used
// meaningfully, but due to limitations in MemorySanitizer related to
// modeling of tbl instructions, a warning will be issued. This can be
// removed if MSan is updated to support the instructions. See
// https://issues.webmproject.org/372109644.
memset(it->i4_boundary_ + sizeof(it->i4_boundary_) - 3, 0xaa, 3);
#endif
VP8IteratorNzToBytes(it); // import the non-zero context
}

View File

@ -16,6 +16,7 @@
#include <string.h> // for memcpy()
#include "src/dec/common_dec.h"
#include "src/dsp/cpu.h"
#include "src/dsp/dsp.h"
#include "src/utils/bit_writer_utils.h"
#include "src/utils/thread_utils.h"
@ -233,7 +234,11 @@ typedef struct {
VP8BitWriter* bw_; // current bit-writer
uint8_t* preds_; // intra mode predictors (4x4 blocks)
uint32_t* nz_; // non-zero pattern
#if WEBP_AARCH64 && BPS == 32
uint8_t i4_boundary_[40]; // 32+8 boundary samples needed by intra4x4
#else
uint8_t i4_boundary_[37]; // 32+5 boundary samples needed by intra4x4
#endif
uint8_t* i4_top_; // pointer to the current top boundary sample
int i4_; // current intra4x4 mode being tested
int top_nz_[9]; // top-non-zero context.