diff --git a/src/dsp/dec_wasm.c b/src/dsp/dec_wasm.c index e13e7446..7f2ab94e 100644 --- a/src/dsp/dec_wasm.c +++ b/src/dsp/dec_wasm.c @@ -32,6 +32,12 @@ static WEBP_INLINE uint8x16 get_16_bytes(uint8_t* dst) { return a; } +static WEBP_INLINE uint8x16 get_8_bytes(uint8_t* dst) { + uint8x16 a; + memcpy(&a, dst, 8); + return a; +} + static WEBP_INLINE uint8x16 splat_uint8(uint32_t val) { uint8x16 a; a[0] = val; @@ -120,6 +126,83 @@ static void DC16NoTopLeft(uint8_t* dst) { // DC with no top and left samples Put16(0x80, dst); } +//------------------------------------------------------------------------------ +// Chroma + +static WEBP_INLINE uint32_t add_horizontal_8(uint8_t* dst) { + const uint8x16 zero = (uint8x16){0}; + const uint8x16 a = get_8_bytes(dst); + const uint16x8 _a_lbw = (uint16x8)__builtin_shufflevector( + a, zero, 0, 16, 1, 16, 2, 16, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16); + const uint16x8 _a_hbw = (uint16x8)__builtin_shufflevector( + a, zero, 4, 16, 5, 16, 6, 16, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16); + const uint16x8 sum_a = _a_lbw + _a_hbw; + const uint16x8 sum_b = (uint16x8)__builtin_shufflevector( + sum_a, sum_a, 2, 3, 2, 3, 2, 3, 2, 3); + const uint16x8 sum_c = sum_a + sum_b; + const uint16x8 sum_d = (uint16x8)__builtin_shufflevector( + sum_c, sum_c, 1, 1, 1, 1, 1, 1, 1, 1); + const uint16x8 sum_e = sum_c + sum_d; + return sum_e[0] & 0xffff; +} + +static void VE8uv(uint8_t* dst) { // vertical + const uint8x16 top = get_8_bytes(dst - BPS); + int j; + for (j = 0; j < 8; ++j) { + memcpy(dst + j * BPS, &top, 8); + } +} + +static void HE8uv(uint8_t* dst) { // horizontal + int j; + for (j = 8; j > 0; --j) { + const uint8x16 values = splat_uint8(dst[-1]); + memcpy(dst, &values, 8); + dst += BPS; + } +} + +// helper for chroma-DC predictions +static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) { + int j; + const uint8x16 values = splat_uint8(v); + for (j = 0; j < 8; ++j) { + memcpy(dst + j * BPS, &values, 8); + } +} + +static void DC8uv(uint8_t* dst) { // DC + int left = 0; + int j; + const uint32_t sum = add_horizontal_8(dst - BPS); + for (j = 0; j < 8; ++j) { + left += dst[-1 + j * BPS]; + } + { + const int DC = sum + left + 8; + Put8x8uv(DC >> 4, dst); + } +} + +static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples + const uint32_t DC = 4 + add_horizontal_8(dst - BPS); + Put8x8uv(DC >> 3, dst); +} + +static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples + int dc0 = 4; + int i; + for (i = 0; i < 8; ++i) { + dc0 += dst[-1 + i * BPS]; + } + Put8x8uv(dc0 >> 3, dst); +} + +static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing + Put8x8uv(0x80, dst); +} + //------------------------------------------------------------------------------ // Entry point @@ -132,6 +215,13 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitWASM(void) { VP8PredLuma16[4] = DC16NoTop; VP8PredLuma16[5] = DC16NoLeft; VP8PredLuma16[6] = DC16NoTopLeft; + + VP8PredChroma8[0] = DC8uv; + VP8PredChroma8[2] = VE8uv; + VP8PredChroma8[3] = HE8uv; + VP8PredChroma8[4] = DC8uvNoTop; + VP8PredChroma8[5] = DC8uvNoLeft; + VP8PredChroma8[6] = DC8uvNoTopLeft; } #else // !WEBP_USE_WASM