diff --git a/src/dsp/lossless_sse2.c b/src/dsp/lossless_sse2.c index 1536a475..6ff30bcb 100644 --- a/src/dsp/lossless_sse2.c +++ b/src/dsp/lossless_sse2.c @@ -99,6 +99,14 @@ static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1, *avg = _mm_sub_epi8(avg1, one); } +static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) { + const __m128i zero = _mm_setzero_si128(); + const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero); + const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); + const __m128i sum = _mm_add_epi16(A1, A0); + return _mm_srli_epi16(sum, 1); +} + static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) { __m128i output; Average2_uint32(a0, a1, &output); @@ -106,20 +114,25 @@ static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) { } static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) { - const __m128i A1 = _mm_cvtsi32_si128(a1); - __m128i output, avg1; - Average2_uint32(a0, a2, &avg1); - Average2_m128i(&avg1, &A1, &output); - return _mm_cvtsi128_si32(output); + const __m128i zero = _mm_setzero_si128(); + const __m128i avg1 = Average2_uint32_16(a0, a2); + const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); + const __m128i sum = _mm_add_epi16(avg1, A1); + const __m128i avg2 = _mm_srli_epi16(sum, 1); + const __m128i A2 = _mm_packus_epi16(avg2, avg2); + const uint32_t output = _mm_cvtsi128_si32(A2); + return output; } static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1, uint32_t a2, uint32_t a3) { - __m128i avg1, avg2, avg3; - Average2_uint32(a0, a1, &avg1); - Average2_uint32(a2, a3, &avg2); - Average2_m128i(&avg1, &avg2, &avg3); - return _mm_cvtsi128_si32(avg3); + const __m128i avg1 = Average2_uint32_16(a0, a1); + const __m128i avg2 = Average2_uint32_16(a2, a3); + const __m128i sum = _mm_add_epi16(avg2, avg1); + const __m128i avg3 = _mm_srli_epi16(sum, 1); + const __m128i A0 = _mm_packus_epi16(avg3, avg3); + const uint32_t output = _mm_cvtsi128_si32(A0); + return output; } static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {