From 9904e365a84b7d7fd3c0bbc90dc5cb899301eb37 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 8 Apr 2015 18:43:46 -0700 Subject: [PATCH] dsp/dec_sse2: DC8uv / DC8uvNoLeft speedup use psadbw to perform top row summation; left remains in C as repacking it into a vector to apply the same operation is too costly. DC8uv: ~19% faster DC8uvNoLeft: ~12% faster Change-Id: I707c4f6177a65b5d1f2d3deeca87d2bb740185e2 --- src/dsp/dec_sse2.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/dsp/dec_sse2.c b/src/dsp/dec_sse2.c index 9af0f2d0..d99acaa0 100644 --- a/src/dsp/dec_sse2.c +++ b/src/dsp/dec_sse2.c @@ -1208,21 +1208,26 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) { } static void DC8uv(uint8_t* dst) { // DC - int dc0 = 8; - int i; - for (i = 0; i < 8; ++i) { - dc0 += dst[i - BPS] + dst[-1 + i * BPS]; + const __m128i zero = _mm_setzero_si128(); + const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS)); + const __m128i sum = _mm_sad_epu8(top, zero); + int left = 0; + int j; + for (j = 0; j < 8; ++j) { + left += dst[-1 + j * BPS]; + } + { + const int DC = _mm_cvtsi128_si32(sum) + left + 8; + Put8x8uv(DC >> 4, dst); } - Put8x8uv(dc0 >> 4, dst); } static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples - int dc0 = 4; - int i; - for (i = 0; i < 8; ++i) { - dc0 += dst[i - BPS]; - } - Put8x8uv(dc0 >> 3, dst); + const __m128i zero = _mm_setzero_si128(); + const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS)); + const __m128i sum = _mm_sad_epu8(top, zero); + const int DC = _mm_cvtsi128_si32(sum) + 4; + Put8x8uv(DC >> 3, dst); } static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples