From f262d6120ede2d1088ab1a4712b6db7cbcfee0a1 Mon Sep 17 00:00:00 2001 From: Pascal Massimino Date: Mon, 11 May 2015 22:20:45 +0000 Subject: [PATCH] speed-up SetResidualSSE2 (was unnecessarily complicated) Before: VP8SetResidualCoeffs: checksum = 1127918 elapsed = 475 ms. Change-Id: Ia54bef86c45f9f474622ff16e594bf1da4f67ebd After: VP8SetResidualCoeffs: checksum = 1127918 elapsed = 404 ms. --- src/dsp/cost_sse2.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/src/dsp/cost_sse2.c b/src/dsp/cost_sse2.c index 5fca4b4a..0cb1c1fa 100644 --- a/src/dsp/cost_sse2.c +++ b/src/dsp/cost_sse2.c @@ -24,24 +24,21 @@ static void SetResidualCoeffsSSE2(const int16_t* const coeffs, VP8Residual* const res) { - const __m128i c0 = _mm_loadu_si128((const __m128i*)coeffs); + const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0)); const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8)); - // Use SSE to compare 8 values with a single instruction. + // Use SSE2 to compare 16 values with a single instruction. const __m128i zero = _mm_setzero_si128(); - const __m128i m0 = _mm_cmpeq_epi16(c0, zero); - const __m128i m1 = _mm_cmpeq_epi16(c1, zero); - // Get the comparison results as a bitmask, consisting of two times 16 bits: - // two identical bits for each result. Concatenate both bitmasks to get a - // single 32 bit value. Negate the mask to get the position of entries that - // are not equal to zero. We don't need to mask out least significant bits - // according to res->first, since coeffs[0] is 0 if res->first > 0 - const uint32_t mask = - ~(((uint32_t)_mm_movemask_epi8(m1) << 16) | _mm_movemask_epi8(m0)); + const __m128i m0 = _mm_packs_epi16(c0, c1); + const __m128i m1 = _mm_cmpeq_epi8(m0, zero); + // Get the comparison results as a bitmask into 16bits. Negate the mask to get + // the position of entries that are not equal to zero. We don't need to mask + // out least significant bits according to res->first, since coeffs[0] is 0 + // if res->first > 0. + const uint32_t mask = 0x0000ffffu ^ (uint32_t)_mm_movemask_epi8(m1); // The position of the most significant non-zero bit indicates the position of - // the last non-zero value. Divide the result by two because __movemask_epi8 - // operates on 8 bit values instead of 16 bit values. + // the last non-zero value. assert(res->first == 0 || coeffs[0] == 0); - res->last = mask ? (BitsLog2Floor(mask) >> 1) : -1; + res->last = mask ? BitsLog2Floor(mask) : -1; res->coeffs = coeffs; }