speed-up SetResidualSSE2

(was unnecessarily complicated)

Before:
VP8SetResidualCoeffs: checksum = 1127918   elapsed = 475 ms.

Change-Id: Ia54bef86c45f9f474622ff16e594bf1da4f67ebd
After:
VP8SetResidualCoeffs: checksum = 1127918   elapsed = 404 ms.
This commit is contained in:
Pascal Massimino 2015-05-11 22:20:45 +00:00 committed by James Zern
parent bf46d0acff
commit f262d6120e

View File

@ -24,24 +24,21 @@
static void SetResidualCoeffsSSE2(const int16_t* const coeffs, static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
VP8Residual* const res) { VP8Residual* const res) {
const __m128i c0 = _mm_loadu_si128((const __m128i*)coeffs); const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8)); const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
// Use SSE to compare 8 values with a single instruction. // Use SSE2 to compare 16 values with a single instruction.
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
const __m128i m0 = _mm_cmpeq_epi16(c0, zero); const __m128i m0 = _mm_packs_epi16(c0, c1);
const __m128i m1 = _mm_cmpeq_epi16(c1, zero); const __m128i m1 = _mm_cmpeq_epi8(m0, zero);
// Get the comparison results as a bitmask, consisting of two times 16 bits: // Get the comparison results as a bitmask into 16bits. Negate the mask to get
// two identical bits for each result. Concatenate both bitmasks to get a // the position of entries that are not equal to zero. We don't need to mask
// single 32 bit value. Negate the mask to get the position of entries that // out least significant bits according to res->first, since coeffs[0] is 0
// are not equal to zero. We don't need to mask out least significant bits // if res->first > 0.
// according to res->first, since coeffs[0] is 0 if res->first > 0 const uint32_t mask = 0x0000ffffu ^ (uint32_t)_mm_movemask_epi8(m1);
const uint32_t mask =
~(((uint32_t)_mm_movemask_epi8(m1) << 16) | _mm_movemask_epi8(m0));
// The position of the most significant non-zero bit indicates the position of // The position of the most significant non-zero bit indicates the position of
// the last non-zero value. Divide the result by two because __movemask_epi8 // the last non-zero value.
// operates on 8 bit values instead of 16 bit values.
assert(res->first == 0 || coeffs[0] == 0); assert(res->first == 0 || coeffs[0] == 0);
res->last = mask ? (BitsLog2Floor(mask) >> 1) : -1; res->last = mask ? BitsLog2Floor(mask) : -1;
res->coeffs = coeffs; res->coeffs = coeffs;
} }