speed-up SetResidualSSE2

(was unnecessarily complicated)

Before:
VP8SetResidualCoeffs: checksum = 1127918   elapsed = 475 ms.

Change-Id: Ia54bef86c45f9f474622ff16e594bf1da4f67ebd
After:
VP8SetResidualCoeffs: checksum = 1127918   elapsed = 404 ms.
This commit is contained in:
Pascal Massimino 2015-05-11 22:20:45 +00:00 committed by James Zern
parent bf46d0acff
commit f262d6120e

View File

@ -24,24 +24,21 @@
static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
VP8Residual* const res) {
const __m128i c0 = _mm_loadu_si128((const __m128i*)coeffs);
const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
// Use SSE to compare 8 values with a single instruction.
// Use SSE2 to compare 16 values with a single instruction.
const __m128i zero = _mm_setzero_si128();
const __m128i m0 = _mm_cmpeq_epi16(c0, zero);
const __m128i m1 = _mm_cmpeq_epi16(c1, zero);
// Get the comparison results as a bitmask, consisting of two times 16 bits:
// two identical bits for each result. Concatenate both bitmasks to get a
// single 32 bit value. Negate the mask to get the position of entries that
// are not equal to zero. We don't need to mask out least significant bits
// according to res->first, since coeffs[0] is 0 if res->first > 0
const uint32_t mask =
~(((uint32_t)_mm_movemask_epi8(m1) << 16) | _mm_movemask_epi8(m0));
const __m128i m0 = _mm_packs_epi16(c0, c1);
const __m128i m1 = _mm_cmpeq_epi8(m0, zero);
// Get the comparison results as a bitmask into 16bits. Negate the mask to get
// the position of entries that are not equal to zero. We don't need to mask
// out least significant bits according to res->first, since coeffs[0] is 0
// if res->first > 0.
const uint32_t mask = 0x0000ffffu ^ (uint32_t)_mm_movemask_epi8(m1);
// The position of the most significant non-zero bit indicates the position of
// the last non-zero value. Divide the result by two because __movemask_epi8
// operates on 8 bit values instead of 16 bit values.
// the last non-zero value.
assert(res->first == 0 || coeffs[0] == 0);
res->last = mask ? (BitsLog2Floor(mask) >> 1) : -1;
res->last = mask ? BitsLog2Floor(mask) : -1;
res->coeffs = coeffs;
}