Optimize VP8SetResidualCoeffs.

Brings down WebP lossy encoding timings by 5%

Change-Id: Ia4a2fab0a887aaaf7841ce6d9ee16270d3e15489
This commit is contained in:
skal
2014-06-03 06:44:04 +02:00
parent ac591cf22e
commit 6679f8996f
6 changed files with 107 additions and 44 deletions

View File

@ -17,7 +17,9 @@
#include <stdlib.h> // for abs()
#include <emmintrin.h>
#include "../enc/cost.h"
#include "../enc/vp8enci.h"
#include "../utils/utils.h"
//------------------------------------------------------------------------------
// Quite useful macro for debugging. Left here for convenience.
@ -929,6 +931,33 @@ static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
return DoQuantizeBlock(in, out, 0, &mtx->sharpen_[0], mtx);
}
// Forward declaration.
void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
VP8Residual* const res);
void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
VP8Residual* const res) {
const __m128i c0 = _mm_loadu_si128((const __m128i*)coeffs);
const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
// Use SSE to compare 8 values with a single instruction.
const __m128i zero = _mm_setzero_si128();
const __m128i m0 = _mm_cmpeq_epi16(c0, zero);
const __m128i m1 = _mm_cmpeq_epi16(c1, zero);
// Get the comparison results as a bitmask, consisting of two times 16 bits:
// two identical bits for each result. Concatenate both bitmasks to get a
// single 32 bit value. Negate the mask to get the position of entries that
// are not equal to zero. Finally, mask out least significant bits according
// to res->first.
const uint32_t mask =
~((_mm_movemask_epi8(m1) << 16) | _mm_movemask_epi8(m0)) &
-(1U << (res->first << 1));
// The position of the most significant non-zero bit indicates the position of
// the last non-zero value. Divide the result by two because __movemask_epi8
// operates on 8 bit values instead of 16 bit values.
res->last = mask ? (BitsLog2Floor(mask) >> 1) : -1;
res->coeffs = coeffs;
}
#endif // WEBP_USE_SSE2
//------------------------------------------------------------------------------