mirror of
https://github.com/webmproject/libwebp.git
synced 2025-02-19 02:12:52 +01:00
faster CombinedShannonEntropy_SSE2
optimized for sparse histograms Change-Id: I54412f5f8fc53d2598964a5be91f6c54ece3f21b
This commit is contained in:
parent
5bd2704e30
commit
fae416179e
@ -232,79 +232,55 @@ static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
|
||||
//------------------------------------------------------------------------------
|
||||
// Entropy
|
||||
|
||||
// Checks whether the X or Y contribution is worth computing and adding.
|
||||
// Used in loop unrolling.
|
||||
#define ANALYZE_X_OR_Y(x_or_y, j) \
|
||||
do { \
|
||||
if ((x_or_y)[i + (j)] != 0) retval -= VP8LFastSLog2((x_or_y)[i + (j)]); \
|
||||
} while (0)
|
||||
// TODO(https://crbug.com/webp/499): this function produces different results
|
||||
// from the C code due to use of double/float resulting in output differences
|
||||
// when compared to -noasm.
|
||||
#if !(defined(WEBP_HAVE_SLOW_CLZ_CTZ) || defined(__i386__) || defined(_M_IX86))
|
||||
|
||||
// Checks whether the X + Y contribution is worth computing and adding.
|
||||
// Used in loop unrolling.
|
||||
#define ANALYZE_XY(j) \
|
||||
do { \
|
||||
if (tmp[j] != 0) { \
|
||||
retval -= VP8LFastSLog2(tmp[j]); \
|
||||
ANALYZE_X_OR_Y(X, j); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#if !(defined(__i386__) || defined(_M_IX86))
|
||||
static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
|
||||
int i;
|
||||
double retval = 0.;
|
||||
int sumX, sumXY;
|
||||
int32_t tmp[4];
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
// Sums up X + Y, 4 ints at a time (and will merge it at the end for sumXY).
|
||||
__m128i sumXY_128 = zero;
|
||||
__m128i sumX_128 = zero;
|
||||
int sumX = 0, sumXY = 0;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
||||
for (i = 0; i < 256; i += 4) {
|
||||
const __m128i x = _mm_loadu_si128((const __m128i*)(X + i));
|
||||
const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i));
|
||||
|
||||
// Check if any X is non-zero: this actually provides a speedup as X is
|
||||
// usually sparse.
|
||||
if (_mm_movemask_epi8(_mm_cmpeq_epi32(x, zero)) != 0xFFFF) {
|
||||
const __m128i xy_128 = _mm_add_epi32(x, y);
|
||||
sumXY_128 = _mm_add_epi32(sumXY_128, xy_128);
|
||||
|
||||
sumX_128 = _mm_add_epi32(sumX_128, x);
|
||||
|
||||
// Analyze the different X + Y.
|
||||
_mm_storeu_si128((__m128i*)tmp, xy_128);
|
||||
|
||||
ANALYZE_XY(0);
|
||||
ANALYZE_XY(1);
|
||||
ANALYZE_XY(2);
|
||||
ANALYZE_XY(3);
|
||||
} else {
|
||||
// X is fully 0, so only deal with Y.
|
||||
sumXY_128 = _mm_add_epi32(sumXY_128, y);
|
||||
|
||||
ANALYZE_X_OR_Y(Y, 0);
|
||||
ANALYZE_X_OR_Y(Y, 1);
|
||||
ANALYZE_X_OR_Y(Y, 2);
|
||||
ANALYZE_X_OR_Y(Y, 3);
|
||||
for (i = 0; i < 256; i += 16) {
|
||||
const __m128i x0 = _mm_loadu_si128((const __m128i*)(X + i + 0));
|
||||
const __m128i y0 = _mm_loadu_si128((const __m128i*)(Y + i + 0));
|
||||
const __m128i x1 = _mm_loadu_si128((const __m128i*)(X + i + 4));
|
||||
const __m128i y1 = _mm_loadu_si128((const __m128i*)(Y + i + 4));
|
||||
const __m128i x2 = _mm_loadu_si128((const __m128i*)(X + i + 8));
|
||||
const __m128i y2 = _mm_loadu_si128((const __m128i*)(Y + i + 8));
|
||||
const __m128i x3 = _mm_loadu_si128((const __m128i*)(X + i + 12));
|
||||
const __m128i y3 = _mm_loadu_si128((const __m128i*)(Y + i + 12));
|
||||
const __m128i x4 = _mm_packs_epi16(_mm_packs_epi32(x0, x1),
|
||||
_mm_packs_epi32(x2, x3));
|
||||
const __m128i y4 = _mm_packs_epi16(_mm_packs_epi32(y0, y1),
|
||||
_mm_packs_epi32(y2, y3));
|
||||
const int32_t mx = _mm_movemask_epi8(_mm_cmpgt_epi8(x4, zero));
|
||||
int32_t my = _mm_movemask_epi8(_mm_cmpgt_epi8(y4, zero)) | mx;
|
||||
while (my) {
|
||||
const int32_t j = BitsCtz(my);
|
||||
int xy;
|
||||
if ((mx >> j) & 1) {
|
||||
const int x = X[i + j];
|
||||
sumXY += x;
|
||||
retval -= VP8LFastSLog2(x);
|
||||
}
|
||||
xy = X[i + j] + Y[i + j];
|
||||
sumX += xy;
|
||||
retval -= VP8LFastSLog2(xy);
|
||||
my &= my - 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Sum up sumX_128 to get sumX.
|
||||
_mm_storeu_si128((__m128i*)tmp, sumX_128);
|
||||
sumX = tmp[3] + tmp[2] + tmp[1] + tmp[0];
|
||||
|
||||
// Sum up sumXY_128 to get sumXY.
|
||||
_mm_storeu_si128((__m128i*)tmp, sumXY_128);
|
||||
sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0];
|
||||
|
||||
retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
|
||||
return (float)retval;
|
||||
}
|
||||
#endif // !(defined(__i386__) || defined(_M_IX86))
|
||||
|
||||
#undef ANALYZE_X_OR_Y
|
||||
#undef ANALYZE_XY
|
||||
#else
|
||||
|
||||
#define DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC // won't be faster
|
||||
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
@ -662,10 +638,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
|
||||
VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
|
||||
VP8LAddVector = AddVector_SSE2;
|
||||
VP8LAddVectorEq = AddVectorEq_SSE2;
|
||||
// TODO(https://crbug.com/webp/499): this function produces different results
|
||||
// from the C code due to use of double/float resulting in output differences
|
||||
// when compared to -noasm.
|
||||
#if !(defined(__i386__) || defined(_M_IX86))
|
||||
#if !defined(DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC)
|
||||
VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
|
||||
#endif
|
||||
VP8LVectorMismatch = VectorMismatch_SSE2;
|
||||
|
Loading…
x
Reference in New Issue
Block a user