Provide an SSE2 implementation of CombinedShannonEntropy.

CombinedShannonEntropy takes 30% for lossless compression.
This implementation speeds up the overall process by 2 to 3 %.

Change-Id: I04a71743284c38814fd0726034d51a02b1b6ba8f
This commit is contained in:
Vincent Rabaud
2015-12-11 15:12:19 +01:00
parent 04507dc91f
commit 2835089d6a
3 changed files with 86 additions and 6 deletions

View File

@ -250,6 +250,80 @@ static void HistogramAdd(const VP8LHistogram* const a,
}
}
//------------------------------------------------------------------------------
// Entropy
// Checks whether the X or Y contribution is worth computing and adding.
// Used in loop unrolling.
#define ANALYZE_X_OR_Y(x_or_y, j) \
do { \
if (x_or_y[i + j] != 0) retval -= VP8LFastSLog2(x_or_y[i + j]); \
} while (0)
// Checks whether the X + Y contribution is worth computing and adding.
// Used in loop unrolling.
#define ANALYZE_XY(j) \
do { \
if (tmp[j] != 0) { \
retval -= VP8LFastSLog2(tmp[j]); \
ANALYZE_X_OR_Y(X, j); \
} \
} while (0)
static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
int i;
double retval = 0.;
int sumX, sumXY;
int32_t tmp[4];
__m128i zero = _mm_setzero_si128();
// Sums up X + Y, 4 ints at a time (and will merge it at the end for sumXY).
__m128i sumXY_128 = zero;
__m128i sumX_128 = zero;
for (i = 0; i < 256; i += 4) {
const __m128i x = _mm_loadu_si128((const __m128i*)(X + i));
const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i));
// Check if any X is non-zero: this actually provides a speedup as X is
// usually sparse.
if (_mm_movemask_epi8(_mm_cmpeq_epi32(x, zero)) != 0xFFFF) {
const __m128i xy_128 = _mm_add_epi32(x, y);
sumXY_128 = _mm_add_epi32(sumXY_128, xy_128);
sumX_128 = _mm_add_epi32(sumX_128, x);
// Analyze the different X + Y.
_mm_storeu_si128((__m128i*)tmp, xy_128);
ANALYZE_XY(0);
ANALYZE_XY(1);
ANALYZE_XY(2);
ANALYZE_XY(3);
} else {
// X is fully 0, so only deal with Y.
sumXY_128 = _mm_add_epi32(sumXY_128, y);
ANALYZE_X_OR_Y(Y, 0);
ANALYZE_X_OR_Y(Y, 1);
ANALYZE_X_OR_Y(Y, 2);
ANALYZE_X_OR_Y(Y, 3);
}
}
// Sum up sumX_128 to get sumX.
_mm_storeu_si128((__m128i*)tmp, sumX_128);
sumX = tmp[3] + tmp[2] + tmp[1] + tmp[0];
// Sum up sumXY_128 to get sumXY.
_mm_storeu_si128((__m128i*)tmp, sumXY_128);
sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0];
retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
return (float)retval;
}
#undef ANALYZE_X_OR_Y
#undef ANALYZE_XY
//------------------------------------------------------------------------------
// Entry point
@ -261,6 +335,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
VP8LCollectColorRedTransforms = CollectColorRedTransforms;
VP8LHistogramAdd = HistogramAdd;
VP8LCombinedShannonEntropy = CombinedShannonEntropy;
}
#else // !WEBP_USE_SSE2