Switch ExtraCost to ints and implement it in SSE.

The histograms count the occurrences of len/dist in entropy images.
Those (at most (1<<14) by (1<<14)) are sub-sampled by at least
MIN_HUFFMAN_BITS == 2, hence at most 24 bits in a histogram value.
At most, we multiply by 19 (because the longest histogram is of
size 40 and we do 40>>1, cf code) for the bit cost. So it all fits
in 32 bits.

Change-Id: Ife24b035f54794851ff31f2fac07901f724c6d7f
This commit is contained in:
Vincent Rabaud 2023-05-31 15:41:43 +02:00
parent 15b365083d
commit 828b4ce062
5 changed files with 94 additions and 37 deletions

View File

@ -182,8 +182,8 @@ extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
// -----------------------------------------------------------------------------
// Huffman-cost related functions.
typedef float (*VP8LCostFunc)(const uint32_t* population, int length);
typedef float (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
typedef uint32_t (*VP8LCostFunc)(const uint32_t* population, int length);
typedef uint32_t (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
int length);
typedef float (*VP8LCombinedShannonEntropyFunc)(const int X[256],
const int Y[256]);

View File

@ -636,20 +636,25 @@ void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
//------------------------------------------------------------------------------
static float ExtraCost_C(const uint32_t* population, int length) {
static uint32_t ExtraCost_C(const uint32_t* population, int length) {
int i;
float cost = 0.f;
for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
uint32_t cost = population[4] + population[5];
assert(length % 2 == 0);
for (i = 2; i < length / 2 - 1; ++i) {
cost += i * (population[2 * i + 2] + population[2 * i + 3]);
}
return cost;
}
static float ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
static uint32_t ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
int length) {
int i;
float cost = 0.f;
for (i = 2; i < length - 2; ++i) {
const int xy = X[i + 2] + Y[i + 2];
cost += (i >> 1) * xy;
uint32_t cost = X[4] + Y[4] + X[5] + Y[5];
assert(length % 2 == 0);
for (i = 2; i < length / 2 - 1; ++i) {
const int xy0 = X[2 * i + 2] + Y[2 * i + 2];
const int xy1 = X[2 * i + 3] + Y[2 * i + 3];
cost += i * (xy0 + xy1);
}
return cost;
}

View File

@ -103,8 +103,8 @@ static float FastLog2Slow_MIPS32(uint32_t v) {
// cost += i * *(pop + 1);
// pop += 2;
// }
// return (float)cost;
static float ExtraCost_MIPS32(const uint32_t* const population, int length) {
// return cost;
static uint32_t ExtraCost_MIPS32(const uint32_t* const population, int length) {
int i, temp0, temp1;
const uint32_t* pop = &population[4];
const uint32_t* const LoopEnd = &population[length];
@ -130,7 +130,7 @@ static float ExtraCost_MIPS32(const uint32_t* const population, int length) {
: "memory", "hi", "lo"
);
return (float)((int64_t)temp0 << 32 | temp1);
return ((int64_t)temp0 << 32 | temp1);
}
// C version of this function:
@ -148,8 +148,8 @@ static float ExtraCost_MIPS32(const uint32_t* const population, int length) {
// pX += 2;
// pY += 2;
// }
// return (float)cost;
static float ExtraCostCombined_MIPS32(const uint32_t* const X,
// return cost;
static uint32_t ExtraCostCombined_MIPS32(const uint32_t* const X,
const uint32_t* const Y, int length) {
int i, temp0, temp1, temp2, temp3;
const uint32_t* pX = &X[4];
@ -183,7 +183,7 @@ static float ExtraCostCombined_MIPS32(const uint32_t* const X,
: "memory", "hi", "lo"
);
return (float)((int64_t)temp0 << 32 | temp1);
return ((int64_t)temp0 << 32 | temp1);
}
#define HUFFMAN_COST_PASS \

View File

@ -18,8 +18,53 @@
#include <smmintrin.h>
#include "src/dsp/lossless.h"
// For sign-extended multiplying constants, pre-shifted by 5:
#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
//------------------------------------------------------------------------------
// Cost operations.
static WEBP_INLINE uint32_t HorizontalSum_SSE41(__m128i cost) {
cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 8));
cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 4));
return _mm_cvtsi128_si32(cost);
}
static uint32_t ExtraCost_SSE41(const uint32_t* const a, int length) {
int i;
__m128i cost = _mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]);
assert(length % 8 == 0);
for (i = 8; i + 8 <= length; i += 8) {
const int j = (i - 2) >> 1;
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
const __m128i w = _mm_set_epi32(j + 3, j + 2, j + 1, j);
const __m128i a2 = _mm_hadd_epi32(a0, a1);
const __m128i mul = _mm_mullo_epi32(a2, w);
cost = _mm_add_epi32(mul, cost);
}
return HorizontalSum_SSE41(cost);
}
static uint32_t ExtraCostCombined_SSE41(const uint32_t* const a,
const uint32_t* const b, int length) {
int i;
__m128i cost = _mm_add_epi32(_mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]),
_mm_set_epi32(2 * b[7], 2 * b[6], b[5], b[4]));
assert(length % 8 == 0);
for (i = 8; i + 8 <= length; i += 8) {
const int j = (i - 2) >> 1;
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]);
const __m128i w = _mm_set_epi32(j + 3, j + 2, j + 1, j);
const __m128i a2 = _mm_hadd_epi32(a0, a1);
const __m128i b2 = _mm_hadd_epi32(b0, b1);
const __m128i mul = _mm_mullo_epi32(_mm_add_epi32(a2, b2), w);
cost = _mm_add_epi32(mul, cost);
}
return HorizontalSum_SSE41(cost);
}
//------------------------------------------------------------------------------
// Subtract-Green Transform
@ -44,6 +89,9 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
//------------------------------------------------------------------------------
// Color Transform
// For sign-extended multiplying constants, pre-shifted by 5:
#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
#define MK_CST_16(HI, LO) \
_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
@ -143,6 +191,8 @@ static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
extern void VP8LEncDspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
VP8LExtraCost = ExtraCost_SSE41;
VP8LExtraCostCombined = ExtraCostCombined_SSE41;
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41;
VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41;

View File

@ -358,15 +358,17 @@ static WEBP_INLINE float GetCombinedEntropy(const uint32_t* const X,
// Estimates the Entropy + Huffman + other block overhead size cost.
float VP8LHistogramEstimateBits(VP8LHistogram* const p) {
return
PopulationCost(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_),
NULL, &p->is_used_[0])
+ PopulationCost(p->red_, NUM_LITERAL_CODES, NULL, &p->is_used_[1])
+ PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL, &p->is_used_[2])
+ PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL, &p->is_used_[3])
+ PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL, &p->is_used_[4])
+ VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES)
+ VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
return PopulationCost(p->literal_,
VP8LHistogramNumCodes(p->palette_code_bits_), NULL,
&p->is_used_[0]) +
PopulationCost(p->red_, NUM_LITERAL_CODES, NULL, &p->is_used_[1]) +
PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL, &p->is_used_[2]) +
PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL, &p->is_used_[3]) +
PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL,
&p->is_used_[4]) +
(float)VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES,
NUM_LENGTH_CODES) +
(float)VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
}
// -----------------------------------------------------------------------------
@ -381,7 +383,7 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
*cost += GetCombinedEntropy(a->literal_, b->literal_,
VP8LHistogramNumCodes(palette_code_bits),
a->is_used_[0], b->is_used_[0], 0);
*cost += VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
*cost += (float)VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
b->literal_ + NUM_LITERAL_CODES,
NUM_LENGTH_CODES);
if (*cost > cost_threshold) return 0;
@ -417,8 +419,8 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
*cost +=
GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES,
a->is_used_[4], b->is_used_[4], 0);
*cost +=
VP8LExtraCostCombined(a->distance_, b->distance_, NUM_DISTANCE_CODES);
*cost += (float)VP8LExtraCostCombined(a->distance_, b->distance_,
NUM_DISTANCE_CODES);
if (*cost > cost_threshold) return 0;
return 1;
@ -506,11 +508,11 @@ static void UpdateHistogramCost(VP8LHistogram* const h) {
PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym, &h->is_used_[3]);
const float distance_cost =
PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]) +
VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
(float)VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
h->literal_cost_ =
PopulationCost(h->literal_, num_codes, NULL, &h->is_used_[0]) +
VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES);
(float)VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES);
h->red_cost_ =
PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym, &h->is_used_[1]);
h->blue_cost_ =