mirror of
https://github.com/webmproject/libwebp.git
synced 2024-12-27 06:08:21 +01:00
Switch ExtraCost to ints and implement it in SSE.
The histograms count the occurrences of len/dist in entropy images. Those (at most (1<<14) by (1<<14)) are sub-sampled by at least MIN_HUFFMAN_BITS == 2, hence at most 24 bits in a histogram value. At most, we multiply by 19 (because the longest histogram is of size 40 and we do 40>>1, cf code) for the bit cost. So it all fits in 32 bits. Change-Id: Ife24b035f54794851ff31f2fac07901f724c6d7f
This commit is contained in:
parent
15b365083d
commit
828b4ce062
@ -182,9 +182,9 @@ extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
|
|||||||
// -----------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------
|
||||||
// Huffman-cost related functions.
|
// Huffman-cost related functions.
|
||||||
|
|
||||||
typedef float (*VP8LCostFunc)(const uint32_t* population, int length);
|
typedef uint32_t (*VP8LCostFunc)(const uint32_t* population, int length);
|
||||||
typedef float (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
|
typedef uint32_t (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
|
||||||
int length);
|
int length);
|
||||||
typedef float (*VP8LCombinedShannonEntropyFunc)(const int X[256],
|
typedef float (*VP8LCombinedShannonEntropyFunc)(const int X[256],
|
||||||
const int Y[256]);
|
const int Y[256]);
|
||||||
|
|
||||||
|
@ -636,20 +636,25 @@ void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
|
|||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
static float ExtraCost_C(const uint32_t* population, int length) {
|
static uint32_t ExtraCost_C(const uint32_t* population, int length) {
|
||||||
int i;
|
int i;
|
||||||
float cost = 0.f;
|
uint32_t cost = population[4] + population[5];
|
||||||
for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
|
assert(length % 2 == 0);
|
||||||
|
for (i = 2; i < length / 2 - 1; ++i) {
|
||||||
|
cost += i * (population[2 * i + 2] + population[2 * i + 3]);
|
||||||
|
}
|
||||||
return cost;
|
return cost;
|
||||||
}
|
}
|
||||||
|
|
||||||
static float ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
|
static uint32_t ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
|
||||||
int length) {
|
int length) {
|
||||||
int i;
|
int i;
|
||||||
float cost = 0.f;
|
uint32_t cost = X[4] + Y[4] + X[5] + Y[5];
|
||||||
for (i = 2; i < length - 2; ++i) {
|
assert(length % 2 == 0);
|
||||||
const int xy = X[i + 2] + Y[i + 2];
|
for (i = 2; i < length / 2 - 1; ++i) {
|
||||||
cost += (i >> 1) * xy;
|
const int xy0 = X[2 * i + 2] + Y[2 * i + 2];
|
||||||
|
const int xy1 = X[2 * i + 3] + Y[2 * i + 3];
|
||||||
|
cost += i * (xy0 + xy1);
|
||||||
}
|
}
|
||||||
return cost;
|
return cost;
|
||||||
}
|
}
|
||||||
|
@ -103,8 +103,8 @@ static float FastLog2Slow_MIPS32(uint32_t v) {
|
|||||||
// cost += i * *(pop + 1);
|
// cost += i * *(pop + 1);
|
||||||
// pop += 2;
|
// pop += 2;
|
||||||
// }
|
// }
|
||||||
// return (float)cost;
|
// return cost;
|
||||||
static float ExtraCost_MIPS32(const uint32_t* const population, int length) {
|
static uint32_t ExtraCost_MIPS32(const uint32_t* const population, int length) {
|
||||||
int i, temp0, temp1;
|
int i, temp0, temp1;
|
||||||
const uint32_t* pop = &population[4];
|
const uint32_t* pop = &population[4];
|
||||||
const uint32_t* const LoopEnd = &population[length];
|
const uint32_t* const LoopEnd = &population[length];
|
||||||
@ -130,7 +130,7 @@ static float ExtraCost_MIPS32(const uint32_t* const population, int length) {
|
|||||||
: "memory", "hi", "lo"
|
: "memory", "hi", "lo"
|
||||||
);
|
);
|
||||||
|
|
||||||
return (float)((int64_t)temp0 << 32 | temp1);
|
return ((int64_t)temp0 << 32 | temp1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// C version of this function:
|
// C version of this function:
|
||||||
@ -148,9 +148,9 @@ static float ExtraCost_MIPS32(const uint32_t* const population, int length) {
|
|||||||
// pX += 2;
|
// pX += 2;
|
||||||
// pY += 2;
|
// pY += 2;
|
||||||
// }
|
// }
|
||||||
// return (float)cost;
|
// return cost;
|
||||||
static float ExtraCostCombined_MIPS32(const uint32_t* const X,
|
static uint32_t ExtraCostCombined_MIPS32(const uint32_t* const X,
|
||||||
const uint32_t* const Y, int length) {
|
const uint32_t* const Y, int length) {
|
||||||
int i, temp0, temp1, temp2, temp3;
|
int i, temp0, temp1, temp2, temp3;
|
||||||
const uint32_t* pX = &X[4];
|
const uint32_t* pX = &X[4];
|
||||||
const uint32_t* pY = &Y[4];
|
const uint32_t* pY = &Y[4];
|
||||||
@ -183,7 +183,7 @@ static float ExtraCostCombined_MIPS32(const uint32_t* const X,
|
|||||||
: "memory", "hi", "lo"
|
: "memory", "hi", "lo"
|
||||||
);
|
);
|
||||||
|
|
||||||
return (float)((int64_t)temp0 << 32 | temp1);
|
return ((int64_t)temp0 << 32 | temp1);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define HUFFMAN_COST_PASS \
|
#define HUFFMAN_COST_PASS \
|
||||||
|
@ -18,8 +18,53 @@
|
|||||||
#include <smmintrin.h>
|
#include <smmintrin.h>
|
||||||
#include "src/dsp/lossless.h"
|
#include "src/dsp/lossless.h"
|
||||||
|
|
||||||
// For sign-extended multiplying constants, pre-shifted by 5:
|
//------------------------------------------------------------------------------
|
||||||
#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
|
// Cost operations.
|
||||||
|
|
||||||
|
static WEBP_INLINE uint32_t HorizontalSum_SSE41(__m128i cost) {
|
||||||
|
cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 8));
|
||||||
|
cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 4));
|
||||||
|
return _mm_cvtsi128_si32(cost);
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint32_t ExtraCost_SSE41(const uint32_t* const a, int length) {
|
||||||
|
int i;
|
||||||
|
__m128i cost = _mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]);
|
||||||
|
assert(length % 8 == 0);
|
||||||
|
|
||||||
|
for (i = 8; i + 8 <= length; i += 8) {
|
||||||
|
const int j = (i - 2) >> 1;
|
||||||
|
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]);
|
||||||
|
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
|
||||||
|
const __m128i w = _mm_set_epi32(j + 3, j + 2, j + 1, j);
|
||||||
|
const __m128i a2 = _mm_hadd_epi32(a0, a1);
|
||||||
|
const __m128i mul = _mm_mullo_epi32(a2, w);
|
||||||
|
cost = _mm_add_epi32(mul, cost);
|
||||||
|
}
|
||||||
|
return HorizontalSum_SSE41(cost);
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint32_t ExtraCostCombined_SSE41(const uint32_t* const a,
|
||||||
|
const uint32_t* const b, int length) {
|
||||||
|
int i;
|
||||||
|
__m128i cost = _mm_add_epi32(_mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]),
|
||||||
|
_mm_set_epi32(2 * b[7], 2 * b[6], b[5], b[4]));
|
||||||
|
assert(length % 8 == 0);
|
||||||
|
|
||||||
|
for (i = 8; i + 8 <= length; i += 8) {
|
||||||
|
const int j = (i - 2) >> 1;
|
||||||
|
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]);
|
||||||
|
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
|
||||||
|
const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i]);
|
||||||
|
const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]);
|
||||||
|
const __m128i w = _mm_set_epi32(j + 3, j + 2, j + 1, j);
|
||||||
|
const __m128i a2 = _mm_hadd_epi32(a0, a1);
|
||||||
|
const __m128i b2 = _mm_hadd_epi32(b0, b1);
|
||||||
|
const __m128i mul = _mm_mullo_epi32(_mm_add_epi32(a2, b2), w);
|
||||||
|
cost = _mm_add_epi32(mul, cost);
|
||||||
|
}
|
||||||
|
return HorizontalSum_SSE41(cost);
|
||||||
|
}
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
// Subtract-Green Transform
|
// Subtract-Green Transform
|
||||||
@ -44,6 +89,9 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
|
|||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
// Color Transform
|
// Color Transform
|
||||||
|
|
||||||
|
// For sign-extended multiplying constants, pre-shifted by 5:
|
||||||
|
#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
|
||||||
|
|
||||||
#define MK_CST_16(HI, LO) \
|
#define MK_CST_16(HI, LO) \
|
||||||
_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
|
_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
|
||||||
|
|
||||||
@ -143,6 +191,8 @@ static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
|
|||||||
extern void VP8LEncDspInitSSE41(void);
|
extern void VP8LEncDspInitSSE41(void);
|
||||||
|
|
||||||
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
|
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
|
||||||
|
VP8LExtraCost = ExtraCost_SSE41;
|
||||||
|
VP8LExtraCostCombined = ExtraCostCombined_SSE41;
|
||||||
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
|
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
|
||||||
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41;
|
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41;
|
||||||
VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41;
|
VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41;
|
||||||
|
@ -358,15 +358,17 @@ static WEBP_INLINE float GetCombinedEntropy(const uint32_t* const X,
|
|||||||
|
|
||||||
// Estimates the Entropy + Huffman + other block overhead size cost.
|
// Estimates the Entropy + Huffman + other block overhead size cost.
|
||||||
float VP8LHistogramEstimateBits(VP8LHistogram* const p) {
|
float VP8LHistogramEstimateBits(VP8LHistogram* const p) {
|
||||||
return
|
return PopulationCost(p->literal_,
|
||||||
PopulationCost(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_),
|
VP8LHistogramNumCodes(p->palette_code_bits_), NULL,
|
||||||
NULL, &p->is_used_[0])
|
&p->is_used_[0]) +
|
||||||
+ PopulationCost(p->red_, NUM_LITERAL_CODES, NULL, &p->is_used_[1])
|
PopulationCost(p->red_, NUM_LITERAL_CODES, NULL, &p->is_used_[1]) +
|
||||||
+ PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL, &p->is_used_[2])
|
PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL, &p->is_used_[2]) +
|
||||||
+ PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL, &p->is_used_[3])
|
PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL, &p->is_used_[3]) +
|
||||||
+ PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL, &p->is_used_[4])
|
PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL,
|
||||||
+ VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES)
|
&p->is_used_[4]) +
|
||||||
+ VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
|
(float)VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES,
|
||||||
|
NUM_LENGTH_CODES) +
|
||||||
|
(float)VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
|
||||||
}
|
}
|
||||||
|
|
||||||
// -----------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------
|
||||||
@ -381,9 +383,9 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
|
|||||||
*cost += GetCombinedEntropy(a->literal_, b->literal_,
|
*cost += GetCombinedEntropy(a->literal_, b->literal_,
|
||||||
VP8LHistogramNumCodes(palette_code_bits),
|
VP8LHistogramNumCodes(palette_code_bits),
|
||||||
a->is_used_[0], b->is_used_[0], 0);
|
a->is_used_[0], b->is_used_[0], 0);
|
||||||
*cost += VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
|
*cost += (float)VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
|
||||||
b->literal_ + NUM_LITERAL_CODES,
|
b->literal_ + NUM_LITERAL_CODES,
|
||||||
NUM_LENGTH_CODES);
|
NUM_LENGTH_CODES);
|
||||||
if (*cost > cost_threshold) return 0;
|
if (*cost > cost_threshold) return 0;
|
||||||
|
|
||||||
if (a->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM &&
|
if (a->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM &&
|
||||||
@ -417,8 +419,8 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
|
|||||||
*cost +=
|
*cost +=
|
||||||
GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES,
|
GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES,
|
||||||
a->is_used_[4], b->is_used_[4], 0);
|
a->is_used_[4], b->is_used_[4], 0);
|
||||||
*cost +=
|
*cost += (float)VP8LExtraCostCombined(a->distance_, b->distance_,
|
||||||
VP8LExtraCostCombined(a->distance_, b->distance_, NUM_DISTANCE_CODES);
|
NUM_DISTANCE_CODES);
|
||||||
if (*cost > cost_threshold) return 0;
|
if (*cost > cost_threshold) return 0;
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
@ -506,11 +508,11 @@ static void UpdateHistogramCost(VP8LHistogram* const h) {
|
|||||||
PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym, &h->is_used_[3]);
|
PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym, &h->is_used_[3]);
|
||||||
const float distance_cost =
|
const float distance_cost =
|
||||||
PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]) +
|
PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]) +
|
||||||
VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
|
(float)VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
|
||||||
const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
|
const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
|
||||||
h->literal_cost_ =
|
h->literal_cost_ =
|
||||||
PopulationCost(h->literal_, num_codes, NULL, &h->is_used_[0]) +
|
PopulationCost(h->literal_, num_codes, NULL, &h->is_used_[0]) +
|
||||||
VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES);
|
(float)VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES);
|
||||||
h->red_cost_ =
|
h->red_cost_ =
|
||||||
PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym, &h->is_used_[1]);
|
PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym, &h->is_used_[1]);
|
||||||
h->blue_cost_ =
|
h->blue_cost_ =
|
||||||
|
Loading…
Reference in New Issue
Block a user