diff --git a/src/dsp/lossless.h b/src/dsp/lossless.h index de60d95d..0bf10a1a 100644 --- a/src/dsp/lossless.h +++ b/src/dsp/lossless.h @@ -182,9 +182,9 @@ extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16]; // ----------------------------------------------------------------------------- // Huffman-cost related functions. -typedef float (*VP8LCostFunc)(const uint32_t* population, int length); -typedef float (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y, - int length); +typedef uint32_t (*VP8LCostFunc)(const uint32_t* population, int length); +typedef uint32_t (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y, + int length); typedef float (*VP8LCombinedShannonEntropyFunc)(const int X[256], const int Y[256]); diff --git a/src/dsp/lossless_enc.c b/src/dsp/lossless_enc.c index cde12806..997d56c2 100644 --- a/src/dsp/lossless_enc.c +++ b/src/dsp/lossless_enc.c @@ -636,20 +636,25 @@ void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits, //------------------------------------------------------------------------------ -static float ExtraCost_C(const uint32_t* population, int length) { +static uint32_t ExtraCost_C(const uint32_t* population, int length) { int i; - float cost = 0.f; - for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2]; + uint32_t cost = population[4] + population[5]; + assert(length % 2 == 0); + for (i = 2; i < length / 2 - 1; ++i) { + cost += i * (population[2 * i + 2] + population[2 * i + 3]); + } return cost; } -static float ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y, - int length) { +static uint32_t ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y, + int length) { int i; - float cost = 0.f; - for (i = 2; i < length - 2; ++i) { - const int xy = X[i + 2] + Y[i + 2]; - cost += (i >> 1) * xy; + uint32_t cost = X[4] + Y[4] + X[5] + Y[5]; + assert(length % 2 == 0); + for (i = 2; i < length / 2 - 1; ++i) { + const int xy0 = X[2 * i + 2] + Y[2 * i + 2]; + const int xy1 = X[2 * i + 3] + Y[2 * i + 3]; + cost += i * (xy0 + xy1); } return cost; } diff --git a/src/dsp/lossless_enc_mips32.c b/src/dsp/lossless_enc_mips32.c index 639f7866..e10f12da 100644 --- a/src/dsp/lossless_enc_mips32.c +++ b/src/dsp/lossless_enc_mips32.c @@ -103,8 +103,8 @@ static float FastLog2Slow_MIPS32(uint32_t v) { // cost += i * *(pop + 1); // pop += 2; // } -// return (float)cost; -static float ExtraCost_MIPS32(const uint32_t* const population, int length) { +// return cost; +static uint32_t ExtraCost_MIPS32(const uint32_t* const population, int length) { int i, temp0, temp1; const uint32_t* pop = &population[4]; const uint32_t* const LoopEnd = &population[length]; @@ -130,7 +130,7 @@ static float ExtraCost_MIPS32(const uint32_t* const population, int length) { : "memory", "hi", "lo" ); - return (float)((int64_t)temp0 << 32 | temp1); + return ((int64_t)temp0 << 32 | temp1); } // C version of this function: @@ -148,9 +148,9 @@ static float ExtraCost_MIPS32(const uint32_t* const population, int length) { // pX += 2; // pY += 2; // } -// return (float)cost; -static float ExtraCostCombined_MIPS32(const uint32_t* const X, - const uint32_t* const Y, int length) { +// return cost; +static uint32_t ExtraCostCombined_MIPS32(const uint32_t* const X, + const uint32_t* const Y, int length) { int i, temp0, temp1, temp2, temp3; const uint32_t* pX = &X[4]; const uint32_t* pY = &Y[4]; @@ -183,7 +183,7 @@ static float ExtraCostCombined_MIPS32(const uint32_t* const X, : "memory", "hi", "lo" ); - return (float)((int64_t)temp0 << 32 | temp1); + return ((int64_t)temp0 << 32 | temp1); } #define HUFFMAN_COST_PASS \ diff --git a/src/dsp/lossless_enc_sse41.c b/src/dsp/lossless_enc_sse41.c index ad358a6f..7ab83c26 100644 --- a/src/dsp/lossless_enc_sse41.c +++ b/src/dsp/lossless_enc_sse41.c @@ -18,8 +18,53 @@ #include #include "src/dsp/lossless.h" -// For sign-extended multiplying constants, pre-shifted by 5: -#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5) +//------------------------------------------------------------------------------ +// Cost operations. + +static WEBP_INLINE uint32_t HorizontalSum_SSE41(__m128i cost) { + cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 8)); + cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 4)); + return _mm_cvtsi128_si32(cost); +} + +static uint32_t ExtraCost_SSE41(const uint32_t* const a, int length) { + int i; + __m128i cost = _mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]); + assert(length % 8 == 0); + + for (i = 8; i + 8 <= length; i += 8) { + const int j = (i - 2) >> 1; + const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); + const __m128i w = _mm_set_epi32(j + 3, j + 2, j + 1, j); + const __m128i a2 = _mm_hadd_epi32(a0, a1); + const __m128i mul = _mm_mullo_epi32(a2, w); + cost = _mm_add_epi32(mul, cost); + } + return HorizontalSum_SSE41(cost); +} + +static uint32_t ExtraCostCombined_SSE41(const uint32_t* const a, + const uint32_t* const b, int length) { + int i; + __m128i cost = _mm_add_epi32(_mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]), + _mm_set_epi32(2 * b[7], 2 * b[6], b[5], b[4])); + assert(length % 8 == 0); + + for (i = 8; i + 8 <= length; i += 8) { + const int j = (i - 2) >> 1; + const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); + const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i]); + const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]); + const __m128i w = _mm_set_epi32(j + 3, j + 2, j + 1, j); + const __m128i a2 = _mm_hadd_epi32(a0, a1); + const __m128i b2 = _mm_hadd_epi32(b0, b1); + const __m128i mul = _mm_mullo_epi32(_mm_add_epi32(a2, b2), w); + cost = _mm_add_epi32(mul, cost); + } + return HorizontalSum_SSE41(cost); +} //------------------------------------------------------------------------------ // Subtract-Green Transform @@ -44,6 +89,9 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data, //------------------------------------------------------------------------------ // Color Transform +// For sign-extended multiplying constants, pre-shifted by 5: +#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5) + #define MK_CST_16(HI, LO) \ _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) @@ -143,6 +191,8 @@ static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride, extern void VP8LEncDspInitSSE41(void); WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) { + VP8LExtraCost = ExtraCost_SSE41; + VP8LExtraCostCombined = ExtraCostCombined_SSE41; VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41; VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41; VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41; diff --git a/src/enc/histogram_enc.c b/src/enc/histogram_enc.c index 8418def2..9e1d18f1 100644 --- a/src/enc/histogram_enc.c +++ b/src/enc/histogram_enc.c @@ -358,15 +358,17 @@ static WEBP_INLINE float GetCombinedEntropy(const uint32_t* const X, // Estimates the Entropy + Huffman + other block overhead size cost. float VP8LHistogramEstimateBits(VP8LHistogram* const p) { - return - PopulationCost(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_), - NULL, &p->is_used_[0]) - + PopulationCost(p->red_, NUM_LITERAL_CODES, NULL, &p->is_used_[1]) - + PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL, &p->is_used_[2]) - + PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL, &p->is_used_[3]) - + PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL, &p->is_used_[4]) - + VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES) - + VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES); + return PopulationCost(p->literal_, + VP8LHistogramNumCodes(p->palette_code_bits_), NULL, + &p->is_used_[0]) + + PopulationCost(p->red_, NUM_LITERAL_CODES, NULL, &p->is_used_[1]) + + PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL, &p->is_used_[2]) + + PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL, &p->is_used_[3]) + + PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL, + &p->is_used_[4]) + + (float)VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, + NUM_LENGTH_CODES) + + (float)VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES); } // ----------------------------------------------------------------------------- @@ -381,9 +383,9 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a, *cost += GetCombinedEntropy(a->literal_, b->literal_, VP8LHistogramNumCodes(palette_code_bits), a->is_used_[0], b->is_used_[0], 0); - *cost += VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES, - b->literal_ + NUM_LITERAL_CODES, - NUM_LENGTH_CODES); + *cost += (float)VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES, + b->literal_ + NUM_LITERAL_CODES, + NUM_LENGTH_CODES); if (*cost > cost_threshold) return 0; if (a->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM && @@ -417,8 +419,8 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a, *cost += GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES, a->is_used_[4], b->is_used_[4], 0); - *cost += - VP8LExtraCostCombined(a->distance_, b->distance_, NUM_DISTANCE_CODES); + *cost += (float)VP8LExtraCostCombined(a->distance_, b->distance_, + NUM_DISTANCE_CODES); if (*cost > cost_threshold) return 0; return 1; @@ -506,11 +508,11 @@ static void UpdateHistogramCost(VP8LHistogram* const h) { PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym, &h->is_used_[3]); const float distance_cost = PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]) + - VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES); + (float)VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES); const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_); h->literal_cost_ = PopulationCost(h->literal_, num_codes, NULL, &h->is_used_[0]) + - VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES); + (float)VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES); h->red_cost_ = PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym, &h->is_used_[1]); h->blue_cost_ =