mirror of
https://github.com/webmproject/libwebp.git
synced 2024-12-27 06:08:21 +01:00
1-2% faster quantization in SSE2
C-version is a bit faster too (sub-1% faster on ARM) Change-Id: I077262042f1d0937aba1ecf15174f2c51bf6cd97
This commit is contained in:
parent
b2fbc36c26
commit
0235d5e44b
@ -632,16 +632,17 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
||||
for (; n < 16; ++n) {
|
||||
const int j = kZigzag[n];
|
||||
const int sign = (in[j] < 0);
|
||||
const int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
|
||||
const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
|
||||
if (coeff > mtx->zthresh_[j]) {
|
||||
const int Q = mtx->q_[j];
|
||||
const int iQ = mtx->iq_[j];
|
||||
const int B = mtx->bias_[j];
|
||||
out[n] = QUANTDIV(coeff, iQ, B);
|
||||
if (out[n] > MAX_LEVEL) out[n] = MAX_LEVEL;
|
||||
if (sign) out[n] = -out[n];
|
||||
in[j] = out[n] * Q;
|
||||
if (out[n]) last = n;
|
||||
const uint32_t Q = mtx->q_[j];
|
||||
const uint32_t iQ = mtx->iq_[j];
|
||||
const uint32_t B = mtx->bias_[j];
|
||||
int level = QUANTDIV(coeff, iQ, B);
|
||||
if (level > MAX_LEVEL) level = MAX_LEVEL;
|
||||
if (sign) level = -level;
|
||||
in[j] = level * Q;
|
||||
out[n] = level;
|
||||
if (level) last = n;
|
||||
} else {
|
||||
out[n] = 0;
|
||||
in[j] = 0;
|
||||
@ -656,17 +657,18 @@ static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
|
||||
for (n = 0; n < 16; ++n) {
|
||||
const int j = kZigzag[n];
|
||||
const int sign = (in[j] < 0);
|
||||
const int coeff = sign ? -in[j] : in[j];
|
||||
const uint32_t coeff = sign ? -in[j] : in[j];
|
||||
assert(mtx->sharpen_[j] == 0);
|
||||
if (coeff > mtx->zthresh_[j]) {
|
||||
const int Q = mtx->q_[j];
|
||||
const int iQ = mtx->iq_[j];
|
||||
const int B = mtx->bias_[j];
|
||||
out[n] = QUANTDIV(coeff, iQ, B);
|
||||
if (out[n] > MAX_LEVEL) out[n] = MAX_LEVEL;
|
||||
if (sign) out[n] = -out[n];
|
||||
in[j] = out[n] * Q;
|
||||
if (out[n]) last = n;
|
||||
const uint32_t Q = mtx->q_[j];
|
||||
const uint32_t iQ = mtx->iq_[j];
|
||||
const uint32_t B = mtx->bias_[j];
|
||||
int level = QUANTDIV(coeff, iQ, B);
|
||||
if (level > MAX_LEVEL) level = MAX_LEVEL;
|
||||
if (sign) level = -level;
|
||||
in[j] = level * Q;
|
||||
out[n] = level;
|
||||
if (level) last = n;
|
||||
} else {
|
||||
out[n] = 0;
|
||||
in[j] = 0;
|
||||
|
@ -804,9 +804,11 @@ static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
|
||||
// Quantization
|
||||
//
|
||||
|
||||
// Simple quantization
|
||||
static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
|
||||
int n, const VP8Matrix* const mtx) {
|
||||
#define QFIX2 0
|
||||
static WEBP_INLINE int QuantizeBlock(int16_t in[16], int16_t out[16],
|
||||
int n, int shift,
|
||||
const uint16_t* const sharpen,
|
||||
const VP8Matrix* const mtx) {
|
||||
const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i coeff0, coeff8;
|
||||
@ -818,18 +820,14 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
|
||||
// we can use _mm_load_si128 instead of _mm_loadu_si128.
|
||||
__m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
|
||||
__m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
|
||||
const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
|
||||
const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
|
||||
const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
|
||||
const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
|
||||
const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
|
||||
const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
|
||||
const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
|
||||
const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
|
||||
|
||||
// sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative)
|
||||
const __m128i sign0 = _mm_srai_epi16(in0, 15);
|
||||
const __m128i sign8 = _mm_srai_epi16(in8, 15);
|
||||
// extract sign(in) (0x0000 if positive, 0xffff if negative)
|
||||
const __m128i sign0 = _mm_cmpgt_epi16(zero, in0);
|
||||
const __m128i sign8 = _mm_cmpgt_epi16(zero, in8);
|
||||
|
||||
// coeff = abs(in) = (in ^ sign) - sign
|
||||
coeff0 = _mm_xor_si128(in0, sign0);
|
||||
@ -838,36 +836,39 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
|
||||
coeff8 = _mm_sub_epi16(coeff8, sign8);
|
||||
|
||||
// coeff = abs(in) + sharpen
|
||||
coeff0 = _mm_add_epi16(coeff0, sharpen0);
|
||||
coeff8 = _mm_add_epi16(coeff8, sharpen8);
|
||||
if (sharpen != NULL) {
|
||||
const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&sharpen[0]);
|
||||
const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&sharpen[8]);
|
||||
coeff0 = _mm_add_epi16(coeff0, sharpen0);
|
||||
coeff8 = _mm_add_epi16(coeff8, sharpen8);
|
||||
}
|
||||
|
||||
// out = (coeff * iQ + B) >> QFIX;
|
||||
// out = (coeff * iQ + B) >> (QFIX + QFIX2 - shift)
|
||||
{
|
||||
// doing calculations with 32b precision (QFIX=17)
|
||||
// out = (coeff * iQ)
|
||||
__m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
|
||||
__m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
|
||||
__m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
|
||||
__m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
|
||||
const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
|
||||
const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
|
||||
const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
|
||||
const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
|
||||
__m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
|
||||
__m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
|
||||
__m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
|
||||
__m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
|
||||
// expand bias from 16b to 32b
|
||||
__m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
|
||||
__m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
|
||||
__m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
|
||||
__m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
|
||||
// out = (coeff * iQ + B)
|
||||
const __m128i bias_00 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
|
||||
const __m128i bias_04 = _mm_loadu_si128((__m128i*)&mtx->bias_[4]);
|
||||
const __m128i bias_08 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
|
||||
const __m128i bias_12 = _mm_loadu_si128((__m128i*)&mtx->bias_[12]);
|
||||
out_00 = _mm_add_epi32(out_00, bias_00);
|
||||
out_04 = _mm_add_epi32(out_04, bias_04);
|
||||
out_08 = _mm_add_epi32(out_08, bias_08);
|
||||
out_12 = _mm_add_epi32(out_12, bias_12);
|
||||
// out = (coeff * iQ + B) >> QFIX;
|
||||
out_00 = _mm_srai_epi32(out_00, QFIX);
|
||||
out_04 = _mm_srai_epi32(out_04, QFIX);
|
||||
out_08 = _mm_srai_epi32(out_08, QFIX);
|
||||
out_12 = _mm_srai_epi32(out_12, QFIX);
|
||||
// out = QUANTDIV(coeff, iQ, B, QFIX + QFIX2 - shift)
|
||||
out_00 = _mm_srai_epi32(out_00, QFIX + QFIX2 - shift);
|
||||
out_04 = _mm_srai_epi32(out_04, QFIX + QFIX2 - shift);
|
||||
out_08 = _mm_srai_epi32(out_08, QFIX + QFIX2 - shift);
|
||||
out_12 = _mm_srai_epi32(out_12, QFIX + QFIX2 - shift);
|
||||
|
||||
// pack result as 16b
|
||||
out0 = _mm_packs_epi32(out_00, out_04);
|
||||
@ -916,19 +917,18 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
|
||||
}
|
||||
|
||||
// detect if all 'out' values are zeroes or not
|
||||
{
|
||||
int32_t tmp[4];
|
||||
_mm_storeu_si128((__m128i*)tmp, packed_out);
|
||||
if (n) {
|
||||
tmp[0] &= ~0xff;
|
||||
}
|
||||
return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
|
||||
}
|
||||
if (n) packed_out = _mm_srli_si128(packed_out, 1); // ignore DC for n == 1
|
||||
return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
|
||||
}
|
||||
|
||||
static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
|
||||
int n, const VP8Matrix* const mtx) {
|
||||
return QuantizeBlock(in, out, n, 0, &mtx->sharpen_[0], mtx);
|
||||
}
|
||||
|
||||
static int QuantizeBlockWHTSSE2(int16_t in[16], int16_t out[16],
|
||||
const VP8Matrix* const mtx) {
|
||||
return QuantizeBlockSSE2(in, out, 0, mtx);
|
||||
return QuantizeBlock(in, out, 0, 0, &mtx->sharpen_[0], mtx);
|
||||
}
|
||||
|
||||
#endif // WEBP_USE_SSE2
|
||||
|
@ -592,13 +592,13 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
|
||||
// traverse trellis.
|
||||
for (n = first; n <= last; ++n) {
|
||||
const int j = kZigzag[n];
|
||||
const int Q = mtx->q_[j];
|
||||
const int iQ = mtx->iq_[j];
|
||||
const int B = BIAS(0x00); // neutral bias
|
||||
const uint32_t Q = mtx->q_[j];
|
||||
const uint32_t iQ = mtx->iq_[j];
|
||||
const uint32_t B = BIAS(0x00); // neutral bias
|
||||
// note: it's important to take sign of the _original_ coeff,
|
||||
// so we don't have to consider level < 0 afterward.
|
||||
const int sign = (in[j] < 0);
|
||||
const int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
|
||||
const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
|
||||
int level0 = QUANTDIV(coeff0, iQ, B);
|
||||
if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
|
||||
|
||||
|
@ -166,8 +166,8 @@ typedef int64_t score_t; // type used for scores, rate, distortion
|
||||
#define BIAS(b) ((b) << (QFIX - 8))
|
||||
// Fun fact: this is the _only_ line where we're actually being lossy and
|
||||
// discarding bits.
|
||||
static WEBP_INLINE int QUANTDIV(int n, int iQ, int B) {
|
||||
return (n * iQ + B) >> QFIX;
|
||||
static WEBP_INLINE int QUANTDIV(uint32_t n, uint32_t iQ, uint32_t B) {
|
||||
return (int)((n * iQ + B) >> QFIX);
|
||||
}
|
||||
|
||||
// size of histogram used by CollectHistogram.
|
||||
@ -236,8 +236,8 @@ typedef struct {
|
||||
typedef struct VP8Matrix {
|
||||
uint16_t q_[16]; // quantizer steps
|
||||
uint16_t iq_[16]; // reciprocals, fixed point.
|
||||
uint16_t bias_[16]; // rounding bias
|
||||
uint16_t zthresh_[16]; // value under which a coefficient is zeroed
|
||||
uint32_t bias_[16]; // rounding bias
|
||||
uint32_t zthresh_[16]; // value below which a coefficient is zeroed
|
||||
uint16_t sharpen_[16]; // frequency boosters for slight sharpening
|
||||
} VP8Matrix;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user