mirror of
https://github.com/webmproject/libwebp.git
synced 2024-12-28 14:38:21 +01:00
~30% encoding speedup: use NEON for QuantizeBlock()
also revamped the signature to avoid having to pass the 'first' parameter Change-Id: Ief9af1747dcfb5db0700b595d0073cebd57542a5
This commit is contained in:
parent
f758af6b73
commit
869eaf6c60
@ -88,7 +88,7 @@ extern VP8BlockCopy VP8Copy4x4;
|
|||||||
// Quantization
|
// Quantization
|
||||||
struct VP8Matrix; // forward declaration
|
struct VP8Matrix; // forward declaration
|
||||||
typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
|
typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
|
||||||
int n, const struct VP8Matrix* const mtx);
|
const struct VP8Matrix* const mtx);
|
||||||
extern VP8QuantizeBlock VP8EncQuantizeBlock;
|
extern VP8QuantizeBlock VP8EncQuantizeBlock;
|
||||||
|
|
||||||
// specific to 2nd transform:
|
// specific to 2nd transform:
|
||||||
|
@ -600,9 +600,10 @@ static const uint8_t kZigzag[16] = {
|
|||||||
|
|
||||||
// Simple quantization
|
// Simple quantization
|
||||||
static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
||||||
int n, const VP8Matrix* const mtx) {
|
const VP8Matrix* const mtx) {
|
||||||
int last = -1;
|
int last = -1;
|
||||||
for (; n < 16; ++n) {
|
int n;
|
||||||
|
for (n = 0; n < 16; ++n) {
|
||||||
const int j = kZigzag[n];
|
const int j = kZigzag[n];
|
||||||
const int sign = (in[j] < 0);
|
const int sign = (in[j] < 0);
|
||||||
const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
|
const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
|
||||||
|
@ -186,8 +186,7 @@ static void ITransformMIPS32(const uint8_t* ref, const int16_t* in,
|
|||||||
"sh %[level], "#N"(%[pout]) \n\t"
|
"sh %[level], "#N"(%[pout]) \n\t"
|
||||||
|
|
||||||
static int QuantizeBlockMIPS32(int16_t in[16], int16_t out[16],
|
static int QuantizeBlockMIPS32(int16_t in[16], int16_t out[16],
|
||||||
int n, const VP8Matrix* const mtx) {
|
const VP8Matrix* const mtx) {
|
||||||
int last;
|
|
||||||
int temp0, temp1, temp2, temp3, temp4, temp5;
|
int temp0, temp1, temp2, temp3, temp4, temp5;
|
||||||
int sign, coeff, level, i;
|
int sign, coeff, level, i;
|
||||||
int max_level = MAX_LEVEL;
|
int max_level = MAX_LEVEL;
|
||||||
@ -201,9 +200,7 @@ static int QuantizeBlockMIPS32(int16_t in[16], int16_t out[16],
|
|||||||
const uint32_t* ppbias = &mtx->bias_[0];
|
const uint32_t* ppbias = &mtx->bias_[0];
|
||||||
|
|
||||||
__asm__ volatile(
|
__asm__ volatile(
|
||||||
"bnez %[n], 1f \n\t"
|
|
||||||
QUANTIZE_ONE( 0, 0, 0)
|
QUANTIZE_ONE( 0, 0, 0)
|
||||||
"1: \n\t"
|
|
||||||
QUANTIZE_ONE( 2, 4, 2)
|
QUANTIZE_ONE( 2, 4, 2)
|
||||||
QUANTIZE_ONE( 8, 16, 4)
|
QUANTIZE_ONE( 8, 16, 4)
|
||||||
QUANTIZE_ONE(16, 32, 6)
|
QUANTIZE_ONE(16, 32, 6)
|
||||||
@ -225,7 +222,7 @@ static int QuantizeBlockMIPS32(int16_t in[16], int16_t out[16],
|
|||||||
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
|
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
|
||||||
[sign]"=&r"(sign), [coeff]"=&r"(coeff),
|
[sign]"=&r"(sign), [coeff]"=&r"(coeff),
|
||||||
[level]"=&r"(level)
|
[level]"=&r"(level)
|
||||||
: [n]"r"(n), [pout]"r"(pout), [ppin]"r"(ppin),
|
: [pout]"r"(pout), [ppin]"r"(ppin),
|
||||||
[ppiq]"r"(ppiq), [max_level]"r"(max_level),
|
[ppiq]"r"(ppiq), [max_level]"r"(max_level),
|
||||||
[ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
|
[ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
|
||||||
[ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
|
[ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
|
||||||
@ -233,14 +230,10 @@ static int QuantizeBlockMIPS32(int16_t in[16], int16_t out[16],
|
|||||||
);
|
);
|
||||||
|
|
||||||
// moved out from macro to increase possibility for earlier breaking
|
// moved out from macro to increase possibility for earlier breaking
|
||||||
last = -1;
|
for (i = 15; i >= 0; i--) {
|
||||||
for (i = 15; i >= n; i--) {
|
if (out[i]) return 1;
|
||||||
if (out[i]) {
|
|
||||||
last = i;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return (last >= 0);
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef QUANTIZE_ONE
|
#undef QUANTIZE_ONE
|
||||||
|
@ -17,10 +17,10 @@
|
|||||||
|
|
||||||
#define USE_INTRINSICS // use intrinsics when possible
|
#define USE_INTRINSICS // use intrinsics when possible
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
||||||
#include "../enc/vp8enci.h"
|
#include "../enc/vp8enci.h"
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
// Transforms (Paragraph 14.4)
|
// Transforms (Paragraph 14.4)
|
||||||
|
|
||||||
@ -766,6 +766,72 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
|
|||||||
|
|
||||||
#undef LOAD_LANE_32b
|
#undef LOAD_LANE_32b
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// Compilation with gcc4.6.3 is problematic for now. Disable this function then.
|
||||||
|
#if (__GNUC__ <= 4 && __GNUC_MINOR__ < 8)
|
||||||
|
#define SKIP_QUANTIZE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(SKIP_QUANTIZE)
|
||||||
|
|
||||||
|
static int16x8_t Quantize(int16_t* const in,
|
||||||
|
const VP8Matrix* const mtx, int offset) {
|
||||||
|
const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
|
||||||
|
const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
|
||||||
|
const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
|
||||||
|
const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
|
||||||
|
const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
|
||||||
|
|
||||||
|
const int16x8_t a = vld1q_s16(in + offset); // in
|
||||||
|
const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)
|
||||||
|
const int16x8_t sign = vshrq_n_s16(a, 15); // sign
|
||||||
|
const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen
|
||||||
|
const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
|
||||||
|
const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
|
||||||
|
const uint32x4_t m2 = vhaddq_u32(m0, bias0);
|
||||||
|
const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1
|
||||||
|
const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
|
||||||
|
vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1
|
||||||
|
const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
|
||||||
|
const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
|
||||||
|
const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign
|
||||||
|
const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
|
||||||
|
vst1q_s16(in + offset, c4);
|
||||||
|
assert(QFIX == 17); // this function can't work as is if QFIX != 16+1
|
||||||
|
return c3;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const uint8_t kShuffles[4][8] = {
|
||||||
|
{ 0, 1, 2, 3, 8, 9, 16, 17 },
|
||||||
|
{ 10, 11, 4, 5, 6, 7, 12, 13 },
|
||||||
|
{ 18, 19, 24, 25, 26, 27, 20, 21 },
|
||||||
|
{ 14, 15, 22, 23, 28, 29, 30, 31 }
|
||||||
|
};
|
||||||
|
|
||||||
|
static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
||||||
|
const VP8Matrix* const mtx) {
|
||||||
|
const int16x8_t out0 = Quantize(in, mtx, 0);
|
||||||
|
const int16x8_t out1 = Quantize(in, mtx, 8);
|
||||||
|
const uint8x8x4_t all_out = {{
|
||||||
|
vreinterpret_u8_s16(vget_low_s16(out0)),
|
||||||
|
vreinterpret_u8_s16(vget_high_s16(out0)),
|
||||||
|
vreinterpret_u8_s16(vget_low_s16(out1)),
|
||||||
|
vreinterpret_u8_s16(vget_high_s16(out1)) }};
|
||||||
|
// Zigzag reordering
|
||||||
|
vst1_u8((uint8_t*)(out + 0), vtbl4_u8(all_out, vld1_u8(kShuffles[0])));
|
||||||
|
vst1_u8((uint8_t*)(out + 4), vtbl4_u8(all_out, vld1_u8(kShuffles[1])));
|
||||||
|
vst1_u8((uint8_t*)(out + 8), vtbl4_u8(all_out, vld1_u8(kShuffles[2])));
|
||||||
|
vst1_u8((uint8_t*)(out + 12), vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
|
||||||
|
// test zeros
|
||||||
|
if (*(uint64_t*)(out + 0) != 0) return 1;
|
||||||
|
if (*(uint64_t*)(out + 4) != 0) return 1;
|
||||||
|
if (*(uint64_t*)(out + 8) != 0) return 1;
|
||||||
|
if (*(uint64_t*)(out + 12) != 0) return 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif // SKIP_QUANTIZE
|
||||||
|
|
||||||
#endif // WEBP_USE_NEON
|
#endif // WEBP_USE_NEON
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
@ -788,5 +854,8 @@ void VP8EncDspInitNEON(void) {
|
|||||||
VP8SSE8x8 = SSE8x8;
|
VP8SSE8x8 = SSE8x8;
|
||||||
VP8SSE4x4 = SSE4x4;
|
VP8SSE4x4 = SSE4x4;
|
||||||
#endif
|
#endif
|
||||||
|
#if !defined(SKIP_QUANTIZE)
|
||||||
|
VP8EncQuantizeBlock = QuantizeBlock;
|
||||||
|
#endif
|
||||||
#endif // WEBP_USE_NEON
|
#endif // WEBP_USE_NEON
|
||||||
}
|
}
|
||||||
|
@ -805,7 +805,7 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
|
|||||||
|
|
||||||
#define QFIX2 0
|
#define QFIX2 0
|
||||||
static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
|
static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
|
||||||
int n, int shift,
|
int shift,
|
||||||
const uint16_t* const sharpen,
|
const uint16_t* const sharpen,
|
||||||
const VP8Matrix* const mtx) {
|
const VP8Matrix* const mtx) {
|
||||||
const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
|
const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
|
||||||
@ -916,18 +916,17 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
|
|||||||
}
|
}
|
||||||
|
|
||||||
// detect if all 'out' values are zeroes or not
|
// detect if all 'out' values are zeroes or not
|
||||||
if (n) packed_out = _mm_srli_si128(packed_out, 1); // ignore DC for n == 1
|
|
||||||
return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
|
return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
||||||
int n, const VP8Matrix* const mtx) {
|
const VP8Matrix* const mtx) {
|
||||||
return DoQuantizeBlock(in, out, n, 0, &mtx->sharpen_[0], mtx);
|
return DoQuantizeBlock(in, out, 0, &mtx->sharpen_[0], mtx);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
|
static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
|
||||||
const VP8Matrix* const mtx) {
|
const VP8Matrix* const mtx) {
|
||||||
return DoQuantizeBlock(in, out, 0, 0, &mtx->sharpen_[0], mtx);
|
return DoQuantizeBlock(in, out, 0, &mtx->sharpen_[0], mtx);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // WEBP_USE_SSE2
|
#endif // WEBP_USE_SSE2
|
||||||
|
@ -746,7 +746,8 @@ static int ReconstructIntra16(VP8EncIterator* const it,
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (n = 0; n < 16; ++n) {
|
for (n = 0; n < 16; ++n) {
|
||||||
nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], 1, &dqm->y1_) << n;
|
tmp[n][0] = 0; // so that nz is correct below
|
||||||
|
nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -777,7 +778,7 @@ static int ReconstructIntra4(VP8EncIterator* const it,
|
|||||||
nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
|
nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
|
||||||
dqm->lambda_trellis_i4_);
|
dqm->lambda_trellis_i4_);
|
||||||
} else {
|
} else {
|
||||||
nz = VP8EncQuantizeBlock(tmp, levels, 0, &dqm->y1_);
|
nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
|
||||||
}
|
}
|
||||||
VP8ITransform(ref, tmp, yuv_out, 0);
|
VP8ITransform(ref, tmp, yuv_out, 0);
|
||||||
return nz;
|
return nz;
|
||||||
@ -812,7 +813,7 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (n = 0; n < 8; ++n) {
|
for (n = 0; n < 8; ++n) {
|
||||||
nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], 0, &dqm->uv_) << n;
|
nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user