From 869eaf6c602dc041f3d1a8bb0bc0e29386d1ae89 Mon Sep 17 00:00:00 2001 From: skal Date: Mon, 7 Apr 2014 18:02:25 +0200 Subject: [PATCH] ~30% encoding speedup: use NEON for QuantizeBlock() also revamped the signature to avoid having to pass the 'first' parameter Change-Id: Ief9af1747dcfb5db0700b595d0073cebd57542a5 --- src/dsp/dsp.h | 2 +- src/dsp/enc.c | 5 ++-- src/dsp/enc_mips32.c | 17 ++++------- src/dsp/enc_neon.c | 71 +++++++++++++++++++++++++++++++++++++++++++- src/dsp/enc_sse2.c | 9 +++--- src/enc/quant.c | 7 +++-- 6 files changed, 87 insertions(+), 24 deletions(-) diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index 4f0666a1..41d48a88 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -88,7 +88,7 @@ extern VP8BlockCopy VP8Copy4x4; // Quantization struct VP8Matrix; // forward declaration typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16], - int n, const struct VP8Matrix* const mtx); + const struct VP8Matrix* const mtx); extern VP8QuantizeBlock VP8EncQuantizeBlock; // specific to 2nd transform: diff --git a/src/dsp/enc.c b/src/dsp/enc.c index a38135e8..66f3a6a9 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -600,9 +600,10 @@ static const uint8_t kZigzag[16] = { // Simple quantization static int QuantizeBlock(int16_t in[16], int16_t out[16], - int n, const VP8Matrix* const mtx) { + const VP8Matrix* const mtx) { int last = -1; - for (; n < 16; ++n) { + int n; + for (n = 0; n < 16; ++n) { const int j = kZigzag[n]; const int sign = (in[j] < 0); const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j]; diff --git a/src/dsp/enc_mips32.c b/src/dsp/enc_mips32.c index a03441ba..15ef0112 100755 --- a/src/dsp/enc_mips32.c +++ b/src/dsp/enc_mips32.c @@ -186,8 +186,7 @@ static void ITransformMIPS32(const uint8_t* ref, const int16_t* in, "sh %[level], "#N"(%[pout]) \n\t" static int QuantizeBlockMIPS32(int16_t in[16], int16_t out[16], - int n, const VP8Matrix* const mtx) { - int last; + const VP8Matrix* const mtx) { int temp0, temp1, temp2, temp3, temp4, temp5; int sign, coeff, level, i; int max_level = MAX_LEVEL; @@ -201,9 +200,7 @@ static int QuantizeBlockMIPS32(int16_t in[16], int16_t out[16], const uint32_t* ppbias = &mtx->bias_[0]; __asm__ volatile( - "bnez %[n], 1f \n\t" QUANTIZE_ONE( 0, 0, 0) - "1: \n\t" QUANTIZE_ONE( 2, 4, 2) QUANTIZE_ONE( 8, 16, 4) QUANTIZE_ONE(16, 32, 6) @@ -225,7 +222,7 @@ static int QuantizeBlockMIPS32(int16_t in[16], int16_t out[16], [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [sign]"=&r"(sign), [coeff]"=&r"(coeff), [level]"=&r"(level) - : [n]"r"(n), [pout]"r"(pout), [ppin]"r"(ppin), + : [pout]"r"(pout), [ppin]"r"(ppin), [ppiq]"r"(ppiq), [max_level]"r"(max_level), [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh), [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq) @@ -233,14 +230,10 @@ static int QuantizeBlockMIPS32(int16_t in[16], int16_t out[16], ); // moved out from macro to increase possibility for earlier breaking - last = -1; - for (i = 15; i >= n; i--) { - if (out[i]) { - last = i; - break; - } + for (i = 15; i >= 0; i--) { + if (out[i]) return 1; } - return (last >= 0); + return 0; } #undef QUANTIZE_ONE diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c index 280caa8d..a3507925 100644 --- a/src/dsp/enc_neon.c +++ b/src/dsp/enc_neon.c @@ -17,10 +17,10 @@ #define USE_INTRINSICS // use intrinsics when possible +#include #include #include "../enc/vp8enci.h" - //------------------------------------------------------------------------------ // Transforms (Paragraph 14.4) @@ -766,6 +766,72 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { #undef LOAD_LANE_32b +//------------------------------------------------------------------------------ + +// Compilation with gcc4.6.3 is problematic for now. Disable this function then. +#if (__GNUC__ <= 4 && __GNUC_MINOR__ < 8) +#define SKIP_QUANTIZE +#endif + +#if !defined(SKIP_QUANTIZE) + +static int16x8_t Quantize(int16_t* const in, + const VP8Matrix* const mtx, int offset) { + const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]); + const uint16x8_t q = vld1q_u16(&mtx->q_[offset]); + const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]); + const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]); + const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]); + + const int16x8_t a = vld1q_s16(in + offset); // in + const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in) + const int16x8_t sign = vshrq_n_s16(a, 15); // sign + const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen + const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq)); + const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq)); + const uint32x4_t m2 = vhaddq_u32(m0, bias0); + const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1 + const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16), + vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1 + const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL)); + const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign); + const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign + const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q)); + vst1q_s16(in + offset, c4); + assert(QFIX == 17); // this function can't work as is if QFIX != 16+1 + return c3; +} + +static const uint8_t kShuffles[4][8] = { + { 0, 1, 2, 3, 8, 9, 16, 17 }, + { 10, 11, 4, 5, 6, 7, 12, 13 }, + { 18, 19, 24, 25, 26, 27, 20, 21 }, + { 14, 15, 22, 23, 28, 29, 30, 31 } +}; + +static int QuantizeBlock(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { + const int16x8_t out0 = Quantize(in, mtx, 0); + const int16x8_t out1 = Quantize(in, mtx, 8); + const uint8x8x4_t all_out = {{ + vreinterpret_u8_s16(vget_low_s16(out0)), + vreinterpret_u8_s16(vget_high_s16(out0)), + vreinterpret_u8_s16(vget_low_s16(out1)), + vreinterpret_u8_s16(vget_high_s16(out1)) }}; + // Zigzag reordering + vst1_u8((uint8_t*)(out + 0), vtbl4_u8(all_out, vld1_u8(kShuffles[0]))); + vst1_u8((uint8_t*)(out + 4), vtbl4_u8(all_out, vld1_u8(kShuffles[1]))); + vst1_u8((uint8_t*)(out + 8), vtbl4_u8(all_out, vld1_u8(kShuffles[2]))); + vst1_u8((uint8_t*)(out + 12), vtbl4_u8(all_out, vld1_u8(kShuffles[3]))); + // test zeros + if (*(uint64_t*)(out + 0) != 0) return 1; + if (*(uint64_t*)(out + 4) != 0) return 1; + if (*(uint64_t*)(out + 8) != 0) return 1; + if (*(uint64_t*)(out + 12) != 0) return 1; + return 0; +} +#endif // SKIP_QUANTIZE + #endif // WEBP_USE_NEON //------------------------------------------------------------------------------ @@ -788,5 +854,8 @@ void VP8EncDspInitNEON(void) { VP8SSE8x8 = SSE8x8; VP8SSE4x4 = SSE4x4; #endif +#if !defined(SKIP_QUANTIZE) + VP8EncQuantizeBlock = QuantizeBlock; +#endif #endif // WEBP_USE_NEON } diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c index 0e9fd181..ecebb4b3 100644 --- a/src/dsp/enc_sse2.c +++ b/src/dsp/enc_sse2.c @@ -805,7 +805,7 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b, #define QFIX2 0 static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16], - int n, int shift, + int shift, const uint16_t* const sharpen, const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL); @@ -916,18 +916,17 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16], } // detect if all 'out' values are zeroes or not - if (n) packed_out = _mm_srli_si128(packed_out, 1); // ignore DC for n == 1 return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff); } static int QuantizeBlock(int16_t in[16], int16_t out[16], - int n, const VP8Matrix* const mtx) { - return DoQuantizeBlock(in, out, n, 0, &mtx->sharpen_[0], mtx); + const VP8Matrix* const mtx) { + return DoQuantizeBlock(in, out, 0, &mtx->sharpen_[0], mtx); } static int QuantizeBlockWHT(int16_t in[16], int16_t out[16], const VP8Matrix* const mtx) { - return DoQuantizeBlock(in, out, 0, 0, &mtx->sharpen_[0], mtx); + return DoQuantizeBlock(in, out, 0, &mtx->sharpen_[0], mtx); } #endif // WEBP_USE_SSE2 diff --git a/src/enc/quant.c b/src/enc/quant.c index 609e3910..c8cdc160 100644 --- a/src/enc/quant.c +++ b/src/enc/quant.c @@ -746,7 +746,8 @@ static int ReconstructIntra16(VP8EncIterator* const it, } } else { for (n = 0; n < 16; ++n) { - nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], 1, &dqm->y1_) << n; + tmp[n][0] = 0; // so that nz is correct below + nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n; } } @@ -777,7 +778,7 @@ static int ReconstructIntra4(VP8EncIterator* const it, nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_, dqm->lambda_trellis_i4_); } else { - nz = VP8EncQuantizeBlock(tmp, levels, 0, &dqm->y1_); + nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_); } VP8ITransform(ref, tmp, yuv_out, 0); return nz; @@ -812,7 +813,7 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd, } } else { for (n = 0; n < 8; ++n) { - nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], 0, &dqm->uv_) << n; + nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], &dqm->uv_) << n; } }