From f9bbc2a034aa1943364881c4eadbf68433002350 Mon Sep 17 00:00:00 2001 From: skal Date: Tue, 8 Oct 2013 22:05:38 +0200 Subject: [PATCH] Special-case sparse transform If the number of non-zero coeffs is <= 3, use a simplified transform for luma. Change-Id: I78a1252704228d21720d4bc1221252c84338d9c8 --- src/dec/frame.c | 70 ++++++++++++++++++++------------------ src/dec/vp8.c | 36 ++++++++++---------- src/dec/vp8i.h | 15 ++++---- src/dsp/dec.c | 24 +++++++++++++ src/dsp/dec_sse2.c | 85 +++++++++++++++++++++++++++++++++++++--------- src/dsp/dsp.h | 1 + 6 files changed, 159 insertions(+), 72 deletions(-) diff --git a/src/dec/frame.c b/src/dec/frame.c index 9051b567..b8a56fd0 100644 --- a/src/dec/frame.c +++ b/src/dec/frame.c @@ -544,6 +544,34 @@ static void Copy32b(uint8_t* dst, uint8_t* src) { memcpy(dst, src, 4); } +static void DoTransform(uint32_t bits, const int16_t* const src, + uint8_t* const dst) { + switch (bits >> 30) { + case 3: + VP8Transform(src, dst, 0); + break; + case 2: + VP8TransformAC3(src, dst); + break; + case 1: + VP8TransformDC(src, dst); + break; + default: + break; + } +} + +static void DoUVTransform(uint32_t bits, const int16_t* const src, + uint8_t* const dst) { + if (bits & 0xff) { // any non-zero coeff at all? + if (bits & 0xaa) { // any non-zero AC coefficient? + VP8TransformUV(src, dst); // note we don't use the AC3 variant for U/V + } else { + VP8TransformDCUV(src, dst); + } + } +} + void VP8ReconstructBlock(const VP8Decoder* const dec) { int j; uint8_t* const y_dst = dec->yuv_b_ + Y_OFF; @@ -578,6 +606,7 @@ void VP8ReconstructBlock(const VP8Decoder* const dec) { // bring top samples into the cache VP8TopSamples* const top_yuv = dec->yuv_t_ + dec->mb_x_; const int16_t* const coeffs = block->coeffs_; + uint32_t bits = block->non_zero_y_; int n; if (dec->mb_y_ > 0) { @@ -595,7 +624,6 @@ void VP8ReconstructBlock(const VP8Decoder* const dec) { // predict and add residuals if (block->is_i4x4_) { // 4x4 uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16); - uint32_t bits = (block->non_zero_ & 0xffff) | (block->non_zero_ac_ << 16); if (dec->mb_y_ > 0) { if (dec->mb_x_ >= dec->mb_w_ - 1) { // on rightmost border @@ -608,53 +636,29 @@ void VP8ReconstructBlock(const VP8Decoder* const dec) { top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0]; // predict and add residuals for all 4x4 blocks in turn. - for (n = 0; n < 16; ++n, bits <<= 1) { + for (n = 0; n < 16; ++n, bits <<= 2) { uint8_t* const dst = y_dst + kScan[n]; VP8PredLuma4[block->imodes_[n]](dst); - if (bits & (1UL << 31)) { - VP8Transform(coeffs + n * 16, dst, 0); - } else if (bits & (1UL << 15)) { // only DC is present - VP8TransformDC(coeffs + n * 16, dst); - } + DoTransform(bits, coeffs + n * 16, dst); } } else { // 16x16 const int pred_func = CheckMode(dec->mb_x_, dec->mb_y_, block->imodes_[0]); - uint32_t bits = (block->non_zero_ & 0xffff) | (block->non_zero_ac_ << 16); VP8PredLuma16[pred_func](y_dst); - if (bits & 0xffff) { - for (n = 0; n < 16; ++n, bits <<= 1) { - uint8_t* const dst = y_dst + kScan[n]; - if (bits & (1UL << 31)) { - VP8Transform(coeffs + n * 16, dst, 0); - } else if (bits & (1UL << 15)) { // only DC is present - VP8TransformDC(coeffs + n * 16, dst); - } + if (bits != 0) { + for (n = 0; n < 16; ++n, bits <<= 2) { + DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]); } } } { // Chroma + const uint32_t bits_uv = block->non_zero_uv_; const int pred_func = CheckMode(dec->mb_x_, dec->mb_y_, block->uvmode_); VP8PredChroma8[pred_func](u_dst); VP8PredChroma8[pred_func](v_dst); - - if (block->non_zero_ & 0x0f0000) { // chroma-U - const int16_t* const u_coeffs = coeffs + 16 * 16; - if (block->non_zero_ac_ & 0x0f0000) { - VP8TransformUV(u_coeffs, u_dst); - } else { - VP8TransformDCUV(u_coeffs, u_dst); - } - } - if (block->non_zero_ & 0xf00000) { // chroma-V - const int16_t* const v_coeffs = coeffs + 20 * 16; - if (block->non_zero_ac_ & 0xf00000) { - VP8TransformUV(v_coeffs, v_dst); - } else { - VP8TransformDCUV(v_coeffs, v_dst); - } - } + DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst); + DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst); } // stash away top samples for next block diff --git a/src/dec/vp8.c b/src/dec/vp8.c index c1cd1813..6612d137 100644 --- a/src/dec/vp8.c +++ b/src/dec/vp8.c @@ -509,8 +509,8 @@ static int ParseResiduals(VP8Decoder* const dec, int16_t* dst = block->coeffs_; VP8MB* const left_mb = dec->mb_info_ - 1; uint8_t tnz, lnz; - uint32_t non_zero_ac = 0; - uint32_t non_zero_dc = 0; + uint32_t non_zero_y = 0; + uint32_t non_zero_uv = 0; int x, y, ch; uint32_t out_t_nz, out_l_nz; int first; @@ -539,26 +539,27 @@ static int ParseResiduals(VP8Decoder* const dec, lnz = left_mb->nz_ & 0x0f; for (y = 0; y < 4; ++y) { int l = lnz & 1; - uint32_t nz_dc = 0, nz_ac = 0; + uint32_t nz_coeffs = 0; for (x = 0; x < 4; ++x) { const int ctx = l + (tnz & 1); const int nz = GetCoeffs(token_br, ac_proba, ctx, q->y1_mat_, first, dst); l = (nz > first); tnz = (tnz >> 1) | (l << 7); - nz_dc = (nz_dc << 1) | (dst[0] != 0); - nz_ac = (nz_ac << 1) | (nz > 1); + nz_coeffs <<= 2; + if (nz > 3) nz_coeffs |= 3; + else if (nz > 1) nz_coeffs |= 2; + else if (dst[0] != 0) nz_coeffs |= 1; dst += 16; } tnz >>= 4; lnz = (lnz >> 1) | (l << 7); - non_zero_dc = (non_zero_dc << 4) | nz_dc; - non_zero_ac = (non_zero_ac << 4) | nz_ac; + non_zero_y = (non_zero_y << 8) | nz_coeffs; } out_t_nz = tnz; out_l_nz = lnz >> 4; for (ch = 0; ch < 4; ch += 2) { - uint32_t nz_dc = 0, nz_ac = 0; + uint32_t nz_coeffs = 0; tnz = mb->nz_ >> (4 + ch); lnz = left_mb->nz_ >> (4 + ch); for (y = 0; y < 2; ++y) { @@ -568,25 +569,26 @@ static int ParseResiduals(VP8Decoder* const dec, const int nz = GetCoeffs(token_br, bands[2], ctx, q->uv_mat_, 0, dst); l = (nz > 0); tnz = (tnz >> 1) | (l << 3); - nz_dc = (nz_dc << 1) | (dst[0] != 0); - nz_ac = (nz_ac << 1) | (nz > 1); + nz_coeffs <<= 2; + if (nz > 3) nz_coeffs |= 3; + else if (nz > 1) nz_coeffs |= 2; + else if (dst[0] != 0) nz_coeffs |= 1; dst += 16; } tnz >>= 2; lnz = (lnz >> 1) | (l << 5); } // Note: we don't really need the per-4x4 details for U/V blocks. - non_zero_dc |= (nz_dc & 0x0f) << (16 + 2 * ch); - non_zero_ac |= (nz_ac & 0x0f) << (16 + 2 * ch); + non_zero_uv |= nz_coeffs << (4 * ch); out_t_nz |= (tnz << 4) << ch; out_l_nz |= (lnz & 0xf0) << ch; } mb->nz_ = out_t_nz; left_mb->nz_ = out_l_nz; - block->non_zero_ac_ = non_zero_ac; - block->non_zero_ = non_zero_ac | non_zero_dc; - return !block->non_zero_; // will be used for further optimization + block->non_zero_y_ = non_zero_y; + block->non_zero_uv_ = non_zero_uv; + return !(non_zero_y | non_zero_uv); // will be used for further optimization } //------------------------------------------------------------------------------ @@ -621,8 +623,8 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) { if (!block->is_i4x4_) { left->nz_dc_ = mb->nz_dc_ = 0; } - block->non_zero_ = 0; - block->non_zero_ac_ = 0; + block->non_zero_y_ = 0; + block->non_zero_uv_ = 0; } if (dec->filter_type_ > 0) { // store filter info diff --git a/src/dec/vp8i.h b/src/dec/vp8i.h index ba1e819c..e2c08c4e 100644 --- a/src/dec/vp8i.h +++ b/src/dec/vp8i.h @@ -184,12 +184,15 @@ typedef struct { uint8_t is_i4x4_; // true if intra4x4 uint8_t imodes_[16]; // one 16x16 mode (#0) or sixteen 4x4 modes uint8_t uvmode_; // chroma prediction mode - // bit-wise info about the content of each sub-4x4 blocks: there are 16 bits - // for luma (bits #15->#0), then 4 bits for chroma-u (#19->#16) and 4 bits for - // chroma-v (#23->#20), each corresponding to one 4x4 block in decoding order. - // If the bit is set, the 4x4 block contains some non-zero coefficients. - uint32_t non_zero_; - uint32_t non_zero_ac_; + // bit-wise info about the content of each sub-4x4 blocks (in decoding order). + // Each of the 4x4 blocks for y/u/v is associated with a 2b code according to: + // code=0 -> no coefficient + // code=1 -> only DC + // code=2 -> first three coefficients are non-zero + // code=3 -> more than three coefficients are non-zero + // This allows to call specialized transform functions. + uint32_t non_zero_y_; + uint32_t non_zero_uv_; } VP8MBData; // Persistent information needed by the parallel processing diff --git a/src/dsp/dec.c b/src/dsp/dec.c index 2fbd6b1a..be4c8b53 100644 --- a/src/dsp/dec.c +++ b/src/dsp/dec.c @@ -61,6 +61,14 @@ static WEBP_INLINE uint8_t clip_8b(int v) { #define STORE(x, y, v) \ dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3)) +#define STORE2(y, dc, d, c) do { \ + const int DC = (dc); \ + STORE(0, y, DC + (d)); \ + STORE(1, y, DC + (c)); \ + STORE(2, y, DC - (c)); \ + STORE(3, y, DC - (d)); \ +} while (0) + static const int kC1 = 20091 + (1 << 16); static const int kC2 = 35468; #define MUL(a, b) (((a) * (b)) >> 16) @@ -103,7 +111,21 @@ static void TransformOne(const int16_t* in, uint8_t* dst) { dst += BPS; } } + +// Simplified transform when only in[0], in[1] and in[4] are non-zero +static void TransformAC3(const int16_t* in, uint8_t* dst) { + const int a = in[0] + 4; + const int c4 = MUL(in[4], kC2); + const int d4 = MUL(in[4], kC1); + const int c1 = MUL(in[1], kC2); + const int d1 = MUL(in[1], kC1); + STORE2(0, a + d4, d1, c1); + STORE2(1, a + c4, d1, c1); + STORE2(2, a - c4, d1, c1); + STORE2(3, a - d4, d1, c1); +} #undef MUL +#undef STORE2 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { TransformOne(in, dst); @@ -679,6 +701,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride, //------------------------------------------------------------------------------ VP8DecIdct2 VP8Transform; +VP8DecIdct VP8TransformAC3; VP8DecIdct VP8TransformUV; VP8DecIdct VP8TransformDC; VP8DecIdct VP8TransformDCUV; @@ -706,6 +729,7 @@ void VP8DspInit(void) { VP8TransformUV = TransformUV; VP8TransformDC = TransformDC; VP8TransformDCUV = TransformDCUV; + VP8TransformAC3 = TransformAC3; VP8VFilter16 = VFilter16; VP8HFilter16 = HFilter16; diff --git a/src/dsp/dec_sse2.c b/src/dsp/dec_sse2.c index 6be94678..45c10de1 100644 --- a/src/dsp/dec_sse2.c +++ b/src/dsp/dec_sse2.c @@ -20,6 +20,10 @@ extern "C" { #if defined(WEBP_USE_SSE2) +// The 3-coeff sparse transform in SSE2 is not really faster than the plain-C +// one it seems => disable it by default. Uncomment the following to enable: +// #define USE_TRANSFORM_AC3 + #include #include "../dec/vp8i.h" @@ -201,16 +205,16 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) { __m128i dst0, dst1, dst2, dst3; if (do_two) { // Load eight bytes/pixels per line. - dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]); - dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]); - dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]); - dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]); + dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS)); + dst1 = _mm_loadl_epi64((__m128i*)(dst + 1 * BPS)); + dst2 = _mm_loadl_epi64((__m128i*)(dst + 2 * BPS)); + dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS)); } else { // Load four bytes/pixels per line. - dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]); - dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]); - dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]); - dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]); + dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS)); + dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS)); + dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS)); + dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS)); } // Convert to 16b. dst0 = _mm_unpacklo_epi8(dst0, zero); @@ -230,20 +234,66 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) { // Store the results. if (do_two) { // Store eight bytes/pixels per line. - _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0); - _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1); - _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2); - _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3); + _mm_storel_epi64((__m128i*)(dst + 0 * BPS), dst0); + _mm_storel_epi64((__m128i*)(dst + 1 * BPS), dst1); + _mm_storel_epi64((__m128i*)(dst + 2 * BPS), dst2); + _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3); } else { // Store four bytes/pixels per line. - *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0); - *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1); - *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2); - *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3); + *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0); + *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1); + *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2); + *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3); } } } +#if defined(USE_TRANSFORM_AC3) +#define MUL(a, b) (((a) * (b)) >> 16) +static void TransformAC3SSE2(const int16_t* in, uint8_t* dst) { + static const int kC1 = 20091 + (1 << 16); + static const int kC2 = 35468; + const __m128i A = _mm_set1_epi16(in[0] + 4); + const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2)); + const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1)); + const int c1 = MUL(in[1], kC2); + const int d1 = MUL(in[1], kC1); + const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1); + const __m128i B = _mm_adds_epi16(A, CD); + const __m128i m0 = _mm_adds_epi16(B, d4); + const __m128i m1 = _mm_adds_epi16(B, c4); + const __m128i m2 = _mm_subs_epi16(B, c4); + const __m128i m3 = _mm_subs_epi16(B, d4); + const __m128i zero = _mm_setzero_si128(); + // Load the source pixels. + __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS)); + __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS)); + __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS)); + __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS)); + // Convert to 16b. + dst0 = _mm_unpacklo_epi8(dst0, zero); + dst1 = _mm_unpacklo_epi8(dst1, zero); + dst2 = _mm_unpacklo_epi8(dst2, zero); + dst3 = _mm_unpacklo_epi8(dst3, zero); + // Add the inverse transform. + dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3)); + dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3)); + dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3)); + dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3)); + // Unsigned saturate to 8b. + dst0 = _mm_packus_epi16(dst0, dst0); + dst1 = _mm_packus_epi16(dst1, dst1); + dst2 = _mm_packus_epi16(dst2, dst2); + dst3 = _mm_packus_epi16(dst3, dst3); + // Store the results. + *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0); + *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1); + *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2); + *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3); +} +#undef MUL +#endif // USE_TRANSFORM_AC3 + //------------------------------------------------------------------------------ // Loop Filter (Paragraph 15) @@ -888,6 +938,9 @@ extern void VP8DspInitSSE2(void); void VP8DspInitSSE2(void) { #if defined(WEBP_USE_SSE2) VP8Transform = TransformSSE2; +#if defined(USE_TRANSFORM_AC3) + VP8TransformAC3 = TransformAC3SSE2; +#endif VP8VFilter16 = VFilter16SSE2; VP8HFilter16 = HFilter16SSE2; diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index 11d1d11f..d197eeb6 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -103,6 +103,7 @@ typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst); // when doing two transforms, coeffs is actually int16_t[2][16]. typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two); extern VP8DecIdct2 VP8Transform; +extern VP8DecIdct VP8TransformAC3; extern VP8DecIdct VP8TransformUV; extern VP8DecIdct VP8TransformDC; extern VP8DecIdct VP8TransformDCUV;