diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index cbac82d8..84423b17 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -144,6 +144,7 @@ typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out); typedef void (*VP8WHT)(const int16_t* in, int16_t* out); extern VP8Idct VP8ITransform; extern VP8Fdct VP8FTransform; +extern VP8Fdct VP8FTransform2; // performs two transforms at a time extern VP8WHT VP8FTransformWHT; // Predictions // *dst is the destination block. *top and *left can be NULL. diff --git a/src/dsp/enc.c b/src/dsp/enc.c index 7775e2df..95e63f89 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -177,6 +177,11 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { } } +static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) { + VP8FTransform(src, ref, out); + VP8FTransform(src + 4, ref + 4, out + 16); +} + static void FTransformWHT(const int16_t* in, int16_t* out) { // input is 12b signed int32_t tmp[16]; @@ -704,6 +709,7 @@ static void Copy16x8(const uint8_t* src, uint8_t* dst) { VP8CHisto VP8CollectHistogram; VP8Idct VP8ITransform; VP8Fdct VP8FTransform; +VP8Fdct VP8FTransform2; VP8WHT VP8FTransformWHT; VP8Intra4Preds VP8EncPredLuma4; VP8IntraPreds VP8EncPredLuma16; @@ -740,6 +746,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) { VP8CollectHistogram = CollectHistogram; VP8ITransform = ITransform; VP8FTransform = FTransform; + VP8FTransform2 = FTransform2; VP8FTransformWHT = FTransformWHT; VP8EncPredLuma4 = Intra4Preds; VP8EncPredLuma16 = Intra16Preds; diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c index 4e19a324..b874225c 100644 --- a/src/dsp/enc_sse2.c +++ b/src/dsp/enc_sse2.c @@ -274,136 +274,193 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, } } -static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { - const __m128i zero = _mm_setzero_si128(); - const __m128i seven = _mm_set1_epi16(7); +static void FTransformPass1(const __m128i* const in01, + const __m128i* const in23, + __m128i* const out01, + __m128i* const out32) { const __m128i k937 = _mm_set1_epi32(937); const __m128i k1812 = _mm_set1_epi32(1812); - const __m128i k51000 = _mm_set1_epi32(51000); - const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); - const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, - 5352, 2217, 5352, 2217); - const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, - 2217, -5352, 2217, -5352); + const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8); const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8); const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352, 2217, 5352, 2217, 5352); const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217, -5352, 2217, -5352, 2217); + + // *in01 = 00 01 10 11 02 03 12 13 + // *in23 = 20 21 30 31 22 23 32 33 + const __m128i shuf01_p = _mm_shufflehi_epi16(*in01, _MM_SHUFFLE(2, 3, 0, 1)); + const __m128i shuf23_p = _mm_shufflehi_epi16(*in23, _MM_SHUFFLE(2, 3, 0, 1)); + // 00 01 10 11 03 02 13 12 + // 20 21 30 31 23 22 33 32 + const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p); + const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p); + // 00 01 10 11 20 21 30 31 + // 03 02 13 12 23 22 33 32 + const __m128i a01 = _mm_add_epi16(s01, s32); + const __m128i a32 = _mm_sub_epi16(s01, s32); + // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ] + // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ] + + const __m128i tmp0 = _mm_madd_epi16(a01, k88p); // [ (a0 + a1) << 3, ... ] + const __m128i tmp2 = _mm_madd_epi16(a01, k88m); // [ (a0 - a1) << 3, ... ] + const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p); + const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m); + const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812); + const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937); + const __m128i tmp1 = _mm_srai_epi32(tmp1_2, 9); + const __m128i tmp3 = _mm_srai_epi32(tmp3_2, 9); + const __m128i s03 = _mm_packs_epi32(tmp0, tmp2); + const __m128i s12 = _mm_packs_epi32(tmp1, tmp3); + const __m128i s_lo = _mm_unpacklo_epi16(s03, s12); // 0 1 0 1 0 1... + const __m128i s_hi = _mm_unpackhi_epi16(s03, s12); // 2 3 2 3 2 3 + const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi); + *out01 = _mm_unpacklo_epi32(s_lo, s_hi); + *out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2.. +} + +static void FTransformPass2(const __m128i* const v01, const __m128i* const v32, + int16_t* out) { + const __m128i zero = _mm_setzero_si128(); + const __m128i seven = _mm_set1_epi16(7); + const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, + 5352, 2217, 5352, 2217); + const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, + 2217, -5352, 2217, -5352); + const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); + const __m128i k51000 = _mm_set1_epi32(51000); + + // Same operations are done on the (0,3) and (1,2) pairs. + // a0 = v0 + v3 + // a1 = v1 + v2 + // a3 = v0 - v3 + // a2 = v1 - v2 + const __m128i a01 = _mm_add_epi16(*v01, *v32); + const __m128i a32 = _mm_sub_epi16(*v01, *v32); + const __m128i a11 = _mm_unpackhi_epi64(a01, a01); + const __m128i a22 = _mm_unpackhi_epi64(a32, a32); + const __m128i a01_plus_7 = _mm_add_epi16(a01, seven); + + // d0 = (a0 + a1 + 7) >> 4; + // d2 = (a0 - a1 + 7) >> 4; + const __m128i c0 = _mm_add_epi16(a01_plus_7, a11); + const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11); + const __m128i d0 = _mm_srai_epi16(c0, 4); + const __m128i d2 = _mm_srai_epi16(c2, 4); + + // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) + // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) + const __m128i b23 = _mm_unpacklo_epi16(a22, a32); + const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); + const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); + const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); + const __m128i d3 = _mm_add_epi32(c3, k51000); + const __m128i e1 = _mm_srai_epi32(d1, 16); + const __m128i e3 = _mm_srai_epi32(d3, 16); + const __m128i f1 = _mm_packs_epi32(e1, e1); + const __m128i f3 = _mm_packs_epi32(e3, e3); + // f1 = f1 + (a3 != 0); + // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the + // desired (0, 1), we add one earlier through k12000_plus_one. + // -> f1 = f1 + 1 - (a3 == 0) + const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); + + const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1); + const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3); + _mm_storeu_si128((__m128i*)&out[0], d0_g1); + _mm_storeu_si128((__m128i*)&out[8], d2_f3); +} + +static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { + const __m128i zero = _mm_setzero_si128(); + + // Load src and convert to 16b. + const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); + const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]); + const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]); + const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]); + const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); + const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); + const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); + const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); + // Load ref and convert to 16b. + const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); + const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); + const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); + const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); + const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); + const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); + const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); + const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); + // Compute difference. -> 00 01 02 03 00 00 00 00 + const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); + const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); + const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); + const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); + + // Unpack and shuffle + // 00 01 02 03 0 0 0 0 + // 10 11 12 13 0 0 0 0 + // 20 21 22 23 0 0 0 0 + // 30 31 32 33 0 0 0 0 + const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1); + const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3); __m128i v01, v32; - - // Difference between src and ref and initial transpose. - { - // Load src and convert to 16b. - const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); - const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]); - const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]); - const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]); - const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); - const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); - const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); - const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); - // Load ref and convert to 16b. - const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); - const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); - const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); - const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); - const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); - const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); - const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); - const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); - // Compute difference. -> 00 01 02 03 00 00 00 00 - const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); - const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); - const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); - const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); - - - // Unpack and shuffle - // 00 01 02 03 0 0 0 0 - // 10 11 12 13 0 0 0 0 - // 20 21 22 23 0 0 0 0 - // 30 31 32 33 0 0 0 0 - const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1); - const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3); - // 00 01 10 11 02 03 12 13 - // 20 21 30 31 22 23 32 33 - const __m128i shuf01_p = - _mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i shuf23_p = - _mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1)); - // 00 01 10 11 03 02 13 12 - // 20 21 30 31 23 22 33 32 - const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p); - const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p); - // 00 01 10 11 20 21 30 31 - // 03 02 13 12 23 22 33 32 - const __m128i a01 = _mm_add_epi16(s01, s32); - const __m128i a32 = _mm_sub_epi16(s01, s32); - // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ] - // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ] - - const __m128i tmp0 = _mm_madd_epi16(a01, k88p); // [ (a0 + a1) << 3, ... ] - const __m128i tmp2 = _mm_madd_epi16(a01, k88m); // [ (a0 - a1) << 3, ... ] - const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p); - const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m); - const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812); - const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937); - const __m128i tmp1 = _mm_srai_epi32(tmp1_2, 9); - const __m128i tmp3 = _mm_srai_epi32(tmp3_2, 9); - const __m128i s03 = _mm_packs_epi32(tmp0, tmp2); - const __m128i s12 = _mm_packs_epi32(tmp1, tmp3); - const __m128i s_lo = _mm_unpacklo_epi16(s03, s12); // 0 1 0 1 0 1... - const __m128i s_hi = _mm_unpackhi_epi16(s03, s12); // 2 3 2 3 2 3 - const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi); - v01 = _mm_unpacklo_epi32(s_lo, s_hi); - v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2.. - } + // First pass + FTransformPass1(&shuf01, &shuf23, &v01, &v32); // Second pass - { - // Same operations are done on the (0,3) and (1,2) pairs. - // a0 = v0 + v3 - // a1 = v1 + v2 - // a3 = v0 - v3 - // a2 = v1 - v2 - const __m128i a01 = _mm_add_epi16(v01, v32); - const __m128i a32 = _mm_sub_epi16(v01, v32); - const __m128i a11 = _mm_unpackhi_epi64(a01, a01); - const __m128i a22 = _mm_unpackhi_epi64(a32, a32); - const __m128i a01_plus_7 = _mm_add_epi16(a01, seven); + FTransformPass2(&v01, &v32, out); +} - // d0 = (a0 + a1 + 7) >> 4; - // d2 = (a0 - a1 + 7) >> 4; - const __m128i c0 = _mm_add_epi16(a01_plus_7, a11); - const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11); - const __m128i d0 = _mm_srai_epi16(c0, 4); - const __m128i d2 = _mm_srai_epi16(c2, 4); +static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) { + const __m128i zero = _mm_setzero_si128(); - // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) - // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) - const __m128i b23 = _mm_unpacklo_epi16(a22, a32); - const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); - const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); - const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); - const __m128i d3 = _mm_add_epi32(c3, k51000); - const __m128i e1 = _mm_srai_epi32(d1, 16); - const __m128i e3 = _mm_srai_epi32(d3, 16); - const __m128i f1 = _mm_packs_epi32(e1, e1); - const __m128i f3 = _mm_packs_epi32(e3, e3); - // f1 = f1 + (a3 != 0); - // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the - // desired (0, 1), we add one earlier through k12000_plus_one. - // -> f1 = f1 + 1 - (a3 == 0) - const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); + // Load src and convert to 16b. + const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); + const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]); + const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]); + const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]); + const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); + const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); + const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); + const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); + // Load ref and convert to 16b. + const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); + const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); + const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); + const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); + const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); + const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); + const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); + const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); + // Compute difference. -> 00 01 02 03 00' 01' 02' 03' + const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); + const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); + const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); + const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); - const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1); - const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3); - _mm_storeu_si128((__m128i*)&out[0], d0_g1); - _mm_storeu_si128((__m128i*)&out[8], d2_f3); - } + // Unpack and shuffle + // 00 01 02 03 0 0 0 0 + // 10 11 12 13 0 0 0 0 + // 20 21 22 23 0 0 0 0 + // 30 31 32 33 0 0 0 0 + const __m128i shuf01l = _mm_unpacklo_epi32(diff0, diff1); + const __m128i shuf23l = _mm_unpacklo_epi32(diff2, diff3); + const __m128i shuf01h = _mm_unpackhi_epi32(diff0, diff1); + const __m128i shuf23h = _mm_unpackhi_epi32(diff2, diff3); + __m128i v01l, v32l; + __m128i v01h, v32h; + + // First pass + FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l); + FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h); + + // Second pass + FTransformPass2(&v01l, &v32l, out + 0); + FTransformPass2(&v01h, &v32h, out + 16); } static void FTransformWHT(const int16_t* in, int16_t* out) { @@ -1392,6 +1449,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) { VP8EncQuantizeBlockWHT = QuantizeBlockWHT; VP8ITransform = ITransform; VP8FTransform = FTransform; + VP8FTransform2 = FTransform2; VP8FTransformWHT = FTransformWHT; VP8SSE16x16 = SSE16x16; VP8SSE16x8 = SSE16x8; diff --git a/src/enc/quant.c b/src/enc/quant.c index 17b78ec4..002c326b 100644 --- a/src/enc/quant.c +++ b/src/enc/quant.c @@ -723,8 +723,8 @@ static int ReconstructIntra16(VP8EncIterator* const it, int n; int16_t tmp[16][16], dc_tmp[16]; - for (n = 0; n < 16; ++n) { - VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]); + for (n = 0; n < 16; n += 2) { + VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]); } VP8FTransformWHT(tmp[0], dc_tmp); nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24; @@ -797,8 +797,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd, int n; int16_t tmp[8][16]; - for (n = 0; n < 8; ++n) { - VP8FTransform(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]); + for (n = 0; n < 8; n += 2) { + VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]); } if (DO_TRELLIS_UV && it->do_trellis_) { int ch, x, y;