introduce FTransform2 to perform two transforms at a time.

FTransform goes from ~12.0% to 11.5% total CPU time. Change-Id: Ibcb23155324f4fd8b235563f80668531c781f624
2025-07-18 23:09:52 +02:00 · 2015-05-18 21:06:15 -07:00
parent aa6065aedd
commit ac76801159
4 changed files with 185 additions and 119 deletions
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@ -144,6 +144,7 @@ typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
 typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
 extern VP8Idct VP8ITransform;
 extern VP8Fdct VP8FTransform;
 extern VP8Fdct VP8FTransform2;   // performs two transforms at a time
 extern VP8WHT VP8FTransformWHT;
 // Predictions
 // *dst is the destination block. *top and *left can be NULL.
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@ -177,6 +177,11 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  }
 }
 static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  VP8FTransform(src, ref, out);
  VP8FTransform(src + 4, ref + 4, out + 16);
 }
 static void FTransformWHT(const int16_t* in, int16_t* out) {
  // input is 12b signed
  int32_t tmp[16];
@ -704,6 +709,7 @@ static void Copy16x8(const uint8_t* src, uint8_t* dst) {
 VP8CHisto VP8CollectHistogram;
 VP8Idct VP8ITransform;
 VP8Fdct VP8FTransform;
 VP8Fdct VP8FTransform2;
 VP8WHT VP8FTransformWHT;
 VP8Intra4Preds VP8EncPredLuma4;
 VP8IntraPreds VP8EncPredLuma16;
@ -740,6 +746,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
  VP8CollectHistogram = CollectHistogram;
  VP8ITransform = ITransform;
  VP8FTransform = FTransform;
  VP8FTransform2 = FTransform2;
  VP8FTransformWHT = FTransformWHT;
  VP8EncPredLuma4 = Intra4Preds;
  VP8EncPredLuma16 = Intra16Preds;
--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
@ -274,136 +274,193 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
  }
 }
-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransformPass1(const __m128i* const in01,
-  const __m128i zero = _mm_setzero_si128();
+                            const __m128i* const in23,
-  const __m128i seven = _mm_set1_epi16(7);
+                            __m128i* const out01,
                            __m128i* const out32) {
  const __m128i k937 = _mm_set1_epi32(937);
  const __m128i k1812 = _mm_set1_epi32(1812);
-  const __m128i k51000 = _mm_set1_epi32(51000);
+
  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
                                           5352,  2217, 5352,  2217);
  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
                                           2217, -5352, 2217, -5352);
  const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8);
  const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8);
  const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352,
                                            2217, 5352, 2217, 5352);
  const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217,
                                            -5352, 2217, -5352, 2217);
  // *in01 = 00 01 10 11 02 03 12 13
  // *in23 = 20 21 30 31 22 23 32 33
  const __m128i shuf01_p = _mm_shufflehi_epi16(*in01, _MM_SHUFFLE(2, 3, 0, 1));
  const __m128i shuf23_p = _mm_shufflehi_epi16(*in23, _MM_SHUFFLE(2, 3, 0, 1));
  // 00 01 10 11 03 02 13 12
  // 20 21 30 31 23 22 33 32
  const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);
  const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p);
  // 00 01 10 11 20 21 30 31
  // 03 02 13 12 23 22 33 32
  const __m128i a01 = _mm_add_epi16(s01, s32);
  const __m128i a32 = _mm_sub_epi16(s01, s32);
  // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
  // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]
  const __m128i tmp0   = _mm_madd_epi16(a01, k88p);  // [ (a0 + a1) << 3, ... ]
  const __m128i tmp2   = _mm_madd_epi16(a01, k88m);  // [ (a0 - a1) << 3, ... ]
  const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p);
  const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m);
  const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812);
  const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937);
  const __m128i tmp1   = _mm_srai_epi32(tmp1_2, 9);
  const __m128i tmp3   = _mm_srai_epi32(tmp3_2, 9);
  const __m128i s03    = _mm_packs_epi32(tmp0, tmp2);
  const __m128i s12    = _mm_packs_epi32(tmp1, tmp3);
  const __m128i s_lo   = _mm_unpacklo_epi16(s03, s12);   // 0 1 0 1 0 1...
  const __m128i s_hi   = _mm_unpackhi_epi16(s03, s12);   // 2 3 2 3 2 3
  const __m128i v23    = _mm_unpackhi_epi32(s_lo, s_hi);
  *out01 = _mm_unpacklo_epi32(s_lo, s_hi);
  *out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2..
 }
 static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
                            int16_t* out) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i seven = _mm_set1_epi16(7);
  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
                                           5352,  2217, 5352,  2217);
  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
                                           2217, -5352, 2217, -5352);
  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
  const __m128i k51000 = _mm_set1_epi32(51000);
  // Same operations are done on the (0,3) and (1,2) pairs.
  // a0 = v0 + v3
  // a1 = v1 + v2
  // a3 = v0 - v3
  // a2 = v1 - v2
  const __m128i a01 = _mm_add_epi16(*v01, *v32);
  const __m128i a32 = _mm_sub_epi16(*v01, *v32);
  const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
  const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
  const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);
  // d0 = (a0 + a1 + 7) >> 4;
  // d2 = (a0 - a1 + 7) >> 4;
  const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
  const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
  const __m128i d0 = _mm_srai_epi16(c0, 4);
  const __m128i d2 = _mm_srai_epi16(c2, 4);
  // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
  // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
  const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
  const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
  const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
  const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
  const __m128i d3 = _mm_add_epi32(c3, k51000);
  const __m128i e1 = _mm_srai_epi32(d1, 16);
  const __m128i e3 = _mm_srai_epi32(d3, 16);
  const __m128i f1 = _mm_packs_epi32(e1, e1);
  const __m128i f3 = _mm_packs_epi32(e3, e3);
  // f1 = f1 + (a3 != 0);
  // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
  // desired (0, 1), we add one earlier through k12000_plus_one.
  // -> f1 = f1 + 1 - (a3 == 0)
  const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
  const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
  const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
  _mm_storeu_si128((__m128i*)&out[0], d0_g1);
  _mm_storeu_si128((__m128i*)&out[8], d2_f3);
 }
 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  const __m128i zero = _mm_setzero_si128();
  // Load src and convert to 16b.
  const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
  const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
  const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
  const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
  const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
  const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
  const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
  const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
  // Load ref and convert to 16b.
  const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
  const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
  const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
  const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
  const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
  const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
  const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
  const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
  // Compute difference. -> 00 01 02 03 00 00 00 00
  const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
  const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
  const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
  const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
  // Unpack and shuffle
  // 00 01 02 03   0 0 0 0
  // 10 11 12 13   0 0 0 0
  // 20 21 22 23   0 0 0 0
  // 30 31 32 33   0 0 0 0
  const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1);
  const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3);
  __m128i v01, v32;
-
+  // First pass
-  // Difference between src and ref and initial transpose.
+  FTransformPass1(&shuf01, &shuf23, &v01, &v32);
  {
    // Load src and convert to 16b.
    const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
    const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
    const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
    const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
    const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
    const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
    const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
    const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
    // Load ref and convert to 16b.
    const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
    const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
    const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
    const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
    const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
    const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
    const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
    const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
    // Compute difference. -> 00 01 02 03 00 00 00 00
    const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
    const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
    const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
    const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
    // Unpack and shuffle
    // 00 01 02 03   0 0 0 0
    // 10 11 12 13   0 0 0 0
    // 20 21 22 23   0 0 0 0
    // 30 31 32 33   0 0 0 0
    const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1);
    const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3);
    // 00 01 10 11 02 03 12 13
    // 20 21 30 31 22 23 32 33
    const __m128i shuf01_p =
        _mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1));
    const __m128i shuf23_p =
        _mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1));
    // 00 01 10 11 03 02 13 12
    // 20 21 30 31 23 22 33 32
    const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);
    const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p);
    // 00 01 10 11 20 21 30 31
    // 03 02 13 12 23 22 33 32
    const __m128i a01 = _mm_add_epi16(s01, s32);
    const __m128i a32 = _mm_sub_epi16(s01, s32);
    // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
    // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]
    const __m128i tmp0 = _mm_madd_epi16(a01, k88p);  // [ (a0 + a1) << 3, ... ]
    const __m128i tmp2 = _mm_madd_epi16(a01, k88m);  // [ (a0 - a1) << 3, ... ]
    const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p);
    const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m);
    const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812);
    const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937);
    const __m128i tmp1   = _mm_srai_epi32(tmp1_2, 9);
    const __m128i tmp3   = _mm_srai_epi32(tmp3_2, 9);
    const __m128i s03 = _mm_packs_epi32(tmp0, tmp2);
    const __m128i s12 = _mm_packs_epi32(tmp1, tmp3);
    const __m128i s_lo = _mm_unpacklo_epi16(s03, s12);   // 0 1 0 1 0 1...
    const __m128i s_hi = _mm_unpackhi_epi16(s03, s12);   // 2 3 2 3 2 3
    const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi);
    v01 = _mm_unpacklo_epi32(s_lo, s_hi);
    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2..
  }
  // Second pass
-  {
+  FTransformPass2(&v01, &v32, out);
-    // Same operations are done on the (0,3) and (1,2) pairs.
+}
    // a0 = v0 + v3
    // a1 = v1 + v2
    // a3 = v0 - v3
    // a2 = v1 - v2
    const __m128i a01 = _mm_add_epi16(v01, v32);
    const __m128i a32 = _mm_sub_epi16(v01, v32);
    const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
    const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
    const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);
-    // d0 = (a0 + a1 + 7) >> 4;
+static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
-    // d2 = (a0 - a1 + 7) >> 4;
+  const __m128i zero = _mm_setzero_si128();
    const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
    const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
    const __m128i d0 = _mm_srai_epi16(c0, 4);
    const __m128i d2 = _mm_srai_epi16(c2, 4);
-    // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
+  // Load src and convert to 16b.
-    // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
+  const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
-    const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
+  const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
-    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
+  const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
-    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
+  const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
-    const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
+  const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
-    const __m128i d3 = _mm_add_epi32(c3, k51000);
+  const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
-    const __m128i e1 = _mm_srai_epi32(d1, 16);
+  const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
-    const __m128i e3 = _mm_srai_epi32(d3, 16);
+  const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
-    const __m128i f1 = _mm_packs_epi32(e1, e1);
+  // Load ref and convert to 16b.
-    const __m128i f3 = _mm_packs_epi32(e3, e3);
+  const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
-    // f1 = f1 + (a3 != 0);
+  const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
-    // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
+  const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
-    // desired (0, 1), we add one earlier through k12000_plus_one.
+  const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
-    // -> f1 = f1 + 1 - (a3 == 0)
+  const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
-    const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
+  const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
  const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
  const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
  // Compute difference. -> 00 01 02 03  00' 01' 02' 03'
  const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
  const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
  const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
  const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
-    const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
+  // Unpack and shuffle
-    const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
+  // 00 01 02 03   0 0 0 0
-    _mm_storeu_si128((__m128i*)&out[0], d0_g1);
+  // 10 11 12 13   0 0 0 0
-    _mm_storeu_si128((__m128i*)&out[8], d2_f3);
+  // 20 21 22 23   0 0 0 0
-  }
+  // 30 31 32 33   0 0 0 0
  const __m128i shuf01l = _mm_unpacklo_epi32(diff0, diff1);
  const __m128i shuf23l = _mm_unpacklo_epi32(diff2, diff3);
  const __m128i shuf01h = _mm_unpackhi_epi32(diff0, diff1);
  const __m128i shuf23h = _mm_unpackhi_epi32(diff2, diff3);
  __m128i v01l, v32l;
  __m128i v01h, v32h;
  // First pass
  FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l);
  FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h);
  // Second pass
  FTransformPass2(&v01l, &v32l, out + 0);
  FTransformPass2(&v01h, &v32h, out + 16);
 }
 static void FTransformWHT(const int16_t* in, int16_t* out) {
@ -1392,6 +1449,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
  VP8ITransform = ITransform;
  VP8FTransform = FTransform;
  VP8FTransform2 = FTransform2;
  VP8FTransformWHT = FTransformWHT;
  VP8SSE16x16 = SSE16x16;
  VP8SSE16x8 = SSE16x8;
--- a/src/enc/quant.c
+++ b/src/enc/quant.c
@ -723,8 +723,8 @@ static int ReconstructIntra16(VP8EncIterator* const it,
  int n;
  int16_t tmp[16][16], dc_tmp[16];
-  for (n = 0; n < 16; ++n) {
+  for (n = 0; n < 16; n += 2) {
-    VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
+    VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
  }
  VP8FTransformWHT(tmp[0], dc_tmp);
  nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
@ -797,8 +797,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
  int n;
  int16_t tmp[8][16];
-  for (n = 0; n < 8; ++n) {
+  for (n = 0; n < 8; n += 2) {
-    VP8FTransform(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
+    VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
  }
  if (DO_TRELLIS_UV && it->do_trellis_) {
    int ch, x, y;