mirror of
				https://github.com/webmproject/libwebp.git
				synced 2025-10-31 10:25:46 +01:00 
			
		
		
		
	introduce FTransform2 to perform two transforms at a time.
FTransform goes from ~12.0% to 11.5% total CPU time. Change-Id: Ibcb23155324f4fd8b235563f80668531c781f624
This commit is contained in:
		| @@ -144,6 +144,7 @@ typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out); | ||||
| typedef void (*VP8WHT)(const int16_t* in, int16_t* out); | ||||
| extern VP8Idct VP8ITransform; | ||||
| extern VP8Fdct VP8FTransform; | ||||
| extern VP8Fdct VP8FTransform2;   // performs two transforms at a time | ||||
| extern VP8WHT VP8FTransformWHT; | ||||
| // Predictions | ||||
| // *dst is the destination block. *top and *left can be NULL. | ||||
|   | ||||
| @@ -177,6 +177,11 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { | ||||
|   } | ||||
| } | ||||
|  | ||||
| static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) { | ||||
|   VP8FTransform(src, ref, out); | ||||
|   VP8FTransform(src + 4, ref + 4, out + 16); | ||||
| } | ||||
|  | ||||
| static void FTransformWHT(const int16_t* in, int16_t* out) { | ||||
|   // input is 12b signed | ||||
|   int32_t tmp[16]; | ||||
| @@ -704,6 +709,7 @@ static void Copy16x8(const uint8_t* src, uint8_t* dst) { | ||||
| VP8CHisto VP8CollectHistogram; | ||||
| VP8Idct VP8ITransform; | ||||
| VP8Fdct VP8FTransform; | ||||
| VP8Fdct VP8FTransform2; | ||||
| VP8WHT VP8FTransformWHT; | ||||
| VP8Intra4Preds VP8EncPredLuma4; | ||||
| VP8IntraPreds VP8EncPredLuma16; | ||||
| @@ -740,6 +746,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) { | ||||
|   VP8CollectHistogram = CollectHistogram; | ||||
|   VP8ITransform = ITransform; | ||||
|   VP8FTransform = FTransform; | ||||
|   VP8FTransform2 = FTransform2; | ||||
|   VP8FTransformWHT = FTransformWHT; | ||||
|   VP8EncPredLuma4 = Intra4Preds; | ||||
|   VP8EncPredLuma16 = Intra16Preds; | ||||
|   | ||||
| @@ -274,136 +274,193 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, | ||||
|   } | ||||
| } | ||||
|  | ||||
| static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { | ||||
|   const __m128i zero = _mm_setzero_si128(); | ||||
|   const __m128i seven = _mm_set1_epi16(7); | ||||
| static void FTransformPass1(const __m128i* const in01, | ||||
|                             const __m128i* const in23, | ||||
|                             __m128i* const out01, | ||||
|                             __m128i* const out32) { | ||||
|   const __m128i k937 = _mm_set1_epi32(937); | ||||
|   const __m128i k1812 = _mm_set1_epi32(1812); | ||||
|   const __m128i k51000 = _mm_set1_epi32(51000); | ||||
|   const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); | ||||
|   const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217, | ||||
|                                            5352,  2217, 5352,  2217); | ||||
|   const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, | ||||
|                                            2217, -5352, 2217, -5352); | ||||
|  | ||||
|   const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8); | ||||
|   const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8); | ||||
|   const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352, | ||||
|                                             2217, 5352, 2217, 5352); | ||||
|   const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217, | ||||
|                                             -5352, 2217, -5352, 2217); | ||||
|  | ||||
|   // *in01 = 00 01 10 11 02 03 12 13 | ||||
|   // *in23 = 20 21 30 31 22 23 32 33 | ||||
|   const __m128i shuf01_p = _mm_shufflehi_epi16(*in01, _MM_SHUFFLE(2, 3, 0, 1)); | ||||
|   const __m128i shuf23_p = _mm_shufflehi_epi16(*in23, _MM_SHUFFLE(2, 3, 0, 1)); | ||||
|   // 00 01 10 11 03 02 13 12 | ||||
|   // 20 21 30 31 23 22 33 32 | ||||
|   const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p); | ||||
|   const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p); | ||||
|   // 00 01 10 11 20 21 30 31 | ||||
|   // 03 02 13 12 23 22 33 32 | ||||
|   const __m128i a01 = _mm_add_epi16(s01, s32); | ||||
|   const __m128i a32 = _mm_sub_epi16(s01, s32); | ||||
|   // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ] | ||||
|   // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ] | ||||
|  | ||||
|   const __m128i tmp0   = _mm_madd_epi16(a01, k88p);  // [ (a0 + a1) << 3, ... ] | ||||
|   const __m128i tmp2   = _mm_madd_epi16(a01, k88m);  // [ (a0 - a1) << 3, ... ] | ||||
|   const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p); | ||||
|   const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m); | ||||
|   const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812); | ||||
|   const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937); | ||||
|   const __m128i tmp1   = _mm_srai_epi32(tmp1_2, 9); | ||||
|   const __m128i tmp3   = _mm_srai_epi32(tmp3_2, 9); | ||||
|   const __m128i s03    = _mm_packs_epi32(tmp0, tmp2); | ||||
|   const __m128i s12    = _mm_packs_epi32(tmp1, tmp3); | ||||
|   const __m128i s_lo   = _mm_unpacklo_epi16(s03, s12);   // 0 1 0 1 0 1... | ||||
|   const __m128i s_hi   = _mm_unpackhi_epi16(s03, s12);   // 2 3 2 3 2 3 | ||||
|   const __m128i v23    = _mm_unpackhi_epi32(s_lo, s_hi); | ||||
|   *out01 = _mm_unpacklo_epi32(s_lo, s_hi); | ||||
|   *out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2.. | ||||
| } | ||||
|  | ||||
| static void FTransformPass2(const __m128i* const v01, const __m128i* const v32, | ||||
|                             int16_t* out) { | ||||
|   const __m128i zero = _mm_setzero_si128(); | ||||
|   const __m128i seven = _mm_set1_epi16(7); | ||||
|   const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217, | ||||
|                                            5352,  2217, 5352,  2217); | ||||
|   const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, | ||||
|                                            2217, -5352, 2217, -5352); | ||||
|   const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); | ||||
|   const __m128i k51000 = _mm_set1_epi32(51000); | ||||
|  | ||||
|   // Same operations are done on the (0,3) and (1,2) pairs. | ||||
|   // a0 = v0 + v3 | ||||
|   // a1 = v1 + v2 | ||||
|   // a3 = v0 - v3 | ||||
|   // a2 = v1 - v2 | ||||
|   const __m128i a01 = _mm_add_epi16(*v01, *v32); | ||||
|   const __m128i a32 = _mm_sub_epi16(*v01, *v32); | ||||
|   const __m128i a11 = _mm_unpackhi_epi64(a01, a01); | ||||
|   const __m128i a22 = _mm_unpackhi_epi64(a32, a32); | ||||
|   const __m128i a01_plus_7 = _mm_add_epi16(a01, seven); | ||||
|  | ||||
|   // d0 = (a0 + a1 + 7) >> 4; | ||||
|   // d2 = (a0 - a1 + 7) >> 4; | ||||
|   const __m128i c0 = _mm_add_epi16(a01_plus_7, a11); | ||||
|   const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11); | ||||
|   const __m128i d0 = _mm_srai_epi16(c0, 4); | ||||
|   const __m128i d2 = _mm_srai_epi16(c2, 4); | ||||
|  | ||||
|   // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) | ||||
|   // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) | ||||
|   const __m128i b23 = _mm_unpacklo_epi16(a22, a32); | ||||
|   const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); | ||||
|   const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); | ||||
|   const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); | ||||
|   const __m128i d3 = _mm_add_epi32(c3, k51000); | ||||
|   const __m128i e1 = _mm_srai_epi32(d1, 16); | ||||
|   const __m128i e3 = _mm_srai_epi32(d3, 16); | ||||
|   const __m128i f1 = _mm_packs_epi32(e1, e1); | ||||
|   const __m128i f3 = _mm_packs_epi32(e3, e3); | ||||
|   // f1 = f1 + (a3 != 0); | ||||
|   // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the | ||||
|   // desired (0, 1), we add one earlier through k12000_plus_one. | ||||
|   // -> f1 = f1 + 1 - (a3 == 0) | ||||
|   const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); | ||||
|  | ||||
|   const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1); | ||||
|   const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3); | ||||
|   _mm_storeu_si128((__m128i*)&out[0], d0_g1); | ||||
|   _mm_storeu_si128((__m128i*)&out[8], d2_f3); | ||||
| } | ||||
|  | ||||
| static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { | ||||
|   const __m128i zero = _mm_setzero_si128(); | ||||
|  | ||||
|   // Load src and convert to 16b. | ||||
|   const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); | ||||
|   const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]); | ||||
|   const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]); | ||||
|   const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]); | ||||
|   const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); | ||||
|   const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); | ||||
|   const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); | ||||
|   const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); | ||||
|   // Load ref and convert to 16b. | ||||
|   const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); | ||||
|   const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); | ||||
|   const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); | ||||
|   const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); | ||||
|   const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); | ||||
|   const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); | ||||
|   const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); | ||||
|   const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); | ||||
|   // Compute difference. -> 00 01 02 03 00 00 00 00 | ||||
|   const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); | ||||
|   const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); | ||||
|   const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); | ||||
|   const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); | ||||
|  | ||||
|   // Unpack and shuffle | ||||
|   // 00 01 02 03   0 0 0 0 | ||||
|   // 10 11 12 13   0 0 0 0 | ||||
|   // 20 21 22 23   0 0 0 0 | ||||
|   // 30 31 32 33   0 0 0 0 | ||||
|   const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1); | ||||
|   const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3); | ||||
|   __m128i v01, v32; | ||||
|  | ||||
|  | ||||
|   // Difference between src and ref and initial transpose. | ||||
|   { | ||||
|     // Load src and convert to 16b. | ||||
|     const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); | ||||
|     const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]); | ||||
|     const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]); | ||||
|     const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]); | ||||
|     const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); | ||||
|     const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); | ||||
|     const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); | ||||
|     const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); | ||||
|     // Load ref and convert to 16b. | ||||
|     const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); | ||||
|     const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); | ||||
|     const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); | ||||
|     const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); | ||||
|     const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); | ||||
|     const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); | ||||
|     const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); | ||||
|     const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); | ||||
|     // Compute difference. -> 00 01 02 03 00 00 00 00 | ||||
|     const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); | ||||
|     const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); | ||||
|     const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); | ||||
|     const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); | ||||
|  | ||||
|  | ||||
|     // Unpack and shuffle | ||||
|     // 00 01 02 03   0 0 0 0 | ||||
|     // 10 11 12 13   0 0 0 0 | ||||
|     // 20 21 22 23   0 0 0 0 | ||||
|     // 30 31 32 33   0 0 0 0 | ||||
|     const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1); | ||||
|     const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3); | ||||
|     // 00 01 10 11 02 03 12 13 | ||||
|     // 20 21 30 31 22 23 32 33 | ||||
|     const __m128i shuf01_p = | ||||
|         _mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1)); | ||||
|     const __m128i shuf23_p = | ||||
|         _mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1)); | ||||
|     // 00 01 10 11 03 02 13 12 | ||||
|     // 20 21 30 31 23 22 33 32 | ||||
|     const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p); | ||||
|     const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p); | ||||
|     // 00 01 10 11 20 21 30 31 | ||||
|     // 03 02 13 12 23 22 33 32 | ||||
|     const __m128i a01 = _mm_add_epi16(s01, s32); | ||||
|     const __m128i a32 = _mm_sub_epi16(s01, s32); | ||||
|     // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ] | ||||
|     // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ] | ||||
|  | ||||
|     const __m128i tmp0 = _mm_madd_epi16(a01, k88p);  // [ (a0 + a1) << 3, ... ] | ||||
|     const __m128i tmp2 = _mm_madd_epi16(a01, k88m);  // [ (a0 - a1) << 3, ... ] | ||||
|     const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p); | ||||
|     const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m); | ||||
|     const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812); | ||||
|     const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937); | ||||
|     const __m128i tmp1   = _mm_srai_epi32(tmp1_2, 9); | ||||
|     const __m128i tmp3   = _mm_srai_epi32(tmp3_2, 9); | ||||
|     const __m128i s03 = _mm_packs_epi32(tmp0, tmp2); | ||||
|     const __m128i s12 = _mm_packs_epi32(tmp1, tmp3); | ||||
|     const __m128i s_lo = _mm_unpacklo_epi16(s03, s12);   // 0 1 0 1 0 1... | ||||
|     const __m128i s_hi = _mm_unpackhi_epi16(s03, s12);   // 2 3 2 3 2 3 | ||||
|     const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi); | ||||
|     v01 = _mm_unpacklo_epi32(s_lo, s_hi); | ||||
|     v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2.. | ||||
|   } | ||||
|   // First pass | ||||
|   FTransformPass1(&shuf01, &shuf23, &v01, &v32); | ||||
|  | ||||
|   // Second pass | ||||
|   { | ||||
|     // Same operations are done on the (0,3) and (1,2) pairs. | ||||
|     // a0 = v0 + v3 | ||||
|     // a1 = v1 + v2 | ||||
|     // a3 = v0 - v3 | ||||
|     // a2 = v1 - v2 | ||||
|     const __m128i a01 = _mm_add_epi16(v01, v32); | ||||
|     const __m128i a32 = _mm_sub_epi16(v01, v32); | ||||
|     const __m128i a11 = _mm_unpackhi_epi64(a01, a01); | ||||
|     const __m128i a22 = _mm_unpackhi_epi64(a32, a32); | ||||
|     const __m128i a01_plus_7 = _mm_add_epi16(a01, seven); | ||||
|   FTransformPass2(&v01, &v32, out); | ||||
| } | ||||
|  | ||||
|     // d0 = (a0 + a1 + 7) >> 4; | ||||
|     // d2 = (a0 - a1 + 7) >> 4; | ||||
|     const __m128i c0 = _mm_add_epi16(a01_plus_7, a11); | ||||
|     const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11); | ||||
|     const __m128i d0 = _mm_srai_epi16(c0, 4); | ||||
|     const __m128i d2 = _mm_srai_epi16(c2, 4); | ||||
| static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) { | ||||
|   const __m128i zero = _mm_setzero_si128(); | ||||
|  | ||||
|     // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) | ||||
|     // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) | ||||
|     const __m128i b23 = _mm_unpacklo_epi16(a22, a32); | ||||
|     const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); | ||||
|     const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); | ||||
|     const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); | ||||
|     const __m128i d3 = _mm_add_epi32(c3, k51000); | ||||
|     const __m128i e1 = _mm_srai_epi32(d1, 16); | ||||
|     const __m128i e3 = _mm_srai_epi32(d3, 16); | ||||
|     const __m128i f1 = _mm_packs_epi32(e1, e1); | ||||
|     const __m128i f3 = _mm_packs_epi32(e3, e3); | ||||
|     // f1 = f1 + (a3 != 0); | ||||
|     // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the | ||||
|     // desired (0, 1), we add one earlier through k12000_plus_one. | ||||
|     // -> f1 = f1 + 1 - (a3 == 0) | ||||
|     const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); | ||||
|   // Load src and convert to 16b. | ||||
|   const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); | ||||
|   const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]); | ||||
|   const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]); | ||||
|   const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]); | ||||
|   const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); | ||||
|   const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); | ||||
|   const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); | ||||
|   const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); | ||||
|   // Load ref and convert to 16b. | ||||
|   const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); | ||||
|   const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); | ||||
|   const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); | ||||
|   const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); | ||||
|   const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); | ||||
|   const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); | ||||
|   const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); | ||||
|   const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); | ||||
|   // Compute difference. -> 00 01 02 03  00' 01' 02' 03' | ||||
|   const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); | ||||
|   const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); | ||||
|   const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); | ||||
|   const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); | ||||
|  | ||||
|     const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1); | ||||
|     const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3); | ||||
|     _mm_storeu_si128((__m128i*)&out[0], d0_g1); | ||||
|     _mm_storeu_si128((__m128i*)&out[8], d2_f3); | ||||
|   } | ||||
|   // Unpack and shuffle | ||||
|   // 00 01 02 03   0 0 0 0 | ||||
|   // 10 11 12 13   0 0 0 0 | ||||
|   // 20 21 22 23   0 0 0 0 | ||||
|   // 30 31 32 33   0 0 0 0 | ||||
|   const __m128i shuf01l = _mm_unpacklo_epi32(diff0, diff1); | ||||
|   const __m128i shuf23l = _mm_unpacklo_epi32(diff2, diff3); | ||||
|   const __m128i shuf01h = _mm_unpackhi_epi32(diff0, diff1); | ||||
|   const __m128i shuf23h = _mm_unpackhi_epi32(diff2, diff3); | ||||
|   __m128i v01l, v32l; | ||||
|   __m128i v01h, v32h; | ||||
|  | ||||
|   // First pass | ||||
|   FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l); | ||||
|   FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h); | ||||
|  | ||||
|   // Second pass | ||||
|   FTransformPass2(&v01l, &v32l, out + 0); | ||||
|   FTransformPass2(&v01h, &v32h, out + 16); | ||||
| } | ||||
|  | ||||
| static void FTransformWHT(const int16_t* in, int16_t* out) { | ||||
| @@ -1392,6 +1449,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) { | ||||
|   VP8EncQuantizeBlockWHT = QuantizeBlockWHT; | ||||
|   VP8ITransform = ITransform; | ||||
|   VP8FTransform = FTransform; | ||||
|   VP8FTransform2 = FTransform2; | ||||
|   VP8FTransformWHT = FTransformWHT; | ||||
|   VP8SSE16x16 = SSE16x16; | ||||
|   VP8SSE16x8 = SSE16x8; | ||||
|   | ||||
| @@ -723,8 +723,8 @@ static int ReconstructIntra16(VP8EncIterator* const it, | ||||
|   int n; | ||||
|   int16_t tmp[16][16], dc_tmp[16]; | ||||
|  | ||||
|   for (n = 0; n < 16; ++n) { | ||||
|     VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]); | ||||
|   for (n = 0; n < 16; n += 2) { | ||||
|     VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]); | ||||
|   } | ||||
|   VP8FTransformWHT(tmp[0], dc_tmp); | ||||
|   nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24; | ||||
| @@ -797,8 +797,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd, | ||||
|   int n; | ||||
|   int16_t tmp[8][16]; | ||||
|  | ||||
|   for (n = 0; n < 8; ++n) { | ||||
|     VP8FTransform(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]); | ||||
|   for (n = 0; n < 8; n += 2) { | ||||
|     VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]); | ||||
|   } | ||||
|   if (DO_TRELLIS_UV && it->do_trellis_) { | ||||
|     int ch, x, y; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user