mirror of
				https://github.com/webmproject/libwebp.git
				synced 2025-10-31 18:35:41 +01:00 
			
		
		
		
	faster non-transposing SSE2 4x4 FTransform
1-2% faster. uses pmaddwd instead of transpose + pmullw. Can possibly be simplified further. Change-Id: I420e148816c4c6ab5e2080c9b1719dbbe6762d4e
This commit is contained in:
		| @@ -21,6 +21,35 @@ extern "C" { | |||||||
|  |  | ||||||
| #include "../enc/vp8enci.h" | #include "../enc/vp8enci.h" | ||||||
|  |  | ||||||
|  | //------------------------------------------------------------------------------ | ||||||
|  | // Quite useful macro for debugging. Left here for convenience. | ||||||
|  |  | ||||||
|  | #if 0 | ||||||
|  | #include <stdio.h> | ||||||
|  | static void PrintReg(const __m128i r, const char* const name, int size) { | ||||||
|  |   int n; | ||||||
|  |   union { | ||||||
|  |     __m128i r; | ||||||
|  |     uint8_t i8[16]; | ||||||
|  |     uint16_t i16[8]; | ||||||
|  |     uint32_t i32[4]; | ||||||
|  |     uint64_t i64[2]; | ||||||
|  |   } tmp; | ||||||
|  |   tmp.r = r; | ||||||
|  |   printf("%s\t: ", name); | ||||||
|  |   if (size == 8) { | ||||||
|  |     for (n = 0; n < 16; ++n) printf("%.2x ", tmp.i8[n]); | ||||||
|  |   } else if (size == 16) { | ||||||
|  |     for (n = 0; n < 8; ++n) printf("%.4x ", tmp.i16[n]); | ||||||
|  |   } else if (size == 32) { | ||||||
|  |     for (n = 0; n < 4; ++n) printf("%.8x ", tmp.i32[n]); | ||||||
|  |   } else { | ||||||
|  |     for (n = 0; n < 2; ++n) printf("%.16lx ", tmp.i64[n]); | ||||||
|  |   } | ||||||
|  |   printf("\n"); | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  |  | ||||||
| //------------------------------------------------------------------------------ | //------------------------------------------------------------------------------ | ||||||
| // Compute susceptibility based on DCT-coeff histograms: | // Compute susceptibility based on DCT-coeff histograms: | ||||||
| // the higher, the "easier" the macroblock is to compress. | // the higher, the "easier" the macroblock is to compress. | ||||||
| @@ -303,9 +332,15 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, | |||||||
|                                            5352,  2217, 5352,  2217); |                                            5352,  2217, 5352,  2217); | ||||||
|   const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, |   const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, | ||||||
|                                            2217, -5352, 2217, -5352); |                                            2217, -5352, 2217, -5352); | ||||||
|  |   const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8); | ||||||
|  |   const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8); | ||||||
|  |   const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352, | ||||||
|  |                                             2217, 5352, 2217, 5352); | ||||||
|  |   const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217, | ||||||
|  |                                             -5352, 2217, -5352, 2217); | ||||||
|   __m128i v01, v32; |   __m128i v01, v32; | ||||||
|  |  | ||||||
|  |  | ||||||
|   // Difference between src and ref and initial transpose. |   // Difference between src and ref and initial transpose. | ||||||
|   { |   { | ||||||
|     // Load src and convert to 16b. |     // Load src and convert to 16b. | ||||||
| @@ -326,73 +361,50 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, | |||||||
|     const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); |     const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); | ||||||
|     const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); |     const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); | ||||||
|     const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); |     const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); | ||||||
|     // Compute difference. |     // Compute difference. -> 00 01 02 03 00 00 00 00 | ||||||
|     const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); |     const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); | ||||||
|     const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); |     const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); | ||||||
|     const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); |     const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); | ||||||
|     const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); |     const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); | ||||||
|  |  | ||||||
|     // Transpose. |  | ||||||
|  |     // Unpack and shuffle | ||||||
|     // 00 01 02 03   0 0 0 0 |     // 00 01 02 03   0 0 0 0 | ||||||
|     // 10 11 12 13   0 0 0 0 |     // 10 11 12 13   0 0 0 0 | ||||||
|     // 20 21 22 23   0 0 0 0 |     // 20 21 22 23   0 0 0 0 | ||||||
|     // 30 31 32 33   0 0 0 0 |     // 30 31 32 33   0 0 0 0 | ||||||
|     const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1); |     const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1); | ||||||
|     const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3); |     const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3); | ||||||
|     // 00 10 01 11   02 12 03 13 |     // 00 01 10 11 02 03 12 13 | ||||||
|     // 20 30 21 31   22 32 23 33 |     // 20 21 30 31 22 23 32 33 | ||||||
|     const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); |     const __m128i shuf01_p = _mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1)); | ||||||
|     v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); |     const __m128i shuf23_p = _mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1)); | ||||||
|     v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); |     // 00 01 10 11 03 02 13 12 | ||||||
|     // a02 a12 a22 a32   a03 a13 a23 a33 |     // 20 21 30 31 23 22 33 32 | ||||||
|     // a00 a10 a20 a30   a01 a11 a21 a31 |     const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p); | ||||||
|     // a03 a13 a23 a33   a02 a12 a22 a32 |     const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p); | ||||||
|   } |     // 00 01 10 11 20 21 30 31 | ||||||
|  |     // 03 02 13 12 23 22 33 32 | ||||||
|  |     const __m128i a01 = _mm_add_epi16(s01, s32); | ||||||
|  |     const __m128i a32 = _mm_sub_epi16(s01, s32); | ||||||
|  |     // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ] | ||||||
|  |     // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ] | ||||||
|  |  | ||||||
|   // First pass and subsequent transpose. |     const __m128i tmp0 = _mm_madd_epi16(a01, k88p);  // [ (a0 + a1) << 3, ... ] | ||||||
|   { |     const __m128i tmp2 = _mm_madd_epi16(a01, k88m);  // [ (a0 - a1) << 3, ... ] | ||||||
|     // Same operations are done on the (0,3) and (1,2) pairs. |     const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p); | ||||||
|     // b0 = (a0 + a3) |     const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m); | ||||||
|     // b1 = (a1 + a2) |     const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812); | ||||||
|     // b3 = (a0 - a3) |     const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937); | ||||||
|     // b2 = (a1 - a2) |     const __m128i tmp1   = _mm_srai_epi32(tmp1_2, 9); | ||||||
|     const __m128i a01 = _mm_add_epi16(v01, v32); |     const __m128i tmp3   = _mm_srai_epi32(tmp3_2, 9); | ||||||
|     const __m128i a32 = _mm_sub_epi16(v01, v32); |     const __m128i s03 = _mm_packs_epi32(tmp0, tmp2); | ||||||
|     const __m128i b11 = _mm_unpackhi_epi64(a01, a01); |     const __m128i s12 = _mm_packs_epi32(tmp1, tmp3); | ||||||
|     const __m128i b22 = _mm_unpackhi_epi64(a32, a32); |     const __m128i s_lo = _mm_unpacklo_epi16(s03, s12);   // 0 1 0 1 0 1... | ||||||
|  |     const __m128i s_hi = _mm_unpackhi_epi16(s03, s12);   // 2 3 2 3 2 3 | ||||||
|     // e0 = (b0 + b1) |     const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi); | ||||||
|     // e2 = (b0 - b1) |     v01 = _mm_unpacklo_epi32(s_lo, s_hi); | ||||||
|     const __m128i e0 = _mm_add_epi16(a01, b11); |     v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2.. | ||||||
|     const __m128i e2 = _mm_sub_epi16(a01, b11); |  | ||||||
|     // e02 = [e0 | e2] << 3 |  | ||||||
|     const __m128i e0_e2 = _mm_unpacklo_epi64(e0, e2); |  | ||||||
|     const __m128i e02 = _mm_slli_epi16(e0_e2, 3); |  | ||||||
|  |  | ||||||
|     // e1 = (b3 * 5352 + b2 * 2217 + 1812) >> 9 |  | ||||||
|     // e3 = (b3 * 2217 - b2 * 5352 + 937) >> 9 |  | ||||||
|     const __m128i b23 = _mm_unpacklo_epi16(b22, a32); |  | ||||||
|     const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); |  | ||||||
|     const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); |  | ||||||
|     const __m128i d1 = _mm_add_epi32(c1, k1812); |  | ||||||
|     const __m128i d3 = _mm_add_epi32(c3, k937); |  | ||||||
|     const __m128i e1 = _mm_srai_epi32(d1, 9); |  | ||||||
|     const __m128i e3 = _mm_srai_epi32(d3, 9); |  | ||||||
|     const __m128i e13 = _mm_packs_epi32(e1, e3); |  | ||||||
|  |  | ||||||
|     // Transpose. |  | ||||||
|     // 00 01 02 03  20 21 22 23 |  | ||||||
|     // 10 11 12 13  30 31 32 33 |  | ||||||
|     const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13); |  | ||||||
|     const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13); |  | ||||||
|     // 00 10 01 11   02 12 03 13 |  | ||||||
|     // 20 30 21 31   22 32 23 33 |  | ||||||
|     const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); |  | ||||||
|     v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); |  | ||||||
|     v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); |  | ||||||
|     // 02 12 22 32   03 13 23 33 |  | ||||||
|     // 00 10 20 30   01 11 21 31 |  | ||||||
|     // 03 13 23 33   02 12 22 32 |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   // Second pass |   // Second pass | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user