mirror of
https://github.com/webmproject/libwebp.git
synced 2024-11-20 04:18:26 +01:00
Small speedup in FTransform.
It removes two _mm_unpacklo_epi32 and two _mm_sub_epi16. Change-Id: Icdf86259f796ba855d1cda5e9c0e99cb396cb351
This commit is contained in:
parent
9dbd4aad77
commit
6e36b51188
@ -292,42 +292,42 @@ static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
|
|||||||
|
|
||||||
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
||||||
const __m128i zero = _mm_setzero_si128();
|
const __m128i zero = _mm_setzero_si128();
|
||||||
|
// Load src.
|
||||||
// Load src and convert to 16b.
|
|
||||||
const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
|
const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
|
||||||
const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
|
const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
|
||||||
const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
|
const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
|
||||||
const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
|
const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
|
||||||
const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
|
// 00 01 02 03 *
|
||||||
const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
|
// 10 11 12 13 *
|
||||||
const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
|
// 20 21 22 23 *
|
||||||
const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
|
// 30 31 32 33 *
|
||||||
// Load ref and convert to 16b.
|
// Shuffle.
|
||||||
|
const __m128i src_0 = _mm_unpacklo_epi16(src0, src1);
|
||||||
|
const __m128i src_1 = _mm_unpacklo_epi16(src2, src3);
|
||||||
|
// 00 01 10 11 02 03 12 13 * * ...
|
||||||
|
// 20 21 30 31 22 22 32 33 * * ...
|
||||||
|
|
||||||
|
// Load ref.
|
||||||
const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
|
const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
|
||||||
const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
|
const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
|
||||||
const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
|
const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
|
||||||
const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
|
const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
|
||||||
const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
|
const __m128i ref_0 = _mm_unpacklo_epi16(ref0, ref1);
|
||||||
const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
|
const __m128i ref_1 = _mm_unpacklo_epi16(ref2, ref3);
|
||||||
const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
|
|
||||||
const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
|
|
||||||
// Compute difference. -> 00 01 02 03 00 00 00 00
|
|
||||||
const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
|
|
||||||
const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
|
|
||||||
const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
|
|
||||||
const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
|
|
||||||
|
|
||||||
// Unpack and shuffle
|
// Convert both to 16 bit.
|
||||||
// 00 01 02 03 0 0 0 0
|
const __m128i src_0_16b = _mm_unpacklo_epi8(src_0, zero);
|
||||||
// 10 11 12 13 0 0 0 0
|
const __m128i src_1_16b = _mm_unpacklo_epi8(src_1, zero);
|
||||||
// 20 21 22 23 0 0 0 0
|
const __m128i ref_0_16b = _mm_unpacklo_epi8(ref_0, zero);
|
||||||
// 30 31 32 33 0 0 0 0
|
const __m128i ref_1_16b = _mm_unpacklo_epi8(ref_1, zero);
|
||||||
const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1);
|
|
||||||
const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3);
|
// Compute the difference.
|
||||||
|
const __m128i row01 = _mm_sub_epi16(src_0_16b, ref_0_16b);
|
||||||
|
const __m128i row23 = _mm_sub_epi16(src_1_16b, ref_1_16b);
|
||||||
__m128i v01, v32;
|
__m128i v01, v32;
|
||||||
|
|
||||||
// First pass
|
// First pass
|
||||||
FTransformPass1(&shuf01, &shuf23, &v01, &v32);
|
FTransformPass1(&row01, &row23, &v01, &v32);
|
||||||
|
|
||||||
// Second pass
|
// Second pass
|
||||||
FTransformPass2(&v01, &v32, out);
|
FTransformPass2(&v01, &v32, out);
|
||||||
|
Loading…
Reference in New Issue
Block a user