mirror of
https://github.com/webmproject/libwebp.git
synced 2025-02-06 20:12:53 +01:00
introduce FTransform2 to perform two transforms at a time.
FTransform goes from ~12.0% to 11.5% total CPU time. Change-Id: Ibcb23155324f4fd8b235563f80668531c781f624
This commit is contained in:
parent
aa6065aedd
commit
ac76801159
@ -144,6 +144,7 @@ typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
|
||||
typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
|
||||
extern VP8Idct VP8ITransform;
|
||||
extern VP8Fdct VP8FTransform;
|
||||
extern VP8Fdct VP8FTransform2; // performs two transforms at a time
|
||||
extern VP8WHT VP8FTransformWHT;
|
||||
// Predictions
|
||||
// *dst is the destination block. *top and *left can be NULL.
|
||||
|
@ -177,6 +177,11 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
||||
}
|
||||
}
|
||||
|
||||
static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
||||
VP8FTransform(src, ref, out);
|
||||
VP8FTransform(src + 4, ref + 4, out + 16);
|
||||
}
|
||||
|
||||
static void FTransformWHT(const int16_t* in, int16_t* out) {
|
||||
// input is 12b signed
|
||||
int32_t tmp[16];
|
||||
@ -704,6 +709,7 @@ static void Copy16x8(const uint8_t* src, uint8_t* dst) {
|
||||
VP8CHisto VP8CollectHistogram;
|
||||
VP8Idct VP8ITransform;
|
||||
VP8Fdct VP8FTransform;
|
||||
VP8Fdct VP8FTransform2;
|
||||
VP8WHT VP8FTransformWHT;
|
||||
VP8Intra4Preds VP8EncPredLuma4;
|
||||
VP8IntraPreds VP8EncPredLuma16;
|
||||
@ -740,6 +746,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
|
||||
VP8CollectHistogram = CollectHistogram;
|
||||
VP8ITransform = ITransform;
|
||||
VP8FTransform = FTransform;
|
||||
VP8FTransform2 = FTransform2;
|
||||
VP8FTransformWHT = FTransformWHT;
|
||||
VP8EncPredLuma4 = Intra4Preds;
|
||||
VP8EncPredLuma16 = Intra16Preds;
|
||||
|
@ -274,66 +274,24 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
|
||||
}
|
||||
}
|
||||
|
||||
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i seven = _mm_set1_epi16(7);
|
||||
static void FTransformPass1(const __m128i* const in01,
|
||||
const __m128i* const in23,
|
||||
__m128i* const out01,
|
||||
__m128i* const out32) {
|
||||
const __m128i k937 = _mm_set1_epi32(937);
|
||||
const __m128i k1812 = _mm_set1_epi32(1812);
|
||||
const __m128i k51000 = _mm_set1_epi32(51000);
|
||||
const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
|
||||
const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217,
|
||||
5352, 2217, 5352, 2217);
|
||||
const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
|
||||
2217, -5352, 2217, -5352);
|
||||
|
||||
const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8);
|
||||
const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8);
|
||||
const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352,
|
||||
2217, 5352, 2217, 5352);
|
||||
const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217,
|
||||
-5352, 2217, -5352, 2217);
|
||||
__m128i v01, v32;
|
||||
|
||||
|
||||
// Difference between src and ref and initial transpose.
|
||||
{
|
||||
// Load src and convert to 16b.
|
||||
const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
|
||||
const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
|
||||
const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
|
||||
const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
|
||||
const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
|
||||
const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
|
||||
const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
|
||||
const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
|
||||
// Load ref and convert to 16b.
|
||||
const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
|
||||
const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
|
||||
const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
|
||||
const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
|
||||
const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
|
||||
const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
|
||||
const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
|
||||
const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
|
||||
// Compute difference. -> 00 01 02 03 00 00 00 00
|
||||
const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
|
||||
const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
|
||||
const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
|
||||
const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
|
||||
|
||||
|
||||
// Unpack and shuffle
|
||||
// 00 01 02 03 0 0 0 0
|
||||
// 10 11 12 13 0 0 0 0
|
||||
// 20 21 22 23 0 0 0 0
|
||||
// 30 31 32 33 0 0 0 0
|
||||
const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1);
|
||||
const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3);
|
||||
// 00 01 10 11 02 03 12 13
|
||||
// 20 21 30 31 22 23 32 33
|
||||
const __m128i shuf01_p =
|
||||
_mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
const __m128i shuf23_p =
|
||||
_mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
// *in01 = 00 01 10 11 02 03 12 13
|
||||
// *in23 = 20 21 30 31 22 23 32 33
|
||||
const __m128i shuf01_p = _mm_shufflehi_epi16(*in01, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
const __m128i shuf23_p = _mm_shufflehi_epi16(*in23, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
// 00 01 10 11 03 02 13 12
|
||||
// 20 21 30 31 23 22 33 32
|
||||
const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);
|
||||
@ -358,19 +316,28 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
||||
const __m128i s_lo = _mm_unpacklo_epi16(s03, s12); // 0 1 0 1 0 1...
|
||||
const __m128i s_hi = _mm_unpackhi_epi16(s03, s12); // 2 3 2 3 2 3
|
||||
const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi);
|
||||
v01 = _mm_unpacklo_epi32(s_lo, s_hi);
|
||||
v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2..
|
||||
*out01 = _mm_unpacklo_epi32(s_lo, s_hi);
|
||||
*out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2..
|
||||
}
|
||||
|
||||
// Second pass
|
||||
{
|
||||
static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
|
||||
int16_t* out) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i seven = _mm_set1_epi16(7);
|
||||
const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217,
|
||||
5352, 2217, 5352, 2217);
|
||||
const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
|
||||
2217, -5352, 2217, -5352);
|
||||
const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
|
||||
const __m128i k51000 = _mm_set1_epi32(51000);
|
||||
|
||||
// Same operations are done on the (0,3) and (1,2) pairs.
|
||||
// a0 = v0 + v3
|
||||
// a1 = v1 + v2
|
||||
// a3 = v0 - v3
|
||||
// a2 = v1 - v2
|
||||
const __m128i a01 = _mm_add_epi16(v01, v32);
|
||||
const __m128i a32 = _mm_sub_epi16(v01, v32);
|
||||
const __m128i a01 = _mm_add_epi16(*v01, *v32);
|
||||
const __m128i a32 = _mm_sub_epi16(*v01, *v32);
|
||||
const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
|
||||
const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
|
||||
const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);
|
||||
@ -404,6 +371,96 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
||||
_mm_storeu_si128((__m128i*)&out[0], d0_g1);
|
||||
_mm_storeu_si128((__m128i*)&out[8], d2_f3);
|
||||
}
|
||||
|
||||
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
||||
// Load src and convert to 16b.
|
||||
const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
|
||||
const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
|
||||
const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
|
||||
const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
|
||||
const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
|
||||
const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
|
||||
const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
|
||||
const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
|
||||
// Load ref and convert to 16b.
|
||||
const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
|
||||
const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
|
||||
const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
|
||||
const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
|
||||
const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
|
||||
const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
|
||||
const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
|
||||
const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
|
||||
// Compute difference. -> 00 01 02 03 00 00 00 00
|
||||
const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
|
||||
const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
|
||||
const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
|
||||
const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
|
||||
|
||||
// Unpack and shuffle
|
||||
// 00 01 02 03 0 0 0 0
|
||||
// 10 11 12 13 0 0 0 0
|
||||
// 20 21 22 23 0 0 0 0
|
||||
// 30 31 32 33 0 0 0 0
|
||||
const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1);
|
||||
const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3);
|
||||
__m128i v01, v32;
|
||||
|
||||
// First pass
|
||||
FTransformPass1(&shuf01, &shuf23, &v01, &v32);
|
||||
|
||||
// Second pass
|
||||
FTransformPass2(&v01, &v32, out);
|
||||
}
|
||||
|
||||
static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
||||
// Load src and convert to 16b.
|
||||
const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
|
||||
const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
|
||||
const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
|
||||
const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
|
||||
const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
|
||||
const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
|
||||
const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
|
||||
const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
|
||||
// Load ref and convert to 16b.
|
||||
const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
|
||||
const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
|
||||
const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
|
||||
const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
|
||||
const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
|
||||
const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
|
||||
const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
|
||||
const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
|
||||
// Compute difference. -> 00 01 02 03 00' 01' 02' 03'
|
||||
const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
|
||||
const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
|
||||
const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
|
||||
const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
|
||||
|
||||
// Unpack and shuffle
|
||||
// 00 01 02 03 0 0 0 0
|
||||
// 10 11 12 13 0 0 0 0
|
||||
// 20 21 22 23 0 0 0 0
|
||||
// 30 31 32 33 0 0 0 0
|
||||
const __m128i shuf01l = _mm_unpacklo_epi32(diff0, diff1);
|
||||
const __m128i shuf23l = _mm_unpacklo_epi32(diff2, diff3);
|
||||
const __m128i shuf01h = _mm_unpackhi_epi32(diff0, diff1);
|
||||
const __m128i shuf23h = _mm_unpackhi_epi32(diff2, diff3);
|
||||
__m128i v01l, v32l;
|
||||
__m128i v01h, v32h;
|
||||
|
||||
// First pass
|
||||
FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l);
|
||||
FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h);
|
||||
|
||||
// Second pass
|
||||
FTransformPass2(&v01l, &v32l, out + 0);
|
||||
FTransformPass2(&v01h, &v32h, out + 16);
|
||||
}
|
||||
|
||||
static void FTransformWHT(const int16_t* in, int16_t* out) {
|
||||
@ -1392,6 +1449,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
|
||||
VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
|
||||
VP8ITransform = ITransform;
|
||||
VP8FTransform = FTransform;
|
||||
VP8FTransform2 = FTransform2;
|
||||
VP8FTransformWHT = FTransformWHT;
|
||||
VP8SSE16x16 = SSE16x16;
|
||||
VP8SSE16x8 = SSE16x8;
|
||||
|
@ -723,8 +723,8 @@ static int ReconstructIntra16(VP8EncIterator* const it,
|
||||
int n;
|
||||
int16_t tmp[16][16], dc_tmp[16];
|
||||
|
||||
for (n = 0; n < 16; ++n) {
|
||||
VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
|
||||
for (n = 0; n < 16; n += 2) {
|
||||
VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
|
||||
}
|
||||
VP8FTransformWHT(tmp[0], dc_tmp);
|
||||
nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
|
||||
@ -797,8 +797,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
|
||||
int n;
|
||||
int16_t tmp[8][16];
|
||||
|
||||
for (n = 0; n < 8; ++n) {
|
||||
VP8FTransform(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
|
||||
for (n = 0; n < 8; n += 2) {
|
||||
VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
|
||||
}
|
||||
if (DO_TRELLIS_UV && it->do_trellis_) {
|
||||
int ch, x, y;
|
||||
|
Loading…
x
Reference in New Issue
Block a user