mirror of
https://github.com/webmproject/libwebp.git
synced 2024-12-27 06:08:21 +01:00
Merge "Add MSA optimized distortion functions"
This commit is contained in:
commit
bbb6ecd9b0
@ -15,6 +15,7 @@
|
|||||||
|
|
||||||
#if defined(WEBP_USE_MSA)
|
#if defined(WEBP_USE_MSA)
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
#include "./msa_macro.h"
|
#include "./msa_macro.h"
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
@ -165,6 +166,57 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
|
|||||||
ST_SH2(out0, out1, out, 8);
|
ST_SH2(out0, out1, out, 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int TTransform(const uint8_t* in, const uint16_t* w) {
|
||||||
|
int sum;
|
||||||
|
uint32_t in0_m, in1_m, in2_m, in3_m;
|
||||||
|
v16i8 src0;
|
||||||
|
v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
|
||||||
|
v4i32 dst0, dst1;
|
||||||
|
const v16i8 zero = { 0 };
|
||||||
|
const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
|
||||||
|
const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
|
||||||
|
const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
|
||||||
|
const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
|
||||||
|
|
||||||
|
LW4(in, BPS, in0_m, in1_m, in2_m, in3_m);
|
||||||
|
INSERT_W4_SB(in0_m, in1_m, in2_m, in3_m, src0);
|
||||||
|
ILVRL_B2_SH(zero, src0, tmp0, tmp1);
|
||||||
|
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
|
||||||
|
ADDSUB2(in0, in1, tmp0, tmp1);
|
||||||
|
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
|
||||||
|
ADDSUB2(tmp2, tmp3, tmp0, tmp1);
|
||||||
|
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
|
||||||
|
ADDSUB2(in0, in1, tmp0, tmp1);
|
||||||
|
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
|
||||||
|
ADDSUB2(tmp2, tmp3, tmp0, tmp1);
|
||||||
|
tmp0 = __msa_add_a_h(tmp0, (v8i16)zero);
|
||||||
|
tmp1 = __msa_add_a_h(tmp1, (v8i16)zero);
|
||||||
|
LD_SH2(w, 8, tmp2, tmp3);
|
||||||
|
DOTP_SH2_SW(tmp0, tmp1, tmp2, tmp3, dst0, dst1);
|
||||||
|
dst0 = dst0 + dst1;
|
||||||
|
sum = HADD_SW_S32(dst0);
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
|
||||||
|
const uint16_t* const w) {
|
||||||
|
const int sum1 = TTransform(a, w);
|
||||||
|
const int sum2 = TTransform(b, w);
|
||||||
|
return abs(sum2 - sum1) >> 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
|
||||||
|
const uint16_t* const w) {
|
||||||
|
int D = 0;
|
||||||
|
int x, y;
|
||||||
|
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
|
||||||
|
for (x = 0; x < 16; x += 4) {
|
||||||
|
D += Disto4x4(a + x + y, b + x + y, w);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return D;
|
||||||
|
}
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
// Entry point
|
// Entry point
|
||||||
|
|
||||||
@ -174,6 +226,9 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
|
|||||||
VP8ITransform = ITransform;
|
VP8ITransform = ITransform;
|
||||||
VP8FTransform = FTransform;
|
VP8FTransform = FTransform;
|
||||||
VP8FTransformWHT = FTransformWHT;
|
VP8FTransformWHT = FTransformWHT;
|
||||||
|
|
||||||
|
VP8TDisto4x4 = Disto4x4;
|
||||||
|
VP8TDisto16x16 = Disto16x16;
|
||||||
}
|
}
|
||||||
|
|
||||||
#else // !WEBP_USE_MSA
|
#else // !WEBP_USE_MSA
|
||||||
|
@ -393,6 +393,22 @@
|
|||||||
} while (0)
|
} while (0)
|
||||||
#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
|
#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
|
||||||
|
|
||||||
|
/* Description : Dot product of halfword vector elements
|
||||||
|
* Arguments : Inputs - mult0, mult1, cnst0, cnst1
|
||||||
|
* Outputs - out0, out1
|
||||||
|
* Return Type - as per RTYPE
|
||||||
|
* Details : Signed halfword elements from 'mult0' are multiplied with
|
||||||
|
* signed halfword elements from 'cnst0' producing a result
|
||||||
|
* twice the size of input i.e. signed word.
|
||||||
|
* The multiplication result of adjacent odd-even elements
|
||||||
|
* are added together and written to the 'out0' vector
|
||||||
|
*/
|
||||||
|
#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \
|
||||||
|
out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \
|
||||||
|
out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \
|
||||||
|
} while (0)
|
||||||
|
#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
|
||||||
|
|
||||||
/* Description : Dot product & addition of halfword vector elements
|
/* Description : Dot product & addition of halfword vector elements
|
||||||
* Arguments : Inputs - mult0, mult1, cnst0, cnst1
|
* Arguments : Inputs - mult0, mult1, cnst0, cnst1
|
||||||
* Outputs - out0, out1
|
* Outputs - out0, out1
|
||||||
@ -448,6 +464,22 @@
|
|||||||
CLIP_SW_0_255(in3); \
|
CLIP_SW_0_255(in3); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
/* Description : Horizontal addition of 4 signed word elements of input vector
|
||||||
|
* Arguments : Input - in (signed word vector)
|
||||||
|
* Output - sum_m (i32 sum)
|
||||||
|
* Return Type - signed word (GP)
|
||||||
|
* Details : 4 signed word elements of 'in' vector are added together and
|
||||||
|
* the resulting integer sum is returned
|
||||||
|
*/
|
||||||
|
static WEBP_INLINE int32_t func_hadd_sw_s32(v4i32 in) {
|
||||||
|
const v2i64 res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);
|
||||||
|
const v2i64 res1_m = __msa_splati_d(res0_m, 1);
|
||||||
|
const v2i64 out = res0_m + res1_m;
|
||||||
|
int32_t sum_m = __msa_copy_s_w((v4i32)out, 0);
|
||||||
|
return sum_m;
|
||||||
|
}
|
||||||
|
#define HADD_SW_S32(in) func_hadd_sw_s32(in)
|
||||||
|
|
||||||
/* Description : Horizontal addition of 8 unsigned halfword elements
|
/* Description : Horizontal addition of 8 unsigned halfword elements
|
||||||
* Arguments : Input - in (unsigned halfword vector)
|
* Arguments : Input - in (unsigned halfword vector)
|
||||||
* Output - sum_m (u32 sum)
|
* Output - sum_m (u32 sum)
|
||||||
|
Loading…
Reference in New Issue
Block a user