mirror of
https://github.com/webmproject/libwebp.git
synced 2024-11-20 12:28:26 +01:00
Added MSA optimized intra prediction 16x16 functions
1. DC16 2. TM16 3. VE16 4. HE16 5. DC16NoTop 6. DC16NoLeft 7. DC16NoTopLeft Change-Id: I53c57c27cee40973b7ee40a7b7a7fbf0df812d1a
This commit is contained in:
parent
0afa0ce2ff
commit
293d786f31
@ -772,6 +772,99 @@ static void LD4(uint8_t* dst) { // Down-Left
|
|||||||
SW4(val0, val1, val2, val3, dst, BPS);
|
SW4(val0, val1, val2, val3, dst, BPS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 16x16
|
||||||
|
|
||||||
|
static void DC16(uint8_t* dst) { // DC
|
||||||
|
uint32_t dc = 16;
|
||||||
|
int i;
|
||||||
|
const v16u8 rtop = LD_UB(dst - BPS);
|
||||||
|
const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
|
||||||
|
v16u8 out;
|
||||||
|
|
||||||
|
for (i = 0; i < 16; ++i) {
|
||||||
|
dc += dst[-1 + i * BPS];
|
||||||
|
}
|
||||||
|
dc += HADD_UH_U32(dctop);
|
||||||
|
out = (v16u8)__msa_fill_b(dc >> 5);
|
||||||
|
ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
|
||||||
|
ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void TM16(uint8_t* dst) {
|
||||||
|
int j;
|
||||||
|
v8i16 d1, d2;
|
||||||
|
const v16i8 zero = { 0 };
|
||||||
|
const v8i16 TL = (v8i16)__msa_fill_h(dst[-1 - BPS]);
|
||||||
|
const v16i8 T = LD_SB(dst - BPS);
|
||||||
|
|
||||||
|
ILVRL_B2_SH(zero, T, d1, d2);
|
||||||
|
SUB2(d1, TL, d2, TL, d1, d2);
|
||||||
|
for (j = 0; j < 16; j += 4) {
|
||||||
|
v16i8 t0, t1, t2, t3;
|
||||||
|
v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
|
||||||
|
const v8i16 L0 = (v8i16)__msa_fill_h(dst[-1 + 0 * BPS]);
|
||||||
|
const v8i16 L1 = (v8i16)__msa_fill_h(dst[-1 + 1 * BPS]);
|
||||||
|
const v8i16 L2 = (v8i16)__msa_fill_h(dst[-1 + 2 * BPS]);
|
||||||
|
const v8i16 L3 = (v8i16)__msa_fill_h(dst[-1 + 3 * BPS]);
|
||||||
|
ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
|
||||||
|
ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
|
||||||
|
CLIP_SH4_0_255(r0, r1, r2, r3);
|
||||||
|
CLIP_SH4_0_255(r4, r5, r6, r7);
|
||||||
|
PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
|
||||||
|
ST_SB4(t0, t1, t2, t3, dst, BPS);
|
||||||
|
dst += 4 * BPS;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void VE16(uint8_t* dst) { // vertical
|
||||||
|
const v16u8 rtop = LD_UB(dst - BPS);
|
||||||
|
ST_UB8(rtop, rtop, rtop, rtop, rtop, rtop, rtop, rtop, dst, BPS);
|
||||||
|
ST_UB8(rtop, rtop, rtop, rtop, rtop, rtop, rtop, rtop, dst + 8 * BPS, BPS);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void HE16(uint8_t* dst) { // horizontal
|
||||||
|
int j;
|
||||||
|
for (j = 16; j > 0; j -= 4) {
|
||||||
|
const v16u8 L0 = (v16u8)__msa_fill_b(dst[-1 + 0 * BPS]);
|
||||||
|
const v16u8 L1 = (v16u8)__msa_fill_b(dst[-1 + 1 * BPS]);
|
||||||
|
const v16u8 L2 = (v16u8)__msa_fill_b(dst[-1 + 2 * BPS]);
|
||||||
|
const v16u8 L3 = (v16u8)__msa_fill_b(dst[-1 + 3 * BPS]);
|
||||||
|
ST_UB4(L0, L1, L2, L3, dst, BPS);
|
||||||
|
dst += 4 * BPS;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
|
||||||
|
int j;
|
||||||
|
uint32_t dc = 8;
|
||||||
|
v16u8 out;
|
||||||
|
|
||||||
|
for (j = 0; j < 16; ++j) {
|
||||||
|
dc += dst[-1 + j * BPS];
|
||||||
|
}
|
||||||
|
out = (v16u8)__msa_fill_b(dc >> 4);
|
||||||
|
ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
|
||||||
|
ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available
|
||||||
|
uint32_t dc = 8;
|
||||||
|
const v16u8 rtop = LD_UB(dst - BPS);
|
||||||
|
const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
|
||||||
|
v16u8 out;
|
||||||
|
|
||||||
|
dc += HADD_UH_U32(dctop);
|
||||||
|
out = (v16u8)__msa_fill_b(dc >> 4);
|
||||||
|
ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
|
||||||
|
ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void DC16NoTopLeft(uint8_t* dst) { // DC with nothing
|
||||||
|
const v16u8 out = (v16u8)__msa_fill_b(0x80);
|
||||||
|
ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
|
||||||
|
ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
|
||||||
|
}
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
// Entry point
|
// Entry point
|
||||||
|
|
||||||
@ -801,6 +894,13 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMSA(void) {
|
|||||||
VP8PredLuma4[2] = VE4;
|
VP8PredLuma4[2] = VE4;
|
||||||
VP8PredLuma4[4] = RD4;
|
VP8PredLuma4[4] = RD4;
|
||||||
VP8PredLuma4[6] = LD4;
|
VP8PredLuma4[6] = LD4;
|
||||||
|
VP8PredLuma16[0] = DC16;
|
||||||
|
VP8PredLuma16[1] = TM16;
|
||||||
|
VP8PredLuma16[2] = VE16;
|
||||||
|
VP8PredLuma16[3] = HE16;
|
||||||
|
VP8PredLuma16[4] = DC16NoTop;
|
||||||
|
VP8PredLuma16[5] = DC16NoLeft;
|
||||||
|
VP8PredLuma16[6] = DC16NoTopLeft;
|
||||||
}
|
}
|
||||||
|
|
||||||
#else // !WEBP_USE_MSA
|
#else // !WEBP_USE_MSA
|
||||||
|
@ -243,6 +243,13 @@
|
|||||||
#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
|
#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
|
||||||
#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
|
#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||||
|
pdst, stride) { \
|
||||||
|
ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
|
||||||
|
ST_B4(RTYPE, in4, in5, in6, in7, pdst + 4 * stride, stride); \
|
||||||
|
}
|
||||||
|
#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
|
||||||
|
|
||||||
/* Description : Store 2x4 byte block to destination memory from input vector
|
/* Description : Store 2x4 byte block to destination memory from input vector
|
||||||
* Arguments : Inputs - in, stidx, pdst, stride
|
* Arguments : Inputs - in, stidx, pdst, stride
|
||||||
* Details : Index 'stidx' halfword element from 'in' vector is copied to
|
* Details : Index 'stidx' halfword element from 'in' vector is copied to
|
||||||
@ -359,6 +366,25 @@
|
|||||||
CLIP_SW_0_255(in3); \
|
CLIP_SW_0_255(in3); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Description : Horizontal addition of 8 unsigned halfword elements
|
||||||
|
* Arguments : Input - in (unsigned halfword vector)
|
||||||
|
* Output - sum_m (u32 sum)
|
||||||
|
* Return Type - unsigned word
|
||||||
|
* Details : 8 unsigned halfword elements of input vector are added
|
||||||
|
* together and the resulting integer sum is returned
|
||||||
|
*/
|
||||||
|
static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
|
||||||
|
uint32_t sum_m;
|
||||||
|
v2u64 res0_m, res1_m;
|
||||||
|
const v4u32 res_m = __msa_hadd_u_w(in, in);
|
||||||
|
res0_m = __msa_hadd_u_d(res_m, res_m);
|
||||||
|
res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1);
|
||||||
|
res0_m = res0_m + res1_m;
|
||||||
|
sum_m = __msa_copy_s_w((v4i32)res0_m, 0);
|
||||||
|
return sum_m;
|
||||||
|
}
|
||||||
|
#define HADD_UH_U32(in) func_hadd_uh_u32(in)
|
||||||
|
|
||||||
/* Description : Set element n input vector to GPR value
|
/* Description : Set element n input vector to GPR value
|
||||||
* Arguments : Inputs - in0, in1, in2, in3
|
* Arguments : Inputs - in0, in1, in2, in3
|
||||||
* Output - out
|
* Output - out
|
||||||
@ -660,6 +686,16 @@
|
|||||||
#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
|
#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
|
||||||
#define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
|
#define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||||
|
out0, out1, out2, out3) { \
|
||||||
|
PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
|
||||||
|
PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
|
||||||
|
}
|
||||||
|
#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
|
||||||
|
#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
|
||||||
|
#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
|
||||||
|
#define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
|
||||||
|
|
||||||
/* Description : Arithmetic immediate shift right all elements of word vector
|
/* Description : Arithmetic immediate shift right all elements of word vector
|
||||||
* Arguments : Inputs - in0, in1, shift
|
* Arguments : Inputs - in0, in1, shift
|
||||||
* Outputs - in place operation
|
* Outputs - in place operation
|
||||||
@ -745,6 +781,17 @@
|
|||||||
ADD2(in4, in5, in6, in7, out2, out3); \
|
ADD2(in4, in5, in6, in7, out2, out3); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Description : Subtraction of 2 pairs of vectors
|
||||||
|
* Arguments : Inputs - in0, in1, in2, in3
|
||||||
|
* Outputs - out0, out1
|
||||||
|
* Details : Each element in 'in1' is subtracted from 'in0' and result is
|
||||||
|
* written to 'out0'.
|
||||||
|
*/
|
||||||
|
#define SUB2(in0, in1, in2, in3, out0, out1) { \
|
||||||
|
out0 = in0 - in1; \
|
||||||
|
out1 = in2 - in3; \
|
||||||
|
}
|
||||||
|
|
||||||
/* Description : Sign extend halfword elements from input vector and return
|
/* Description : Sign extend halfword elements from input vector and return
|
||||||
* the result in pair of vectors
|
* the result in pair of vectors
|
||||||
* Arguments : Input - in (halfword vector)
|
* Arguments : Input - in (halfword vector)
|
||||||
|
Loading…
Reference in New Issue
Block a user