mirror of
https://github.com/webmproject/libwebp.git
synced 2024-12-27 06:08:21 +01:00
Merge "Added MSA optimized edge filtering functions"
This commit is contained in:
commit
9ad2352d0d
@ -153,6 +153,288 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
|
||||
ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Edge filtering functions
|
||||
|
||||
#define FLIP_SIGN2(in0, in1, out0, out1) { \
|
||||
out0 = (v16i8)__msa_xori_b(in0, 0x80); \
|
||||
out1 = (v16i8)__msa_xori_b(in1, 0x80); \
|
||||
}
|
||||
|
||||
#define FLIP_SIGN4(in0, in1, in2, in3, out0, out1, out2, out3) { \
|
||||
FLIP_SIGN2(in0, in1, out0, out1); \
|
||||
FLIP_SIGN2(in2, in3, out2, out3); \
|
||||
}
|
||||
|
||||
#define FILT_VAL(q0_m, p0_m, mask, filt) do { \
|
||||
v16i8 q0_sub_p0; \
|
||||
q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \
|
||||
filt = __msa_adds_s_b(filt, q0_sub_p0); \
|
||||
filt = __msa_adds_s_b(filt, q0_sub_p0); \
|
||||
filt = __msa_adds_s_b(filt, q0_sub_p0); \
|
||||
filt = filt & mask; \
|
||||
} while (0)
|
||||
|
||||
#define FILT2(q_m, p_m, q, p) do { \
|
||||
u_r = SRAI_H(temp1, 7); \
|
||||
u_r = __msa_sat_s_h(u_r, 7); \
|
||||
u_l = SRAI_H(temp3, 7); \
|
||||
u_l = __msa_sat_s_h(u_l, 7); \
|
||||
u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \
|
||||
q_m = __msa_subs_s_b(q_m, u); \
|
||||
p_m = __msa_adds_s_b(p_m, u); \
|
||||
q = __msa_xori_b((v16u8)q_m, 0x80); \
|
||||
p = __msa_xori_b((v16u8)p_m, 0x80); \
|
||||
} while (0)
|
||||
|
||||
#define LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) do { \
|
||||
v16i8 p1_m, p0_m, q0_m, q1_m; \
|
||||
v16i8 filt, t1, t2; \
|
||||
const v16i8 cnst4b = __msa_ldi_b(4); \
|
||||
const v16i8 cnst3b = __msa_ldi_b(3); \
|
||||
\
|
||||
FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m); \
|
||||
filt = __msa_subs_s_b(p1_m, q1_m); \
|
||||
filt = filt & hev; \
|
||||
FILT_VAL(q0_m, p0_m, mask, filt); \
|
||||
t1 = __msa_adds_s_b(filt, cnst4b); \
|
||||
t1 = SRAI_B(t1, 3); \
|
||||
t2 = __msa_adds_s_b(filt, cnst3b); \
|
||||
t2 = SRAI_B(t2, 3); \
|
||||
q0_m = __msa_subs_s_b(q0_m, t1); \
|
||||
q0 = __msa_xori_b((v16u8)q0_m, 0x80); \
|
||||
p0_m = __msa_adds_s_b(p0_m, t2); \
|
||||
p0 = __msa_xori_b((v16u8)p0_m, 0x80); \
|
||||
filt = __msa_srari_b(t1, 1); \
|
||||
hev = __msa_xori_b(hev, 0xff); \
|
||||
filt = filt & hev; \
|
||||
q1_m = __msa_subs_s_b(q1_m, filt); \
|
||||
q1 = __msa_xori_b((v16u8)q1_m, 0x80); \
|
||||
p1_m = __msa_adds_s_b(p1_m, filt); \
|
||||
p1 = __msa_xori_b((v16u8)p1_m, 0x80); \
|
||||
} while (0)
|
||||
|
||||
#define LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) do { \
|
||||
v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \
|
||||
v16i8 u, filt, t1, t2, filt_sign; \
|
||||
v8i16 filt_r, filt_l, u_r, u_l; \
|
||||
v8i16 temp0, temp1, temp2, temp3; \
|
||||
const v16i8 cnst4b = __msa_ldi_b(4); \
|
||||
const v16i8 cnst3b = __msa_ldi_b(3); \
|
||||
const v8i16 cnst9h = __msa_ldi_h(9); \
|
||||
\
|
||||
FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m); \
|
||||
filt = __msa_subs_s_b(p1_m, q1_m); \
|
||||
FILT_VAL(q0_m, p0_m, mask, filt); \
|
||||
FLIP_SIGN2(p2, q2, p2_m, q2_m); \
|
||||
t2 = filt & hev; \
|
||||
/* filt_val &= ~hev */ \
|
||||
hev = __msa_xori_b(hev, 0xff); \
|
||||
filt = filt & hev; \
|
||||
t1 = __msa_adds_s_b(t2, cnst4b); \
|
||||
t1 = SRAI_B(t1, 3); \
|
||||
t2 = __msa_adds_s_b(t2, cnst3b); \
|
||||
t2 = SRAI_B(t2, 3); \
|
||||
q0_m = __msa_subs_s_b(q0_m, t1); \
|
||||
p0_m = __msa_adds_s_b(p0_m, t2); \
|
||||
filt_sign = __msa_clti_s_b(filt, 0); \
|
||||
ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \
|
||||
/* update q2/p2 */ \
|
||||
temp0 = filt_r * cnst9h; \
|
||||
temp1 = ADDVI_H(temp0, 63); \
|
||||
temp2 = filt_l * cnst9h; \
|
||||
temp3 = ADDVI_H(temp2, 63); \
|
||||
FILT2(q2_m, p2_m, q2, p2); \
|
||||
/* update q1/p1 */ \
|
||||
temp1 = temp1 + temp0; \
|
||||
temp3 = temp3 + temp2; \
|
||||
FILT2(q1_m, p1_m, q1, p1); \
|
||||
/* update q0/p0 */ \
|
||||
temp1 = temp1 + temp0; \
|
||||
temp3 = temp3 + temp2; \
|
||||
FILT2(q0_m, p0_m, q0, p0); \
|
||||
} while (0)
|
||||
|
||||
#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \
|
||||
q0_in, q1_in, q2_in, q3_in, \
|
||||
limit_in, b_limit_in, thresh_in, \
|
||||
hev_out, mask_out) do { \
|
||||
v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
|
||||
v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
|
||||
v16u8 flat_out; \
|
||||
\
|
||||
/* absolute subtraction of pixel values */ \
|
||||
p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \
|
||||
p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \
|
||||
p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \
|
||||
q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \
|
||||
q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \
|
||||
q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \
|
||||
p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \
|
||||
p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \
|
||||
/* calculation of hev */ \
|
||||
flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
|
||||
hev_out = (thresh_in < flat_out); \
|
||||
/* calculation of mask */ \
|
||||
p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
|
||||
p1_asub_q1_m = SRAI_B(p1_asub_q1_m, 1); \
|
||||
p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
|
||||
mask_out = (b_limit_in < p0_asub_q0_m); \
|
||||
mask_out = __msa_max_u_b(flat_out, mask_out); \
|
||||
p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
|
||||
mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
|
||||
q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
|
||||
mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
|
||||
mask_out = (limit_in < mask_out); \
|
||||
mask_out = __msa_xori_b(mask_out, 0xff); \
|
||||
} while (0)
|
||||
|
||||
#define ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride) { \
|
||||
const uint16_t tmp0_h = __msa_copy_s_h((v8i16)in1, in1_idx); \
|
||||
const uint32_t tmp0_w = __msa_copy_s_w((v4i32)in0, in0_idx); \
|
||||
SW(tmp0_w, pdst); \
|
||||
SH(tmp0_h, pdst + stride); \
|
||||
}
|
||||
|
||||
static void VFilter16(uint8_t *src, int stride,
|
||||
int b_limit_in, int limit_in, int thresh_in) {
|
||||
uint8_t *ptemp = src - 4 * stride;
|
||||
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
|
||||
v16u8 mask, hev;
|
||||
const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
|
||||
const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
|
||||
const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
|
||||
|
||||
LD_UB8(ptemp, stride, p3, p2, p1, p0, q0, q1, q2, q3);
|
||||
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
|
||||
hev, mask);
|
||||
LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
|
||||
ptemp = src - 3 * stride;
|
||||
ST_UB4(p2, p1, p0, q0, ptemp, stride);
|
||||
ptemp += (4 * stride);
|
||||
ST_UB2(q1, q2, ptemp, stride);
|
||||
}
|
||||
|
||||
static void HFilter16(uint8_t *src, int stride,
|
||||
int b_limit_in, int limit_in, int thresh_in) {
|
||||
uint8_t *ptmp = src - 4;
|
||||
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
|
||||
v16u8 mask, hev;
|
||||
v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
|
||||
v16u8 row9, row10, row11, row12, row13, row14, row15;
|
||||
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
|
||||
const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
|
||||
const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
|
||||
|
||||
LD_UB8(ptmp, stride, row0, row1, row2, row3, row4, row5, row6, row7);
|
||||
ptmp += (8 * stride);
|
||||
LD_UB8(ptmp, stride, row8, row9, row10, row11, row12, row13, row14, row15);
|
||||
TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
row8, row9, row10, row11, row12, row13, row14, row15,
|
||||
p3, p2, p1, p0, q0, q1, q2, q3);
|
||||
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
|
||||
hev, mask);
|
||||
LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
|
||||
ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
|
||||
ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
|
||||
ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
|
||||
ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
|
||||
ILVRL_B2_SH(q2, q1, tmp2, tmp5);
|
||||
ptmp = src - 3;
|
||||
ST6x1_UB(tmp3, 0, tmp2, 0, ptmp, 4);
|
||||
ptmp += stride;
|
||||
ST6x1_UB(tmp3, 1, tmp2, 1, ptmp, 4);
|
||||
ptmp += stride;
|
||||
ST6x1_UB(tmp3, 2, tmp2, 2, ptmp, 4);
|
||||
ptmp += stride;
|
||||
ST6x1_UB(tmp3, 3, tmp2, 3, ptmp, 4);
|
||||
ptmp += stride;
|
||||
ST6x1_UB(tmp4, 0, tmp2, 4, ptmp, 4);
|
||||
ptmp += stride;
|
||||
ST6x1_UB(tmp4, 1, tmp2, 5, ptmp, 4);
|
||||
ptmp += stride;
|
||||
ST6x1_UB(tmp4, 2, tmp2, 6, ptmp, 4);
|
||||
ptmp += stride;
|
||||
ST6x1_UB(tmp4, 3, tmp2, 7, ptmp, 4);
|
||||
ptmp += stride;
|
||||
ST6x1_UB(tmp6, 0, tmp5, 0, ptmp, 4);
|
||||
ptmp += stride;
|
||||
ST6x1_UB(tmp6, 1, tmp5, 1, ptmp, 4);
|
||||
ptmp += stride;
|
||||
ST6x1_UB(tmp6, 2, tmp5, 2, ptmp, 4);
|
||||
ptmp += stride;
|
||||
ST6x1_UB(tmp6, 3, tmp5, 3, ptmp, 4);
|
||||
ptmp += stride;
|
||||
ST6x1_UB(tmp7, 0, tmp5, 4, ptmp, 4);
|
||||
ptmp += stride;
|
||||
ST6x1_UB(tmp7, 1, tmp5, 5, ptmp, 4);
|
||||
ptmp += stride;
|
||||
ST6x1_UB(tmp7, 2, tmp5, 6, ptmp, 4);
|
||||
ptmp += stride;
|
||||
ST6x1_UB(tmp7, 3, tmp5, 7, ptmp, 4);
|
||||
}
|
||||
|
||||
// on three inner edges
|
||||
static void VFilterHorEdge16i(uint8_t *src, int stride,
|
||||
int b_limit, int limit, int thresh) {
|
||||
v16u8 mask, hev;
|
||||
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
|
||||
const v16u8 thresh0 = (v16u8)__msa_fill_b(thresh);
|
||||
const v16u8 b_limit0 = (v16u8)__msa_fill_b(b_limit);
|
||||
const v16u8 limit0 = (v16u8)__msa_fill_b(limit);
|
||||
|
||||
LD_UB8((src - 4 * stride), stride, p3, p2, p1, p0, q0, q1, q2, q3);
|
||||
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
|
||||
hev, mask);
|
||||
LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
|
||||
ST_UB4(p1, p0, q0, q1, (src - 2 * stride), stride);
|
||||
}
|
||||
|
||||
static void VFilter16i(uint8_t *src_y, int stride,
|
||||
int b_limit, int limit, int thresh) {
|
||||
VFilterHorEdge16i(src_y + 4 * stride, stride, b_limit, limit, thresh);
|
||||
VFilterHorEdge16i(src_y + 8 * stride, stride, b_limit, limit, thresh);
|
||||
VFilterHorEdge16i(src_y + 12 * stride, stride, b_limit, limit, thresh);
|
||||
}
|
||||
|
||||
static void HFilterVertEdge16i(uint8_t *src, int stride,
|
||||
int b_limit, int limit, int thresh) {
|
||||
v16u8 mask, hev;
|
||||
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
|
||||
v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
|
||||
v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
|
||||
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
|
||||
const v16u8 thresh0 = (v16u8)__msa_fill_b(thresh);
|
||||
const v16u8 b_limit0 = (v16u8)__msa_fill_b(b_limit);
|
||||
const v16u8 limit0 = (v16u8)__msa_fill_b(limit);
|
||||
|
||||
LD_UB8(src - 4, stride, row0, row1, row2, row3, row4, row5, row6, row7);
|
||||
LD_UB8(src - 4 + (8 * stride), stride,
|
||||
row8, row9, row10, row11, row12, row13, row14, row15);
|
||||
TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
|
||||
row8, row9, row10, row11, row12, row13, row14, row15,
|
||||
p3, p2, p1, p0, q0, q1, q2, q3);
|
||||
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
|
||||
hev, mask);
|
||||
LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
|
||||
ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
|
||||
ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
|
||||
ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
|
||||
ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
|
||||
src -= 2;
|
||||
ST4x8_UB(tmp2, tmp3, src, stride);
|
||||
src += (8 * stride);
|
||||
ST4x8_UB(tmp4, tmp5, src, stride);
|
||||
}
|
||||
|
||||
static void HFilter16i(uint8_t *src_y, int stride,
|
||||
int b_limit, int limit, int thresh) {
|
||||
HFilterVertEdge16i(src_y + 4, stride, b_limit, limit, thresh);
|
||||
HFilterVertEdge16i(src_y + 8, stride, b_limit, limit, thresh);
|
||||
HFilterVertEdge16i(src_y + 12, stride, b_limit, limit, thresh);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Entry point
|
||||
|
||||
@ -163,6 +445,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMSA(void) {
|
||||
VP8Transform = TransformTwo;
|
||||
VP8TransformDC = TransformDC;
|
||||
VP8TransformAC3 = TransformAC3;
|
||||
|
||||
VP8VFilter16 = VFilter16;
|
||||
VP8HFilter16 = HFilter16;
|
||||
VP8VFilter16i = VFilter16i;
|
||||
VP8HFilter16i = HFilter16i;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MSA
|
||||
|
@ -23,10 +23,12 @@
|
||||
|
||||
#ifdef CLANG_BUILD
|
||||
#define ADDVI_H(a, b) __msa_addvi_h((v8i16)a, b)
|
||||
#define SRAI_B(a, b) __msa_srai_b((v16i8)a, b)
|
||||
#define SRAI_H(a, b) __msa_srai_h((v8i16)a, b)
|
||||
#define SRAI_W(a, b) __msa_srai_w((v4i32)a, b)
|
||||
#else
|
||||
#define ADDVI_H(a, b) (a + b)
|
||||
#define SRAI_B(a, b) (a >> b)
|
||||
#define SRAI_H(a, b) (a >> b)
|
||||
#define SRAI_W(a, b) (a >> b)
|
||||
#endif
|
||||
@ -183,6 +185,14 @@
|
||||
#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
|
||||
#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
|
||||
|
||||
#define LD_B8(RTYPE, psrc, stride, \
|
||||
out0, out1, out2, out3, out4, out5, out6, out7) { \
|
||||
LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3); \
|
||||
LD_B4(RTYPE, psrc + 4 * stride, stride, out4, out5, out6, out7); \
|
||||
}
|
||||
#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
|
||||
#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
|
||||
|
||||
/* Description : Load vectors with 8 halfword elements with stride
|
||||
* Arguments : Inputs - psrc, stride
|
||||
* Outputs - out0, out1
|
||||
@ -196,6 +206,25 @@
|
||||
#define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
|
||||
#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
|
||||
|
||||
/* Description : Store vectors of 16 byte elements with stride
|
||||
* Arguments : Inputs - in0, in1, pdst, stride
|
||||
* Details : Store 16 byte elements from 'in0' to (pdst)
|
||||
* Store 16 byte elements from 'in1' to (pdst + stride)
|
||||
*/
|
||||
#define ST_B2(RTYPE, in0, in1, pdst, stride) { \
|
||||
ST_B(RTYPE, in0, pdst); \
|
||||
ST_B(RTYPE, in1, pdst + stride); \
|
||||
}
|
||||
#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
|
||||
#define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
|
||||
|
||||
#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) { \
|
||||
ST_B2(RTYPE, in0, in1, pdst, stride); \
|
||||
ST_B2(RTYPE, in2, in3, pdst + 2 * stride, stride); \
|
||||
}
|
||||
#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
|
||||
#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
|
||||
|
||||
/* Description : Store 4x4 byte block to destination memory from input vector
|
||||
* Arguments : Inputs - in0, in1, pdst, stride
|
||||
* Details : 'Idx0' word element from input vector 'in0' is copied to the
|
||||
@ -216,6 +245,12 @@
|
||||
SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
|
||||
}
|
||||
|
||||
#define ST4x8_UB(in0, in1, pdst, stride) { \
|
||||
uint8_t* const pblk_4x8 = (uint8_t*)pdst; \
|
||||
ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
|
||||
ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
|
||||
}
|
||||
|
||||
/* Description : Immediate number of elements to slide
|
||||
* Arguments : Inputs - in0, in1, slide_val
|
||||
* Outputs - out
|
||||
@ -299,6 +334,121 @@
|
||||
#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
|
||||
#define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave even byte elements from vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1
|
||||
* Return Type - as per RTYPE
|
||||
* Details : Even byte elements of 'in0' and 'in1' are interleaved
|
||||
* and written to 'out0'
|
||||
*/
|
||||
#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
|
||||
out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
|
||||
out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
|
||||
}
|
||||
#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
|
||||
#define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
|
||||
#define ILVEV_B2_UH(...) ILVEV_B2(v8u16, __VA_ARGS__)
|
||||
#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
|
||||
#define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave odd byte elements from vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1
|
||||
* Return Type - as per RTYPE
|
||||
* Details : Odd byte elements of 'in0' and 'in1' are interleaved
|
||||
* and written to 'out0'
|
||||
*/
|
||||
#define ILVOD_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
|
||||
out0 = (RTYPE)__msa_ilvod_b((v16i8)in1, (v16i8)in0); \
|
||||
out1 = (RTYPE)__msa_ilvod_b((v16i8)in3, (v16i8)in2); \
|
||||
}
|
||||
#define ILVOD_B2_UB(...) ILVOD_B2(v16u8, __VA_ARGS__)
|
||||
#define ILVOD_B2_SB(...) ILVOD_B2(v16i8, __VA_ARGS__)
|
||||
#define ILVOD_B2_UH(...) ILVOD_B2(v8u16, __VA_ARGS__)
|
||||
#define ILVOD_B2_SH(...) ILVOD_B2(v8i16, __VA_ARGS__)
|
||||
#define ILVOD_B2_SD(...) ILVOD_B2(v2i64, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave even halfword elements from vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1
|
||||
* Return Type - as per RTYPE
|
||||
* Details : Even halfword elements of 'in0' and 'in1' are interleaved
|
||||
* and written to 'out0'
|
||||
*/
|
||||
#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
|
||||
out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
|
||||
out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
|
||||
}
|
||||
#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
|
||||
#define ILVEV_H2_UH(...) ILVEV_H2(v8u16, __VA_ARGS__)
|
||||
#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
|
||||
#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave odd halfword elements from vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1
|
||||
* Return Type - as per RTYPE
|
||||
* Details : Odd halfword elements of 'in0' and 'in1' are interleaved
|
||||
* and written to 'out0'
|
||||
*/
|
||||
#define ILVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
|
||||
out0 = (RTYPE)__msa_ilvod_h((v8i16)in1, (v8i16)in0); \
|
||||
out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2); \
|
||||
}
|
||||
#define ILVOD_H2_UB(...) ILVOD_H2(v16u8, __VA_ARGS__)
|
||||
#define ILVOD_H2_UH(...) ILVOD_H2(v8u16, __VA_ARGS__)
|
||||
#define ILVOD_H2_SH(...) ILVOD_H2(v8i16, __VA_ARGS__)
|
||||
#define ILVOD_H2_SW(...) ILVOD_H2(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave even-odd word elements from vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1
|
||||
* Return Type - as per RTYPE
|
||||
* Details : Even word elements of 'in0' and 'in1' are interleaved
|
||||
* and written to 'out0'
|
||||
* Odd word elements of 'in2' and 'in3' are interleaved
|
||||
* and written to 'out1'
|
||||
*/
|
||||
#define ILVEVOD_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \
|
||||
out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
|
||||
out1 = (RTYPE)__msa_ilvod_w((v4i32)in3, (v4i32)in2); \
|
||||
}
|
||||
#define ILVEVOD_W2_UB(...) ILVEVOD_W2(v16u8, __VA_ARGS__)
|
||||
#define ILVEVOD_W2_UH(...) ILVEVOD_W2(v8u16, __VA_ARGS__)
|
||||
#define ILVEVOD_W2_SH(...) ILVEVOD_W2(v8i16, __VA_ARGS__)
|
||||
#define ILVEVOD_W2_SW(...) ILVEVOD_W2(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave even double word elements from vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1
|
||||
* Return Type - as per RTYPE
|
||||
* Details : Even double word elements of 'in0' and 'in1' are interleaved
|
||||
* and written to 'out0'
|
||||
*/
|
||||
#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \
|
||||
out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
|
||||
out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
|
||||
}
|
||||
#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
|
||||
#define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
|
||||
#define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave left half of byte elements from vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1
|
||||
* Return Type - as per RTYPE
|
||||
* Details : Left half of byte elements of 'in0' and 'in1' are interleaved
|
||||
* and written to 'out0'.
|
||||
*/
|
||||
#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
|
||||
out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
|
||||
out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
|
||||
}
|
||||
#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
|
||||
#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
|
||||
#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
|
||||
#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave right half of byte elements from vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1
|
||||
@ -366,6 +516,23 @@
|
||||
#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
|
||||
#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave both left and right half of input vectors
|
||||
* Arguments : Inputs - in0, in1
|
||||
* Outputs - out0, out1
|
||||
* Return Type - as per RTYPE
|
||||
* Details : Right half of byte elements from 'in0' and 'in1' are
|
||||
* interleaved and written to 'out0'
|
||||
*/
|
||||
#define ILVRL_B2(RTYPE, in0, in1, out0, out1) { \
|
||||
out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
|
||||
out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
|
||||
}
|
||||
#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
|
||||
#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
|
||||
#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
|
||||
#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
|
||||
#define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
|
||||
|
||||
#define ILVRL_H2(RTYPE, in0, in1, out0, out1) { \
|
||||
out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
|
||||
out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
|
||||
@ -514,6 +681,36 @@
|
||||
out3 = in0 - in3; \
|
||||
}
|
||||
|
||||
/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
|
||||
* in8, in9, in10, in11, in12, in13, in14, in15
|
||||
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
|
||||
* Return Type - unsigned byte
|
||||
*/
|
||||
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
in8, in9, in10, in11, in12, in13, in14, in15, \
|
||||
out0, out1, out2, out3, out4, out5, \
|
||||
out6, out7) { \
|
||||
v8i16 tmp0_m, tmp1_m, tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
|
||||
v4i32 tmp2_m, tmp3_m; \
|
||||
ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
|
||||
ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
|
||||
ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
|
||||
ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
|
||||
ILVEV_B2_SH(out7, out6, out5, out4, tmp0_m, tmp1_m); \
|
||||
ILVOD_B2_SH(out7, out6, out5, out4, tmp4_m, tmp5_m); \
|
||||
ILVEV_B2_UB(out3, out2, out1, out0, out5, out7); \
|
||||
ILVOD_B2_SH(out3, out2, out1, out0, tmp6_m, tmp7_m); \
|
||||
ILVEV_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
|
||||
ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out0, out4); \
|
||||
ILVOD_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
|
||||
ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out2, out6); \
|
||||
ILVEV_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
|
||||
ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out1, out5); \
|
||||
ILVOD_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
|
||||
ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out3, out7); \
|
||||
}
|
||||
|
||||
/* Description : Transpose 4x4 block with word elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1, out2, out3
|
||||
|
Loading…
Reference in New Issue
Block a user