Merge "Added MSA optimized edge filtering functions"

This commit is contained in:
Parag Salasakar 2016-06-22 10:45:14 +00:00 committed by Gerrit Code Review
commit 9ad2352d0d
2 changed files with 484 additions and 0 deletions

View File

@ -153,6 +153,288 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
}
//------------------------------------------------------------------------------
// Edge filtering functions
#define FLIP_SIGN2(in0, in1, out0, out1) { \
out0 = (v16i8)__msa_xori_b(in0, 0x80); \
out1 = (v16i8)__msa_xori_b(in1, 0x80); \
}
#define FLIP_SIGN4(in0, in1, in2, in3, out0, out1, out2, out3) { \
FLIP_SIGN2(in0, in1, out0, out1); \
FLIP_SIGN2(in2, in3, out2, out3); \
}
#define FILT_VAL(q0_m, p0_m, mask, filt) do { \
v16i8 q0_sub_p0; \
q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \
filt = __msa_adds_s_b(filt, q0_sub_p0); \
filt = __msa_adds_s_b(filt, q0_sub_p0); \
filt = __msa_adds_s_b(filt, q0_sub_p0); \
filt = filt & mask; \
} while (0)
#define FILT2(q_m, p_m, q, p) do { \
u_r = SRAI_H(temp1, 7); \
u_r = __msa_sat_s_h(u_r, 7); \
u_l = SRAI_H(temp3, 7); \
u_l = __msa_sat_s_h(u_l, 7); \
u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \
q_m = __msa_subs_s_b(q_m, u); \
p_m = __msa_adds_s_b(p_m, u); \
q = __msa_xori_b((v16u8)q_m, 0x80); \
p = __msa_xori_b((v16u8)p_m, 0x80); \
} while (0)
#define LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) do { \
v16i8 p1_m, p0_m, q0_m, q1_m; \
v16i8 filt, t1, t2; \
const v16i8 cnst4b = __msa_ldi_b(4); \
const v16i8 cnst3b = __msa_ldi_b(3); \
\
FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m); \
filt = __msa_subs_s_b(p1_m, q1_m); \
filt = filt & hev; \
FILT_VAL(q0_m, p0_m, mask, filt); \
t1 = __msa_adds_s_b(filt, cnst4b); \
t1 = SRAI_B(t1, 3); \
t2 = __msa_adds_s_b(filt, cnst3b); \
t2 = SRAI_B(t2, 3); \
q0_m = __msa_subs_s_b(q0_m, t1); \
q0 = __msa_xori_b((v16u8)q0_m, 0x80); \
p0_m = __msa_adds_s_b(p0_m, t2); \
p0 = __msa_xori_b((v16u8)p0_m, 0x80); \
filt = __msa_srari_b(t1, 1); \
hev = __msa_xori_b(hev, 0xff); \
filt = filt & hev; \
q1_m = __msa_subs_s_b(q1_m, filt); \
q1 = __msa_xori_b((v16u8)q1_m, 0x80); \
p1_m = __msa_adds_s_b(p1_m, filt); \
p1 = __msa_xori_b((v16u8)p1_m, 0x80); \
} while (0)
#define LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) do { \
v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \
v16i8 u, filt, t1, t2, filt_sign; \
v8i16 filt_r, filt_l, u_r, u_l; \
v8i16 temp0, temp1, temp2, temp3; \
const v16i8 cnst4b = __msa_ldi_b(4); \
const v16i8 cnst3b = __msa_ldi_b(3); \
const v8i16 cnst9h = __msa_ldi_h(9); \
\
FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m); \
filt = __msa_subs_s_b(p1_m, q1_m); \
FILT_VAL(q0_m, p0_m, mask, filt); \
FLIP_SIGN2(p2, q2, p2_m, q2_m); \
t2 = filt & hev; \
/* filt_val &= ~hev */ \
hev = __msa_xori_b(hev, 0xff); \
filt = filt & hev; \
t1 = __msa_adds_s_b(t2, cnst4b); \
t1 = SRAI_B(t1, 3); \
t2 = __msa_adds_s_b(t2, cnst3b); \
t2 = SRAI_B(t2, 3); \
q0_m = __msa_subs_s_b(q0_m, t1); \
p0_m = __msa_adds_s_b(p0_m, t2); \
filt_sign = __msa_clti_s_b(filt, 0); \
ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \
/* update q2/p2 */ \
temp0 = filt_r * cnst9h; \
temp1 = ADDVI_H(temp0, 63); \
temp2 = filt_l * cnst9h; \
temp3 = ADDVI_H(temp2, 63); \
FILT2(q2_m, p2_m, q2, p2); \
/* update q1/p1 */ \
temp1 = temp1 + temp0; \
temp3 = temp3 + temp2; \
FILT2(q1_m, p1_m, q1, p1); \
/* update q0/p0 */ \
temp1 = temp1 + temp0; \
temp3 = temp3 + temp2; \
FILT2(q0_m, p0_m, q0, p0); \
} while (0)
#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \
q0_in, q1_in, q2_in, q3_in, \
limit_in, b_limit_in, thresh_in, \
hev_out, mask_out) do { \
v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
v16u8 flat_out; \
\
/* absolute subtraction of pixel values */ \
p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \
p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \
p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \
q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \
q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \
q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \
p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \
p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \
/* calculation of hev */ \
flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
hev_out = (thresh_in < flat_out); \
/* calculation of mask */ \
p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
p1_asub_q1_m = SRAI_B(p1_asub_q1_m, 1); \
p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
mask_out = (b_limit_in < p0_asub_q0_m); \
mask_out = __msa_max_u_b(flat_out, mask_out); \
p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
mask_out = (limit_in < mask_out); \
mask_out = __msa_xori_b(mask_out, 0xff); \
} while (0)
#define ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride) { \
const uint16_t tmp0_h = __msa_copy_s_h((v8i16)in1, in1_idx); \
const uint32_t tmp0_w = __msa_copy_s_w((v4i32)in0, in0_idx); \
SW(tmp0_w, pdst); \
SH(tmp0_h, pdst + stride); \
}
static void VFilter16(uint8_t *src, int stride,
int b_limit_in, int limit_in, int thresh_in) {
uint8_t *ptemp = src - 4 * stride;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
v16u8 mask, hev;
const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
LD_UB8(ptemp, stride, p3, p2, p1, p0, q0, q1, q2, q3);
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
hev, mask);
LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
ptemp = src - 3 * stride;
ST_UB4(p2, p1, p0, q0, ptemp, stride);
ptemp += (4 * stride);
ST_UB2(q1, q2, ptemp, stride);
}
static void HFilter16(uint8_t *src, int stride,
int b_limit_in, int limit_in, int thresh_in) {
uint8_t *ptmp = src - 4;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
v16u8 mask, hev;
v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
v16u8 row9, row10, row11, row12, row13, row14, row15;
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
LD_UB8(ptmp, stride, row0, row1, row2, row3, row4, row5, row6, row7);
ptmp += (8 * stride);
LD_UB8(ptmp, stride, row8, row9, row10, row11, row12, row13, row14, row15);
TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
row8, row9, row10, row11, row12, row13, row14, row15,
p3, p2, p1, p0, q0, q1, q2, q3);
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
hev, mask);
LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
ILVRL_B2_SH(q2, q1, tmp2, tmp5);
ptmp = src - 3;
ST6x1_UB(tmp3, 0, tmp2, 0, ptmp, 4);
ptmp += stride;
ST6x1_UB(tmp3, 1, tmp2, 1, ptmp, 4);
ptmp += stride;
ST6x1_UB(tmp3, 2, tmp2, 2, ptmp, 4);
ptmp += stride;
ST6x1_UB(tmp3, 3, tmp2, 3, ptmp, 4);
ptmp += stride;
ST6x1_UB(tmp4, 0, tmp2, 4, ptmp, 4);
ptmp += stride;
ST6x1_UB(tmp4, 1, tmp2, 5, ptmp, 4);
ptmp += stride;
ST6x1_UB(tmp4, 2, tmp2, 6, ptmp, 4);
ptmp += stride;
ST6x1_UB(tmp4, 3, tmp2, 7, ptmp, 4);
ptmp += stride;
ST6x1_UB(tmp6, 0, tmp5, 0, ptmp, 4);
ptmp += stride;
ST6x1_UB(tmp6, 1, tmp5, 1, ptmp, 4);
ptmp += stride;
ST6x1_UB(tmp6, 2, tmp5, 2, ptmp, 4);
ptmp += stride;
ST6x1_UB(tmp6, 3, tmp5, 3, ptmp, 4);
ptmp += stride;
ST6x1_UB(tmp7, 0, tmp5, 4, ptmp, 4);
ptmp += stride;
ST6x1_UB(tmp7, 1, tmp5, 5, ptmp, 4);
ptmp += stride;
ST6x1_UB(tmp7, 2, tmp5, 6, ptmp, 4);
ptmp += stride;
ST6x1_UB(tmp7, 3, tmp5, 7, ptmp, 4);
}
// on three inner edges
static void VFilterHorEdge16i(uint8_t *src, int stride,
int b_limit, int limit, int thresh) {
v16u8 mask, hev;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
const v16u8 thresh0 = (v16u8)__msa_fill_b(thresh);
const v16u8 b_limit0 = (v16u8)__msa_fill_b(b_limit);
const v16u8 limit0 = (v16u8)__msa_fill_b(limit);
LD_UB8((src - 4 * stride), stride, p3, p2, p1, p0, q0, q1, q2, q3);
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
hev, mask);
LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
ST_UB4(p1, p0, q0, q1, (src - 2 * stride), stride);
}
static void VFilter16i(uint8_t *src_y, int stride,
int b_limit, int limit, int thresh) {
VFilterHorEdge16i(src_y + 4 * stride, stride, b_limit, limit, thresh);
VFilterHorEdge16i(src_y + 8 * stride, stride, b_limit, limit, thresh);
VFilterHorEdge16i(src_y + 12 * stride, stride, b_limit, limit, thresh);
}
static void HFilterVertEdge16i(uint8_t *src, int stride,
int b_limit, int limit, int thresh) {
v16u8 mask, hev;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
const v16u8 thresh0 = (v16u8)__msa_fill_b(thresh);
const v16u8 b_limit0 = (v16u8)__msa_fill_b(b_limit);
const v16u8 limit0 = (v16u8)__msa_fill_b(limit);
LD_UB8(src - 4, stride, row0, row1, row2, row3, row4, row5, row6, row7);
LD_UB8(src - 4 + (8 * stride), stride,
row8, row9, row10, row11, row12, row13, row14, row15);
TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
row8, row9, row10, row11, row12, row13, row14, row15,
p3, p2, p1, p0, q0, q1, q2, q3);
LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
hev, mask);
LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
src -= 2;
ST4x8_UB(tmp2, tmp3, src, stride);
src += (8 * stride);
ST4x8_UB(tmp4, tmp5, src, stride);
}
static void HFilter16i(uint8_t *src_y, int stride,
int b_limit, int limit, int thresh) {
HFilterVertEdge16i(src_y + 4, stride, b_limit, limit, thresh);
HFilterVertEdge16i(src_y + 8, stride, b_limit, limit, thresh);
HFilterVertEdge16i(src_y + 12, stride, b_limit, limit, thresh);
}
//------------------------------------------------------------------------------
// Entry point
@ -163,6 +445,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMSA(void) {
VP8Transform = TransformTwo;
VP8TransformDC = TransformDC;
VP8TransformAC3 = TransformAC3;
VP8VFilter16 = VFilter16;
VP8HFilter16 = HFilter16;
VP8VFilter16i = VFilter16i;
VP8HFilter16i = HFilter16i;
}
#else // !WEBP_USE_MSA

View File

@ -23,10 +23,12 @@
#ifdef CLANG_BUILD
#define ADDVI_H(a, b) __msa_addvi_h((v8i16)a, b)
#define SRAI_B(a, b) __msa_srai_b((v16i8)a, b)
#define SRAI_H(a, b) __msa_srai_h((v8i16)a, b)
#define SRAI_W(a, b) __msa_srai_w((v4i32)a, b)
#else
#define ADDVI_H(a, b) (a + b)
#define SRAI_B(a, b) (a >> b)
#define SRAI_H(a, b) (a >> b)
#define SRAI_W(a, b) (a >> b)
#endif
@ -183,6 +185,14 @@
#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
#define LD_B8(RTYPE, psrc, stride, \
out0, out1, out2, out3, out4, out5, out6, out7) { \
LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3); \
LD_B4(RTYPE, psrc + 4 * stride, stride, out4, out5, out6, out7); \
}
#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
/* Description : Load vectors with 8 halfword elements with stride
* Arguments : Inputs - psrc, stride
* Outputs - out0, out1
@ -196,6 +206,25 @@
#define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
/* Description : Store vectors of 16 byte elements with stride
* Arguments : Inputs - in0, in1, pdst, stride
* Details : Store 16 byte elements from 'in0' to (pdst)
* Store 16 byte elements from 'in1' to (pdst + stride)
*/
#define ST_B2(RTYPE, in0, in1, pdst, stride) { \
ST_B(RTYPE, in0, pdst); \
ST_B(RTYPE, in1, pdst + stride); \
}
#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
#define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) { \
ST_B2(RTYPE, in0, in1, pdst, stride); \
ST_B2(RTYPE, in2, in3, pdst + 2 * stride, stride); \
}
#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
/* Description : Store 4x4 byte block to destination memory from input vector
* Arguments : Inputs - in0, in1, pdst, stride
* Details : 'Idx0' word element from input vector 'in0' is copied to the
@ -216,6 +245,12 @@
SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
}
#define ST4x8_UB(in0, in1, pdst, stride) { \
uint8_t* const pblk_4x8 = (uint8_t*)pdst; \
ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
}
/* Description : Immediate number of elements to slide
* Arguments : Inputs - in0, in1, slide_val
* Outputs - out
@ -299,6 +334,121 @@
#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
#define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
/* Description : Interleave even byte elements from vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1
* Return Type - as per RTYPE
* Details : Even byte elements of 'in0' and 'in1' are interleaved
* and written to 'out0'
*/
#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
}
#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
#define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
#define ILVEV_B2_UH(...) ILVEV_B2(v8u16, __VA_ARGS__)
#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
#define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
/* Description : Interleave odd byte elements from vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1
* Return Type - as per RTYPE
* Details : Odd byte elements of 'in0' and 'in1' are interleaved
* and written to 'out0'
*/
#define ILVOD_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
out0 = (RTYPE)__msa_ilvod_b((v16i8)in1, (v16i8)in0); \
out1 = (RTYPE)__msa_ilvod_b((v16i8)in3, (v16i8)in2); \
}
#define ILVOD_B2_UB(...) ILVOD_B2(v16u8, __VA_ARGS__)
#define ILVOD_B2_SB(...) ILVOD_B2(v16i8, __VA_ARGS__)
#define ILVOD_B2_UH(...) ILVOD_B2(v8u16, __VA_ARGS__)
#define ILVOD_B2_SH(...) ILVOD_B2(v8i16, __VA_ARGS__)
#define ILVOD_B2_SD(...) ILVOD_B2(v2i64, __VA_ARGS__)
/* Description : Interleave even halfword elements from vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1
* Return Type - as per RTYPE
* Details : Even halfword elements of 'in0' and 'in1' are interleaved
* and written to 'out0'
*/
#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
}
#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
#define ILVEV_H2_UH(...) ILVEV_H2(v8u16, __VA_ARGS__)
#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
/* Description : Interleave odd halfword elements from vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1
* Return Type - as per RTYPE
* Details : Odd halfword elements of 'in0' and 'in1' are interleaved
* and written to 'out0'
*/
#define ILVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
out0 = (RTYPE)__msa_ilvod_h((v8i16)in1, (v8i16)in0); \
out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2); \
}
#define ILVOD_H2_UB(...) ILVOD_H2(v16u8, __VA_ARGS__)
#define ILVOD_H2_UH(...) ILVOD_H2(v8u16, __VA_ARGS__)
#define ILVOD_H2_SH(...) ILVOD_H2(v8i16, __VA_ARGS__)
#define ILVOD_H2_SW(...) ILVOD_H2(v4i32, __VA_ARGS__)
/* Description : Interleave even-odd word elements from vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1
* Return Type - as per RTYPE
* Details : Even word elements of 'in0' and 'in1' are interleaved
* and written to 'out0'
* Odd word elements of 'in2' and 'in3' are interleaved
* and written to 'out1'
*/
#define ILVEVOD_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \
out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
out1 = (RTYPE)__msa_ilvod_w((v4i32)in3, (v4i32)in2); \
}
#define ILVEVOD_W2_UB(...) ILVEVOD_W2(v16u8, __VA_ARGS__)
#define ILVEVOD_W2_UH(...) ILVEVOD_W2(v8u16, __VA_ARGS__)
#define ILVEVOD_W2_SH(...) ILVEVOD_W2(v8i16, __VA_ARGS__)
#define ILVEVOD_W2_SW(...) ILVEVOD_W2(v4i32, __VA_ARGS__)
/* Description : Interleave even double word elements from vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1
* Return Type - as per RTYPE
* Details : Even double word elements of 'in0' and 'in1' are interleaved
* and written to 'out0'
*/
#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \
out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
}
#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
#define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
#define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
/* Description : Interleave left half of byte elements from vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1
* Return Type - as per RTYPE
* Details : Left half of byte elements of 'in0' and 'in1' are interleaved
* and written to 'out0'.
*/
#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
}
#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
/* Description : Interleave right half of byte elements from vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1
@ -366,6 +516,23 @@
#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
/* Description : Interleave both left and right half of input vectors
* Arguments : Inputs - in0, in1
* Outputs - out0, out1
* Return Type - as per RTYPE
* Details : Right half of byte elements from 'in0' and 'in1' are
* interleaved and written to 'out0'
*/
#define ILVRL_B2(RTYPE, in0, in1, out0, out1) { \
out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
}
#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
#define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
#define ILVRL_H2(RTYPE, in0, in1, out0, out1) { \
out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
@ -514,6 +681,36 @@
out3 = in0 - in3; \
}
/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
* in8, in9, in10, in11, in12, in13, in14, in15
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
* Return Type - unsigned byte
*/
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
in8, in9, in10, in11, in12, in13, in14, in15, \
out0, out1, out2, out3, out4, out5, \
out6, out7) { \
v8i16 tmp0_m, tmp1_m, tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
v4i32 tmp2_m, tmp3_m; \
ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
ILVEV_B2_SH(out7, out6, out5, out4, tmp0_m, tmp1_m); \
ILVOD_B2_SH(out7, out6, out5, out4, tmp4_m, tmp5_m); \
ILVEV_B2_UB(out3, out2, out1, out0, out5, out7); \
ILVOD_B2_SH(out3, out2, out1, out0, tmp6_m, tmp7_m); \
ILVEV_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out0, out4); \
ILVOD_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out2, out6); \
ILVEV_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out1, out5); \
ILVOD_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out3, out7); \
}
/* Description : Transpose 4x4 block with word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1, out2, out3