MIPS: dspr2: added optimization for Disto4x4

enc/dec common macros moved to mips_macro.h

Change-Id: I38d491e772554ac663dd5eb4d15485c0343f23b1
This commit is contained in:
Djordje Pesut
2014-11-05 12:06:15 +01:00
parent 4171b6724e
commit 95ca44a718
3 changed files with 170 additions and 93 deletions

View File

@ -23,53 +23,11 @@ static const int kC2 = 35468;
#define MUL(a, b) (((a) * (b)) >> 16)
// temp0[31..16 | 15..0] = temp0[31..16 | 15..0] + temp8[31..16 | 15..0]
// temp0[31..16 | 15..0] = temp0[31..16 <<(s) 7 | 15..0 <<(s) 7]
// temp1..temp7 same as temp0
// precrqu_s.qb.ph temp0, temp1, temp0:
// temp0 = temp1[31..24] | temp1[15..8] | temp0[31..24] | temp0[15..8]
// store temp0 to dst
// IO - input/output
// I - input (macro doesn't change it)
#define STORE_SAT_SUM_X2(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7, \
I0, I1, I2, I3, I4, I5, I6, I7) \
"addq.ph %["#IO0"], %["#IO0"], %["#I0"] \n\t" \
"addq.ph %["#IO1"], %["#IO1"], %["#I1"] \n\t" \
"addq.ph %["#IO2"], %["#IO2"], %["#I2"] \n\t" \
"addq.ph %["#IO3"], %["#IO3"], %["#I3"] \n\t" \
"addq.ph %["#IO4"], %["#IO4"], %["#I4"] \n\t" \
"addq.ph %["#IO5"], %["#IO5"], %["#I5"] \n\t" \
"addq.ph %["#IO6"], %["#IO6"], %["#I6"] \n\t" \
"addq.ph %["#IO7"], %["#IO7"], %["#I7"] \n\t" \
"shll_s.ph %["#IO0"], %["#IO0"], 7 \n\t" \
"shll_s.ph %["#IO1"], %["#IO1"], 7 \n\t" \
"shll_s.ph %["#IO2"], %["#IO2"], 7 \n\t" \
"shll_s.ph %["#IO3"], %["#IO3"], 7 \n\t" \
"shll_s.ph %["#IO4"], %["#IO4"], 7 \n\t" \
"shll_s.ph %["#IO5"], %["#IO5"], 7 \n\t" \
"shll_s.ph %["#IO6"], %["#IO6"], 7 \n\t" \
"shll_s.ph %["#IO7"], %["#IO7"], 7 \n\t" \
"precrqu_s.qb.ph %["#IO0"], %["#IO1"], %["#IO0"] \n\t" \
"precrqu_s.qb.ph %["#IO2"], %["#IO3"], %["#IO2"] \n\t" \
"precrqu_s.qb.ph %["#IO4"], %["#IO5"], %["#IO4"] \n\t" \
"precrqu_s.qb.ph %["#IO6"], %["#IO7"], %["#IO6"] \n\t" \
"usw %["#IO0"], 0(%[dst]) \n\t" \
"usw %["#IO2"], 32(%[dst]) \n\t" \
"usw %["#IO4"], 64(%[dst]) \n\t" \
"usw %["#IO6"], 96(%[dst]) \n\t"
// O - output
#define LOAD_DST(O0, O1, O2, O3) \
"ulw %["#O0"], 0(%[dst]) \n\t" \
"ulw %["#O1"], 32(%[dst]) \n\t" \
"ulw %["#O2"], 64(%[dst]) \n\t" \
"ulw %["#O3"], 96(%[dst]) \n\t"
static void TransformDC(const int16_t* in, uint8_t* dst) {
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
__asm__ volatile (
LOAD_DST(temp1, temp2, temp3, temp4)
LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst, 0, 32, 64, 96)
"lh %[temp5], 0(%[in]) \n\t"
"addiu %[temp5], %[temp5], 4 \n\t"
"ins %[temp5], %[temp5], 16, 16 \n\t"
@ -77,7 +35,8 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,
temp3, temp1, temp2, temp3, temp4)
STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5)
temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,
dst, 0, 32, 64, 96)
OUTPUT_EARLY_CLOBBER_REGS_10()
: [in]"r"(in), [dst]"r"(dst)
@ -102,14 +61,14 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
"replv.ph %[temp5], %[c1] \n\t"
SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
LOAD_DST(temp3, temp5, temp11, temp12)
LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst, 0, 32, 64, 96)
CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
temp11, temp17, temp3, temp5, temp11, temp12)
PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
temp4, temp7, temp6, temp10, temp9)
STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
temp17, temp12, temp18, temp1, temp8, temp2, temp4,
temp7, temp6)
temp7, temp6, dst, 0, 32, 64, 96)
OUTPUT_EARLY_CLOBBER_REGS_18(),
[c4]"+&r"(c4)
@ -169,11 +128,12 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
temp6)
PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
temp16, temp11, temp10, temp15, temp14)
LOAD_DST(temp10, temp11, temp14, temp15)
LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst, 0, 32, 64, 96)
CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
temp11, temp10, temp11, temp14, temp15)
STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4)
temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
dst, 0, 32, 64, 96)
OUTPUT_EARLY_CLOBBER_REGS_18()
: [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
@ -510,8 +470,6 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
}
#undef LOAD_DST
#undef STORE_SAT_SUM_X2
#undef MUL
#endif // WEBP_USE_MIPS_DSP_R2