mirror of
https://github.com/webmproject/libwebp.git
synced 2025-07-13 06:24:27 +02:00
Simplify forward-WHT + SSE2 version
no precision loss observed speed is not really faster (0.5% at max), as forward-WHT isn't called often. also: replaced a "int << 3" (undefined by C-spec) by a "int * 8" ( supersedes https://gerrit.chromium.org/gerrit/#/c/48739/ ) Change-Id: I2d980ec2f20f4ff6be5636105ff4f1c70ffde401
This commit is contained in:
@ -363,19 +363,12 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
|
||||
"vld1.16 d2[3], [%[in]], %[kStep] \n"
|
||||
"vld1.16 d3[3], [%[in]], %[kStep] \n"
|
||||
|
||||
"vaddl.s16 q2, d0, d2 \n"
|
||||
"vshl.s32 q2, q2, #2 \n" // a0=(in[0*16]+in[2*16])<<2
|
||||
"vaddl.s16 q3, d1, d3 \n"
|
||||
"vshl.s32 q3, q3, #2 \n" // a1=(in[1*16]+in[3*16])<<2
|
||||
"vsubl.s16 q4, d1, d3 \n"
|
||||
"vshl.s32 q4, q4, #2 \n" // a2=(in[1*16]-in[3*16])<<2
|
||||
"vsubl.s16 q5, d0, d2 \n"
|
||||
"vshl.s32 q5, q5, #2 \n" // a3=(in[0*16]-in[2*16])<<2
|
||||
"vaddl.s16 q2, d0, d2 \n" // a0=(in[0*16]+in[2*16])
|
||||
"vaddl.s16 q3, d1, d3 \n" // a1=(in[1*16]+in[3*16])
|
||||
"vsubl.s16 q4, d1, d3 \n" // a2=(in[1*16]-in[3*16])
|
||||
"vsubl.s16 q5, d0, d2 \n" // a3=(in[0*16]-in[2*16])
|
||||
|
||||
"vceq.s32 q10, q2, #0 \n"
|
||||
"vmvn.s32 q10, q10 \n" // (a0 != 0)
|
||||
"vqadd.s32 q6, q2, q3 \n" // (a0 + a1)
|
||||
"vqsub.s32 q6, q6, q10 \n" // (a0 + a1) + (a0 != 0)
|
||||
"vqadd.s32 q6, q2, q3 \n" // a0 + a1
|
||||
"vqadd.s32 q7, q5, q4 \n" // a3 + a2
|
||||
"vqsub.s32 q8, q5, q4 \n" // a3 - a2
|
||||
"vqsub.s32 q9, q2, q3 \n" // a0 - a1
|
||||
@ -398,27 +391,10 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
|
||||
"vqsub.s32 q6, q3, q2 \n" // b2 = a3 - a2
|
||||
"vqsub.s32 q7, q0, q1 \n" // b3 = a0 - a1
|
||||
|
||||
"vmov.s32 q0, #3 \n" // q0 = 3
|
||||
|
||||
"vcgt.s32 q1, q4, #0 \n" // (b0>0)
|
||||
"vqsub.s32 q2, q4, q1 \n" // (b0+(b0>0))
|
||||
"vqadd.s32 q3, q2, q0 \n" // (b0+(b0>0)+3)
|
||||
"vshrn.s32 d18, q3, #3 \n" // (b0+(b0>0)+3) >> 3
|
||||
|
||||
"vcgt.s32 q1, q5, #0 \n" // (b1>0)
|
||||
"vqsub.s32 q2, q5, q1 \n" // (b1+(b1>0))
|
||||
"vqadd.s32 q3, q2, q0 \n" // (b1+(b1>0)+3)
|
||||
"vshrn.s32 d19, q3, #3 \n" // (b1+(b1>0)+3) >> 3
|
||||
|
||||
"vcgt.s32 q1, q6, #0 \n" // (b2>0)
|
||||
"vqsub.s32 q2, q6, q1 \n" // (b2+(b2>0))
|
||||
"vqadd.s32 q3, q2, q0 \n" // (b2+(b2>0)+3)
|
||||
"vshrn.s32 d20, q3, #3 \n" // (b2+(b2>0)+3) >> 3
|
||||
|
||||
"vcgt.s32 q1, q7, #0 \n" // (b3>0)
|
||||
"vqsub.s32 q2, q7, q1 \n" // (b3+(b3>0))
|
||||
"vqadd.s32 q3, q2, q0 \n" // (b3+(b3>0)+3)
|
||||
"vshrn.s32 d21, q3, #3 \n" // (b3+(b3>0)+3) >> 3
|
||||
"vshrn.s32 d18, q4, #1 \n" // b0 >> 1
|
||||
"vshrn.s32 d19, q5, #1 \n" // b1 >> 1
|
||||
"vshrn.s32 d20, q6, #1 \n" // b2 >> 1
|
||||
"vshrn.s32 d21, q7, #1 \n" // b3 >> 1
|
||||
|
||||
"vst1.16 {q9, q10}, [%[out]] \n"
|
||||
|
||||
|
Reference in New Issue
Block a user