mirror of
https://github.com/webmproject/libwebp.git
synced 2024-12-26 05:38:22 +01:00
add NEON asm version for WHT inverse transform
Contributed by Wayne Chen (datoudatou at gmail dot com) + some header cleanup + remove the NEON suffix in static functions Change-Id: I75bf5e9b54cf5e1acc53764c6f081d61690f8e3d
This commit is contained in:
parent
a61a824b3a
commit
e8b41ad136
@ -155,6 +155,9 @@ static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
|
||||
}
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Inverse transforms (Paragraph 14.4)
|
||||
|
||||
static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
|
||||
const int kBPS = BPS;
|
||||
const int16_t constants[] = {20091, 17734, 0, 0};
|
||||
@ -311,6 +314,73 @@ static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {
|
||||
}
|
||||
}
|
||||
|
||||
static void TransformWHT(const int16_t* in, int16_t* out) {
|
||||
const int kStep = 32; // The store is only incrementing the pointer as if we
|
||||
// had stored a single byte.
|
||||
__asm__ volatile (
|
||||
// part 1
|
||||
// load data into q0, q1
|
||||
"vld1.16 {q0, q1}, [%[in]] \n"
|
||||
|
||||
"vaddl.s16 q2, d0, d3 \n" // a0 = in[0] + in[12]
|
||||
"vaddl.s16 q3, d1, d2 \n" // a1 = in[4] + in[8]
|
||||
"vsubl.s16 q4, d1, d2 \n" // a2 = in[4] - in[8]
|
||||
"vsubl.s16 q5, d0, d3 \n" // a3 = in[0] - in[12]
|
||||
|
||||
"vadd.s32 q0, q2, q3 \n" // tmp[0] = a0 + a1
|
||||
"vsub.s32 q2, q2, q3 \n" // tmp[8] = a0 - a1
|
||||
"vadd.s32 q1, q5, q4 \n" // tmp[4] = a3 + a2
|
||||
"vsub.s32 q3, q5, q4 \n" // tmp[12] = a3 - a2
|
||||
|
||||
// Transpose
|
||||
// q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
|
||||
// q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
|
||||
"vswp d1, d4 \n" // vtrn.64 q0, q2
|
||||
"vswp d3, d6 \n" // vtrn.64 q1, q3
|
||||
"vtrn.32 q0, q1 \n"
|
||||
"vtrn.32 q2, q3 \n"
|
||||
|
||||
"vmov.s32 q4, #3 \n" // dc = 3
|
||||
"vadd.s32 q0, q0, q4 \n" // dc = tmp[0] + 3
|
||||
"vadd.s32 q6, q0, q3 \n" // a0 = dc + tmp[3]
|
||||
"vadd.s32 q7, q1, q2 \n" // a1 = tmp[1] + tmp[2]
|
||||
"vsub.s32 q8, q1, q2 \n" // a2 = tmp[1] - tmp[2]
|
||||
"vsub.s32 q9, q0, q3 \n" // a3 = dc - tmp[3]
|
||||
|
||||
"vadd.s32 q0, q6, q7 \n"
|
||||
"vshrn.s32 d0, q0, #3 \n" // (a0 + a1) >> 3
|
||||
"vadd.s32 q1, q9, q8 \n"
|
||||
"vshrn.s32 d1, q1, #3 \n" // (a3 + a2) >> 3
|
||||
"vsub.s32 q2, q6, q7 \n"
|
||||
"vshrn.s32 d2, q2, #3 \n" // (a0 - a1) >> 3
|
||||
"vsub.s32 q3, q9, q8 \n"
|
||||
"vshrn.s32 d3, q3, #3 \n" // (a3 - a2) >> 3
|
||||
|
||||
// set the results to output
|
||||
"vst1.16 d0[0], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d1[0], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d2[0], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d3[0], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d0[1], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d1[1], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d2[1], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d3[1], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d0[2], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d1[2], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d2[2], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d3[2], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d0[3], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d1[3], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d2[3], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d3[3], [%[out]], %[kStep] \n"
|
||||
|
||||
: [out] "+r"(out) // modified registers
|
||||
: [in] "r"(in), [kStep] "r"(kStep) // constants
|
||||
: "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q5", "q6", "q7", "q8", "q9" // clobbered
|
||||
);
|
||||
}
|
||||
|
||||
#endif // WEBP_USE_NEON
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
@ -321,6 +391,7 @@ extern void VP8DspInitNEON(void);
|
||||
void VP8DspInitNEON(void) {
|
||||
#if defined(WEBP_USE_NEON)
|
||||
VP8Transform = TransformTwoNEON;
|
||||
VP8TransformWHT = TransformWHT;
|
||||
|
||||
VP8SimpleVFilter16 = SimpleVFilter16NEON;
|
||||
VP8SimpleHFilter16 = SimpleHFilter16NEON;
|
||||
|
@ -103,7 +103,7 @@ extern VP8DecIdct2 VP8Transform;
|
||||
extern VP8DecIdct VP8TransformUV;
|
||||
extern VP8DecIdct VP8TransformDC;
|
||||
extern VP8DecIdct VP8TransformDCUV;
|
||||
extern void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
|
||||
extern VP8WHT VP8TransformWHT;
|
||||
|
||||
// *dst is the destination block, with stride BPS. Boundary samples are
|
||||
// assumed accessible when needed.
|
||||
|
@ -25,7 +25,7 @@ extern "C" {
|
||||
// Inverse transform.
|
||||
// This code is pretty much the same as TransformOneNEON in the decoder, except
|
||||
// for subtraction to *ref. See the comments there for algorithmic explanations.
|
||||
static void ITransformOneNEON(const uint8_t* ref,
|
||||
static void ITransformOne(const uint8_t* ref,
|
||||
const int16_t* in, uint8_t* dst) {
|
||||
const int kBPS = BPS;
|
||||
const int16_t kC1C2[] = { 20091, 17734, 0, 0 }; // kC1 / (kC2 >> 1) / 0 / 0
|
||||
@ -139,14 +139,82 @@ static void ITransformOneNEON(const uint8_t* ref,
|
||||
);
|
||||
}
|
||||
|
||||
static void ITransformNEON(const uint8_t* ref,
|
||||
static void ITransform(const uint8_t* ref,
|
||||
const int16_t* in, uint8_t* dst, int do_two) {
|
||||
ITransformOneNEON(ref, in, dst);
|
||||
ITransformOne(ref, in, dst);
|
||||
if (do_two) {
|
||||
ITransformOneNEON(ref + 4, in + 16, dst + 4);
|
||||
ITransformOne(ref + 4, in + 16, dst + 4);
|
||||
}
|
||||
}
|
||||
|
||||
// Same code as dec_neon.c
|
||||
static void ITransformWHT(const int16_t* in, int16_t* out) {
|
||||
const int kStep = 32; // The store is only incrementing the pointer as if we
|
||||
// had stored a single byte.
|
||||
__asm__ volatile (
|
||||
// part 1
|
||||
// load data into q0, q1
|
||||
"vld1.16 {q0, q1}, [%[in]] \n"
|
||||
|
||||
"vaddl.s16 q2, d0, d3 \n" // a0 = in[0] + in[12]
|
||||
"vaddl.s16 q3, d1, d2 \n" // a1 = in[4] + in[8]
|
||||
"vsubl.s16 q4, d1, d2 \n" // a2 = in[4] - in[8]
|
||||
"vsubl.s16 q5, d0, d3 \n" // a3 = in[0] - in[12]
|
||||
|
||||
"vadd.s32 q0, q2, q3 \n" // tmp[0] = a0 + a1
|
||||
"vsub.s32 q2, q2, q3 \n" // tmp[8] = a0 - a1
|
||||
"vadd.s32 q1, q5, q4 \n" // tmp[4] = a3 + a2
|
||||
"vsub.s32 q3, q5, q4 \n" // tmp[12] = a3 - a2
|
||||
|
||||
// Transpose
|
||||
// q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
|
||||
// q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
|
||||
"vswp d1, d4 \n" // vtrn.64 q0, q2
|
||||
"vswp d3, d6 \n" // vtrn.64 q1, q3
|
||||
"vtrn.32 q0, q1 \n"
|
||||
"vtrn.32 q2, q3 \n"
|
||||
|
||||
"vmov.s32 q4, #3 \n" // dc = 3
|
||||
"vadd.s32 q0, q0, q4 \n" // dc = tmp[0] + 3
|
||||
"vadd.s32 q6, q0, q3 \n" // a0 = dc + tmp[3]
|
||||
"vadd.s32 q7, q1, q2 \n" // a1 = tmp[1] + tmp[2]
|
||||
"vsub.s32 q8, q1, q2 \n" // a2 = tmp[1] - tmp[2]
|
||||
"vsub.s32 q9, q0, q3 \n" // a3 = dc - tmp[3]
|
||||
|
||||
"vadd.s32 q0, q6, q7 \n"
|
||||
"vshrn.s32 d0, q0, #3 \n" // (a0 + a1) >> 3
|
||||
"vadd.s32 q1, q9, q8 \n"
|
||||
"vshrn.s32 d1, q1, #3 \n" // (a3 + a2) >> 3
|
||||
"vsub.s32 q2, q6, q7 \n"
|
||||
"vshrn.s32 d2, q2, #3 \n" // (a0 - a1) >> 3
|
||||
"vsub.s32 q3, q9, q8 \n"
|
||||
"vshrn.s32 d3, q3, #3 \n" // (a3 - a2) >> 3
|
||||
|
||||
// set the results to output
|
||||
"vst1.16 d0[0], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d1[0], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d2[0], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d3[0], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d0[1], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d1[1], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d2[1], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d3[1], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d0[2], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d1[2], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d2[2], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d3[2], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d0[3], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d1[3], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d2[3], [%[out]], %[kStep] \n"
|
||||
"vst1.16 d3[3], [%[out]], %[kStep] \n"
|
||||
|
||||
: [out] "+r"(out) // modified registers
|
||||
: [in] "r"(in), [kStep] "r"(kStep) // constants
|
||||
: "memory", "q0", "q1", "q2", "q3", "q4",
|
||||
"q5", "q6", "q7", "q8", "q9" // clobbered
|
||||
);
|
||||
}
|
||||
|
||||
// Forward transform.
|
||||
|
||||
// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
|
||||
@ -160,7 +228,7 @@ static const int32_t kCoeff32[] = {
|
||||
51000, 51000, 51000, 51000
|
||||
};
|
||||
|
||||
static void FTransformNEON(const uint8_t* src, const uint8_t* ref,
|
||||
static void FTransform(const uint8_t* src, const uint8_t* ref,
|
||||
int16_t* out) {
|
||||
const int kBPS = BPS;
|
||||
const uint8_t* src_ptr = src;
|
||||
@ -282,8 +350,10 @@ extern void VP8EncDspInitNEON(void);
|
||||
|
||||
void VP8EncDspInitNEON(void) {
|
||||
#if defined(WEBP_USE_NEON)
|
||||
VP8ITransform = ITransformNEON;
|
||||
VP8FTransform = FTransformNEON;
|
||||
VP8ITransform = ITransform;
|
||||
VP8FTransform = FTransform;
|
||||
|
||||
VP8ITransformWHT = ITransformWHT;
|
||||
#endif // WEBP_USE_NEON
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user