diff --git a/src/dsp/dec_neon.c b/src/dsp/dec_neon.c index 09cafeef..5d7cff15 100644 --- a/src/dsp/dec_neon.c +++ b/src/dsp/dec_neon.c @@ -155,6 +155,9 @@ static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) { } } +//----------------------------------------------------------------------------- +// Inverse transforms (Paragraph 14.4) + static void TransformOneNEON(const int16_t *in, uint8_t *dst) { const int kBPS = BPS; const int16_t constants[] = {20091, 17734, 0, 0}; @@ -311,6 +314,73 @@ static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) { } } +static void TransformWHT(const int16_t* in, int16_t* out) { + const int kStep = 32; // The store is only incrementing the pointer as if we + // had stored a single byte. + __asm__ volatile ( + // part 1 + // load data into q0, q1 + "vld1.16 {q0, q1}, [%[in]] \n" + + "vaddl.s16 q2, d0, d3 \n" // a0 = in[0] + in[12] + "vaddl.s16 q3, d1, d2 \n" // a1 = in[4] + in[8] + "vsubl.s16 q4, d1, d2 \n" // a2 = in[4] - in[8] + "vsubl.s16 q5, d0, d3 \n" // a3 = in[0] - in[12] + + "vadd.s32 q0, q2, q3 \n" // tmp[0] = a0 + a1 + "vsub.s32 q2, q2, q3 \n" // tmp[8] = a0 - a1 + "vadd.s32 q1, q5, q4 \n" // tmp[4] = a3 + a2 + "vsub.s32 q3, q5, q4 \n" // tmp[12] = a3 - a2 + + // Transpose + // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14] + // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15] + "vswp d1, d4 \n" // vtrn.64 q0, q2 + "vswp d3, d6 \n" // vtrn.64 q1, q3 + "vtrn.32 q0, q1 \n" + "vtrn.32 q2, q3 \n" + + "vmov.s32 q4, #3 \n" // dc = 3 + "vadd.s32 q0, q0, q4 \n" // dc = tmp[0] + 3 + "vadd.s32 q6, q0, q3 \n" // a0 = dc + tmp[3] + "vadd.s32 q7, q1, q2 \n" // a1 = tmp[1] + tmp[2] + "vsub.s32 q8, q1, q2 \n" // a2 = tmp[1] - tmp[2] + "vsub.s32 q9, q0, q3 \n" // a3 = dc - tmp[3] + + "vadd.s32 q0, q6, q7 \n" + "vshrn.s32 d0, q0, #3 \n" // (a0 + a1) >> 3 + "vadd.s32 q1, q9, q8 \n" + "vshrn.s32 d1, q1, #3 \n" // (a3 + a2) >> 3 + "vsub.s32 q2, q6, q7 \n" + "vshrn.s32 d2, q2, #3 \n" // (a0 - a1) >> 3 + "vsub.s32 q3, q9, q8 \n" + "vshrn.s32 d3, q3, #3 \n" // (a3 - a2) >> 3 + + // set the results to output + "vst1.16 d0[0], [%[out]], %[kStep] \n" + "vst1.16 d1[0], [%[out]], %[kStep] \n" + "vst1.16 d2[0], [%[out]], %[kStep] \n" + "vst1.16 d3[0], [%[out]], %[kStep] \n" + "vst1.16 d0[1], [%[out]], %[kStep] \n" + "vst1.16 d1[1], [%[out]], %[kStep] \n" + "vst1.16 d2[1], [%[out]], %[kStep] \n" + "vst1.16 d3[1], [%[out]], %[kStep] \n" + "vst1.16 d0[2], [%[out]], %[kStep] \n" + "vst1.16 d1[2], [%[out]], %[kStep] \n" + "vst1.16 d2[2], [%[out]], %[kStep] \n" + "vst1.16 d3[2], [%[out]], %[kStep] \n" + "vst1.16 d0[3], [%[out]], %[kStep] \n" + "vst1.16 d1[3], [%[out]], %[kStep] \n" + "vst1.16 d2[3], [%[out]], %[kStep] \n" + "vst1.16 d3[3], [%[out]], %[kStep] \n" + + : [out] "+r"(out) // modified registers + : [in] "r"(in), [kStep] "r"(kStep) // constants + : "memory", "q0", "q1", "q2", "q3", "q4", + "q5", "q6", "q7", "q8", "q9" // clobbered + ); +} + #endif // WEBP_USE_NEON //------------------------------------------------------------------------------ @@ -321,6 +391,7 @@ extern void VP8DspInitNEON(void); void VP8DspInitNEON(void) { #if defined(WEBP_USE_NEON) VP8Transform = TransformTwoNEON; + VP8TransformWHT = TransformWHT; VP8SimpleVFilter16 = SimpleVFilter16NEON; VP8SimpleHFilter16 = SimpleHFilter16NEON; diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index a797193e..ad48dffc 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -103,7 +103,7 @@ extern VP8DecIdct2 VP8Transform; extern VP8DecIdct VP8TransformUV; extern VP8DecIdct VP8TransformDC; extern VP8DecIdct VP8TransformDCUV; -extern void (*VP8TransformWHT)(const int16_t* in, int16_t* out); +extern VP8WHT VP8TransformWHT; // *dst is the destination block, with stride BPS. Boundary samples are // assumed accessible when needed. diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c index 31bc3049..2e72a3fd 100644 --- a/src/dsp/enc_neon.c +++ b/src/dsp/enc_neon.c @@ -25,8 +25,8 @@ extern "C" { // Inverse transform. // This code is pretty much the same as TransformOneNEON in the decoder, except // for subtraction to *ref. See the comments there for algorithmic explanations. -static void ITransformOneNEON(const uint8_t* ref, - const int16_t* in, uint8_t* dst) { +static void ITransformOne(const uint8_t* ref, + const int16_t* in, uint8_t* dst) { const int kBPS = BPS; const int16_t kC1C2[] = { 20091, 17734, 0, 0 }; // kC1 / (kC2 >> 1) / 0 / 0 @@ -139,14 +139,82 @@ static void ITransformOneNEON(const uint8_t* ref, ); } -static void ITransformNEON(const uint8_t* ref, - const int16_t* in, uint8_t* dst, int do_two) { - ITransformOneNEON(ref, in, dst); +static void ITransform(const uint8_t* ref, + const int16_t* in, uint8_t* dst, int do_two) { + ITransformOne(ref, in, dst); if (do_two) { - ITransformOneNEON(ref + 4, in + 16, dst + 4); + ITransformOne(ref + 4, in + 16, dst + 4); } } +// Same code as dec_neon.c +static void ITransformWHT(const int16_t* in, int16_t* out) { + const int kStep = 32; // The store is only incrementing the pointer as if we + // had stored a single byte. + __asm__ volatile ( + // part 1 + // load data into q0, q1 + "vld1.16 {q0, q1}, [%[in]] \n" + + "vaddl.s16 q2, d0, d3 \n" // a0 = in[0] + in[12] + "vaddl.s16 q3, d1, d2 \n" // a1 = in[4] + in[8] + "vsubl.s16 q4, d1, d2 \n" // a2 = in[4] - in[8] + "vsubl.s16 q5, d0, d3 \n" // a3 = in[0] - in[12] + + "vadd.s32 q0, q2, q3 \n" // tmp[0] = a0 + a1 + "vsub.s32 q2, q2, q3 \n" // tmp[8] = a0 - a1 + "vadd.s32 q1, q5, q4 \n" // tmp[4] = a3 + a2 + "vsub.s32 q3, q5, q4 \n" // tmp[12] = a3 - a2 + + // Transpose + // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14] + // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15] + "vswp d1, d4 \n" // vtrn.64 q0, q2 + "vswp d3, d6 \n" // vtrn.64 q1, q3 + "vtrn.32 q0, q1 \n" + "vtrn.32 q2, q3 \n" + + "vmov.s32 q4, #3 \n" // dc = 3 + "vadd.s32 q0, q0, q4 \n" // dc = tmp[0] + 3 + "vadd.s32 q6, q0, q3 \n" // a0 = dc + tmp[3] + "vadd.s32 q7, q1, q2 \n" // a1 = tmp[1] + tmp[2] + "vsub.s32 q8, q1, q2 \n" // a2 = tmp[1] - tmp[2] + "vsub.s32 q9, q0, q3 \n" // a3 = dc - tmp[3] + + "vadd.s32 q0, q6, q7 \n" + "vshrn.s32 d0, q0, #3 \n" // (a0 + a1) >> 3 + "vadd.s32 q1, q9, q8 \n" + "vshrn.s32 d1, q1, #3 \n" // (a3 + a2) >> 3 + "vsub.s32 q2, q6, q7 \n" + "vshrn.s32 d2, q2, #3 \n" // (a0 - a1) >> 3 + "vsub.s32 q3, q9, q8 \n" + "vshrn.s32 d3, q3, #3 \n" // (a3 - a2) >> 3 + + // set the results to output + "vst1.16 d0[0], [%[out]], %[kStep] \n" + "vst1.16 d1[0], [%[out]], %[kStep] \n" + "vst1.16 d2[0], [%[out]], %[kStep] \n" + "vst1.16 d3[0], [%[out]], %[kStep] \n" + "vst1.16 d0[1], [%[out]], %[kStep] \n" + "vst1.16 d1[1], [%[out]], %[kStep] \n" + "vst1.16 d2[1], [%[out]], %[kStep] \n" + "vst1.16 d3[1], [%[out]], %[kStep] \n" + "vst1.16 d0[2], [%[out]], %[kStep] \n" + "vst1.16 d1[2], [%[out]], %[kStep] \n" + "vst1.16 d2[2], [%[out]], %[kStep] \n" + "vst1.16 d3[2], [%[out]], %[kStep] \n" + "vst1.16 d0[3], [%[out]], %[kStep] \n" + "vst1.16 d1[3], [%[out]], %[kStep] \n" + "vst1.16 d2[3], [%[out]], %[kStep] \n" + "vst1.16 d3[3], [%[out]], %[kStep] \n" + + : [out] "+r"(out) // modified registers + : [in] "r"(in), [kStep] "r"(kStep) // constants + : "memory", "q0", "q1", "q2", "q3", "q4", + "q5", "q6", "q7", "q8", "q9" // clobbered + ); +} + // Forward transform. // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm @@ -160,8 +228,8 @@ static const int32_t kCoeff32[] = { 51000, 51000, 51000, 51000 }; -static void FTransformNEON(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform(const uint8_t* src, const uint8_t* ref, + int16_t* out) { const int kBPS = BPS; const uint8_t* src_ptr = src; const uint8_t* ref_ptr = ref; @@ -282,8 +350,10 @@ extern void VP8EncDspInitNEON(void); void VP8EncDspInitNEON(void) { #if defined(WEBP_USE_NEON) - VP8ITransform = ITransformNEON; - VP8FTransform = FTransformNEON; + VP8ITransform = ITransform; + VP8FTransform = FTransform; + + VP8ITransformWHT = ITransformWHT; #endif // WEBP_USE_NEON }