mirror of
				https://github.com/webmproject/libwebp.git
				synced 2025-10-31 10:25:46 +01:00 
			
		
		
		
	add NEON asm version for WHT inverse transform
Contributed by Wayne Chen (datoudatou at gmail dot com) + some header cleanup + remove the NEON suffix in static functions Change-Id: I75bf5e9b54cf5e1acc53764c6f081d61690f8e3d
This commit is contained in:
		| @@ -155,6 +155,9 @@ static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) { | ||||
|   } | ||||
| } | ||||
|  | ||||
| //----------------------------------------------------------------------------- | ||||
| // Inverse transforms (Paragraph 14.4) | ||||
|  | ||||
| static void TransformOneNEON(const int16_t *in, uint8_t *dst) { | ||||
|   const int kBPS = BPS; | ||||
|   const int16_t constants[] = {20091, 17734, 0, 0}; | ||||
| @@ -311,6 +314,73 @@ static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) { | ||||
|   } | ||||
| } | ||||
|  | ||||
| static void TransformWHT(const int16_t* in, int16_t* out) { | ||||
|   const int kStep = 32; // The store is only incrementing the pointer as if we | ||||
|                         // had stored a single byte. | ||||
|   __asm__ volatile ( | ||||
|     // part 1 | ||||
|     // load data into q0, q1 | ||||
|     "vld1.16         {q0, q1}, [%[in]]           \n" | ||||
|  | ||||
|     "vaddl.s16       q2, d0, d3                  \n" // a0 = in[0] + in[12] | ||||
|     "vaddl.s16       q3, d1, d2                  \n" // a1 = in[4] + in[8] | ||||
|     "vsubl.s16       q4, d1, d2                  \n" // a2 = in[4] - in[8] | ||||
|     "vsubl.s16       q5, d0, d3                  \n" // a3 = in[0] - in[12] | ||||
|  | ||||
|     "vadd.s32        q0, q2, q3                  \n" // tmp[0] = a0 + a1 | ||||
|     "vsub.s32        q2, q2, q3                  \n" // tmp[8] = a0 - a1 | ||||
|     "vadd.s32        q1, q5, q4                  \n" // tmp[4] = a3 + a2 | ||||
|     "vsub.s32        q3, q5, q4                  \n" // tmp[12] = a3 - a2 | ||||
|  | ||||
|     // Transpose | ||||
|     // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14] | ||||
|     // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15] | ||||
|     "vswp            d1, d4                      \n" // vtrn.64 q0, q2 | ||||
|     "vswp            d3, d6                      \n" // vtrn.64 q1, q3 | ||||
|     "vtrn.32         q0, q1                      \n" | ||||
|     "vtrn.32         q2, q3                      \n" | ||||
|  | ||||
|     "vmov.s32        q4, #3                      \n" // dc = 3 | ||||
|     "vadd.s32        q0, q0, q4                  \n" // dc = tmp[0] + 3 | ||||
|     "vadd.s32        q6, q0, q3                  \n" // a0 = dc + tmp[3] | ||||
|     "vadd.s32        q7, q1, q2                  \n" // a1 = tmp[1] + tmp[2] | ||||
|     "vsub.s32        q8, q1, q2                  \n" // a2 = tmp[1] - tmp[2] | ||||
|     "vsub.s32        q9, q0, q3                  \n" // a3 = dc - tmp[3] | ||||
|  | ||||
|     "vadd.s32        q0, q6, q7                  \n" | ||||
|     "vshrn.s32       d0, q0, #3                  \n" // (a0 + a1) >> 3 | ||||
|     "vadd.s32        q1, q9, q8                  \n" | ||||
|     "vshrn.s32       d1, q1, #3                  \n" // (a3 + a2) >> 3 | ||||
|     "vsub.s32        q2, q6, q7                  \n" | ||||
|     "vshrn.s32       d2, q2, #3                  \n" // (a0 - a1) >> 3 | ||||
|     "vsub.s32        q3, q9, q8                  \n" | ||||
|     "vshrn.s32       d3, q3, #3                  \n" // (a3 - a2) >> 3 | ||||
|  | ||||
|     // set the results to output | ||||
|     "vst1.16         d0[0], [%[out]], %[kStep]   \n" | ||||
|     "vst1.16         d1[0], [%[out]], %[kStep]   \n" | ||||
|     "vst1.16         d2[0], [%[out]], %[kStep]   \n" | ||||
|     "vst1.16         d3[0], [%[out]], %[kStep]   \n" | ||||
|     "vst1.16         d0[1], [%[out]], %[kStep]   \n" | ||||
|     "vst1.16         d1[1], [%[out]], %[kStep]   \n" | ||||
|     "vst1.16         d2[1], [%[out]], %[kStep]   \n" | ||||
|     "vst1.16         d3[1], [%[out]], %[kStep]   \n" | ||||
|     "vst1.16         d0[2], [%[out]], %[kStep]   \n" | ||||
|     "vst1.16         d1[2], [%[out]], %[kStep]   \n" | ||||
|     "vst1.16         d2[2], [%[out]], %[kStep]   \n" | ||||
|     "vst1.16         d3[2], [%[out]], %[kStep]   \n" | ||||
|     "vst1.16         d0[3], [%[out]], %[kStep]   \n" | ||||
|     "vst1.16         d1[3], [%[out]], %[kStep]   \n" | ||||
|     "vst1.16         d2[3], [%[out]], %[kStep]   \n" | ||||
|     "vst1.16         d3[3], [%[out]], %[kStep]   \n" | ||||
|  | ||||
|     : [out] "+r"(out)  // modified registers | ||||
|     : [in] "r"(in), [kStep] "r"(kStep)  // constants | ||||
|     : "memory", "q0", "q1", "q2", "q3", "q4", | ||||
|       "q5", "q6", "q7", "q8", "q9"  // clobbered | ||||
|   ); | ||||
| } | ||||
|  | ||||
| #endif   // WEBP_USE_NEON | ||||
|  | ||||
| //------------------------------------------------------------------------------ | ||||
| @@ -321,6 +391,7 @@ extern void VP8DspInitNEON(void); | ||||
| void VP8DspInitNEON(void) { | ||||
| #if defined(WEBP_USE_NEON) | ||||
|   VP8Transform = TransformTwoNEON; | ||||
|   VP8TransformWHT = TransformWHT; | ||||
|  | ||||
|   VP8SimpleVFilter16 = SimpleVFilter16NEON; | ||||
|   VP8SimpleHFilter16 = SimpleHFilter16NEON; | ||||
|   | ||||
| @@ -103,7 +103,7 @@ extern VP8DecIdct2 VP8Transform; | ||||
| extern VP8DecIdct VP8TransformUV; | ||||
| extern VP8DecIdct VP8TransformDC; | ||||
| extern VP8DecIdct VP8TransformDCUV; | ||||
| extern void (*VP8TransformWHT)(const int16_t* in, int16_t* out); | ||||
| extern VP8WHT VP8TransformWHT; | ||||
|  | ||||
| // *dst is the destination block, with stride BPS. Boundary samples are | ||||
| // assumed accessible when needed. | ||||
|   | ||||
| @@ -25,8 +25,8 @@ extern "C" { | ||||
| // Inverse transform. | ||||
| // This code is pretty much the same as TransformOneNEON in the decoder, except | ||||
| // for subtraction to *ref. See the comments there for algorithmic explanations. | ||||
| static void ITransformOneNEON(const uint8_t* ref, | ||||
|                               const int16_t* in, uint8_t* dst) { | ||||
| static void ITransformOne(const uint8_t* ref, | ||||
|                           const int16_t* in, uint8_t* dst) { | ||||
|   const int kBPS = BPS; | ||||
|   const int16_t kC1C2[] = { 20091, 17734, 0, 0 };  // kC1 / (kC2 >> 1) / 0 / 0 | ||||
|  | ||||
| @@ -139,14 +139,82 @@ static void ITransformOneNEON(const uint8_t* ref, | ||||
|   ); | ||||
| } | ||||
|  | ||||
| static void ITransformNEON(const uint8_t* ref, | ||||
|                            const int16_t* in, uint8_t* dst, int do_two) { | ||||
|   ITransformOneNEON(ref, in, dst); | ||||
| static void ITransform(const uint8_t* ref, | ||||
|                        const int16_t* in, uint8_t* dst, int do_two) { | ||||
|   ITransformOne(ref, in, dst); | ||||
|   if (do_two) { | ||||
|     ITransformOneNEON(ref + 4, in + 16, dst + 4); | ||||
|     ITransformOne(ref + 4, in + 16, dst + 4); | ||||
|   } | ||||
| } | ||||
|  | ||||
| // Same code as dec_neon.c | ||||
| static void ITransformWHT(const int16_t* in, int16_t* out) { | ||||
|   const int kStep = 32; // The store is only incrementing the pointer as if we | ||||
|                         // had stored a single byte. | ||||
|   __asm__ volatile ( | ||||
|     // part 1 | ||||
|     // load data into q0, q1 | ||||
|     "vld1.16         {q0, q1}, [%[in]]           \n" | ||||
|  | ||||
|     "vaddl.s16       q2, d0, d3                  \n" // a0 = in[0] + in[12] | ||||
|     "vaddl.s16       q3, d1, d2                  \n" // a1 = in[4] + in[8] | ||||
|     "vsubl.s16       q4, d1, d2                  \n" // a2 = in[4] - in[8] | ||||
|     "vsubl.s16       q5, d0, d3                  \n" // a3 = in[0] - in[12] | ||||
|  | ||||
|     "vadd.s32        q0, q2, q3                  \n" // tmp[0] = a0 + a1 | ||||
|     "vsub.s32        q2, q2, q3                  \n" // tmp[8] = a0 - a1 | ||||
|     "vadd.s32        q1, q5, q4                  \n" // tmp[4] = a3 + a2 | ||||
|     "vsub.s32        q3, q5, q4                  \n" // tmp[12] = a3 - a2 | ||||
|  | ||||
|     // Transpose | ||||
|     // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14] | ||||
|     // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15] | ||||
|     "vswp            d1, d4                      \n" // vtrn.64 q0, q2 | ||||
|     "vswp            d3, d6                      \n" // vtrn.64 q1, q3 | ||||
|     "vtrn.32         q0, q1                      \n" | ||||
|     "vtrn.32         q2, q3                      \n" | ||||
|  | ||||
|     "vmov.s32        q4, #3                      \n" // dc = 3 | ||||
|     "vadd.s32        q0, q0, q4                  \n" // dc = tmp[0] + 3 | ||||
|     "vadd.s32        q6, q0, q3                  \n" // a0 = dc + tmp[3] | ||||
|     "vadd.s32        q7, q1, q2                  \n" // a1 = tmp[1] + tmp[2] | ||||
|     "vsub.s32        q8, q1, q2                  \n" // a2 = tmp[1] - tmp[2] | ||||
|     "vsub.s32        q9, q0, q3                  \n" // a3 = dc - tmp[3] | ||||
|  | ||||
|     "vadd.s32        q0, q6, q7                  \n" | ||||
|     "vshrn.s32       d0, q0, #3                  \n" // (a0 + a1) >> 3 | ||||
|     "vadd.s32        q1, q9, q8                  \n" | ||||
|     "vshrn.s32       d1, q1, #3                  \n" // (a3 + a2) >> 3 | ||||
|     "vsub.s32        q2, q6, q7                  \n" | ||||
|     "vshrn.s32       d2, q2, #3                  \n" // (a0 - a1) >> 3 | ||||
|     "vsub.s32        q3, q9, q8                  \n" | ||||
|     "vshrn.s32       d3, q3, #3                  \n" // (a3 - a2) >> 3 | ||||
|  | ||||
|     // set the results to output | ||||
|     "vst1.16         d0[0], [%[out]], %[kStep]      \n" | ||||
|     "vst1.16         d1[0], [%[out]], %[kStep]      \n" | ||||
|     "vst1.16         d2[0], [%[out]], %[kStep]      \n" | ||||
|     "vst1.16         d3[0], [%[out]], %[kStep]      \n" | ||||
|     "vst1.16         d0[1], [%[out]], %[kStep]      \n" | ||||
|     "vst1.16         d1[1], [%[out]], %[kStep]      \n" | ||||
|     "vst1.16         d2[1], [%[out]], %[kStep]      \n" | ||||
|     "vst1.16         d3[1], [%[out]], %[kStep]      \n" | ||||
|     "vst1.16         d0[2], [%[out]], %[kStep]      \n" | ||||
|     "vst1.16         d1[2], [%[out]], %[kStep]      \n" | ||||
|     "vst1.16         d2[2], [%[out]], %[kStep]      \n" | ||||
|     "vst1.16         d3[2], [%[out]], %[kStep]      \n" | ||||
|     "vst1.16         d0[3], [%[out]], %[kStep]      \n" | ||||
|     "vst1.16         d1[3], [%[out]], %[kStep]      \n" | ||||
|     "vst1.16         d2[3], [%[out]], %[kStep]      \n" | ||||
|     "vst1.16         d3[3], [%[out]], %[kStep]      \n" | ||||
|  | ||||
|     : [out] "+r"(out)  // modified registers | ||||
|     : [in] "r"(in), [kStep] "r"(kStep)  // constants | ||||
|     : "memory", "q0", "q1", "q2", "q3", "q4", | ||||
|       "q5", "q6", "q7", "q8", "q9" // clobbered | ||||
|   ); | ||||
| } | ||||
|  | ||||
| // Forward transform. | ||||
|  | ||||
| // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm | ||||
| @@ -160,8 +228,8 @@ static const int32_t kCoeff32[] = { | ||||
|   51000, 51000, 51000, 51000 | ||||
| }; | ||||
|  | ||||
| static void FTransformNEON(const uint8_t* src, const uint8_t* ref, | ||||
|                            int16_t* out) { | ||||
| static void FTransform(const uint8_t* src, const uint8_t* ref, | ||||
|                        int16_t* out) { | ||||
|  const int kBPS = BPS; | ||||
|   const uint8_t* src_ptr = src; | ||||
|   const uint8_t* ref_ptr = ref; | ||||
| @@ -282,8 +350,10 @@ extern void VP8EncDspInitNEON(void); | ||||
|  | ||||
| void VP8EncDspInitNEON(void) { | ||||
| #if defined(WEBP_USE_NEON) | ||||
|   VP8ITransform = ITransformNEON; | ||||
|   VP8FTransform = FTransformNEON; | ||||
|   VP8ITransform = ITransform; | ||||
|   VP8FTransform = FTransform; | ||||
|  | ||||
|   VP8ITransformWHT = ITransformWHT; | ||||
| #endif   // WEBP_USE_NEON | ||||
| } | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user