NEON speed up

add TransformDC special case, and make the switch function inlined. Recovers a few of the CPU lost during the addition of TransformAC3 (only on ARM) Change-Id: I21c1f0c6a9cb9d1dfc1e307b4f473a2791273bd6
2026-01-05 03:46:36 +01:00 · 2013-12-18 22:32:58 +01:00
parent d49345533f
commit 26d842eb8f
2 changed files with 39 additions and 6 deletions
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@@ -671,7 +671,7 @@ static void Copy32b(uint8_t* dst, uint8_t* src) {
  memcpy(dst, src, 4);
 }
-static void DoTransform(uint32_t bits, const int16_t* const src,
+static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src,
                        uint8_t* const dst) {
  switch (bits >> 30) {
    case 3:
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@@ -160,7 +160,7 @@ static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
 //-----------------------------------------------------------------------------
 // Inverse transforms (Paragraph 14.4)
-static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
+static void TransformOne(const int16_t* in, uint8_t* dst) {
  const int kBPS = BPS;
  const int16_t constants[] = {20091, 17734, 0, 0};
  /* kC1, kC2. Padded because vld1.16 loads 8 bytes
@@ -309,13 +309,44 @@ static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
  );
 }
-static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
-  TransformOneNEON(in, dst);
+  TransformOne(in, dst);
  if (do_two) {
-    TransformOneNEON(in + 16, dst + 4);
+    TransformOne(in + 16, dst + 4);
  }
 }
 static void TransformDC(const int16_t* in, uint8_t* dst) {
  const int DC = (in[0] + 4) >> 3;
  const int kBPS = BPS;
  __asm__ volatile (
    "vdup.16         q1, %[DC]        \n"
    "vld1.32         d0[0], [%[dst]], %[kBPS]    \n"
    "vld1.32         d1[0], [%[dst]], %[kBPS]    \n"
    "vld1.32         d0[1], [%[dst]], %[kBPS]    \n"
    "vld1.32         d1[1], [%[dst]], %[kBPS]    \n"
    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
    // add DC and convert to s16.
    "vaddw.u8        q2, q1, d0                  \n"
    "vaddw.u8        q3, q1, d1                  \n"
    // convert back to u8 with saturation
    "vqmovun.s16     d0,  q2                     \n"
    "vqmovun.s16     d1,  q3                     \n"
    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
    "vst1.32         d1[1], [%[dst]]             \n"
    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
    : [kBPS] "r"(kBPS),   /* constants */
      [DC] "r"(DC)
    : "memory", "q0", "q1", "q2", "q3"  /* clobbered */
  );
 }
 static void TransformWHT(const int16_t* in, int16_t* out) {
  const int kStep = 32;  // The store is only incrementing the pointer as if we
                         // had stored a single byte.
@@ -392,7 +423,9 @@ extern void VP8DspInitNEON(void);
 void VP8DspInitNEON(void) {
 #if defined(WEBP_USE_NEON)
-  VP8Transform = TransformTwoNEON;
+  VP8Transform = TransformTwo;
  VP8TransformAC3 = TransformOne;  // no special code here
  VP8TransformDC = TransformDC;
  VP8TransformWHT = TransformWHT;
  VP8SimpleVFilter16 = SimpleVFilter16NEON;