enc_neon: move Transpose4x4 to dsp/neon.h

+ reuse it in TransformWHT() Change-Id: Idfbd0f9b58d6253ac3d65ba55b58989c427ee989
2025-07-26 10:50:06 +02:00 · 2014-04-26 12:11:00 -07:00
parent 8e5f90b086
commit 5e1a17ef4b
5 changed files with 89 additions and 77 deletions
--- a/makefile.unix
+++ b/makefile.unix
@ -200,9 +200,14 @@ HDRS = \
    src/dec/webpi.h \
    src/dsp/dsp.h \
    src/dsp/lossless.h \
    src/dsp/neon.h \
    src/dsp/yuv.h \
    src/enc/backward_references.h \
    src/enc/cost.h \
    src/enc/histogram.h \
    src/enc/vp8enci.h \
    src/enc/vp8li.h \
    src/mux/muxi.h \
    src/utils/alpha_processing.h \
    src/utils/bit_reader.h \
    src/utils/bit_writer.h \
@ -215,6 +220,7 @@ HDRS = \
    src/utils/random.h \
    src/utils/rescaler.h \
    src/utils/thread.h \
    src/utils/utils.h \
    src/webp/format_constants.h \
    $(HDRS_INSTALLED) \
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@ -17,10 +17,11 @@ COMMON_SOURCES += dec_neon.c
 COMMON_SOURCES += dec_sse2.c
 COMMON_SOURCES += dsp.h
 COMMON_SOURCES += lossless.c
 COMMON_SOURCES += lossless.h
 COMMON_SOURCES += lossless_mips32.c
 COMMON_SOURCES += lossless_neon.c
 COMMON_SOURCES += lossless_sse2.c
-COMMON_SOURCES += lossless.h
+COMMON_SOURCES += neon.h
 COMMON_SOURCES += upsampling.c
 COMMON_SOURCES += upsampling_mips32.c
 COMMON_SOURCES += upsampling_neon.c
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@ -23,8 +23,7 @@
 // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
 #define WORK_AROUND_GCC
-#include <arm_neon.h>
+#include "./neon.h"
 #include "../dec/vp8i.h"
 #define QRegs "q0", "q1", "q2", "q3",                                          \
@ -1217,16 +1216,15 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
 //------------------------------------------------------------------------------
-#define STORE_WHT(dst, col, row01, row23) do {           \
+#define STORE_WHT(dst, col, rows) do {                  \
-  *dst = vgetq_lane_s32(row01.val[0], col); (dst) += 16; \
+  *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
-  *dst = vgetq_lane_s32(row01.val[1], col); (dst) += 16; \
+  *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
-  *dst = vgetq_lane_s32(row23.val[0], col); (dst) += 16; \
+  *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
-  *dst = vgetq_lane_s32(row23.val[1], col); (dst) += 16; \
+  *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
 } while (0)
 static void TransformWHT(const int16_t* in, int16_t* out) {
-  int32x4x2_t tmp0;  // tmp[0..7]
+  int32x4x4_t tmp;
  int32x4x2_t tmp1;  // tmp[8..15]
  {
    // Load the source.
@ -1238,47 +1236,37 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
    const int32x4_t a1 = vaddl_s16(in04_07, in08_11);  // in[4..7] + in[8..11]
    const int32x4_t a2 = vsubl_s16(in04_07, in08_11);  // in[4..7] - in[8..11]
    const int32x4_t a3 = vsubl_s16(in00_03, in12_15);  // in[0..3] - in[12..15]
-    tmp0.val[0] = vaddq_s32(a0, a1);
+    tmp.val[0] = vaddq_s32(a0, a1);
-    tmp0.val[1] = vaddq_s32(a3, a2);
+    tmp.val[1] = vaddq_s32(a3, a2);
-    tmp1.val[0] = vsubq_s32(a0, a1);
+    tmp.val[2] = vsubq_s32(a0, a1);
-    tmp1.val[1] = vsubq_s32(a3, a2);
+    tmp.val[3] = vsubq_s32(a3, a2);
    // Arrange the temporary results column-wise.
    tmp = Transpose4x4(tmp);
  }
  tmp0 = vzipq_s32(tmp0.val[0], tmp0.val[1]);  // 0,  4, 1,  5 |  2,  6,  3,  7
  tmp1 = vzipq_s32(tmp1.val[0], tmp1.val[1]);  // 8, 12, 9, 13 | 10, 14, 11, 15
  {
-    // Arrange the temporary results column-wise.
+    const int32x4_t kCst3 = vdupq_n_s32(3);
-    const int32x4_t tmp_0_4_8_12 =
+    const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3);  // add rounder
-        vcombine_s32(vget_low_s32(tmp0.val[0]), vget_low_s32(tmp1.val[0]));
+    const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
-    const int32x4_t tmp_2_6_10_14 =
+    const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
-        vcombine_s32(vget_low_s32(tmp0.val[1]), vget_low_s32(tmp1.val[1]));
+    const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
-    const int32x4_t tmp_1_5_9_13 =
+    const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
        vcombine_s32(vget_high_s32(tmp0.val[0]), vget_high_s32(tmp1.val[0]));
    const int32x4_t tmp_3_7_11_15 =
        vcombine_s32(vget_high_s32(tmp0.val[1]), vget_high_s32(tmp1.val[1]));
    const int32x4_t three = vdupq_n_s32(3);
    const int32x4_t dc = vaddq_s32(tmp_0_4_8_12, three);  // add rounder
    const int32x4_t a0 = vaddq_s32(dc, tmp_3_7_11_15);
    const int32x4_t a1 = vaddq_s32(tmp_1_5_9_13, tmp_2_6_10_14);
    const int32x4_t a2 = vsubq_s32(tmp_1_5_9_13, tmp_2_6_10_14);
    const int32x4_t a3 = vsubq_s32(dc, tmp_3_7_11_15);
-    tmp0.val[0] = vaddq_s32(a0, a1);
+    tmp.val[0] = vaddq_s32(a0, a1);
-    tmp0.val[1] = vaddq_s32(a3, a2);
+    tmp.val[1] = vaddq_s32(a3, a2);
-    tmp1.val[0] = vsubq_s32(a0, a1);
+    tmp.val[2] = vsubq_s32(a0, a1);
-    tmp1.val[1] = vsubq_s32(a3, a2);
+    tmp.val[3] = vsubq_s32(a3, a2);
    // right shift the results by 3.
-    tmp0.val[0] = vshrq_n_s32(tmp0.val[0], 3);
+    tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
-    tmp0.val[1] = vshrq_n_s32(tmp0.val[1], 3);
+    tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
-    tmp1.val[0] = vshrq_n_s32(tmp1.val[0], 3);
+    tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
-    tmp1.val[1] = vshrq_n_s32(tmp1.val[1], 3);
+    tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
-    STORE_WHT(out, 0, tmp0, tmp1);
+    STORE_WHT(out, 0, tmp);
-    STORE_WHT(out, 1, tmp0, tmp1);
+    STORE_WHT(out, 1, tmp);
-    STORE_WHT(out, 2, tmp0, tmp1);
+    STORE_WHT(out, 2, tmp);
-    STORE_WHT(out, 3, tmp0, tmp1);
+    STORE_WHT(out, 3, tmp);
  }
 }
--- a/src/dsp/enc_neon.c
+++ b/src/dsp/enc_neon.c
@ -18,8 +18,8 @@
 #define USE_INTRINSICS   // use intrinsics when possible
 #include <assert.h>
 #include <arm_neon.h>
 #include "./neon.h"
 #include "../enc/vp8enci.h"
 //------------------------------------------------------------------------------
@ -474,38 +474,6 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
 #endif
 static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
  uint64x2x2_t row01, row23;
  row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
  row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
  row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
  row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
  // Transpose 64-bit values (there's no vswp equivalent)
  {
    const uint64x1_t row0h = vget_high_u64(row01.val[0]);
    const uint64x1_t row2l = vget_low_u64(row23.val[0]);
    const uint64x1_t row1h = vget_high_u64(row01.val[1]);
    const uint64x1_t row3l = vget_low_u64(row23.val[1]);
    row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
    row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
    row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
    row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
  }
  {
    const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
                                        vreinterpretq_s32_u64(row01.val[1]));
    const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
                                        vreinterpretq_s32_u64(row23.val[1]));
    int32x4x4_t out;
    out.val[0] = out01.val[0];
    out.val[1] = out01.val[1];
    out.val[2] = out23.val[0];
    out.val[3] = out23.val[1];
    return out;
  }
 }
 #define LOAD_LANE_16b(VALUE, LANE) do {             \
  (VALUE) = vld1_lane_s16(src, (VALUE), (LANE));    \
  src += stride;                                    \
--- a/src/dsp/neon.h
+++ b/src/dsp/neon.h
@ -0,0 +1,49 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  NEON common code.
 #ifndef WEBP_DSP_NEON_H_
 #define WEBP_DSP_NEON_H_
 #include <arm_neon.h>
 static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
  uint64x2x2_t row01, row23;
  row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
  row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
  row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
  row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
  // Transpose 64-bit values (there's no vswp equivalent)
  {
    const uint64x1_t row0h = vget_high_u64(row01.val[0]);
    const uint64x1_t row2l = vget_low_u64(row23.val[0]);
    const uint64x1_t row1h = vget_high_u64(row01.val[1]);
    const uint64x1_t row3l = vget_low_u64(row23.val[1]);
    row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
    row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
    row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
    row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
  }
  {
    const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
                                        vreinterpretq_s32_u64(row01.val[1]));
    const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
                                        vreinterpretq_s32_u64(row23.val[1]));
    int32x4x4_t out;
    out.val[0] = out01.val[0];
    out.val[1] = out01.val[1];
    out.val[2] = out23.val[0];
    out.val[3] = out23.val[1];
    return out;
  }
 }
 #endif  // WEBP_DSP_NEON_H_