From 5e1a17ef4badf8b9ea93e45396c6d659a9de7d2f Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 26 Apr 2014 12:11:00 -0700 Subject: [PATCH] enc_neon: move Transpose4x4 to dsp/neon.h + reuse it in TransformWHT() Change-Id: Idfbd0f9b58d6253ac3d65ba55b58989c427ee989 --- makefile.unix | 6 ++++ src/dsp/Makefile.am | 3 +- src/dsp/dec_neon.c | 74 +++++++++++++++++++-------------------------- src/dsp/enc_neon.c | 34 +-------------------- src/dsp/neon.h | 49 ++++++++++++++++++++++++++++++ 5 files changed, 89 insertions(+), 77 deletions(-) create mode 100644 src/dsp/neon.h diff --git a/makefile.unix b/makefile.unix index be3ca08b..6aea2365 100644 --- a/makefile.unix +++ b/makefile.unix @@ -200,9 +200,14 @@ HDRS = \ src/dec/webpi.h \ src/dsp/dsp.h \ src/dsp/lossless.h \ + src/dsp/neon.h \ src/dsp/yuv.h \ + src/enc/backward_references.h \ src/enc/cost.h \ + src/enc/histogram.h \ src/enc/vp8enci.h \ + src/enc/vp8li.h \ + src/mux/muxi.h \ src/utils/alpha_processing.h \ src/utils/bit_reader.h \ src/utils/bit_writer.h \ @@ -215,6 +220,7 @@ HDRS = \ src/utils/random.h \ src/utils/rescaler.h \ src/utils/thread.h \ + src/utils/utils.h \ src/webp/format_constants.h \ $(HDRS_INSTALLED) \ diff --git a/src/dsp/Makefile.am b/src/dsp/Makefile.am index 284e8f4d..24fb75d7 100644 --- a/src/dsp/Makefile.am +++ b/src/dsp/Makefile.am @@ -17,10 +17,11 @@ COMMON_SOURCES += dec_neon.c COMMON_SOURCES += dec_sse2.c COMMON_SOURCES += dsp.h COMMON_SOURCES += lossless.c +COMMON_SOURCES += lossless.h COMMON_SOURCES += lossless_mips32.c COMMON_SOURCES += lossless_neon.c COMMON_SOURCES += lossless_sse2.c -COMMON_SOURCES += lossless.h +COMMON_SOURCES += neon.h COMMON_SOURCES += upsampling.c COMMON_SOURCES += upsampling_mips32.c COMMON_SOURCES += upsampling_neon.c diff --git a/src/dsp/dec_neon.c b/src/dsp/dec_neon.c index 2850b465..4d8ea724 100644 --- a/src/dsp/dec_neon.c +++ b/src/dsp/dec_neon.c @@ -23,8 +23,7 @@ // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183) #define WORK_AROUND_GCC -#include - +#include "./neon.h" #include "../dec/vp8i.h" #define QRegs "q0", "q1", "q2", "q3", \ @@ -1217,16 +1216,15 @@ static void TransformDC(const int16_t* in, uint8_t* dst) { //------------------------------------------------------------------------------ -#define STORE_WHT(dst, col, row01, row23) do { \ - *dst = vgetq_lane_s32(row01.val[0], col); (dst) += 16; \ - *dst = vgetq_lane_s32(row01.val[1], col); (dst) += 16; \ - *dst = vgetq_lane_s32(row23.val[0], col); (dst) += 16; \ - *dst = vgetq_lane_s32(row23.val[1], col); (dst) += 16; \ +#define STORE_WHT(dst, col, rows) do { \ + *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \ + *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \ + *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \ + *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \ } while (0) static void TransformWHT(const int16_t* in, int16_t* out) { - int32x4x2_t tmp0; // tmp[0..7] - int32x4x2_t tmp1; // tmp[8..15] + int32x4x4_t tmp; { // Load the source. @@ -1238,47 +1236,37 @@ static void TransformWHT(const int16_t* in, int16_t* out) { const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11] const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11] const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15] - tmp0.val[0] = vaddq_s32(a0, a1); - tmp0.val[1] = vaddq_s32(a3, a2); - tmp1.val[0] = vsubq_s32(a0, a1); - tmp1.val[1] = vsubq_s32(a3, a2); + tmp.val[0] = vaddq_s32(a0, a1); + tmp.val[1] = vaddq_s32(a3, a2); + tmp.val[2] = vsubq_s32(a0, a1); + tmp.val[3] = vsubq_s32(a3, a2); + // Arrange the temporary results column-wise. + tmp = Transpose4x4(tmp); } - tmp0 = vzipq_s32(tmp0.val[0], tmp0.val[1]); // 0, 4, 1, 5 | 2, 6, 3, 7 - tmp1 = vzipq_s32(tmp1.val[0], tmp1.val[1]); // 8, 12, 9, 13 | 10, 14, 11, 15 - { - // Arrange the temporary results column-wise. - const int32x4_t tmp_0_4_8_12 = - vcombine_s32(vget_low_s32(tmp0.val[0]), vget_low_s32(tmp1.val[0])); - const int32x4_t tmp_2_6_10_14 = - vcombine_s32(vget_low_s32(tmp0.val[1]), vget_low_s32(tmp1.val[1])); - const int32x4_t tmp_1_5_9_13 = - vcombine_s32(vget_high_s32(tmp0.val[0]), vget_high_s32(tmp1.val[0])); - const int32x4_t tmp_3_7_11_15 = - vcombine_s32(vget_high_s32(tmp0.val[1]), vget_high_s32(tmp1.val[1])); - const int32x4_t three = vdupq_n_s32(3); - const int32x4_t dc = vaddq_s32(tmp_0_4_8_12, three); // add rounder - const int32x4_t a0 = vaddq_s32(dc, tmp_3_7_11_15); - const int32x4_t a1 = vaddq_s32(tmp_1_5_9_13, tmp_2_6_10_14); - const int32x4_t a2 = vsubq_s32(tmp_1_5_9_13, tmp_2_6_10_14); - const int32x4_t a3 = vsubq_s32(dc, tmp_3_7_11_15); + const int32x4_t kCst3 = vdupq_n_s32(3); + const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3); // add rounder + const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]); + const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]); + const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]); + const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]); - tmp0.val[0] = vaddq_s32(a0, a1); - tmp0.val[1] = vaddq_s32(a3, a2); - tmp1.val[0] = vsubq_s32(a0, a1); - tmp1.val[1] = vsubq_s32(a3, a2); + tmp.val[0] = vaddq_s32(a0, a1); + tmp.val[1] = vaddq_s32(a3, a2); + tmp.val[2] = vsubq_s32(a0, a1); + tmp.val[3] = vsubq_s32(a3, a2); // right shift the results by 3. - tmp0.val[0] = vshrq_n_s32(tmp0.val[0], 3); - tmp0.val[1] = vshrq_n_s32(tmp0.val[1], 3); - tmp1.val[0] = vshrq_n_s32(tmp1.val[0], 3); - tmp1.val[1] = vshrq_n_s32(tmp1.val[1], 3); + tmp.val[0] = vshrq_n_s32(tmp.val[0], 3); + tmp.val[1] = vshrq_n_s32(tmp.val[1], 3); + tmp.val[2] = vshrq_n_s32(tmp.val[2], 3); + tmp.val[3] = vshrq_n_s32(tmp.val[3], 3); - STORE_WHT(out, 0, tmp0, tmp1); - STORE_WHT(out, 1, tmp0, tmp1); - STORE_WHT(out, 2, tmp0, tmp1); - STORE_WHT(out, 3, tmp0, tmp1); + STORE_WHT(out, 0, tmp); + STORE_WHT(out, 1, tmp); + STORE_WHT(out, 2, tmp); + STORE_WHT(out, 3, tmp); } } diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c index 170c70b3..1e478825 100644 --- a/src/dsp/enc_neon.c +++ b/src/dsp/enc_neon.c @@ -18,8 +18,8 @@ #define USE_INTRINSICS // use intrinsics when possible #include -#include +#include "./neon.h" #include "../enc/vp8enci.h" //------------------------------------------------------------------------------ @@ -474,38 +474,6 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, #endif -static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) { - uint64x2x2_t row01, row23; - - row01.val[0] = vreinterpretq_u64_s32(rows.val[0]); - row01.val[1] = vreinterpretq_u64_s32(rows.val[1]); - row23.val[0] = vreinterpretq_u64_s32(rows.val[2]); - row23.val[1] = vreinterpretq_u64_s32(rows.val[3]); - // Transpose 64-bit values (there's no vswp equivalent) - { - const uint64x1_t row0h = vget_high_u64(row01.val[0]); - const uint64x1_t row2l = vget_low_u64(row23.val[0]); - const uint64x1_t row1h = vget_high_u64(row01.val[1]); - const uint64x1_t row3l = vget_low_u64(row23.val[1]); - row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l); - row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0])); - row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l); - row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1])); - } - { - const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]), - vreinterpretq_s32_u64(row01.val[1])); - const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]), - vreinterpretq_s32_u64(row23.val[1])); - int32x4x4_t out; - out.val[0] = out01.val[0]; - out.val[1] = out01.val[1]; - out.val[2] = out23.val[0]; - out.val[3] = out23.val[1]; - return out; - } -} - #define LOAD_LANE_16b(VALUE, LANE) do { \ (VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \ src += stride; \ diff --git a/src/dsp/neon.h b/src/dsp/neon.h new file mode 100644 index 00000000..96c089ec --- /dev/null +++ b/src/dsp/neon.h @@ -0,0 +1,49 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// NEON common code. + +#ifndef WEBP_DSP_NEON_H_ +#define WEBP_DSP_NEON_H_ + +#include + +static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) { + uint64x2x2_t row01, row23; + + row01.val[0] = vreinterpretq_u64_s32(rows.val[0]); + row01.val[1] = vreinterpretq_u64_s32(rows.val[1]); + row23.val[0] = vreinterpretq_u64_s32(rows.val[2]); + row23.val[1] = vreinterpretq_u64_s32(rows.val[3]); + // Transpose 64-bit values (there's no vswp equivalent) + { + const uint64x1_t row0h = vget_high_u64(row01.val[0]); + const uint64x1_t row2l = vget_low_u64(row23.val[0]); + const uint64x1_t row1h = vget_high_u64(row01.val[1]); + const uint64x1_t row3l = vget_low_u64(row23.val[1]); + row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l); + row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0])); + row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l); + row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1])); + } + { + const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]), + vreinterpretq_s32_u64(row01.val[1])); + const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]), + vreinterpretq_s32_u64(row23.val[1])); + int32x4x4_t out; + out.val[0] = out01.val[0]; + out.val[1] = out01.val[1]; + out.val[2] = out23.val[0]; + out.val[3] = out23.val[1]; + return out; + } +} + +#endif // WEBP_DSP_NEON_H_