enc_neon: move Transpose4x4 to dsp/neon.h

+ reuse it in TransformWHT()

Change-Id: Idfbd0f9b58d6253ac3d65ba55b58989c427ee989
This commit is contained in:
James Zern 2014-04-26 12:11:00 -07:00
parent 8e5f90b086
commit 5e1a17ef4b
5 changed files with 89 additions and 77 deletions

View File

@ -200,9 +200,14 @@ HDRS = \
src/dec/webpi.h \ src/dec/webpi.h \
src/dsp/dsp.h \ src/dsp/dsp.h \
src/dsp/lossless.h \ src/dsp/lossless.h \
src/dsp/neon.h \
src/dsp/yuv.h \ src/dsp/yuv.h \
src/enc/backward_references.h \
src/enc/cost.h \ src/enc/cost.h \
src/enc/histogram.h \
src/enc/vp8enci.h \ src/enc/vp8enci.h \
src/enc/vp8li.h \
src/mux/muxi.h \
src/utils/alpha_processing.h \ src/utils/alpha_processing.h \
src/utils/bit_reader.h \ src/utils/bit_reader.h \
src/utils/bit_writer.h \ src/utils/bit_writer.h \
@ -215,6 +220,7 @@ HDRS = \
src/utils/random.h \ src/utils/random.h \
src/utils/rescaler.h \ src/utils/rescaler.h \
src/utils/thread.h \ src/utils/thread.h \
src/utils/utils.h \
src/webp/format_constants.h \ src/webp/format_constants.h \
$(HDRS_INSTALLED) \ $(HDRS_INSTALLED) \

View File

@ -17,10 +17,11 @@ COMMON_SOURCES += dec_neon.c
COMMON_SOURCES += dec_sse2.c COMMON_SOURCES += dec_sse2.c
COMMON_SOURCES += dsp.h COMMON_SOURCES += dsp.h
COMMON_SOURCES += lossless.c COMMON_SOURCES += lossless.c
COMMON_SOURCES += lossless.h
COMMON_SOURCES += lossless_mips32.c COMMON_SOURCES += lossless_mips32.c
COMMON_SOURCES += lossless_neon.c COMMON_SOURCES += lossless_neon.c
COMMON_SOURCES += lossless_sse2.c COMMON_SOURCES += lossless_sse2.c
COMMON_SOURCES += lossless.h COMMON_SOURCES += neon.h
COMMON_SOURCES += upsampling.c COMMON_SOURCES += upsampling.c
COMMON_SOURCES += upsampling_mips32.c COMMON_SOURCES += upsampling_mips32.c
COMMON_SOURCES += upsampling_neon.c COMMON_SOURCES += upsampling_neon.c

View File

@ -23,8 +23,7 @@
// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183) // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
#define WORK_AROUND_GCC #define WORK_AROUND_GCC
#include <arm_neon.h> #include "./neon.h"
#include "../dec/vp8i.h" #include "../dec/vp8i.h"
#define QRegs "q0", "q1", "q2", "q3", \ #define QRegs "q0", "q1", "q2", "q3", \
@ -1217,16 +1216,15 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
#define STORE_WHT(dst, col, row01, row23) do { \ #define STORE_WHT(dst, col, rows) do { \
*dst = vgetq_lane_s32(row01.val[0], col); (dst) += 16; \ *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
*dst = vgetq_lane_s32(row01.val[1], col); (dst) += 16; \ *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
*dst = vgetq_lane_s32(row23.val[0], col); (dst) += 16; \ *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
*dst = vgetq_lane_s32(row23.val[1], col); (dst) += 16; \ *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
} while (0) } while (0)
static void TransformWHT(const int16_t* in, int16_t* out) { static void TransformWHT(const int16_t* in, int16_t* out) {
int32x4x2_t tmp0; // tmp[0..7] int32x4x4_t tmp;
int32x4x2_t tmp1; // tmp[8..15]
{ {
// Load the source. // Load the source.
@ -1238,47 +1236,37 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11] const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11]
const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11] const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11]
const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15] const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15]
tmp0.val[0] = vaddq_s32(a0, a1); tmp.val[0] = vaddq_s32(a0, a1);
tmp0.val[1] = vaddq_s32(a3, a2); tmp.val[1] = vaddq_s32(a3, a2);
tmp1.val[0] = vsubq_s32(a0, a1); tmp.val[2] = vsubq_s32(a0, a1);
tmp1.val[1] = vsubq_s32(a3, a2); tmp.val[3] = vsubq_s32(a3, a2);
// Arrange the temporary results column-wise.
tmp = Transpose4x4(tmp);
} }
tmp0 = vzipq_s32(tmp0.val[0], tmp0.val[1]); // 0, 4, 1, 5 | 2, 6, 3, 7
tmp1 = vzipq_s32(tmp1.val[0], tmp1.val[1]); // 8, 12, 9, 13 | 10, 14, 11, 15
{ {
// Arrange the temporary results column-wise. const int32x4_t kCst3 = vdupq_n_s32(3);
const int32x4_t tmp_0_4_8_12 = const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3); // add rounder
vcombine_s32(vget_low_s32(tmp0.val[0]), vget_low_s32(tmp1.val[0])); const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
const int32x4_t tmp_2_6_10_14 = const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
vcombine_s32(vget_low_s32(tmp0.val[1]), vget_low_s32(tmp1.val[1])); const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
const int32x4_t tmp_1_5_9_13 = const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
vcombine_s32(vget_high_s32(tmp0.val[0]), vget_high_s32(tmp1.val[0]));
const int32x4_t tmp_3_7_11_15 =
vcombine_s32(vget_high_s32(tmp0.val[1]), vget_high_s32(tmp1.val[1]));
const int32x4_t three = vdupq_n_s32(3);
const int32x4_t dc = vaddq_s32(tmp_0_4_8_12, three); // add rounder
const int32x4_t a0 = vaddq_s32(dc, tmp_3_7_11_15);
const int32x4_t a1 = vaddq_s32(tmp_1_5_9_13, tmp_2_6_10_14);
const int32x4_t a2 = vsubq_s32(tmp_1_5_9_13, tmp_2_6_10_14);
const int32x4_t a3 = vsubq_s32(dc, tmp_3_7_11_15);
tmp0.val[0] = vaddq_s32(a0, a1); tmp.val[0] = vaddq_s32(a0, a1);
tmp0.val[1] = vaddq_s32(a3, a2); tmp.val[1] = vaddq_s32(a3, a2);
tmp1.val[0] = vsubq_s32(a0, a1); tmp.val[2] = vsubq_s32(a0, a1);
tmp1.val[1] = vsubq_s32(a3, a2); tmp.val[3] = vsubq_s32(a3, a2);
// right shift the results by 3. // right shift the results by 3.
tmp0.val[0] = vshrq_n_s32(tmp0.val[0], 3); tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
tmp0.val[1] = vshrq_n_s32(tmp0.val[1], 3); tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
tmp1.val[0] = vshrq_n_s32(tmp1.val[0], 3); tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
tmp1.val[1] = vshrq_n_s32(tmp1.val[1], 3); tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
STORE_WHT(out, 0, tmp0, tmp1); STORE_WHT(out, 0, tmp);
STORE_WHT(out, 1, tmp0, tmp1); STORE_WHT(out, 1, tmp);
STORE_WHT(out, 2, tmp0, tmp1); STORE_WHT(out, 2, tmp);
STORE_WHT(out, 3, tmp0, tmp1); STORE_WHT(out, 3, tmp);
} }
} }

View File

@ -18,8 +18,8 @@
#define USE_INTRINSICS // use intrinsics when possible #define USE_INTRINSICS // use intrinsics when possible
#include <assert.h> #include <assert.h>
#include <arm_neon.h>
#include "./neon.h"
#include "../enc/vp8enci.h" #include "../enc/vp8enci.h"
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
@ -474,38 +474,6 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
#endif #endif
static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
uint64x2x2_t row01, row23;
row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
// Transpose 64-bit values (there's no vswp equivalent)
{
const uint64x1_t row0h = vget_high_u64(row01.val[0]);
const uint64x1_t row2l = vget_low_u64(row23.val[0]);
const uint64x1_t row1h = vget_high_u64(row01.val[1]);
const uint64x1_t row3l = vget_low_u64(row23.val[1]);
row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
}
{
const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
vreinterpretq_s32_u64(row01.val[1]));
const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
vreinterpretq_s32_u64(row23.val[1]));
int32x4x4_t out;
out.val[0] = out01.val[0];
out.val[1] = out01.val[1];
out.val[2] = out23.val[0];
out.val[3] = out23.val[1];
return out;
}
}
#define LOAD_LANE_16b(VALUE, LANE) do { \ #define LOAD_LANE_16b(VALUE, LANE) do { \
(VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \ (VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \
src += stride; \ src += stride; \

49
src/dsp/neon.h Normal file
View File

@ -0,0 +1,49 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// NEON common code.
#ifndef WEBP_DSP_NEON_H_
#define WEBP_DSP_NEON_H_
#include <arm_neon.h>
static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
uint64x2x2_t row01, row23;
row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
// Transpose 64-bit values (there's no vswp equivalent)
{
const uint64x1_t row0h = vget_high_u64(row01.val[0]);
const uint64x1_t row2l = vget_low_u64(row23.val[0]);
const uint64x1_t row1h = vget_high_u64(row01.val[1]);
const uint64x1_t row3l = vget_low_u64(row23.val[1]);
row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
}
{
const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
vreinterpretq_s32_u64(row01.val[1]));
const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
vreinterpretq_s32_u64(row23.val[1]));
int32x4x4_t out;
out.val[0] = out01.val[0];
out.val[1] = out01.val[1];
out.val[2] = out23.val[0];
out.val[3] = out23.val[1];
return out;
}
}
#endif // WEBP_DSP_NEON_H_