mirror of
https://github.com/webmproject/libwebp.git
synced 2024-12-26 13:48:21 +01:00
enc_neon: move Transpose4x4 to dsp/neon.h
+ reuse it in TransformWHT() Change-Id: Idfbd0f9b58d6253ac3d65ba55b58989c427ee989
This commit is contained in:
parent
8e5f90b086
commit
5e1a17ef4b
@ -200,9 +200,14 @@ HDRS = \
|
||||
src/dec/webpi.h \
|
||||
src/dsp/dsp.h \
|
||||
src/dsp/lossless.h \
|
||||
src/dsp/neon.h \
|
||||
src/dsp/yuv.h \
|
||||
src/enc/backward_references.h \
|
||||
src/enc/cost.h \
|
||||
src/enc/histogram.h \
|
||||
src/enc/vp8enci.h \
|
||||
src/enc/vp8li.h \
|
||||
src/mux/muxi.h \
|
||||
src/utils/alpha_processing.h \
|
||||
src/utils/bit_reader.h \
|
||||
src/utils/bit_writer.h \
|
||||
@ -215,6 +220,7 @@ HDRS = \
|
||||
src/utils/random.h \
|
||||
src/utils/rescaler.h \
|
||||
src/utils/thread.h \
|
||||
src/utils/utils.h \
|
||||
src/webp/format_constants.h \
|
||||
$(HDRS_INSTALLED) \
|
||||
|
||||
|
@ -17,10 +17,11 @@ COMMON_SOURCES += dec_neon.c
|
||||
COMMON_SOURCES += dec_sse2.c
|
||||
COMMON_SOURCES += dsp.h
|
||||
COMMON_SOURCES += lossless.c
|
||||
COMMON_SOURCES += lossless.h
|
||||
COMMON_SOURCES += lossless_mips32.c
|
||||
COMMON_SOURCES += lossless_neon.c
|
||||
COMMON_SOURCES += lossless_sse2.c
|
||||
COMMON_SOURCES += lossless.h
|
||||
COMMON_SOURCES += neon.h
|
||||
COMMON_SOURCES += upsampling.c
|
||||
COMMON_SOURCES += upsampling_mips32.c
|
||||
COMMON_SOURCES += upsampling_neon.c
|
||||
|
@ -23,8 +23,7 @@
|
||||
// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
|
||||
#define WORK_AROUND_GCC
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "./neon.h"
|
||||
#include "../dec/vp8i.h"
|
||||
|
||||
#define QRegs "q0", "q1", "q2", "q3", \
|
||||
@ -1217,16 +1216,15 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
#define STORE_WHT(dst, col, row01, row23) do { \
|
||||
*dst = vgetq_lane_s32(row01.val[0], col); (dst) += 16; \
|
||||
*dst = vgetq_lane_s32(row01.val[1], col); (dst) += 16; \
|
||||
*dst = vgetq_lane_s32(row23.val[0], col); (dst) += 16; \
|
||||
*dst = vgetq_lane_s32(row23.val[1], col); (dst) += 16; \
|
||||
#define STORE_WHT(dst, col, rows) do { \
|
||||
*dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
|
||||
*dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
|
||||
*dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
|
||||
*dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
|
||||
} while (0)
|
||||
|
||||
static void TransformWHT(const int16_t* in, int16_t* out) {
|
||||
int32x4x2_t tmp0; // tmp[0..7]
|
||||
int32x4x2_t tmp1; // tmp[8..15]
|
||||
int32x4x4_t tmp;
|
||||
|
||||
{
|
||||
// Load the source.
|
||||
@ -1238,47 +1236,37 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
|
||||
const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11]
|
||||
const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11]
|
||||
const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15]
|
||||
tmp0.val[0] = vaddq_s32(a0, a1);
|
||||
tmp0.val[1] = vaddq_s32(a3, a2);
|
||||
tmp1.val[0] = vsubq_s32(a0, a1);
|
||||
tmp1.val[1] = vsubq_s32(a3, a2);
|
||||
tmp.val[0] = vaddq_s32(a0, a1);
|
||||
tmp.val[1] = vaddq_s32(a3, a2);
|
||||
tmp.val[2] = vsubq_s32(a0, a1);
|
||||
tmp.val[3] = vsubq_s32(a3, a2);
|
||||
// Arrange the temporary results column-wise.
|
||||
tmp = Transpose4x4(tmp);
|
||||
}
|
||||
|
||||
tmp0 = vzipq_s32(tmp0.val[0], tmp0.val[1]); // 0, 4, 1, 5 | 2, 6, 3, 7
|
||||
tmp1 = vzipq_s32(tmp1.val[0], tmp1.val[1]); // 8, 12, 9, 13 | 10, 14, 11, 15
|
||||
|
||||
{
|
||||
// Arrange the temporary results column-wise.
|
||||
const int32x4_t tmp_0_4_8_12 =
|
||||
vcombine_s32(vget_low_s32(tmp0.val[0]), vget_low_s32(tmp1.val[0]));
|
||||
const int32x4_t tmp_2_6_10_14 =
|
||||
vcombine_s32(vget_low_s32(tmp0.val[1]), vget_low_s32(tmp1.val[1]));
|
||||
const int32x4_t tmp_1_5_9_13 =
|
||||
vcombine_s32(vget_high_s32(tmp0.val[0]), vget_high_s32(tmp1.val[0]));
|
||||
const int32x4_t tmp_3_7_11_15 =
|
||||
vcombine_s32(vget_high_s32(tmp0.val[1]), vget_high_s32(tmp1.val[1]));
|
||||
const int32x4_t three = vdupq_n_s32(3);
|
||||
const int32x4_t dc = vaddq_s32(tmp_0_4_8_12, three); // add rounder
|
||||
const int32x4_t a0 = vaddq_s32(dc, tmp_3_7_11_15);
|
||||
const int32x4_t a1 = vaddq_s32(tmp_1_5_9_13, tmp_2_6_10_14);
|
||||
const int32x4_t a2 = vsubq_s32(tmp_1_5_9_13, tmp_2_6_10_14);
|
||||
const int32x4_t a3 = vsubq_s32(dc, tmp_3_7_11_15);
|
||||
const int32x4_t kCst3 = vdupq_n_s32(3);
|
||||
const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3); // add rounder
|
||||
const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
|
||||
const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
|
||||
const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
|
||||
const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
|
||||
|
||||
tmp0.val[0] = vaddq_s32(a0, a1);
|
||||
tmp0.val[1] = vaddq_s32(a3, a2);
|
||||
tmp1.val[0] = vsubq_s32(a0, a1);
|
||||
tmp1.val[1] = vsubq_s32(a3, a2);
|
||||
tmp.val[0] = vaddq_s32(a0, a1);
|
||||
tmp.val[1] = vaddq_s32(a3, a2);
|
||||
tmp.val[2] = vsubq_s32(a0, a1);
|
||||
tmp.val[3] = vsubq_s32(a3, a2);
|
||||
|
||||
// right shift the results by 3.
|
||||
tmp0.val[0] = vshrq_n_s32(tmp0.val[0], 3);
|
||||
tmp0.val[1] = vshrq_n_s32(tmp0.val[1], 3);
|
||||
tmp1.val[0] = vshrq_n_s32(tmp1.val[0], 3);
|
||||
tmp1.val[1] = vshrq_n_s32(tmp1.val[1], 3);
|
||||
tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
|
||||
tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
|
||||
tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
|
||||
tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
|
||||
|
||||
STORE_WHT(out, 0, tmp0, tmp1);
|
||||
STORE_WHT(out, 1, tmp0, tmp1);
|
||||
STORE_WHT(out, 2, tmp0, tmp1);
|
||||
STORE_WHT(out, 3, tmp0, tmp1);
|
||||
STORE_WHT(out, 0, tmp);
|
||||
STORE_WHT(out, 1, tmp);
|
||||
STORE_WHT(out, 2, tmp);
|
||||
STORE_WHT(out, 3, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -18,8 +18,8 @@
|
||||
#define USE_INTRINSICS // use intrinsics when possible
|
||||
|
||||
#include <assert.h>
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "./neon.h"
|
||||
#include "../enc/vp8enci.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
@ -474,38 +474,6 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
|
||||
|
||||
#endif
|
||||
|
||||
static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
|
||||
uint64x2x2_t row01, row23;
|
||||
|
||||
row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
|
||||
row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
|
||||
row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
|
||||
row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
|
||||
// Transpose 64-bit values (there's no vswp equivalent)
|
||||
{
|
||||
const uint64x1_t row0h = vget_high_u64(row01.val[0]);
|
||||
const uint64x1_t row2l = vget_low_u64(row23.val[0]);
|
||||
const uint64x1_t row1h = vget_high_u64(row01.val[1]);
|
||||
const uint64x1_t row3l = vget_low_u64(row23.val[1]);
|
||||
row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
|
||||
row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
|
||||
row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
|
||||
row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
|
||||
}
|
||||
{
|
||||
const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
|
||||
vreinterpretq_s32_u64(row01.val[1]));
|
||||
const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
|
||||
vreinterpretq_s32_u64(row23.val[1]));
|
||||
int32x4x4_t out;
|
||||
out.val[0] = out01.val[0];
|
||||
out.val[1] = out01.val[1];
|
||||
out.val[2] = out23.val[0];
|
||||
out.val[3] = out23.val[1];
|
||||
return out;
|
||||
}
|
||||
}
|
||||
|
||||
#define LOAD_LANE_16b(VALUE, LANE) do { \
|
||||
(VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \
|
||||
src += stride; \
|
||||
|
49
src/dsp/neon.h
Normal file
49
src/dsp/neon.h
Normal file
@ -0,0 +1,49 @@
|
||||
// Copyright 2014 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// NEON common code.
|
||||
|
||||
#ifndef WEBP_DSP_NEON_H_
|
||||
#define WEBP_DSP_NEON_H_
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
|
||||
uint64x2x2_t row01, row23;
|
||||
|
||||
row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
|
||||
row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
|
||||
row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
|
||||
row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
|
||||
// Transpose 64-bit values (there's no vswp equivalent)
|
||||
{
|
||||
const uint64x1_t row0h = vget_high_u64(row01.val[0]);
|
||||
const uint64x1_t row2l = vget_low_u64(row23.val[0]);
|
||||
const uint64x1_t row1h = vget_high_u64(row01.val[1]);
|
||||
const uint64x1_t row3l = vget_low_u64(row23.val[1]);
|
||||
row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
|
||||
row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
|
||||
row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
|
||||
row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
|
||||
}
|
||||
{
|
||||
const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
|
||||
vreinterpretq_s32_u64(row01.val[1]));
|
||||
const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
|
||||
vreinterpretq_s32_u64(row23.val[1]));
|
||||
int32x4x4_t out;
|
||||
out.val[0] = out01.val[0];
|
||||
out.val[1] = out01.val[1];
|
||||
out.val[2] = out23.val[0];
|
||||
out.val[3] = out23.val[1];
|
||||
return out;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // WEBP_DSP_NEON_H_
|
Loading…
Reference in New Issue
Block a user