enc_neon: move Transpose4x4 to dsp/neon.h

+ reuse it in TransformWHT()

Change-Id: Idfbd0f9b58d6253ac3d65ba55b58989c427ee989
This commit is contained in:
James Zern
2014-04-26 12:11:00 -07:00
parent 8e5f90b086
commit 5e1a17ef4b
5 changed files with 89 additions and 77 deletions

View File

@ -18,8 +18,8 @@
#define USE_INTRINSICS // use intrinsics when possible
#include <assert.h>
#include <arm_neon.h>
#include "./neon.h"
#include "../enc/vp8enci.h"
//------------------------------------------------------------------------------
@ -474,38 +474,6 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
#endif
static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
uint64x2x2_t row01, row23;
row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
// Transpose 64-bit values (there's no vswp equivalent)
{
const uint64x1_t row0h = vget_high_u64(row01.val[0]);
const uint64x1_t row2l = vget_low_u64(row23.val[0]);
const uint64x1_t row1h = vget_high_u64(row01.val[1]);
const uint64x1_t row3l = vget_low_u64(row23.val[1]);
row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
}
{
const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
vreinterpretq_s32_u64(row01.val[1]));
const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
vreinterpretq_s32_u64(row23.val[1]));
int32x4x4_t out;
out.val[0] = out01.val[0];
out.val[1] = out01.val[1];
out.val[2] = out23.val[0];
out.val[3] = out23.val[1];
return out;
}
}
#define LOAD_LANE_16b(VALUE, LANE) do { \
(VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \
src += stride; \