Add MSA optimized encoder transform functions

We add the following MSA optimized encoder transform functions:
- ITransform
- FTransform
- FTransformWHT

Change-Id: Ia6b17556aba5aff2d7a88208905fb45293d080a8
This commit is contained in:
Parag Salasakar
2016-07-05 20:00:43 +05:30
committed by Pascal Massimino
parent dce64bfa1b
commit 435308e029
8 changed files with 318 additions and 4 deletions

183
src/dsp/enc_msa.c Normal file
View File

@@ -0,0 +1,183 @@
// Copyright 2016 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MSA version of encoder dsp functions.
//
// Author: Prashant Patil (prashant.patil@imgtec.com)
#include "./dsp.h"
#if defined(WEBP_USE_MSA)
#include "./msa_macro.h"
//------------------------------------------------------------------------------
// Transforms
#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) do { \
v4i32 a1_m, b1_m, c1_m, d1_m; \
const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091); \
const v4i32 sinpi8sqrt2 = __msa_fill_w(35468); \
v4i32 c_tmp1_m = in1 * sinpi8sqrt2; \
v4i32 c_tmp2_m = in3 * cospi8sqrt2minus1; \
v4i32 d_tmp1_m = in1 * cospi8sqrt2minus1; \
v4i32 d_tmp2_m = in3 * sinpi8sqrt2; \
\
ADDSUB2(in0, in2, a1_m, b1_m); \
SRAI_W2_SW(c_tmp1_m, c_tmp2_m, 16); \
c_tmp2_m = c_tmp2_m + in3; \
c1_m = c_tmp1_m - c_tmp2_m; \
SRAI_W2_SW(d_tmp1_m, d_tmp2_m, 16); \
d_tmp1_m = d_tmp1_m + in1; \
d1_m = d_tmp1_m + d_tmp2_m; \
BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
} while (0)
static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
uint8_t* dst) {
v8i16 input0, input1;
v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
v4i32 res0, res1, res2, res3;
v16i8 dest0, dest1, dest2, dest3;
const v16i8 zero = { 0 };
LD_SH2(in, 8, input0, input1);
UNPCK_SH_SW(input0, in0, in1);
UNPCK_SH_SW(input1, in2, in3);
IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
LD_SB4(ref, BPS, dest0, dest1, dest2, dest3);
ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
res0, res1, res2, res3);
ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
res0, res1, res2, res3);
ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
CLIP_SW4_0_255(res0, res1, res2, res3);
PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
}
static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
uint64_t out0, out1, out2, out3;
uint32_t in0, in1, in2, in3;
v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
v8i16 t0, t1, t2, t3;
v16u8 srcl0, srcl1, src0, src1;
const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 };
const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 };
const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 };
LW4(src, BPS, in0, in1, in2, in3);
INSERT_W4_UB(in0, in1, in2, in3, src0);
LW4(ref, BPS, in0, in1, in2, in3);
INSERT_W4_UB(in0, in1, in2, in3, src1);
ILVRL_B2_UB(src0, src1, srcl0, srcl1);
HSUB_UB2_SH(srcl0, srcl1, t0, t1);
VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
ADDSUB2(t2, t3, t0, t1);
t0 = SRLI_H(t0, 3);
VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
tmp0 = __msa_hadd_s_w(t3, t3);
tmp2 = __msa_hsub_s_w(t3, t3);
FILL_W2_SW(1812, 937, tmp1, tmp3);
DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
SRAI_W2_SW(tmp1, tmp3, 9);
PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
ADDSUB2(t2, t3, t0, t1);
VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
tmp0 = __msa_hadd_s_w(t3, t3);
tmp2 = __msa_hsub_s_w(t3, t3);
ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2);
SRAI_W2_SW(tmp0, tmp2, 4);
FILL_W2_SW(12000, 51000, tmp1, tmp3);
DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
SRAI_W2_SW(tmp1, tmp3, 16);
UNPCK_R_SH_SW(t1, tmp4);
tmp5 = __msa_ceqi_w(tmp4, 0);
tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5);
tmp5 = __msa_fill_w(1);
tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4);
tmp1 += tmp5;
PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
out0 = __msa_copy_s_d((v2i64)t0, 0);
out1 = __msa_copy_s_d((v2i64)t0, 1);
out2 = __msa_copy_s_d((v2i64)t1, 0);
out3 = __msa_copy_s_d((v2i64)t1, 1);
SD4(out0, out1, out2, out3, out, 8);
}
static void FTransformWHT(const int16_t* in, int16_t* out) {
v8i16 in0 = { 0 };
v8i16 in1 = { 0 };
v8i16 tmp0, tmp1, tmp2, tmp3;
v8i16 out0, out1;
const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
in0 = __msa_insert_h(in0, 0, in[ 0]);
in0 = __msa_insert_h(in0, 1, in[ 64]);
in0 = __msa_insert_h(in0, 2, in[128]);
in0 = __msa_insert_h(in0, 3, in[192]);
in0 = __msa_insert_h(in0, 4, in[ 16]);
in0 = __msa_insert_h(in0, 5, in[ 80]);
in0 = __msa_insert_h(in0, 6, in[144]);
in0 = __msa_insert_h(in0, 7, in[208]);
in1 = __msa_insert_h(in1, 0, in[ 48]);
in1 = __msa_insert_h(in1, 1, in[112]);
in1 = __msa_insert_h(in1, 2, in[176]);
in1 = __msa_insert_h(in1, 3, in[240]);
in1 = __msa_insert_h(in1, 4, in[ 32]);
in1 = __msa_insert_h(in1, 5, in[ 96]);
in1 = __msa_insert_h(in1, 6, in[160]);
in1 = __msa_insert_h(in1, 7, in[224]);
ADDSUB2(in0, in1, tmp0, tmp1);
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
ADDSUB2(tmp2, tmp3, tmp0, tmp1);
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
ADDSUB2(in0, in1, tmp0, tmp1);
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
ADDSUB2(tmp2, tmp3, out0, out1);
SRAI_H2_SH(out0, out1, 1);
ST_SH2(out0, out1, out, 8);
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
VP8ITransform = ITransform;
VP8FTransform = FTransform;
VP8FTransformWHT = FTransformWHT;
}
#else // !WEBP_USE_MSA
WEBP_DSP_INIT_STUB(VP8EncDspInitMSA)
#endif // WEBP_USE_MSA