mirror of
https://github.com/webmproject/libwebp.git
synced 2024-11-19 20:08:28 +01:00
Merge "Copy C code to not have multiplication overflow" into main
This commit is contained in:
commit
7fac6c1bf2
@ -37,9 +37,6 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
|
||||
STORE(3, y, DC - (d)); \
|
||||
} while (0)
|
||||
|
||||
#define MUL1(a) ((((a) * 20091) >> 16) + (a))
|
||||
#define MUL2(a) (((a) * 35468) >> 16)
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static void TransformOne_C(const int16_t* in, uint8_t* dst) {
|
||||
int C[4 * 4], *tmp;
|
||||
@ -48,8 +45,10 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) {
|
||||
for (i = 0; i < 4; ++i) { // vertical pass
|
||||
const int a = in[0] + in[8]; // [-4096, 4094]
|
||||
const int b = in[0] - in[8]; // [-4095, 4095]
|
||||
const int c = MUL2(in[4]) - MUL1(in[12]); // [-3783, 3783]
|
||||
const int d = MUL1(in[4]) + MUL2(in[12]); // [-3785, 3781]
|
||||
const int c = WEBP_TRANSFORM_AC3_MUL2(in[4]) -
|
||||
WEBP_TRANSFORM_AC3_MUL1(in[12]); // [-3783, 3783]
|
||||
const int d = WEBP_TRANSFORM_AC3_MUL1(in[4]) +
|
||||
WEBP_TRANSFORM_AC3_MUL2(in[12]); // [-3785, 3781]
|
||||
tmp[0] = a + d; // [-7881, 7875]
|
||||
tmp[1] = b + c; // [-7878, 7878]
|
||||
tmp[2] = b - c; // [-7878, 7878]
|
||||
@ -69,8 +68,10 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) {
|
||||
const int dc = tmp[0] + 4;
|
||||
const int a = dc + tmp[8];
|
||||
const int b = dc - tmp[8];
|
||||
const int c = MUL2(tmp[4]) - MUL1(tmp[12]);
|
||||
const int d = MUL1(tmp[4]) + MUL2(tmp[12]);
|
||||
const int c =
|
||||
WEBP_TRANSFORM_AC3_MUL2(tmp[4]) - WEBP_TRANSFORM_AC3_MUL1(tmp[12]);
|
||||
const int d =
|
||||
WEBP_TRANSFORM_AC3_MUL1(tmp[4]) + WEBP_TRANSFORM_AC3_MUL2(tmp[12]);
|
||||
STORE(0, 0, a + d);
|
||||
STORE(1, 0, b + c);
|
||||
STORE(2, 0, b - c);
|
||||
@ -83,17 +84,15 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) {
|
||||
// Simplified transform when only in[0], in[1] and in[4] are non-zero
|
||||
static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
|
||||
const int a = in[0] + 4;
|
||||
const int c4 = MUL2(in[4]);
|
||||
const int d4 = MUL1(in[4]);
|
||||
const int c1 = MUL2(in[1]);
|
||||
const int d1 = MUL1(in[1]);
|
||||
const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
|
||||
const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
|
||||
const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
|
||||
const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
|
||||
STORE2(0, a + d4, d1, c1);
|
||||
STORE2(1, a + c4, d1, c1);
|
||||
STORE2(2, a - c4, d1, c1);
|
||||
STORE2(3, a - d4, d1, c1);
|
||||
}
|
||||
#undef MUL1
|
||||
#undef MUL2
|
||||
#undef STORE2
|
||||
|
||||
static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
|
||||
|
@ -18,8 +18,8 @@
|
||||
|
||||
#include "src/dsp/mips_macro.h"
|
||||
|
||||
static const int kC1 = 20091 + (1 << 16);
|
||||
static const int kC2 = 35468;
|
||||
static const int kC1 = WEBP_TRANSFORM_AC3_C1;
|
||||
static const int kC2 = WEBP_TRANSFORM_AC3_C2;
|
||||
|
||||
static WEBP_INLINE int abs_mips32(int x) {
|
||||
const int sign = x >> 31;
|
||||
@ -219,7 +219,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
|
||||
int temp0, temp1, temp2, temp3, temp4;
|
||||
int temp5, temp6, temp7, temp8, temp9;
|
||||
int temp10, temp11, temp12, temp13, temp14;
|
||||
int temp15, temp16, temp17, temp18;
|
||||
int temp15, temp16, temp17, temp18, temp19;
|
||||
int16_t* p_in = (int16_t*)in;
|
||||
|
||||
// loops unrolled and merged to avoid usage of tmp buffer
|
||||
@ -233,16 +233,14 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
|
||||
"addu %[temp16], %[temp0], %[temp8] \n\t"
|
||||
"subu %[temp0], %[temp0], %[temp8] \n\t"
|
||||
"mul %[temp8], %[temp4], %[kC2] \n\t"
|
||||
"mul %[temp17], %[temp12], %[kC1] \n\t"
|
||||
"mul %[temp4], %[temp4], %[kC1] \n\t"
|
||||
MUL_SHIFT_C1(temp17, temp12)
|
||||
MUL_SHIFT_C1_IO(temp4, temp19)
|
||||
"mul %[temp12], %[temp12], %[kC2] \n\t"
|
||||
"lh %[temp1], 2(%[in]) \n\t"
|
||||
"lh %[temp5], 10(%[in]) \n\t"
|
||||
"lh %[temp9], 18(%[in]) \n\t"
|
||||
"lh %[temp13], 26(%[in]) \n\t"
|
||||
"sra %[temp8], %[temp8], 16 \n\t"
|
||||
"sra %[temp17], %[temp17], 16 \n\t"
|
||||
"sra %[temp4], %[temp4], 16 \n\t"
|
||||
"sra %[temp12], %[temp12], 16 \n\t"
|
||||
"lh %[temp2], 4(%[in]) \n\t"
|
||||
"lh %[temp6], 12(%[in]) \n\t"
|
||||
@ -261,49 +259,43 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
|
||||
"addu %[temp12], %[temp0], %[temp17] \n\t"
|
||||
"subu %[temp0], %[temp0], %[temp17] \n\t"
|
||||
"mul %[temp9], %[temp5], %[kC2] \n\t"
|
||||
"mul %[temp17], %[temp13], %[kC1] \n\t"
|
||||
"mul %[temp5], %[temp5], %[kC1] \n\t"
|
||||
MUL_SHIFT_C1(temp17, temp13)
|
||||
MUL_SHIFT_C1_IO(temp5, temp19)
|
||||
"mul %[temp13], %[temp13], %[kC2] \n\t"
|
||||
"sra %[temp9], %[temp9], 16 \n\t"
|
||||
"sra %[temp17], %[temp17], 16 \n\t"
|
||||
"subu %[temp17], %[temp9], %[temp17] \n\t"
|
||||
"sra %[temp5], %[temp5], 16 \n\t"
|
||||
"sra %[temp13], %[temp13], 16 \n\t"
|
||||
"addu %[temp5], %[temp5], %[temp13] \n\t"
|
||||
"addu %[temp13], %[temp1], %[temp17] \n\t"
|
||||
"subu %[temp1], %[temp1], %[temp17] \n\t"
|
||||
"mul %[temp17], %[temp14], %[kC1] \n\t"
|
||||
MUL_SHIFT_C1(temp17, temp14)
|
||||
"mul %[temp14], %[temp14], %[kC2] \n\t"
|
||||
"addu %[temp9], %[temp16], %[temp5] \n\t"
|
||||
"subu %[temp5], %[temp16], %[temp5] \n\t"
|
||||
"addu %[temp16], %[temp2], %[temp10] \n\t"
|
||||
"subu %[temp2], %[temp2], %[temp10] \n\t"
|
||||
"mul %[temp10], %[temp6], %[kC2] \n\t"
|
||||
"mul %[temp6], %[temp6], %[kC1] \n\t"
|
||||
"sra %[temp17], %[temp17], 16 \n\t"
|
||||
MUL_SHIFT_C1_IO(temp6, temp19)
|
||||
"sra %[temp14], %[temp14], 16 \n\t"
|
||||
"sra %[temp10], %[temp10], 16 \n\t"
|
||||
"sra %[temp6], %[temp6], 16 \n\t"
|
||||
"subu %[temp17], %[temp10], %[temp17] \n\t"
|
||||
"addu %[temp6], %[temp6], %[temp14] \n\t"
|
||||
"addu %[temp10], %[temp16], %[temp6] \n\t"
|
||||
"subu %[temp6], %[temp16], %[temp6] \n\t"
|
||||
"addu %[temp14], %[temp2], %[temp17] \n\t"
|
||||
"subu %[temp2], %[temp2], %[temp17] \n\t"
|
||||
"mul %[temp17], %[temp15], %[kC1] \n\t"
|
||||
MUL_SHIFT_C1(temp17, temp15)
|
||||
"mul %[temp15], %[temp15], %[kC2] \n\t"
|
||||
"addu %[temp16], %[temp3], %[temp11] \n\t"
|
||||
"subu %[temp3], %[temp3], %[temp11] \n\t"
|
||||
"mul %[temp11], %[temp7], %[kC2] \n\t"
|
||||
"mul %[temp7], %[temp7], %[kC1] \n\t"
|
||||
MUL_SHIFT_C1_IO(temp7, temp19)
|
||||
"addiu %[temp8], %[temp8], 4 \n\t"
|
||||
"addiu %[temp12], %[temp12], 4 \n\t"
|
||||
"addiu %[temp0], %[temp0], 4 \n\t"
|
||||
"addiu %[temp4], %[temp4], 4 \n\t"
|
||||
"sra %[temp17], %[temp17], 16 \n\t"
|
||||
"sra %[temp15], %[temp15], 16 \n\t"
|
||||
"sra %[temp11], %[temp11], 16 \n\t"
|
||||
"sra %[temp7], %[temp7], 16 \n\t"
|
||||
"subu %[temp17], %[temp11], %[temp17] \n\t"
|
||||
"addu %[temp7], %[temp7], %[temp15] \n\t"
|
||||
"addu %[temp15], %[temp3], %[temp17] \n\t"
|
||||
@ -313,48 +305,40 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
|
||||
"addu %[temp16], %[temp8], %[temp10] \n\t"
|
||||
"subu %[temp8], %[temp8], %[temp10] \n\t"
|
||||
"mul %[temp10], %[temp9], %[kC2] \n\t"
|
||||
"mul %[temp17], %[temp11], %[kC1] \n\t"
|
||||
"mul %[temp9], %[temp9], %[kC1] \n\t"
|
||||
MUL_SHIFT_C1(temp17, temp11)
|
||||
MUL_SHIFT_C1_IO(temp9, temp19)
|
||||
"mul %[temp11], %[temp11], %[kC2] \n\t"
|
||||
"sra %[temp10], %[temp10], 16 \n\t"
|
||||
"sra %[temp17], %[temp17], 16 \n\t"
|
||||
"sra %[temp9], %[temp9], 16 \n\t"
|
||||
"sra %[temp11], %[temp11], 16 \n\t"
|
||||
"subu %[temp17], %[temp10], %[temp17] \n\t"
|
||||
"addu %[temp11], %[temp9], %[temp11] \n\t"
|
||||
"addu %[temp10], %[temp12], %[temp14] \n\t"
|
||||
"subu %[temp12], %[temp12], %[temp14] \n\t"
|
||||
"mul %[temp14], %[temp13], %[kC2] \n\t"
|
||||
"mul %[temp9], %[temp15], %[kC1] \n\t"
|
||||
"mul %[temp13], %[temp13], %[kC1] \n\t"
|
||||
MUL_SHIFT_C1(temp9, temp15)
|
||||
MUL_SHIFT_C1_IO(temp13, temp19)
|
||||
"mul %[temp15], %[temp15], %[kC2] \n\t"
|
||||
"sra %[temp14], %[temp14], 16 \n\t"
|
||||
"sra %[temp9], %[temp9], 16 \n\t"
|
||||
"sra %[temp13], %[temp13], 16 \n\t"
|
||||
"sra %[temp15], %[temp15], 16 \n\t"
|
||||
"subu %[temp9], %[temp14], %[temp9] \n\t"
|
||||
"addu %[temp15], %[temp13], %[temp15] \n\t"
|
||||
"addu %[temp14], %[temp0], %[temp2] \n\t"
|
||||
"subu %[temp0], %[temp0], %[temp2] \n\t"
|
||||
"mul %[temp2], %[temp1], %[kC2] \n\t"
|
||||
"mul %[temp13], %[temp3], %[kC1] \n\t"
|
||||
"mul %[temp1], %[temp1], %[kC1] \n\t"
|
||||
MUL_SHIFT_C1(temp13, temp3)
|
||||
MUL_SHIFT_C1_IO(temp1, temp19)
|
||||
"mul %[temp3], %[temp3], %[kC2] \n\t"
|
||||
"sra %[temp2], %[temp2], 16 \n\t"
|
||||
"sra %[temp13], %[temp13], 16 \n\t"
|
||||
"sra %[temp1], %[temp1], 16 \n\t"
|
||||
"sra %[temp3], %[temp3], 16 \n\t"
|
||||
"subu %[temp13], %[temp2], %[temp13] \n\t"
|
||||
"addu %[temp3], %[temp1], %[temp3] \n\t"
|
||||
"addu %[temp2], %[temp4], %[temp6] \n\t"
|
||||
"subu %[temp4], %[temp4], %[temp6] \n\t"
|
||||
"mul %[temp6], %[temp5], %[kC2] \n\t"
|
||||
"mul %[temp1], %[temp7], %[kC1] \n\t"
|
||||
"mul %[temp5], %[temp5], %[kC1] \n\t"
|
||||
MUL_SHIFT_C1(temp1, temp7)
|
||||
MUL_SHIFT_C1_IO(temp5, temp19)
|
||||
"mul %[temp7], %[temp7], %[kC2] \n\t"
|
||||
"sra %[temp6], %[temp6], 16 \n\t"
|
||||
"sra %[temp1], %[temp1], 16 \n\t"
|
||||
"sra %[temp5], %[temp5], 16 \n\t"
|
||||
"sra %[temp7], %[temp7], 16 \n\t"
|
||||
"subu %[temp1], %[temp6], %[temp1] \n\t"
|
||||
"addu %[temp7], %[temp5], %[temp7] \n\t"
|
||||
@ -542,7 +526,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
|
||||
[temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
|
||||
[temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
|
||||
[temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
|
||||
[temp18]"=&r"(temp18)
|
||||
[temp18]"=&r"(temp18), [temp19]"=&r"(temp19)
|
||||
: [in]"r"(p_in), [kC1]"r"(kC1), [kC2]"r"(kC2), [dst]"r"(dst)
|
||||
: "memory", "hi", "lo"
|
||||
);
|
||||
|
@ -18,10 +18,8 @@
|
||||
|
||||
#include "src/dsp/mips_macro.h"
|
||||
|
||||
static const int kC1 = 20091 + (1 << 16);
|
||||
static const int kC2 = 35468;
|
||||
|
||||
#define MUL(a, b) (((a) * (b)) >> 16)
|
||||
static const int kC1 = WEBP_TRANSFORM_AC3_C1;
|
||||
static const int kC2 = WEBP_TRANSFORM_AC3_C2;
|
||||
|
||||
static void TransformDC(const int16_t* in, uint8_t* dst) {
|
||||
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
|
||||
@ -49,10 +47,10 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
|
||||
|
||||
static void TransformAC3(const int16_t* in, uint8_t* dst) {
|
||||
const int a = in[0] + 4;
|
||||
int c4 = MUL(in[4], kC2);
|
||||
const int d4 = MUL(in[4], kC1);
|
||||
const int c1 = MUL(in[1], kC2);
|
||||
const int d1 = MUL(in[1], kC1);
|
||||
int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
|
||||
const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
|
||||
const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
|
||||
const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
|
||||
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
|
||||
int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
|
||||
|
||||
@ -479,8 +477,6 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
|
||||
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
|
||||
#undef MUL
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Simple In-loop filtering (Paragraph 15.2)
|
||||
|
||||
|
@ -37,8 +37,6 @@
|
||||
d1_m = d_tmp1_m + d_tmp2_m; \
|
||||
BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
|
||||
}
|
||||
#define MULT1(a) ((((a) * 20091) >> 16) + (a))
|
||||
#define MULT2(a) (((a) * 35468) >> 16)
|
||||
|
||||
static void TransformOne(const int16_t* in, uint8_t* dst) {
|
||||
v8i16 input0, input1;
|
||||
@ -124,10 +122,10 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
|
||||
|
||||
static void TransformAC3(const int16_t* in, uint8_t* dst) {
|
||||
const int a = in[0] + 4;
|
||||
const int c4 = MULT2(in[4]);
|
||||
const int d4 = MULT1(in[4]);
|
||||
const int in2 = MULT2(in[1]);
|
||||
const int in3 = MULT1(in[1]);
|
||||
const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
|
||||
const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
|
||||
const int in2 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
|
||||
const int in3 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
|
||||
v4i32 tmp0 = { 0 };
|
||||
v4i32 out0 = __msa_fill_w(a + d4);
|
||||
v4i32 out1 = __msa_fill_w(a + c4);
|
||||
|
@ -1000,8 +1000,9 @@ static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
|
||||
// libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
|
||||
// same issue with kC1 and vqdmulh that we work around by down shifting kC2
|
||||
|
||||
static const int16_t kC1 = 20091;
|
||||
static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
|
||||
static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
|
||||
static const int16_t kC2 =
|
||||
WEBP_TRANSFORM_AC3_C2 / 2; // half of kC2, actually. See comment above.
|
||||
|
||||
#if defined(WEBP_USE_INTRINSICS)
|
||||
static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
|
||||
@ -1255,15 +1256,12 @@ static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
#define MUL(a, b) (((a) * (b)) >> 16)
|
||||
static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
|
||||
static const int kC1_full = 20091 + (1 << 16);
|
||||
static const int kC2_full = 35468;
|
||||
const int16x4_t A = vld1_dup_s16(in);
|
||||
const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
|
||||
const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
|
||||
const int c1 = MUL(in[1], kC2_full);
|
||||
const int d1 = MUL(in[1], kC1_full);
|
||||
const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
|
||||
const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
|
||||
const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
|
||||
const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
|
||||
const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 |
|
||||
(uint64_t)( c1 & 0xffff) << 16 |
|
||||
(uint64_t)(-c1 & 0xffff) << 32 |
|
||||
@ -1274,7 +1272,6 @@ static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
|
||||
const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
|
||||
Add4x4_NEON(m0_m1, m2_m3, dst);
|
||||
}
|
||||
#undef MUL
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// 4x4
|
||||
|
@ -196,15 +196,13 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
|
||||
}
|
||||
|
||||
#if (USE_TRANSFORM_AC3 == 1)
|
||||
#define MUL(a, b) (((a) * (b)) >> 16)
|
||||
|
||||
static void TransformAC3(const int16_t* in, uint8_t* dst) {
|
||||
static const int kC1 = 20091 + (1 << 16);
|
||||
static const int kC2 = 35468;
|
||||
const __m128i A = _mm_set1_epi16(in[0] + 4);
|
||||
const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2));
|
||||
const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1));
|
||||
const int c1 = MUL(in[1], kC2);
|
||||
const int d1 = MUL(in[1], kC1);
|
||||
const __m128i c4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
|
||||
const __m128i d4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
|
||||
const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
|
||||
const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
|
||||
const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);
|
||||
const __m128i B = _mm_adds_epi16(A, CD);
|
||||
const __m128i m0 = _mm_adds_epi16(B, d4);
|
||||
@ -238,7 +236,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
|
||||
WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
|
||||
WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
|
||||
}
|
||||
#undef MUL
|
||||
|
||||
#endif // USE_TRANSFORM_AC3
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
@ -203,6 +203,11 @@ extern VP8DecIdct VP8TransformDC;
|
||||
extern VP8DecIdct VP8TransformDCUV;
|
||||
extern VP8WHT VP8TransformWHT;
|
||||
|
||||
#define WEBP_TRANSFORM_AC3_C1 20091
|
||||
#define WEBP_TRANSFORM_AC3_C2 35468
|
||||
#define WEBP_TRANSFORM_AC3_MUL1(a) ((((a) * WEBP_TRANSFORM_AC3_C1) >> 16) + (a))
|
||||
#define WEBP_TRANSFORM_AC3_MUL2(a) (((a) * WEBP_TRANSFORM_AC3_C2) >> 16)
|
||||
|
||||
// *dst is the destination block, with stride BPS. Boundary samples are
|
||||
// assumed accessible when needed.
|
||||
typedef void (*VP8PredFunc)(uint8_t* dst);
|
||||
|
@ -109,10 +109,6 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
|
||||
#define STORE(x, y, v) \
|
||||
dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
|
||||
|
||||
static const int kC1 = 20091 + (1 << 16);
|
||||
static const int kC2 = 35468;
|
||||
#define MUL(a, b) (((a) * (b)) >> 16)
|
||||
|
||||
static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
|
||||
uint8_t* dst) {
|
||||
int C[4 * 4], *tmp;
|
||||
@ -121,8 +117,10 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
|
||||
for (i = 0; i < 4; ++i) { // vertical pass
|
||||
const int a = in[0] + in[8];
|
||||
const int b = in[0] - in[8];
|
||||
const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
|
||||
const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
|
||||
const int c =
|
||||
WEBP_TRANSFORM_AC3_MUL2(in[4]) - WEBP_TRANSFORM_AC3_MUL1(in[12]);
|
||||
const int d =
|
||||
WEBP_TRANSFORM_AC3_MUL1(in[4]) + WEBP_TRANSFORM_AC3_MUL2(in[12]);
|
||||
tmp[0] = a + d;
|
||||
tmp[1] = b + c;
|
||||
tmp[2] = b - c;
|
||||
@ -134,10 +132,12 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
|
||||
tmp = C;
|
||||
for (i = 0; i < 4; ++i) { // horizontal pass
|
||||
const int dc = tmp[0] + 4;
|
||||
const int a = dc + tmp[8];
|
||||
const int b = dc - tmp[8];
|
||||
const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
|
||||
const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
|
||||
const int a = dc + tmp[8];
|
||||
const int b = dc - tmp[8];
|
||||
const int c =
|
||||
WEBP_TRANSFORM_AC3_MUL2(tmp[4]) - WEBP_TRANSFORM_AC3_MUL1(tmp[12]);
|
||||
const int d =
|
||||
WEBP_TRANSFORM_AC3_MUL1(tmp[4]) + WEBP_TRANSFORM_AC3_MUL2(tmp[12]);
|
||||
STORE(0, i, a + d);
|
||||
STORE(1, i, b + c);
|
||||
STORE(2, i, b - c);
|
||||
@ -222,7 +222,6 @@ static void FTransformWHT_C(const int16_t* in, int16_t* out) {
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
#undef MUL
|
||||
#undef STORE
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
@ -21,8 +21,8 @@
|
||||
#include "src/enc/vp8i_enc.h"
|
||||
#include "src/enc/cost_enc.h"
|
||||
|
||||
static const int kC1 = 20091 + (1 << 16);
|
||||
static const int kC2 = 35468;
|
||||
static const int kC1 = WEBP_TRANSFORM_AC3_C1;
|
||||
static const int kC2 = WEBP_TRANSFORM_AC3_C2;
|
||||
|
||||
// macro for one vertical pass in ITransformOne
|
||||
// MUL macro inlined
|
||||
@ -30,7 +30,7 @@ static const int kC2 = 35468;
|
||||
// A..D - offsets in bytes to load from in buffer
|
||||
// TEMP0..TEMP3 - registers for corresponding tmp elements
|
||||
// TEMP4..TEMP5 - temporary registers
|
||||
#define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3) \
|
||||
#define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3) \
|
||||
"lh %[temp16], " #A "(%[temp20]) \n\t" \
|
||||
"lh %[temp18], " #B "(%[temp20]) \n\t" \
|
||||
"lh %[temp17], " #C "(%[temp20]) \n\t" \
|
||||
@ -38,12 +38,10 @@ static const int kC2 = 35468;
|
||||
"addu %[" #TEMP4 "], %[temp16], %[temp18] \n\t" \
|
||||
"subu %[temp16], %[temp16], %[temp18] \n\t" \
|
||||
"mul %[" #TEMP0 "], %[temp17], %[kC2] \n\t" \
|
||||
"mul %[temp18], %[temp19], %[kC1] \n\t" \
|
||||
"mul %[temp17], %[temp17], %[kC1] \n\t" \
|
||||
MUL_SHIFT_C1_IO(temp17, temp18) \
|
||||
MUL_SHIFT_C1(temp18, temp19) \
|
||||
"mul %[temp19], %[temp19], %[kC2] \n\t" \
|
||||
"sra %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\n" \
|
||||
"sra %[temp18], %[temp18], 16 \n\n" \
|
||||
"sra %[temp17], %[temp17], 16 \n\n" \
|
||||
"sra %[temp19], %[temp19], 16 \n\n" \
|
||||
"subu %[" #TEMP2 "], %[" #TEMP0 "], %[temp18] \n\t" \
|
||||
"addu %[" #TEMP3 "], %[temp17], %[temp19] \n\t" \
|
||||
@ -58,17 +56,15 @@ static const int kC2 = 35468;
|
||||
// temp0..temp15 holds tmp[0]..tmp[15]
|
||||
// A - offset in bytes to load from ref and store to dst buffer
|
||||
// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
|
||||
#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12) \
|
||||
#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12) \
|
||||
"addiu %[" #TEMP0 "], %[" #TEMP0 "], 4 \n\t" \
|
||||
"addu %[temp16], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
|
||||
"subu %[temp17], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
|
||||
"mul %[" #TEMP0 "], %[" #TEMP4 "], %[kC2] \n\t" \
|
||||
"mul %[" #TEMP8 "], %[" #TEMP12 "], %[kC1] \n\t" \
|
||||
"mul %[" #TEMP4 "], %[" #TEMP4 "], %[kC1] \n\t" \
|
||||
MUL_SHIFT_C1_IO(TEMP4, TEMP8) \
|
||||
MUL_SHIFT_C1(TEMP8, TEMP12) \
|
||||
"mul %[" #TEMP12 "], %[" #TEMP12 "], %[kC2] \n\t" \
|
||||
"sra %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\t" \
|
||||
"sra %[" #TEMP8 "], %[" #TEMP8 "], 16 \n\t" \
|
||||
"sra %[" #TEMP4 "], %[" #TEMP4 "], 16 \n\t" \
|
||||
"sra %[" #TEMP12 "], %[" #TEMP12 "], 16 \n\t" \
|
||||
"subu %[temp18], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
|
||||
"addu %[temp19], %[" #TEMP4 "], %[" #TEMP12 "] \n\t" \
|
||||
|
@ -20,8 +20,8 @@
|
||||
#include "src/enc/cost_enc.h"
|
||||
#include "src/enc/vp8i_enc.h"
|
||||
|
||||
static const int kC1 = 20091 + (1 << 16);
|
||||
static const int kC2 = 35468;
|
||||
static const int kC1 = WEBP_TRANSFORM_AC3_C1;
|
||||
static const int kC2 = WEBP_TRANSFORM_AC3_C2;
|
||||
|
||||
// O - output
|
||||
// I - input (macro doesn't change it)
|
||||
|
@ -27,8 +27,9 @@
|
||||
// This code is pretty much the same as TransformOne in the dec_neon.c, except
|
||||
// for subtraction to *ref. See the comments there for algorithmic explanations.
|
||||
|
||||
static const int16_t kC1 = 20091;
|
||||
static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
|
||||
static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
|
||||
static const int16_t kC2 =
|
||||
WEBP_TRANSFORM_AC3_C2 / 2; // half of kC2, actually. See comment above.
|
||||
|
||||
// This code works but is *slower* than the inlined-asm version below
|
||||
// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
|
||||
|
@ -45,28 +45,38 @@
|
||||
"ulw %[" #O2 "], " #I3 "+" XSTR(I9) "*" #I7 "(%[" #I0 "]) \n\t" \
|
||||
"ulw %[" #O3 "], " #I4 "+" XSTR(I9) "*" #I8 "(%[" #I0 "]) \n\t"
|
||||
|
||||
|
||||
// O - output
|
||||
// I - input (macro doesn't change it so it should be different from I)
|
||||
#define MUL_SHIFT_C1(O, I) \
|
||||
"mul %[" #O "], %[" #I "], %[kC1] \n\t" \
|
||||
"sra %[" #O "], %[" #O "], 16 \n\t" \
|
||||
"addu %[" #O "], %[" #O "], %[" #I "] \n\t"
|
||||
#define MUL_SHIFT_C2(O, I) \
|
||||
"mul %[" #O "], %[" #I "], %[kC2] \n\t" \
|
||||
"sra %[" #O "], %[" #O "], 16 \n\t"
|
||||
|
||||
// Same as #define MUL_SHIFT_C1 but I and O are the same. It stores the
|
||||
// intermediary result in TMP.
|
||||
#define MUL_SHIFT_C1_IO(IO, TMP) \
|
||||
"mul %[" #TMP "], %[" #IO "], %[kC1] \n\t" \
|
||||
"sra %[" #TMP "], %[" #TMP "], 16 \n\t" \
|
||||
"addu %[" #IO "], %[" #TMP "], %[" #IO "] \n\t"
|
||||
|
||||
// O - output
|
||||
// IO - input/output
|
||||
// I - input (macro doesn't change it)
|
||||
#define MUL_SHIFT_SUM(O0, O1, O2, O3, O4, O5, O6, O7, \
|
||||
IO0, IO1, IO2, IO3, \
|
||||
I0, I1, I2, I3, I4, I5, I6, I7) \
|
||||
"mul %[" #O0 "], %[" #I0 "], %[kC2] \n\t" \
|
||||
"mul %[" #O1 "], %[" #I0 "], %[kC1] \n\t" \
|
||||
"mul %[" #O2 "], %[" #I1 "], %[kC2] \n\t" \
|
||||
"mul %[" #O3 "], %[" #I1 "], %[kC1] \n\t" \
|
||||
"mul %[" #O4 "], %[" #I2 "], %[kC2] \n\t" \
|
||||
"mul %[" #O5 "], %[" #I2 "], %[kC1] \n\t" \
|
||||
"mul %[" #O6 "], %[" #I3 "], %[kC2] \n\t" \
|
||||
"mul %[" #O7 "], %[" #I3 "], %[kC1] \n\t" \
|
||||
"sra %[" #O0 "], %[" #O0 "], 16 \n\t" \
|
||||
"sra %[" #O1 "], %[" #O1 "], 16 \n\t" \
|
||||
"sra %[" #O2 "], %[" #O2 "], 16 \n\t" \
|
||||
"sra %[" #O3 "], %[" #O3 "], 16 \n\t" \
|
||||
"sra %[" #O4 "], %[" #O4 "], 16 \n\t" \
|
||||
"sra %[" #O5 "], %[" #O5 "], 16 \n\t" \
|
||||
"sra %[" #O6 "], %[" #O6 "], 16 \n\t" \
|
||||
"sra %[" #O7 "], %[" #O7 "], 16 \n\t" \
|
||||
MUL_SHIFT_C2(O0, I0) \
|
||||
MUL_SHIFT_C1(O1, I0) \
|
||||
MUL_SHIFT_C2(O2, I1) \
|
||||
MUL_SHIFT_C1(O3, I1) \
|
||||
MUL_SHIFT_C2(O4, I2) \
|
||||
MUL_SHIFT_C1(O5, I2) \
|
||||
MUL_SHIFT_C2(O6, I3) \
|
||||
MUL_SHIFT_C1(O7, I3) \
|
||||
"addu %[" #IO0 "], %[" #IO0 "], %[" #I4 "] \n\t" \
|
||||
"addu %[" #IO1 "], %[" #IO1 "], %[" #I5 "] \n\t" \
|
||||
"subu %[" #IO2 "], %[" #IO2 "], %[" #I6 "] \n\t" \
|
||||
|
Loading…
Reference in New Issue
Block a user