dsp/dec*: use WEBP_RESTRICT qualifier

A minor improvement for arm targets with ndk r27/gcc-13 in H/VFilter8 (a
couple fewer moves w/aarch64) and much better vectorization of
DitherCombine8x8_C in most targets.

This only affects non-vector pointers; any vector pointers are left as a
follow up.

Change-Id: I03e73e6d6404261bb8408a9ae76a4b6ef142f8f0
This commit is contained in:
James Zern 2021-07-03 17:52:50 -07:00
parent 02eac8a741
commit 201894ef24
8 changed files with 121 additions and 82 deletions

View File

@ -38,7 +38,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
} while (0)
#if !WEBP_NEON_OMIT_C_CODE
static void TransformOne_C(const int16_t* in, uint8_t* dst) {
static void TransformOne_C(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int C[4 * 4], *tmp;
int i;
tmp = C;
@ -82,7 +83,8 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) {
}
// Simplified transform when only in[0], in[1] and in[4] are non-zero
static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
static void TransformAC3_C(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int a = in[0] + 4;
const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
@ -95,7 +97,8 @@ static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
}
#undef STORE2
static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
static void TransformTwo_C(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
TransformOne_C(in, dst);
if (do_two) {
TransformOne_C(in + 16, dst + 4);
@ -103,13 +106,15 @@ static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void TransformUV_C(const int16_t* in, uint8_t* dst) {
static void TransformUV_C(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
VP8Transform(in + 0 * 16, dst, 1);
VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
}
#if !WEBP_NEON_OMIT_C_CODE
static void TransformDC_C(const int16_t* in, uint8_t* dst) {
static void TransformDC_C(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int DC = in[0] + 4;
int i, j;
for (j = 0; j < 4; ++j) {
@ -120,7 +125,8 @@ static void TransformDC_C(const int16_t* in, uint8_t* dst) {
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
static void TransformDCUV_C(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
@ -133,7 +139,8 @@ static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
// Paragraph 14.3
#if !WEBP_NEON_OMIT_C_CODE
static void TransformWHT_C(const int16_t* in, int16_t* out) {
static void TransformWHT_C(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out) {
int tmp[16];
int i;
for (i = 0; i < 4; ++i) {
@ -161,7 +168,7 @@ static void TransformWHT_C(const int16_t* in, int16_t* out) {
}
#endif // !WEBP_NEON_OMIT_C_CODE
void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
VP8IWHT VP8TransformWHT;
//------------------------------------------------------------------------------
// Intra predictions
@ -661,32 +668,32 @@ static void HFilter16i_C(uint8_t* p, int stride,
#if !WEBP_NEON_OMIT_C_CODE
// 8-pixels wide variant, for chroma filtering
static void VFilter8_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static void HFilter8_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
#if !WEBP_NEON_OMIT_C_CODE
static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8i_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8i_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
}
@ -694,8 +701,8 @@ static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
//------------------------------------------------------------------------------
static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst,
int dst_stride) {
static void DitherCombine8x8_C(const uint8_t* WEBP_RESTRICT dither,
uint8_t* WEBP_RESTRICT dst, int dst_stride) {
int i, j;
for (j = 0; j < 8; ++j) {
for (i = 0; i < 8; ++i) {
@ -730,8 +737,8 @@ VP8SimpleFilterFunc VP8SimpleHFilter16;
VP8SimpleFilterFunc VP8SimpleVFilter16i;
VP8SimpleFilterFunc VP8SimpleHFilter16i;
void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
int dst_stride);
void (*VP8DitherCombine8x8)(const uint8_t* WEBP_RESTRICT dither,
uint8_t* WEBP_RESTRICT dst, int dst_stride);
extern VP8CPUInfo VP8GetCPUInfo;
extern void VP8DspInitSSE2(void);

View File

@ -133,26 +133,26 @@ static void HFilter16(uint8_t* p, int stride,
}
// 8-pixels wide variant, for chroma filtering
static void VFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
}
static void HFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
}
static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
}
static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
}
@ -215,7 +215,8 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
}
}
static void TransformOne(const int16_t* in, uint8_t* dst) {
static void TransformOne(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14;
@ -532,7 +533,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
);
}
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
static void TransformTwo(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
TransformOne(in, dst);
if (do_two) {
TransformOne(in + 16, dst + 4);

View File

@ -21,7 +21,8 @@
static const int kC1 = WEBP_TRANSFORM_AC3_C1;
static const int kC2 = WEBP_TRANSFORM_AC3_C2;
static void TransformDC(const int16_t* in, uint8_t* dst) {
static void TransformDC(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
__asm__ volatile (
@ -45,7 +46,8 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
);
}
static void TransformAC3(const int16_t* in, uint8_t* dst) {
static void TransformAC3(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int a = in[0] + 4;
int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
@ -81,7 +83,8 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
);
}
static void TransformOne(const int16_t* in, uint8_t* dst) {
static void TransformOne(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
@ -148,7 +151,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
);
}
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
static void TransformTwo(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
TransformOne(in, dst);
if (do_two) {
TransformOne(in + 16, dst + 4);
@ -434,14 +438,14 @@ static void HFilter16(uint8_t* p, int stride,
}
// 8-pixels wide variant, for chroma filtering
static void VFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
}
static void HFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
}
@ -465,14 +469,14 @@ static void HFilter16i(uint8_t* p, int stride,
}
}
static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
}
static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
}

View File

@ -38,7 +38,8 @@
BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
}
static void TransformOne(const int16_t* in, uint8_t* dst) {
static void TransformOne(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
v8i16 input0, input1;
v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
v4i32 res0, res1, res2, res3;
@ -65,14 +66,16 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
}
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
static void TransformTwo(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
TransformOne(in, dst);
if (do_two) {
TransformOne(in + 16, dst + 4);
}
}
static void TransformWHT(const int16_t* in, int16_t* out) {
static void TransformWHT(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out) {
v8i16 input0, input1;
const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
@ -114,13 +117,15 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
out[240] = __msa_copy_s_h(out1, 7);
}
static void TransformDC(const int16_t* in, uint8_t* dst) {
static void TransformDC(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int DC = (in[0] + 4) >> 3;
const v8i16 tmp0 = __msa_fill_h(DC);
ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS);
}
static void TransformAC3(const int16_t* in, uint8_t* dst) {
static void TransformAC3(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int a = in[0] + 4;
const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
@ -475,8 +480,8 @@ static void HFilter16i(uint8_t* src_y, int stride,
}
// 8-pixels wide variants, for chroma filtering
static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
int b_limit_in, int limit_in, int thresh_in) {
static void VFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v,
int stride, int b_limit_in, int limit_in, int thresh_in) {
uint8_t* ptmp_src_u = src_u - 4 * stride;
uint8_t* ptmp_src_v = src_v - 4 * stride;
uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
@ -520,8 +525,8 @@ static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
SD(q2_d, ptmp_src_v);
}
static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
int b_limit_in, int limit_in, int thresh_in) {
static void HFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v,
int stride, int b_limit_in, int limit_in, int thresh_in) {
uint8_t* ptmp_src_u = src_u - 4;
uint8_t* ptmp_src_v = src_v - 4;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
@ -556,7 +561,8 @@ static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
ST6x4_UB(tmp7, 0, tmp5, 4, ptmp_src_v, stride);
}
static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
static void VFilter8i(uint8_t* WEBP_RESTRICT src_u,
uint8_t* WEBP_RESTRICT src_v, int stride,
int b_limit_in, int limit_in, int thresh_in) {
uint64_t p1_d, p0_d, q0_d, q1_d;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
@ -587,7 +593,8 @@ static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
SD4(q1_d, q0_d, p0_d, p1_d, src_v, -stride);
}
static void HFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
static void HFilter8i(uint8_t* WEBP_RESTRICT src_u,
uint8_t* WEBP_RESTRICT src_v, int stride,
int b_limit_in, int limit_in, int thresh_in) {
v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;

View File

@ -916,8 +916,8 @@ static void HFilter16i_NEON(uint8_t* p, int stride,
#endif // !WORK_AROUND_GCC
// 8-pixels wide variant, for chroma filtering
static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
{
@ -932,7 +932,8 @@ static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
}
}
static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
static void VFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride,
int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
u += 4 * stride;
@ -949,8 +950,8 @@ static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
}
#if !defined(WORK_AROUND_GCC)
static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
{
@ -964,7 +965,8 @@ static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
}
}
static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
static void HFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride,
int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
u += 4;
@ -1041,7 +1043,8 @@ static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
Transpose8x2_NEON(E0, E1, rows);
}
static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
int16x8x2_t rows;
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
TransformPass_NEON(&rows);
@ -1051,7 +1054,8 @@ static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
#else
static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int kBPS = BPS;
// kC1, kC2. Padded because vld1.16 loads 8 bytes
const int16_t constants[4] = { kC1, kC2, 0, 0 };
@ -1184,14 +1188,16 @@ static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
#endif // WEBP_USE_INTRINSICS
static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
static void TransformTwo_NEON(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
TransformOne_NEON(in, dst);
if (do_two) {
TransformOne_NEON(in + 16, dst + 4);
}
}
static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
static void TransformDC_NEON(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int16x8_t DC = vdupq_n_s16(in[0]);
Add4x4_NEON(DC, DC, dst);
}
@ -1205,7 +1211,8 @@ static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
*dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
} while (0)
static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
static void TransformWHT_NEON(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out) {
int32x4x4_t tmp;
{
@ -1256,7 +1263,8 @@ static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
//------------------------------------------------------------------------------
static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
static void TransformAC3_NEON(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const int16x4_t A = vld1_dup_s16(in);
const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4]));

View File

@ -30,7 +30,8 @@
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
static void Transform_SSE2(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst, int do_two) {
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@ -197,7 +198,8 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
#if (USE_TRANSFORM_AC3 == 1)
static void TransformAC3_SSE2(const int16_t* in, uint8_t* dst) {
static void TransformAC3_SSE2(const int16_t* WEBP_RESTRICT in,
uint8_t* WEBP_RESTRICT dst) {
const __m128i A = _mm_set1_epi16(in[0] + 4);
const __m128i c4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
const __m128i d4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
@ -792,8 +794,8 @@ static void HFilter16i_SSE2(uint8_t* p, int stride,
}
// 8-pixels wide variant, for chroma filtering
static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, p2, p1, p0, q0, q1, q2;
@ -817,8 +819,8 @@ static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
STOREUV(q2, u, v, 2 * stride);
}
static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride, int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
@ -837,7 +839,8 @@ static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride);
}
static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
static void VFilter8i_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;
@ -863,7 +866,8 @@ static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
STOREUV(q1, u, v, 1 * stride);
}
static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
static void HFilter8i_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;

View File

@ -63,11 +63,15 @@ extern "C" {
typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two);
typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
// TODO(jzern): merge these two typedefs after the encoder functions are
// updated to use WEBP_RESTRICT.
typedef void (*VP8FWHT)(const int16_t* in, int16_t* out);
typedef void (*VP8IWHT)(const int16_t* WEBP_RESTRICT in,
int16_t* WEBP_RESTRICT out);
extern VP8Idct VP8ITransform;
extern VP8Fdct VP8FTransform;
extern VP8Fdct VP8FTransform2; // performs two transforms at a time
extern VP8WHT VP8FTransformWHT;
extern VP8FWHT VP8FTransformWHT;
// Predictions
// *dst is the destination block. *top and *left can be NULL.
typedef void (*VP8IntraPreds)(uint8_t* dst, const uint8_t* left,
@ -194,15 +198,17 @@ void VP8SSIMDspInit(void);
//------------------------------------------------------------------------------
// Decoding
typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
typedef void (*VP8DecIdct)(const int16_t* WEBP_RESTRICT coeffs,
uint8_t* WEBP_RESTRICT dst);
// when doing two transforms, coeffs is actually int16_t[2][16].
typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
typedef void (*VP8DecIdct2)(const int16_t* WEBP_RESTRICT coeffs,
uint8_t* WEBP_RESTRICT dst, int do_two);
extern VP8DecIdct2 VP8Transform;
extern VP8DecIdct VP8TransformAC3;
extern VP8DecIdct VP8TransformUV;
extern VP8DecIdct VP8TransformDC;
extern VP8DecIdct VP8TransformDCUV;
extern VP8WHT VP8TransformWHT;
extern VP8IWHT VP8TransformWHT;
#define WEBP_TRANSFORM_AC3_C1 20091
#define WEBP_TRANSFORM_AC3_C2 35468
@ -234,7 +240,8 @@ extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
// regular filter (on both macroblock edges and inner edges)
typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
int thresh, int ithresh, int hev_t);
typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride,
typedef void (*VP8ChromaFilterFunc)(uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int stride,
int thresh, int ithresh, int hev_t);
// on outer edge
extern VP8LumaFilterFunc VP8VFilter16;
@ -254,8 +261,8 @@ extern VP8ChromaFilterFunc VP8HFilter8i;
#define VP8_DITHER_DESCALE_ROUNDER (1 << (VP8_DITHER_DESCALE - 1))
#define VP8_DITHER_AMP_BITS 7
#define VP8_DITHER_AMP_CENTER (1 << VP8_DITHER_AMP_BITS)
extern void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
int dst_stride);
extern void (*VP8DitherCombine8x8)(const uint8_t* WEBP_RESTRICT dither,
uint8_t* WEBP_RESTRICT dst, int dst_stride);
// must be called before anything using the above
void VP8DspInit(void);

View File

@ -720,7 +720,7 @@ VP8CHisto VP8CollectHistogram;
VP8Idct VP8ITransform;
VP8Fdct VP8FTransform;
VP8Fdct VP8FTransform2;
VP8WHT VP8FTransformWHT;
VP8FWHT VP8FTransformWHT;
VP8Intra4Preds VP8EncPredLuma4;
VP8IntraPreds VP8EncPredLuma16;
VP8IntraPreds VP8EncPredChroma8;