mirror of
https://github.com/webmproject/libwebp.git
synced 2024-12-25 05:08:21 +01:00
dsp/dec*: use WEBP_RESTRICT qualifier
A minor improvement for arm targets with ndk r27/gcc-13 in H/VFilter8 (a couple fewer moves w/aarch64) and much better vectorization of DitherCombine8x8_C in most targets. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: I03e73e6d6404261bb8408a9ae76a4b6ef142f8f0
This commit is contained in:
parent
02eac8a741
commit
201894ef24
@ -38,7 +38,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
|
||||
} while (0)
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static void TransformOne_C(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformOne_C(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
int C[4 * 4], *tmp;
|
||||
int i;
|
||||
tmp = C;
|
||||
@ -82,7 +83,8 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) {
|
||||
}
|
||||
|
||||
// Simplified transform when only in[0], in[1] and in[4] are non-zero
|
||||
static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformAC3_C(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
const int a = in[0] + 4;
|
||||
const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
|
||||
const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
|
||||
@ -95,7 +97,8 @@ static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
|
||||
}
|
||||
#undef STORE2
|
||||
|
||||
static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
|
||||
static void TransformTwo_C(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst, int do_two) {
|
||||
TransformOne_C(in, dst);
|
||||
if (do_two) {
|
||||
TransformOne_C(in + 16, dst + 4);
|
||||
@ -103,13 +106,15 @@ static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
static void TransformUV_C(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformUV_C(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
VP8Transform(in + 0 * 16, dst, 1);
|
||||
VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
|
||||
}
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static void TransformDC_C(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformDC_C(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
const int DC = in[0] + 4;
|
||||
int i, j;
|
||||
for (j = 0; j < 4; ++j) {
|
||||
@ -120,7 +125,8 @@ static void TransformDC_C(const int16_t* in, uint8_t* dst) {
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformDCUV_C(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
|
||||
if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
|
||||
if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
|
||||
@ -133,7 +139,8 @@ static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
|
||||
// Paragraph 14.3
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static void TransformWHT_C(const int16_t* in, int16_t* out) {
|
||||
static void TransformWHT_C(const int16_t* WEBP_RESTRICT in,
|
||||
int16_t* WEBP_RESTRICT out) {
|
||||
int tmp[16];
|
||||
int i;
|
||||
for (i = 0; i < 4; ++i) {
|
||||
@ -161,7 +168,7 @@ static void TransformWHT_C(const int16_t* in, int16_t* out) {
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
|
||||
VP8IWHT VP8TransformWHT;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Intra predictions
|
||||
@ -661,32 +668,32 @@ static void HFilter16i_C(uint8_t* p, int stride,
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
// 8-pixels wide variant, for chroma filtering
|
||||
static void VFilter8_C(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void VFilter8_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride, int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
|
||||
static void HFilter8_C(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void HFilter8_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride, int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void VFilter8i_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride, int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
|
||||
static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void HFilter8i_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride, int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
@ -694,8 +701,8 @@ static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst,
|
||||
int dst_stride) {
|
||||
static void DitherCombine8x8_C(const uint8_t* WEBP_RESTRICT dither,
|
||||
uint8_t* WEBP_RESTRICT dst, int dst_stride) {
|
||||
int i, j;
|
||||
for (j = 0; j < 8; ++j) {
|
||||
for (i = 0; i < 8; ++i) {
|
||||
@ -730,8 +737,8 @@ VP8SimpleFilterFunc VP8SimpleHFilter16;
|
||||
VP8SimpleFilterFunc VP8SimpleVFilter16i;
|
||||
VP8SimpleFilterFunc VP8SimpleHFilter16i;
|
||||
|
||||
void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
|
||||
int dst_stride);
|
||||
void (*VP8DitherCombine8x8)(const uint8_t* WEBP_RESTRICT dither,
|
||||
uint8_t* WEBP_RESTRICT dst, int dst_stride);
|
||||
|
||||
extern VP8CPUInfo VP8GetCPUInfo;
|
||||
extern void VP8DspInitSSE2(void);
|
||||
|
@ -133,26 +133,26 @@ static void HFilter16(uint8_t* p, int stride,
|
||||
}
|
||||
|
||||
// 8-pixels wide variant, for chroma filtering
|
||||
static void VFilter8(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride, int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
|
||||
static void HFilter8(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride, int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
|
||||
static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride, int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
|
||||
static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride, int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
@ -215,7 +215,8 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
|
||||
}
|
||||
}
|
||||
|
||||
static void TransformOne(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformOne(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
int temp0, temp1, temp2, temp3, temp4;
|
||||
int temp5, temp6, temp7, temp8, temp9;
|
||||
int temp10, temp11, temp12, temp13, temp14;
|
||||
@ -532,7 +533,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
|
||||
);
|
||||
}
|
||||
|
||||
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
|
||||
static void TransformTwo(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst, int do_two) {
|
||||
TransformOne(in, dst);
|
||||
if (do_two) {
|
||||
TransformOne(in + 16, dst + 4);
|
||||
|
@ -21,7 +21,8 @@
|
||||
static const int kC1 = WEBP_TRANSFORM_AC3_C1;
|
||||
static const int kC2 = WEBP_TRANSFORM_AC3_C2;
|
||||
|
||||
static void TransformDC(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformDC(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
|
||||
|
||||
__asm__ volatile (
|
||||
@ -45,7 +46,8 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
|
||||
);
|
||||
}
|
||||
|
||||
static void TransformAC3(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformAC3(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
const int a = in[0] + 4;
|
||||
int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
|
||||
const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
|
||||
@ -81,7 +83,8 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
|
||||
);
|
||||
}
|
||||
|
||||
static void TransformOne(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformOne(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
|
||||
int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
|
||||
|
||||
@ -148,7 +151,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
|
||||
);
|
||||
}
|
||||
|
||||
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
|
||||
static void TransformTwo(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst, int do_two) {
|
||||
TransformOne(in, dst);
|
||||
if (do_two) {
|
||||
TransformOne(in + 16, dst + 4);
|
||||
@ -434,14 +438,14 @@ static void HFilter16(uint8_t* p, int stride,
|
||||
}
|
||||
|
||||
// 8-pixels wide variant, for chroma filtering
|
||||
static void VFilter8(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride, int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
|
||||
static void HFilter8(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride, int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
@ -465,14 +469,14 @@ static void HFilter16i(uint8_t* p, int stride,
|
||||
}
|
||||
}
|
||||
|
||||
static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride, int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
|
||||
static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride, int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
|
@ -38,7 +38,8 @@
|
||||
BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
|
||||
}
|
||||
|
||||
static void TransformOne(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformOne(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
v8i16 input0, input1;
|
||||
v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
|
||||
v4i32 res0, res1, res2, res3;
|
||||
@ -65,14 +66,16 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
|
||||
ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
|
||||
}
|
||||
|
||||
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
|
||||
static void TransformTwo(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst, int do_two) {
|
||||
TransformOne(in, dst);
|
||||
if (do_two) {
|
||||
TransformOne(in + 16, dst + 4);
|
||||
}
|
||||
}
|
||||
|
||||
static void TransformWHT(const int16_t* in, int16_t* out) {
|
||||
static void TransformWHT(const int16_t* WEBP_RESTRICT in,
|
||||
int16_t* WEBP_RESTRICT out) {
|
||||
v8i16 input0, input1;
|
||||
const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
|
||||
const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
|
||||
@ -114,13 +117,15 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
|
||||
out[240] = __msa_copy_s_h(out1, 7);
|
||||
}
|
||||
|
||||
static void TransformDC(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformDC(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
const int DC = (in[0] + 4) >> 3;
|
||||
const v8i16 tmp0 = __msa_fill_h(DC);
|
||||
ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS);
|
||||
}
|
||||
|
||||
static void TransformAC3(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformAC3(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
const int a = in[0] + 4;
|
||||
const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
|
||||
const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
|
||||
@ -475,8 +480,8 @@ static void HFilter16i(uint8_t* src_y, int stride,
|
||||
}
|
||||
|
||||
// 8-pixels wide variants, for chroma filtering
|
||||
static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
|
||||
int b_limit_in, int limit_in, int thresh_in) {
|
||||
static void VFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v,
|
||||
int stride, int b_limit_in, int limit_in, int thresh_in) {
|
||||
uint8_t* ptmp_src_u = src_u - 4 * stride;
|
||||
uint8_t* ptmp_src_v = src_v - 4 * stride;
|
||||
uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
|
||||
@ -520,8 +525,8 @@ static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
|
||||
SD(q2_d, ptmp_src_v);
|
||||
}
|
||||
|
||||
static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
|
||||
int b_limit_in, int limit_in, int thresh_in) {
|
||||
static void HFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v,
|
||||
int stride, int b_limit_in, int limit_in, int thresh_in) {
|
||||
uint8_t* ptmp_src_u = src_u - 4;
|
||||
uint8_t* ptmp_src_v = src_v - 4;
|
||||
v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
|
||||
@ -556,7 +561,8 @@ static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
|
||||
ST6x4_UB(tmp7, 0, tmp5, 4, ptmp_src_v, stride);
|
||||
}
|
||||
|
||||
static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
|
||||
static void VFilter8i(uint8_t* WEBP_RESTRICT src_u,
|
||||
uint8_t* WEBP_RESTRICT src_v, int stride,
|
||||
int b_limit_in, int limit_in, int thresh_in) {
|
||||
uint64_t p1_d, p0_d, q0_d, q1_d;
|
||||
v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
|
||||
@ -587,7 +593,8 @@ static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
|
||||
SD4(q1_d, q0_d, p0_d, p1_d, src_v, -stride);
|
||||
}
|
||||
|
||||
static void HFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
|
||||
static void HFilter8i(uint8_t* WEBP_RESTRICT src_u,
|
||||
uint8_t* WEBP_RESTRICT src_v, int stride,
|
||||
int b_limit_in, int limit_in, int thresh_in) {
|
||||
v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
|
||||
v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
|
||||
|
@ -916,8 +916,8 @@ static void HFilter16i_NEON(uint8_t* p, int stride,
|
||||
#endif // !WORK_AROUND_GCC
|
||||
|
||||
// 8-pixels wide variant, for chroma filtering
|
||||
static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void VFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride, int thresh, int ithresh, int hev_thresh) {
|
||||
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
|
||||
Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
||||
{
|
||||
@ -932,7 +932,8 @@ static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
|
||||
Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
|
||||
}
|
||||
}
|
||||
static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
|
||||
static void VFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
|
||||
u += 4 * stride;
|
||||
@ -949,8 +950,8 @@ static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
|
||||
}
|
||||
|
||||
#if !defined(WORK_AROUND_GCC)
|
||||
static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void HFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride, int thresh, int ithresh, int hev_thresh) {
|
||||
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
|
||||
Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
||||
{
|
||||
@ -964,7 +965,8 @@ static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
|
||||
}
|
||||
}
|
||||
|
||||
static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
|
||||
static void HFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
|
||||
u += 4;
|
||||
@ -1041,7 +1043,8 @@ static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
|
||||
Transpose8x2_NEON(E0, E1, rows);
|
||||
}
|
||||
|
||||
static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
int16x8x2_t rows;
|
||||
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
|
||||
TransformPass_NEON(&rows);
|
||||
@ -1051,7 +1054,8 @@ static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
|
||||
|
||||
#else
|
||||
|
||||
static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
const int kBPS = BPS;
|
||||
// kC1, kC2. Padded because vld1.16 loads 8 bytes
|
||||
const int16_t constants[4] = { kC1, kC2, 0, 0 };
|
||||
@ -1184,14 +1188,16 @@ static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
|
||||
|
||||
#endif // WEBP_USE_INTRINSICS
|
||||
|
||||
static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
|
||||
static void TransformTwo_NEON(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst, int do_two) {
|
||||
TransformOne_NEON(in, dst);
|
||||
if (do_two) {
|
||||
TransformOne_NEON(in + 16, dst + 4);
|
||||
}
|
||||
}
|
||||
|
||||
static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformDC_NEON(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
const int16x8_t DC = vdupq_n_s16(in[0]);
|
||||
Add4x4_NEON(DC, DC, dst);
|
||||
}
|
||||
@ -1205,7 +1211,8 @@ static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
|
||||
*dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
|
||||
} while (0)
|
||||
|
||||
static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
|
||||
static void TransformWHT_NEON(const int16_t* WEBP_RESTRICT in,
|
||||
int16_t* WEBP_RESTRICT out) {
|
||||
int32x4x4_t tmp;
|
||||
|
||||
{
|
||||
@ -1256,7 +1263,8 @@ static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformAC3_NEON(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
const int16x4_t A = vld1_dup_s16(in);
|
||||
const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
|
||||
const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
|
||||
|
@ -30,7 +30,8 @@
|
||||
//------------------------------------------------------------------------------
|
||||
// Transforms (Paragraph 14.4)
|
||||
|
||||
static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
|
||||
static void Transform_SSE2(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst, int do_two) {
|
||||
// This implementation makes use of 16-bit fixed point versions of two
|
||||
// multiply constants:
|
||||
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
|
||||
@ -197,7 +198,8 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
|
||||
|
||||
#if (USE_TRANSFORM_AC3 == 1)
|
||||
|
||||
static void TransformAC3_SSE2(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformAC3_SSE2(const int16_t* WEBP_RESTRICT in,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
const __m128i A = _mm_set1_epi16(in[0] + 4);
|
||||
const __m128i c4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
|
||||
const __m128i d4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
|
||||
@ -792,8 +794,8 @@ static void HFilter16i_SSE2(uint8_t* p, int stride,
|
||||
}
|
||||
|
||||
// 8-pixels wide variant, for chroma filtering
|
||||
static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void VFilter8_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride, int thresh, int ithresh, int hev_thresh) {
|
||||
__m128i mask;
|
||||
__m128i t1, p2, p1, p0, q0, q1, q2;
|
||||
|
||||
@ -817,8 +819,8 @@ static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
|
||||
STOREUV(q2, u, v, 2 * stride);
|
||||
}
|
||||
|
||||
static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void HFilter8_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride, int thresh, int ithresh, int hev_thresh) {
|
||||
__m128i mask;
|
||||
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
|
||||
|
||||
@ -837,7 +839,8 @@ static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
|
||||
Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride);
|
||||
}
|
||||
|
||||
static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
|
||||
static void VFilter8i_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
__m128i mask;
|
||||
__m128i t1, t2, p1, p0, q0, q1;
|
||||
@ -863,7 +866,8 @@ static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
|
||||
STOREUV(q1, u, v, 1 * stride);
|
||||
}
|
||||
|
||||
static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
|
||||
static void HFilter8i_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
__m128i mask;
|
||||
__m128i t1, t2, p1, p0, q0, q1;
|
||||
|
@ -63,11 +63,15 @@ extern "C" {
|
||||
typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst,
|
||||
int do_two);
|
||||
typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
|
||||
typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
|
||||
// TODO(jzern): merge these two typedefs after the encoder functions are
|
||||
// updated to use WEBP_RESTRICT.
|
||||
typedef void (*VP8FWHT)(const int16_t* in, int16_t* out);
|
||||
typedef void (*VP8IWHT)(const int16_t* WEBP_RESTRICT in,
|
||||
int16_t* WEBP_RESTRICT out);
|
||||
extern VP8Idct VP8ITransform;
|
||||
extern VP8Fdct VP8FTransform;
|
||||
extern VP8Fdct VP8FTransform2; // performs two transforms at a time
|
||||
extern VP8WHT VP8FTransformWHT;
|
||||
extern VP8FWHT VP8FTransformWHT;
|
||||
// Predictions
|
||||
// *dst is the destination block. *top and *left can be NULL.
|
||||
typedef void (*VP8IntraPreds)(uint8_t* dst, const uint8_t* left,
|
||||
@ -194,15 +198,17 @@ void VP8SSIMDspInit(void);
|
||||
//------------------------------------------------------------------------------
|
||||
// Decoding
|
||||
|
||||
typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
|
||||
typedef void (*VP8DecIdct)(const int16_t* WEBP_RESTRICT coeffs,
|
||||
uint8_t* WEBP_RESTRICT dst);
|
||||
// when doing two transforms, coeffs is actually int16_t[2][16].
|
||||
typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
|
||||
typedef void (*VP8DecIdct2)(const int16_t* WEBP_RESTRICT coeffs,
|
||||
uint8_t* WEBP_RESTRICT dst, int do_two);
|
||||
extern VP8DecIdct2 VP8Transform;
|
||||
extern VP8DecIdct VP8TransformAC3;
|
||||
extern VP8DecIdct VP8TransformUV;
|
||||
extern VP8DecIdct VP8TransformDC;
|
||||
extern VP8DecIdct VP8TransformDCUV;
|
||||
extern VP8WHT VP8TransformWHT;
|
||||
extern VP8IWHT VP8TransformWHT;
|
||||
|
||||
#define WEBP_TRANSFORM_AC3_C1 20091
|
||||
#define WEBP_TRANSFORM_AC3_C2 35468
|
||||
@ -234,7 +240,8 @@ extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
|
||||
// regular filter (on both macroblock edges and inner edges)
|
||||
typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
|
||||
int thresh, int ithresh, int hev_t);
|
||||
typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride,
|
||||
typedef void (*VP8ChromaFilterFunc)(uint8_t* WEBP_RESTRICT u,
|
||||
uint8_t* WEBP_RESTRICT v, int stride,
|
||||
int thresh, int ithresh, int hev_t);
|
||||
// on outer edge
|
||||
extern VP8LumaFilterFunc VP8VFilter16;
|
||||
@ -254,8 +261,8 @@ extern VP8ChromaFilterFunc VP8HFilter8i;
|
||||
#define VP8_DITHER_DESCALE_ROUNDER (1 << (VP8_DITHER_DESCALE - 1))
|
||||
#define VP8_DITHER_AMP_BITS 7
|
||||
#define VP8_DITHER_AMP_CENTER (1 << VP8_DITHER_AMP_BITS)
|
||||
extern void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
|
||||
int dst_stride);
|
||||
extern void (*VP8DitherCombine8x8)(const uint8_t* WEBP_RESTRICT dither,
|
||||
uint8_t* WEBP_RESTRICT dst, int dst_stride);
|
||||
|
||||
// must be called before anything using the above
|
||||
void VP8DspInit(void);
|
||||
|
@ -720,7 +720,7 @@ VP8CHisto VP8CollectHistogram;
|
||||
VP8Idct VP8ITransform;
|
||||
VP8Fdct VP8FTransform;
|
||||
VP8Fdct VP8FTransform2;
|
||||
VP8WHT VP8FTransformWHT;
|
||||
VP8FWHT VP8FTransformWHT;
|
||||
VP8Intra4Preds VP8EncPredLuma4;
|
||||
VP8IntraPreds VP8EncPredLuma16;
|
||||
VP8IntraPreds VP8EncPredChroma8;
|
||||
|
Loading…
Reference in New Issue
Block a user