From 201894ef248f745288290e1a676c679b998d6673 Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 3 Jul 2021 17:52:50 -0700 Subject: [PATCH] dsp/dec*: use WEBP_RESTRICT qualifier A minor improvement for arm targets with ndk r27/gcc-13 in H/VFilter8 (a couple fewer moves w/aarch64) and much better vectorization of DitherCombine8x8_C in most targets. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: I03e73e6d6404261bb8408a9ae76a4b6ef142f8f0 --- src/dsp/dec.c | 47 ++++++++++++++++++++++----------------- src/dsp/dec_mips32.c | 22 +++++++++--------- src/dsp/dec_mips_dsp_r2.c | 28 +++++++++++++---------- src/dsp/dec_msa.c | 29 +++++++++++++++--------- src/dsp/dec_neon.c | 32 ++++++++++++++++---------- src/dsp/dec_sse2.c | 20 ++++++++++------- src/dsp/dsp.h | 23 ++++++++++++------- src/dsp/enc.c | 2 +- 8 files changed, 121 insertions(+), 82 deletions(-) diff --git a/src/dsp/dec.c b/src/dsp/dec.c index 451d649d..51067f45 100644 --- a/src/dsp/dec.c +++ b/src/dsp/dec.c @@ -38,7 +38,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) { } while (0) #if !WEBP_NEON_OMIT_C_CODE -static void TransformOne_C(const int16_t* in, uint8_t* dst) { +static void TransformOne_C(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { int C[4 * 4], *tmp; int i; tmp = C; @@ -82,7 +83,8 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) { } // Simplified transform when only in[0], in[1] and in[4] are non-zero -static void TransformAC3_C(const int16_t* in, uint8_t* dst) { +static void TransformAC3_C(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const int a = in[0] + 4; const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]); const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]); @@ -95,7 +97,8 @@ static void TransformAC3_C(const int16_t* in, uint8_t* dst) { } #undef STORE2 -static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) { +static void TransformTwo_C(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { TransformOne_C(in, dst); if (do_two) { TransformOne_C(in + 16, dst + 4); @@ -103,13 +106,15 @@ static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) { } #endif // !WEBP_NEON_OMIT_C_CODE -static void TransformUV_C(const int16_t* in, uint8_t* dst) { +static void TransformUV_C(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { VP8Transform(in + 0 * 16, dst, 1); VP8Transform(in + 2 * 16, dst + 4 * BPS, 1); } #if !WEBP_NEON_OMIT_C_CODE -static void TransformDC_C(const int16_t* in, uint8_t* dst) { +static void TransformDC_C(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const int DC = in[0] + 4; int i, j; for (j = 0; j < 4; ++j) { @@ -120,7 +125,8 @@ static void TransformDC_C(const int16_t* in, uint8_t* dst) { } #endif // !WEBP_NEON_OMIT_C_CODE -static void TransformDCUV_C(const int16_t* in, uint8_t* dst) { +static void TransformDCUV_C(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst); if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4); if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS); @@ -133,7 +139,8 @@ static void TransformDCUV_C(const int16_t* in, uint8_t* dst) { // Paragraph 14.3 #if !WEBP_NEON_OMIT_C_CODE -static void TransformWHT_C(const int16_t* in, int16_t* out) { +static void TransformWHT_C(const int16_t* WEBP_RESTRICT in, + int16_t* WEBP_RESTRICT out) { int tmp[16]; int i; for (i = 0; i < 4; ++i) { @@ -161,7 +168,7 @@ static void TransformWHT_C(const int16_t* in, int16_t* out) { } #endif // !WEBP_NEON_OMIT_C_CODE -void (*VP8TransformWHT)(const int16_t* in, int16_t* out); +VP8IWHT VP8TransformWHT; //------------------------------------------------------------------------------ // Intra predictions @@ -661,32 +668,32 @@ static void HFilter16i_C(uint8_t* p, int stride, #if !WEBP_NEON_OMIT_C_CODE // 8-pixels wide variant, for chroma filtering -static void VFilter8_C(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh); } #endif // !WEBP_NEON_OMIT_C_CODE #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC -static void HFilter8_C(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh); } #endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC #if !WEBP_NEON_OMIT_C_CODE -static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8i_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); } #endif // !WEBP_NEON_OMIT_C_CODE #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC -static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8i_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); } @@ -694,8 +701,8 @@ static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride, //------------------------------------------------------------------------------ -static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst, - int dst_stride) { +static void DitherCombine8x8_C(const uint8_t* WEBP_RESTRICT dither, + uint8_t* WEBP_RESTRICT dst, int dst_stride) { int i, j; for (j = 0; j < 8; ++j) { for (i = 0; i < 8; ++i) { @@ -730,8 +737,8 @@ VP8SimpleFilterFunc VP8SimpleHFilter16; VP8SimpleFilterFunc VP8SimpleVFilter16i; VP8SimpleFilterFunc VP8SimpleHFilter16i; -void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst, - int dst_stride); +void (*VP8DitherCombine8x8)(const uint8_t* WEBP_RESTRICT dither, + uint8_t* WEBP_RESTRICT dst, int dst_stride); extern VP8CPUInfo VP8GetCPUInfo; extern void VP8DspInitSSE2(void); diff --git a/src/dsp/dec_mips32.c b/src/dsp/dec_mips32.c index f0e7de4a..89fe9009 100644 --- a/src/dsp/dec_mips32.c +++ b/src/dsp/dec_mips32.c @@ -133,26 +133,26 @@ static void HFilter16(uint8_t* p, int stride, } // 8-pixels wide variant, for chroma filtering -static void VFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh); } -static void HFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh); } -static void VFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); } -static void HFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); } @@ -215,7 +215,8 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { } } -static void TransformOne(const int16_t* in, uint8_t* dst) { +static void TransformOne(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { int temp0, temp1, temp2, temp3, temp4; int temp5, temp6, temp7, temp8, temp9; int temp10, temp11, temp12, temp13, temp14; @@ -532,7 +533,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) { ); } -static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { +static void TransformTwo(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { TransformOne(in, dst); if (do_two) { TransformOne(in + 16, dst + 4); diff --git a/src/dsp/dec_mips_dsp_r2.c b/src/dsp/dec_mips_dsp_r2.c index 0ba706a2..03b5f122 100644 --- a/src/dsp/dec_mips_dsp_r2.c +++ b/src/dsp/dec_mips_dsp_r2.c @@ -21,7 +21,8 @@ static const int kC1 = WEBP_TRANSFORM_AC3_C1; static const int kC2 = WEBP_TRANSFORM_AC3_C2; -static void TransformDC(const int16_t* in, uint8_t* dst) { +static void TransformDC(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10; __asm__ volatile ( @@ -45,7 +46,8 @@ static void TransformDC(const int16_t* in, uint8_t* dst) { ); } -static void TransformAC3(const int16_t* in, uint8_t* dst) { +static void TransformAC3(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const int a = in[0] + 4; int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]); const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]); @@ -81,7 +83,8 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { ); } -static void TransformOne(const int16_t* in, uint8_t* dst) { +static void TransformOne(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18; @@ -148,7 +151,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) { ); } -static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { +static void TransformTwo(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { TransformOne(in, dst); if (do_two) { TransformOne(in + 16, dst + 4); @@ -434,14 +438,14 @@ static void HFilter16(uint8_t* p, int stride, } // 8-pixels wide variant, for chroma filtering -static void VFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh); } -static void HFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh); } @@ -465,14 +469,14 @@ static void HFilter16i(uint8_t* p, int stride, } } -static void VFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); } -static void HFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); } diff --git a/src/dsp/dec_msa.c b/src/dsp/dec_msa.c index 58d17301..422b3632 100644 --- a/src/dsp/dec_msa.c +++ b/src/dsp/dec_msa.c @@ -38,7 +38,8 @@ BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ } -static void TransformOne(const int16_t* in, uint8_t* dst) { +static void TransformOne(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { v8i16 input0, input1; v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; v4i32 res0, res1, res2, res3; @@ -65,14 +66,16 @@ static void TransformOne(const int16_t* in, uint8_t* dst) { ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS); } -static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { +static void TransformTwo(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { TransformOne(in, dst); if (do_two) { TransformOne(in + 16, dst + 4); } } -static void TransformWHT(const int16_t* in, int16_t* out) { +static void TransformWHT(const int16_t* WEBP_RESTRICT in, + int16_t* WEBP_RESTRICT out) { v8i16 input0, input1; const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 }; const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 }; @@ -114,13 +117,15 @@ static void TransformWHT(const int16_t* in, int16_t* out) { out[240] = __msa_copy_s_h(out1, 7); } -static void TransformDC(const int16_t* in, uint8_t* dst) { +static void TransformDC(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const int DC = (in[0] + 4) >> 3; const v8i16 tmp0 = __msa_fill_h(DC); ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS); } -static void TransformAC3(const int16_t* in, uint8_t* dst) { +static void TransformAC3(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const int a = in[0] + 4; const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]); const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]); @@ -475,8 +480,8 @@ static void HFilter16i(uint8_t* src_y, int stride, } // 8-pixels wide variants, for chroma filtering -static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride, - int b_limit_in, int limit_in, int thresh_in) { +static void VFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v, + int stride, int b_limit_in, int limit_in, int thresh_in) { uint8_t* ptmp_src_u = src_u - 4 * stride; uint8_t* ptmp_src_v = src_v - 4 * stride; uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; @@ -520,8 +525,8 @@ static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride, SD(q2_d, ptmp_src_v); } -static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride, - int b_limit_in, int limit_in, int thresh_in) { +static void HFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v, + int stride, int b_limit_in, int limit_in, int thresh_in) { uint8_t* ptmp_src_u = src_u - 4; uint8_t* ptmp_src_v = src_v - 4; v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev; @@ -556,7 +561,8 @@ static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride, ST6x4_UB(tmp7, 0, tmp5, 4, ptmp_src_v, stride); } -static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride, +static void VFilter8i(uint8_t* WEBP_RESTRICT src_u, + uint8_t* WEBP_RESTRICT src_v, int stride, int b_limit_in, int limit_in, int thresh_in) { uint64_t p1_d, p0_d, q0_d, q1_d; v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev; @@ -587,7 +593,8 @@ static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride, SD4(q1_d, q0_d, p0_d, p1_d, src_v, -stride); } -static void HFilter8i(uint8_t* src_u, uint8_t* src_v, int stride, +static void HFilter8i(uint8_t* WEBP_RESTRICT src_u, + uint8_t* WEBP_RESTRICT src_v, int stride, int b_limit_in, int limit_in, int thresh_in) { v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev; v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; diff --git a/src/dsp/dec_neon.c b/src/dsp/dec_neon.c index b4c46f70..f150692a 100644 --- a/src/dsp/dec_neon.c +++ b/src/dsp/dec_neon.c @@ -916,8 +916,8 @@ static void HFilter16i_NEON(uint8_t* p, int stride, #endif // !WORK_AROUND_GCC // 8-pixels wide variant, for chroma filtering -static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); { @@ -932,7 +932,8 @@ static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride, Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride); } } -static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride, +static void VFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; u += 4 * stride; @@ -949,8 +950,8 @@ static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride, } #if !defined(WORK_AROUND_GCC) -static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); { @@ -964,7 +965,8 @@ static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride, } } -static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride, +static void HFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; u += 4; @@ -1041,7 +1043,8 @@ static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) { Transpose8x2_NEON(E0, E1, rows); } -static void TransformOne_NEON(const int16_t* in, uint8_t* dst) { +static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { int16x8x2_t rows; INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8)); TransformPass_NEON(&rows); @@ -1051,7 +1054,8 @@ static void TransformOne_NEON(const int16_t* in, uint8_t* dst) { #else -static void TransformOne_NEON(const int16_t* in, uint8_t* dst) { +static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const int kBPS = BPS; // kC1, kC2. Padded because vld1.16 loads 8 bytes const int16_t constants[4] = { kC1, kC2, 0, 0 }; @@ -1184,14 +1188,16 @@ static void TransformOne_NEON(const int16_t* in, uint8_t* dst) { #endif // WEBP_USE_INTRINSICS -static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) { +static void TransformTwo_NEON(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { TransformOne_NEON(in, dst); if (do_two) { TransformOne_NEON(in + 16, dst + 4); } } -static void TransformDC_NEON(const int16_t* in, uint8_t* dst) { +static void TransformDC_NEON(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const int16x8_t DC = vdupq_n_s16(in[0]); Add4x4_NEON(DC, DC, dst); } @@ -1205,7 +1211,8 @@ static void TransformDC_NEON(const int16_t* in, uint8_t* dst) { *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \ } while (0) -static void TransformWHT_NEON(const int16_t* in, int16_t* out) { +static void TransformWHT_NEON(const int16_t* WEBP_RESTRICT in, + int16_t* WEBP_RESTRICT out) { int32x4x4_t tmp; { @@ -1256,7 +1263,8 @@ static void TransformWHT_NEON(const int16_t* in, int16_t* out) { //------------------------------------------------------------------------------ -static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) { +static void TransformAC3_NEON(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const int16x4_t A = vld1_dup_s16(in); const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4])); const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4])); diff --git a/src/dsp/dec_sse2.c b/src/dsp/dec_sse2.c index 5fd85e6f..b0faada8 100644 --- a/src/dsp/dec_sse2.c +++ b/src/dsp/dec_sse2.c @@ -30,7 +30,8 @@ //------------------------------------------------------------------------------ // Transforms (Paragraph 14.4) -static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) { +static void Transform_SSE2(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 @@ -197,7 +198,8 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) { #if (USE_TRANSFORM_AC3 == 1) -static void TransformAC3_SSE2(const int16_t* in, uint8_t* dst) { +static void TransformAC3_SSE2(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const __m128i A = _mm_set1_epi16(in[0] + 4); const __m128i c4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL2(in[4])); const __m128i d4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL1(in[4])); @@ -792,8 +794,8 @@ static void HFilter16i_SSE2(uint8_t* p, int stride, } // 8-pixels wide variant, for chroma filtering -static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i t1, p2, p1, p0, q0, q1, q2; @@ -817,8 +819,8 @@ static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride, STOREUV(q2, u, v, 2 * stride); } -static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i p3, p2, p1, p0, q0, q1, q2, q3; @@ -837,7 +839,8 @@ static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride, Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride); } -static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride, +static void VFilter8i_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i t1, t2, p1, p0, q0, q1; @@ -863,7 +866,8 @@ static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride, STOREUV(q1, u, v, 1 * stride); } -static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride, +static void HFilter8i_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i t1, t2, p1, p0, q0, q1; diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index 82029533..bd126dea 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -63,11 +63,15 @@ extern "C" { typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst, int do_two); typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out); -typedef void (*VP8WHT)(const int16_t* in, int16_t* out); +// TODO(jzern): merge these two typedefs after the encoder functions are +// updated to use WEBP_RESTRICT. +typedef void (*VP8FWHT)(const int16_t* in, int16_t* out); +typedef void (*VP8IWHT)(const int16_t* WEBP_RESTRICT in, + int16_t* WEBP_RESTRICT out); extern VP8Idct VP8ITransform; extern VP8Fdct VP8FTransform; extern VP8Fdct VP8FTransform2; // performs two transforms at a time -extern VP8WHT VP8FTransformWHT; +extern VP8FWHT VP8FTransformWHT; // Predictions // *dst is the destination block. *top and *left can be NULL. typedef void (*VP8IntraPreds)(uint8_t* dst, const uint8_t* left, @@ -194,15 +198,17 @@ void VP8SSIMDspInit(void); //------------------------------------------------------------------------------ // Decoding -typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst); +typedef void (*VP8DecIdct)(const int16_t* WEBP_RESTRICT coeffs, + uint8_t* WEBP_RESTRICT dst); // when doing two transforms, coeffs is actually int16_t[2][16]. -typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two); +typedef void (*VP8DecIdct2)(const int16_t* WEBP_RESTRICT coeffs, + uint8_t* WEBP_RESTRICT dst, int do_two); extern VP8DecIdct2 VP8Transform; extern VP8DecIdct VP8TransformAC3; extern VP8DecIdct VP8TransformUV; extern VP8DecIdct VP8TransformDC; extern VP8DecIdct VP8TransformDCUV; -extern VP8WHT VP8TransformWHT; +extern VP8IWHT VP8TransformWHT; #define WEBP_TRANSFORM_AC3_C1 20091 #define WEBP_TRANSFORM_AC3_C2 35468 @@ -234,7 +240,8 @@ extern VP8SimpleFilterFunc VP8SimpleHFilter16i; // regular filter (on both macroblock edges and inner edges) typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride, int thresh, int ithresh, int hev_t); -typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride, +typedef void (*VP8ChromaFilterFunc)(uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int stride, int thresh, int ithresh, int hev_t); // on outer edge extern VP8LumaFilterFunc VP8VFilter16; @@ -254,8 +261,8 @@ extern VP8ChromaFilterFunc VP8HFilter8i; #define VP8_DITHER_DESCALE_ROUNDER (1 << (VP8_DITHER_DESCALE - 1)) #define VP8_DITHER_AMP_BITS 7 #define VP8_DITHER_AMP_CENTER (1 << VP8_DITHER_AMP_BITS) -extern void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst, - int dst_stride); +extern void (*VP8DitherCombine8x8)(const uint8_t* WEBP_RESTRICT dither, + uint8_t* WEBP_RESTRICT dst, int dst_stride); // must be called before anything using the above void VP8DspInit(void); diff --git a/src/dsp/enc.c b/src/dsp/enc.c index cb44ba6f..95c623d9 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -720,7 +720,7 @@ VP8CHisto VP8CollectHistogram; VP8Idct VP8ITransform; VP8Fdct VP8FTransform; VP8Fdct VP8FTransform2; -VP8WHT VP8FTransformWHT; +VP8FWHT VP8FTransformWHT; VP8Intra4Preds VP8EncPredLuma4; VP8IntraPreds VP8EncPredLuma16; VP8IntraPreds VP8EncPredChroma8;