From 02eac8a741a3d6b9bfacbceb750c44fba7ed4662 Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 3 Jul 2021 17:45:13 -0700 Subject: [PATCH 1/8] dsp/cost*: use WEBP_RESTRICT qualifier on SetResidualCoeffs_*. This results in some minor code reordering when targeting arvm7 with ndk r27 and other recent versions of clang. No changes in the x86 compilations with clang-16 / gcc-13. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: I7c3554ece848fafbc5ac9c4944f1dc85129f6fd8 --- src/dsp/cost.c | 4 ++-- src/dsp/cost_mips32.c | 4 ++-- src/dsp/cost_neon.c | 4 ++-- src/dsp/cost_sse2.c | 4 ++-- src/dsp/dsp.h | 5 +++-- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/dsp/cost.c b/src/dsp/cost.c index 73d21401..609f9264 100644 --- a/src/dsp/cost.c +++ b/src/dsp/cost.c @@ -354,8 +354,8 @@ static int GetResidualCost_C(int ctx0, const VP8Residual* const res) { return cost; } -static void SetResidualCoeffs_C(const int16_t* const coeffs, - VP8Residual* const res) { +static void SetResidualCoeffs_C(const int16_t* WEBP_RESTRICT const coeffs, + VP8Residual* WEBP_RESTRICT const res) { int n; res->last = -1; assert(res->first == 0 || coeffs[0] == 0); diff --git a/src/dsp/cost_mips32.c b/src/dsp/cost_mips32.c index 0500f88c..54586576 100644 --- a/src/dsp/cost_mips32.c +++ b/src/dsp/cost_mips32.c @@ -96,8 +96,8 @@ static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) { return cost; } -static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs, - VP8Residual* const res) { +static void SetResidualCoeffs_MIPS32(const int16_t* WEBP_RESTRICT const coeffs, + VP8Residual* WEBP_RESTRICT const res) { const int16_t* p_coeffs = (int16_t*)coeffs; int temp0, temp1, temp2, n, n1; assert(res->first == 0 || coeffs[0] == 0); diff --git a/src/dsp/cost_neon.c b/src/dsp/cost_neon.c index 6582669c..e1bf3657 100644 --- a/src/dsp/cost_neon.c +++ b/src/dsp/cost_neon.c @@ -19,8 +19,8 @@ static const uint8_t position[16] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }; -static void SetResidualCoeffs_NEON(const int16_t* const coeffs, - VP8Residual* const res) { +static void SetResidualCoeffs_NEON(const int16_t* WEBP_RESTRICT const coeffs, + VP8Residual* WEBP_RESTRICT const res) { const int16x8_t minus_one = vdupq_n_s16(-1); const int16x8_t coeffs_0 = vld1q_s16(coeffs); const int16x8_t coeffs_1 = vld1q_s16(coeffs + 8); diff --git a/src/dsp/cost_sse2.c b/src/dsp/cost_sse2.c index 487a0799..a869b48d 100644 --- a/src/dsp/cost_sse2.c +++ b/src/dsp/cost_sse2.c @@ -22,8 +22,8 @@ //------------------------------------------------------------------------------ -static void SetResidualCoeffs_SSE2(const int16_t* const coeffs, - VP8Residual* const res) { +static void SetResidualCoeffs_SSE2(const int16_t* WEBP_RESTRICT const coeffs, + VP8Residual* WEBP_RESTRICT const res) { const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0)); const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8)); // Use SSE2 to compare 16 values with a single instruction. diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index 23bc2965..82029533 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -138,8 +138,9 @@ extern const uint16_t VP8LevelFixedCosts[2047 /*MAX_LEVEL*/ + 1]; extern const uint8_t VP8EncBands[16 + 1]; struct VP8Residual; -typedef void (*VP8SetResidualCoeffsFunc)(const int16_t* const coeffs, - struct VP8Residual* const res); +typedef void (*VP8SetResidualCoeffsFunc)( + const int16_t* WEBP_RESTRICT const coeffs, + struct VP8Residual* WEBP_RESTRICT const res); extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs; // Cost calculation function. From 201894ef248f745288290e1a676c679b998d6673 Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 3 Jul 2021 17:52:50 -0700 Subject: [PATCH 2/8] dsp/dec*: use WEBP_RESTRICT qualifier A minor improvement for arm targets with ndk r27/gcc-13 in H/VFilter8 (a couple fewer moves w/aarch64) and much better vectorization of DitherCombine8x8_C in most targets. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: I03e73e6d6404261bb8408a9ae76a4b6ef142f8f0 --- src/dsp/dec.c | 47 ++++++++++++++++++++++----------------- src/dsp/dec_mips32.c | 22 +++++++++--------- src/dsp/dec_mips_dsp_r2.c | 28 +++++++++++++---------- src/dsp/dec_msa.c | 29 +++++++++++++++--------- src/dsp/dec_neon.c | 32 ++++++++++++++++---------- src/dsp/dec_sse2.c | 20 ++++++++++------- src/dsp/dsp.h | 23 ++++++++++++------- src/dsp/enc.c | 2 +- 8 files changed, 121 insertions(+), 82 deletions(-) diff --git a/src/dsp/dec.c b/src/dsp/dec.c index 451d649d..51067f45 100644 --- a/src/dsp/dec.c +++ b/src/dsp/dec.c @@ -38,7 +38,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) { } while (0) #if !WEBP_NEON_OMIT_C_CODE -static void TransformOne_C(const int16_t* in, uint8_t* dst) { +static void TransformOne_C(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { int C[4 * 4], *tmp; int i; tmp = C; @@ -82,7 +83,8 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) { } // Simplified transform when only in[0], in[1] and in[4] are non-zero -static void TransformAC3_C(const int16_t* in, uint8_t* dst) { +static void TransformAC3_C(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const int a = in[0] + 4; const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]); const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]); @@ -95,7 +97,8 @@ static void TransformAC3_C(const int16_t* in, uint8_t* dst) { } #undef STORE2 -static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) { +static void TransformTwo_C(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { TransformOne_C(in, dst); if (do_two) { TransformOne_C(in + 16, dst + 4); @@ -103,13 +106,15 @@ static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) { } #endif // !WEBP_NEON_OMIT_C_CODE -static void TransformUV_C(const int16_t* in, uint8_t* dst) { +static void TransformUV_C(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { VP8Transform(in + 0 * 16, dst, 1); VP8Transform(in + 2 * 16, dst + 4 * BPS, 1); } #if !WEBP_NEON_OMIT_C_CODE -static void TransformDC_C(const int16_t* in, uint8_t* dst) { +static void TransformDC_C(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const int DC = in[0] + 4; int i, j; for (j = 0; j < 4; ++j) { @@ -120,7 +125,8 @@ static void TransformDC_C(const int16_t* in, uint8_t* dst) { } #endif // !WEBP_NEON_OMIT_C_CODE -static void TransformDCUV_C(const int16_t* in, uint8_t* dst) { +static void TransformDCUV_C(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst); if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4); if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS); @@ -133,7 +139,8 @@ static void TransformDCUV_C(const int16_t* in, uint8_t* dst) { // Paragraph 14.3 #if !WEBP_NEON_OMIT_C_CODE -static void TransformWHT_C(const int16_t* in, int16_t* out) { +static void TransformWHT_C(const int16_t* WEBP_RESTRICT in, + int16_t* WEBP_RESTRICT out) { int tmp[16]; int i; for (i = 0; i < 4; ++i) { @@ -161,7 +168,7 @@ static void TransformWHT_C(const int16_t* in, int16_t* out) { } #endif // !WEBP_NEON_OMIT_C_CODE -void (*VP8TransformWHT)(const int16_t* in, int16_t* out); +VP8IWHT VP8TransformWHT; //------------------------------------------------------------------------------ // Intra predictions @@ -661,32 +668,32 @@ static void HFilter16i_C(uint8_t* p, int stride, #if !WEBP_NEON_OMIT_C_CODE // 8-pixels wide variant, for chroma filtering -static void VFilter8_C(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh); } #endif // !WEBP_NEON_OMIT_C_CODE #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC -static void HFilter8_C(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh); } #endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC #if !WEBP_NEON_OMIT_C_CODE -static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8i_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); } #endif // !WEBP_NEON_OMIT_C_CODE #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC -static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8i_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); } @@ -694,8 +701,8 @@ static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride, //------------------------------------------------------------------------------ -static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst, - int dst_stride) { +static void DitherCombine8x8_C(const uint8_t* WEBP_RESTRICT dither, + uint8_t* WEBP_RESTRICT dst, int dst_stride) { int i, j; for (j = 0; j < 8; ++j) { for (i = 0; i < 8; ++i) { @@ -730,8 +737,8 @@ VP8SimpleFilterFunc VP8SimpleHFilter16; VP8SimpleFilterFunc VP8SimpleVFilter16i; VP8SimpleFilterFunc VP8SimpleHFilter16i; -void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst, - int dst_stride); +void (*VP8DitherCombine8x8)(const uint8_t* WEBP_RESTRICT dither, + uint8_t* WEBP_RESTRICT dst, int dst_stride); extern VP8CPUInfo VP8GetCPUInfo; extern void VP8DspInitSSE2(void); diff --git a/src/dsp/dec_mips32.c b/src/dsp/dec_mips32.c index f0e7de4a..89fe9009 100644 --- a/src/dsp/dec_mips32.c +++ b/src/dsp/dec_mips32.c @@ -133,26 +133,26 @@ static void HFilter16(uint8_t* p, int stride, } // 8-pixels wide variant, for chroma filtering -static void VFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh); } -static void HFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh); } -static void VFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); } -static void HFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); } @@ -215,7 +215,8 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { } } -static void TransformOne(const int16_t* in, uint8_t* dst) { +static void TransformOne(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { int temp0, temp1, temp2, temp3, temp4; int temp5, temp6, temp7, temp8, temp9; int temp10, temp11, temp12, temp13, temp14; @@ -532,7 +533,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) { ); } -static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { +static void TransformTwo(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { TransformOne(in, dst); if (do_two) { TransformOne(in + 16, dst + 4); diff --git a/src/dsp/dec_mips_dsp_r2.c b/src/dsp/dec_mips_dsp_r2.c index 0ba706a2..03b5f122 100644 --- a/src/dsp/dec_mips_dsp_r2.c +++ b/src/dsp/dec_mips_dsp_r2.c @@ -21,7 +21,8 @@ static const int kC1 = WEBP_TRANSFORM_AC3_C1; static const int kC2 = WEBP_TRANSFORM_AC3_C2; -static void TransformDC(const int16_t* in, uint8_t* dst) { +static void TransformDC(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10; __asm__ volatile ( @@ -45,7 +46,8 @@ static void TransformDC(const int16_t* in, uint8_t* dst) { ); } -static void TransformAC3(const int16_t* in, uint8_t* dst) { +static void TransformAC3(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const int a = in[0] + 4; int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]); const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]); @@ -81,7 +83,8 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { ); } -static void TransformOne(const int16_t* in, uint8_t* dst) { +static void TransformOne(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18; @@ -148,7 +151,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) { ); } -static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { +static void TransformTwo(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { TransformOne(in, dst); if (do_two) { TransformOne(in + 16, dst + 4); @@ -434,14 +438,14 @@ static void HFilter16(uint8_t* p, int stride, } // 8-pixels wide variant, for chroma filtering -static void VFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh); } -static void HFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh); } @@ -465,14 +469,14 @@ static void HFilter16i(uint8_t* p, int stride, } } -static void VFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); } -static void HFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh); FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); } diff --git a/src/dsp/dec_msa.c b/src/dsp/dec_msa.c index 58d17301..422b3632 100644 --- a/src/dsp/dec_msa.c +++ b/src/dsp/dec_msa.c @@ -38,7 +38,8 @@ BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ } -static void TransformOne(const int16_t* in, uint8_t* dst) { +static void TransformOne(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { v8i16 input0, input1; v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; v4i32 res0, res1, res2, res3; @@ -65,14 +66,16 @@ static void TransformOne(const int16_t* in, uint8_t* dst) { ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS); } -static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { +static void TransformTwo(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { TransformOne(in, dst); if (do_two) { TransformOne(in + 16, dst + 4); } } -static void TransformWHT(const int16_t* in, int16_t* out) { +static void TransformWHT(const int16_t* WEBP_RESTRICT in, + int16_t* WEBP_RESTRICT out) { v8i16 input0, input1; const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 }; const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 }; @@ -114,13 +117,15 @@ static void TransformWHT(const int16_t* in, int16_t* out) { out[240] = __msa_copy_s_h(out1, 7); } -static void TransformDC(const int16_t* in, uint8_t* dst) { +static void TransformDC(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const int DC = (in[0] + 4) >> 3; const v8i16 tmp0 = __msa_fill_h(DC); ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS); } -static void TransformAC3(const int16_t* in, uint8_t* dst) { +static void TransformAC3(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const int a = in[0] + 4; const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]); const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]); @@ -475,8 +480,8 @@ static void HFilter16i(uint8_t* src_y, int stride, } // 8-pixels wide variants, for chroma filtering -static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride, - int b_limit_in, int limit_in, int thresh_in) { +static void VFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v, + int stride, int b_limit_in, int limit_in, int thresh_in) { uint8_t* ptmp_src_u = src_u - 4 * stride; uint8_t* ptmp_src_v = src_v - 4 * stride; uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; @@ -520,8 +525,8 @@ static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride, SD(q2_d, ptmp_src_v); } -static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride, - int b_limit_in, int limit_in, int thresh_in) { +static void HFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v, + int stride, int b_limit_in, int limit_in, int thresh_in) { uint8_t* ptmp_src_u = src_u - 4; uint8_t* ptmp_src_v = src_v - 4; v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev; @@ -556,7 +561,8 @@ static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride, ST6x4_UB(tmp7, 0, tmp5, 4, ptmp_src_v, stride); } -static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride, +static void VFilter8i(uint8_t* WEBP_RESTRICT src_u, + uint8_t* WEBP_RESTRICT src_v, int stride, int b_limit_in, int limit_in, int thresh_in) { uint64_t p1_d, p0_d, q0_d, q1_d; v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev; @@ -587,7 +593,8 @@ static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride, SD4(q1_d, q0_d, p0_d, p1_d, src_v, -stride); } -static void HFilter8i(uint8_t* src_u, uint8_t* src_v, int stride, +static void HFilter8i(uint8_t* WEBP_RESTRICT src_u, + uint8_t* WEBP_RESTRICT src_v, int stride, int b_limit_in, int limit_in, int thresh_in) { v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev; v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; diff --git a/src/dsp/dec_neon.c b/src/dsp/dec_neon.c index b4c46f70..f150692a 100644 --- a/src/dsp/dec_neon.c +++ b/src/dsp/dec_neon.c @@ -916,8 +916,8 @@ static void HFilter16i_NEON(uint8_t* p, int stride, #endif // !WORK_AROUND_GCC // 8-pixels wide variant, for chroma filtering -static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); { @@ -932,7 +932,8 @@ static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride, Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride); } } -static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride, +static void VFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; u += 4 * stride; @@ -949,8 +950,8 @@ static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride, } #if !defined(WORK_AROUND_GCC) -static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); { @@ -964,7 +965,8 @@ static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride, } } -static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride, +static void HFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; u += 4; @@ -1041,7 +1043,8 @@ static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) { Transpose8x2_NEON(E0, E1, rows); } -static void TransformOne_NEON(const int16_t* in, uint8_t* dst) { +static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { int16x8x2_t rows; INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8)); TransformPass_NEON(&rows); @@ -1051,7 +1054,8 @@ static void TransformOne_NEON(const int16_t* in, uint8_t* dst) { #else -static void TransformOne_NEON(const int16_t* in, uint8_t* dst) { +static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const int kBPS = BPS; // kC1, kC2. Padded because vld1.16 loads 8 bytes const int16_t constants[4] = { kC1, kC2, 0, 0 }; @@ -1184,14 +1188,16 @@ static void TransformOne_NEON(const int16_t* in, uint8_t* dst) { #endif // WEBP_USE_INTRINSICS -static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) { +static void TransformTwo_NEON(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { TransformOne_NEON(in, dst); if (do_two) { TransformOne_NEON(in + 16, dst + 4); } } -static void TransformDC_NEON(const int16_t* in, uint8_t* dst) { +static void TransformDC_NEON(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const int16x8_t DC = vdupq_n_s16(in[0]); Add4x4_NEON(DC, DC, dst); } @@ -1205,7 +1211,8 @@ static void TransformDC_NEON(const int16_t* in, uint8_t* dst) { *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \ } while (0) -static void TransformWHT_NEON(const int16_t* in, int16_t* out) { +static void TransformWHT_NEON(const int16_t* WEBP_RESTRICT in, + int16_t* WEBP_RESTRICT out) { int32x4x4_t tmp; { @@ -1256,7 +1263,8 @@ static void TransformWHT_NEON(const int16_t* in, int16_t* out) { //------------------------------------------------------------------------------ -static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) { +static void TransformAC3_NEON(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const int16x4_t A = vld1_dup_s16(in); const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4])); const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4])); diff --git a/src/dsp/dec_sse2.c b/src/dsp/dec_sse2.c index 5fd85e6f..b0faada8 100644 --- a/src/dsp/dec_sse2.c +++ b/src/dsp/dec_sse2.c @@ -30,7 +30,8 @@ //------------------------------------------------------------------------------ // Transforms (Paragraph 14.4) -static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) { +static void Transform_SSE2(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 @@ -197,7 +198,8 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) { #if (USE_TRANSFORM_AC3 == 1) -static void TransformAC3_SSE2(const int16_t* in, uint8_t* dst) { +static void TransformAC3_SSE2(const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const __m128i A = _mm_set1_epi16(in[0] + 4); const __m128i c4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL2(in[4])); const __m128i d4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL1(in[4])); @@ -792,8 +794,8 @@ static void HFilter16i_SSE2(uint8_t* p, int stride, } // 8-pixels wide variant, for chroma filtering -static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i t1, p2, p1, p0, q0, q1, q2; @@ -817,8 +819,8 @@ static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride, STOREUV(q2, u, v, 2 * stride); } -static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i p3, p2, p1, p0, q0, q1, q2, q3; @@ -837,7 +839,8 @@ static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride, Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride); } -static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride, +static void VFilter8i_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i t1, t2, p1, p0, q0, q1; @@ -863,7 +866,8 @@ static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride, STOREUV(q1, u, v, 1 * stride); } -static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride, +static void HFilter8i_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int stride, int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i t1, t2, p1, p0, q0, q1; diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index 82029533..bd126dea 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -63,11 +63,15 @@ extern "C" { typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst, int do_two); typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out); -typedef void (*VP8WHT)(const int16_t* in, int16_t* out); +// TODO(jzern): merge these two typedefs after the encoder functions are +// updated to use WEBP_RESTRICT. +typedef void (*VP8FWHT)(const int16_t* in, int16_t* out); +typedef void (*VP8IWHT)(const int16_t* WEBP_RESTRICT in, + int16_t* WEBP_RESTRICT out); extern VP8Idct VP8ITransform; extern VP8Fdct VP8FTransform; extern VP8Fdct VP8FTransform2; // performs two transforms at a time -extern VP8WHT VP8FTransformWHT; +extern VP8FWHT VP8FTransformWHT; // Predictions // *dst is the destination block. *top and *left can be NULL. typedef void (*VP8IntraPreds)(uint8_t* dst, const uint8_t* left, @@ -194,15 +198,17 @@ void VP8SSIMDspInit(void); //------------------------------------------------------------------------------ // Decoding -typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst); +typedef void (*VP8DecIdct)(const int16_t* WEBP_RESTRICT coeffs, + uint8_t* WEBP_RESTRICT dst); // when doing two transforms, coeffs is actually int16_t[2][16]. -typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two); +typedef void (*VP8DecIdct2)(const int16_t* WEBP_RESTRICT coeffs, + uint8_t* WEBP_RESTRICT dst, int do_two); extern VP8DecIdct2 VP8Transform; extern VP8DecIdct VP8TransformAC3; extern VP8DecIdct VP8TransformUV; extern VP8DecIdct VP8TransformDC; extern VP8DecIdct VP8TransformDCUV; -extern VP8WHT VP8TransformWHT; +extern VP8IWHT VP8TransformWHT; #define WEBP_TRANSFORM_AC3_C1 20091 #define WEBP_TRANSFORM_AC3_C2 35468 @@ -234,7 +240,8 @@ extern VP8SimpleFilterFunc VP8SimpleHFilter16i; // regular filter (on both macroblock edges and inner edges) typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride, int thresh, int ithresh, int hev_t); -typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride, +typedef void (*VP8ChromaFilterFunc)(uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int stride, int thresh, int ithresh, int hev_t); // on outer edge extern VP8LumaFilterFunc VP8VFilter16; @@ -254,8 +261,8 @@ extern VP8ChromaFilterFunc VP8HFilter8i; #define VP8_DITHER_DESCALE_ROUNDER (1 << (VP8_DITHER_DESCALE - 1)) #define VP8_DITHER_AMP_BITS 7 #define VP8_DITHER_AMP_CENTER (1 << VP8_DITHER_AMP_BITS) -extern void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst, - int dst_stride); +extern void (*VP8DitherCombine8x8)(const uint8_t* WEBP_RESTRICT dither, + uint8_t* WEBP_RESTRICT dst, int dst_stride); // must be called before anything using the above void VP8DspInit(void); diff --git a/src/dsp/enc.c b/src/dsp/enc.c index cb44ba6f..95c623d9 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -720,7 +720,7 @@ VP8CHisto VP8CollectHistogram; VP8Idct VP8ITransform; VP8Fdct VP8FTransform; VP8Fdct VP8FTransform2; -VP8WHT VP8FTransformWHT; +VP8FWHT VP8FTransformWHT; VP8Intra4Preds VP8EncPredLuma4; VP8IntraPreds VP8EncPredLuma16; VP8IntraPreds VP8EncPredChroma8; From b1cb37e65921649514b7a4fad46eb8d28dcbfb10 Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 3 Jul 2021 17:59:44 -0700 Subject: [PATCH 3/8] dsp/enc*: use WEBP_RESTRICT qualifier This allows for better vectorization of the C code, inlining of TrueMotion_SSE2, better load usage in aarch64 and other minor reordering with ndk r27/gcc-13/clang-16. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: I07e9944d5c0aa5a079b22883ac5a2d649695e4a0 --- src/dsp/dec.c | 2 +- src/dsp/dsp.h | 55 ++++++----- src/dsp/enc.c | 126 +++++++++++++++--------- src/dsp/enc_mips32.c | 40 +++++--- src/dsp/enc_mips_dsp_r2.c | 101 +++++++++++-------- src/dsp/enc_msa.c | 125 ++++++++++++++--------- src/dsp/enc_neon.c | 82 +++++++++------- src/dsp/enc_sse2.c | 202 +++++++++++++++++++++++--------------- src/dsp/enc_sse41.c | 21 ++-- 9 files changed, 457 insertions(+), 297 deletions(-) diff --git a/src/dsp/dec.c b/src/dsp/dec.c index 51067f45..dc1a7625 100644 --- a/src/dsp/dec.c +++ b/src/dsp/dec.c @@ -168,7 +168,7 @@ static void TransformWHT_C(const int16_t* WEBP_RESTRICT in, } #endif // !WEBP_NEON_OMIT_C_CODE -VP8IWHT VP8TransformWHT; +VP8WHT VP8TransformWHT; //------------------------------------------------------------------------------ // Intra predictions diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index bd126dea..ec0302f2 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -60,57 +60,66 @@ extern "C" { // Transforms // VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms // will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4). -typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst, - int do_two); -typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out); -// TODO(jzern): merge these two typedefs after the encoder functions are -// updated to use WEBP_RESTRICT. -typedef void (*VP8FWHT)(const int16_t* in, int16_t* out); -typedef void (*VP8IWHT)(const int16_t* WEBP_RESTRICT in, +typedef void (*VP8Idct)(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two); +typedef void (*VP8Fdct)(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, int16_t* WEBP_RESTRICT out); +typedef void (*VP8WHT)(const int16_t* WEBP_RESTRICT in, + int16_t* WEBP_RESTRICT out); extern VP8Idct VP8ITransform; extern VP8Fdct VP8FTransform; extern VP8Fdct VP8FTransform2; // performs two transforms at a time -extern VP8FWHT VP8FTransformWHT; +extern VP8WHT VP8FTransformWHT; // Predictions // *dst is the destination block. *top and *left can be NULL. -typedef void (*VP8IntraPreds)(uint8_t* dst, const uint8_t* left, - const uint8_t* top); -typedef void (*VP8Intra4Preds)(uint8_t* dst, const uint8_t* top); +typedef void (*VP8IntraPreds)(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top); +typedef void (*VP8Intra4Preds)(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top); extern VP8Intra4Preds VP8EncPredLuma4; extern VP8IntraPreds VP8EncPredLuma16; extern VP8IntraPreds VP8EncPredChroma8; -typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref); +typedef int (*VP8Metric)(const uint8_t* WEBP_RESTRICT pix, + const uint8_t* WEBP_RESTRICT ref); extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4; -typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref, - const uint16_t* const weights); +typedef int (*VP8WMetric)(const uint8_t* WEBP_RESTRICT pix, + const uint8_t* WEBP_RESTRICT ref, + const uint16_t* WEBP_RESTRICT const weights); // The weights for VP8TDisto4x4 and VP8TDisto16x16 contain a row-major // 4 by 4 symmetric matrix. extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16; // Compute the average (DC) of four 4x4 blocks. // Each sub-4x4 block #i sum is stored in dc[i]. -typedef void (*VP8MeanMetric)(const uint8_t* ref, uint32_t dc[4]); +typedef void (*VP8MeanMetric)(const uint8_t* WEBP_RESTRICT ref, + uint32_t dc[4]); extern VP8MeanMetric VP8Mean16x4; -typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst); +typedef void (*VP8BlockCopy)(const uint8_t* WEBP_RESTRICT src, + uint8_t* WEBP_RESTRICT dst); extern VP8BlockCopy VP8Copy4x4; extern VP8BlockCopy VP8Copy16x8; // Quantization struct VP8Matrix; // forward declaration -typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16], - const struct VP8Matrix* const mtx); +typedef int (*VP8QuantizeBlock)( + int16_t in[16], int16_t out[16], + const struct VP8Matrix* WEBP_RESTRICT const mtx); // Same as VP8QuantizeBlock, but quantizes two consecutive blocks. -typedef int (*VP8Quantize2Blocks)(int16_t in[32], int16_t out[32], - const struct VP8Matrix* const mtx); +typedef int (*VP8Quantize2Blocks)( + int16_t in[32], int16_t out[32], + const struct VP8Matrix* WEBP_RESTRICT const mtx); extern VP8QuantizeBlock VP8EncQuantizeBlock; extern VP8Quantize2Blocks VP8EncQuantize2Blocks; // specific to 2nd transform: -typedef int (*VP8QuantizeBlockWHT)(int16_t in[16], int16_t out[16], - const struct VP8Matrix* const mtx); +typedef int (*VP8QuantizeBlockWHT)( + int16_t in[16], int16_t out[16], + const struct VP8Matrix* WEBP_RESTRICT const mtx); extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT; extern const int VP8DspScan[16 + 4 + 4]; @@ -208,7 +217,7 @@ extern VP8DecIdct VP8TransformAC3; extern VP8DecIdct VP8TransformUV; extern VP8DecIdct VP8TransformDC; extern VP8DecIdct VP8TransformDCUV; -extern VP8IWHT VP8TransformWHT; +extern VP8WHT VP8TransformWHT; #define WEBP_TRANSFORM_AC3_C1 20091 #define WEBP_TRANSFORM_AC3_C2 35468 diff --git a/src/dsp/enc.c b/src/dsp/enc.c index 95c623d9..b177031d 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -59,9 +59,10 @@ void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1], } #if !WEBP_NEON_OMIT_C_CODE -static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred, +static void CollectHistogram_C(const uint8_t* WEBP_RESTRICT ref, + const uint8_t* WEBP_RESTRICT pred, int start_block, int end_block, - VP8Histogram* const histo) { + VP8Histogram* WEBP_RESTRICT const histo) { int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; for (j = start_block; j < end_block; ++j) { @@ -109,8 +110,9 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) { #define STORE(x, y, v) \ dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3)) -static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, - uint8_t* dst) { +static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { int C[4 * 4], *tmp; int i; tmp = C; @@ -146,7 +148,9 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, } } -static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst, +static void ITransform_C(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { ITransformOne(ref, in, dst); if (do_two) { @@ -154,7 +158,9 @@ static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst, } } -static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) { +static void FTransform_C(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, + int16_t* WEBP_RESTRICT out) { int i; int tmp[16]; for (i = 0; i < 4; ++i, src += BPS, ref += BPS) { @@ -184,14 +190,16 @@ static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) { } #endif // !WEBP_NEON_OMIT_C_CODE -static void FTransform2_C(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform2_C(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, + int16_t* WEBP_RESTRICT out) { VP8FTransform(src, ref, out); VP8FTransform(src + 4, ref + 4, out + 16); } #if !WEBP_NEON_OMIT_C_CODE -static void FTransformWHT_C(const int16_t* in, int16_t* out) { +static void FTransformWHT_C(const int16_t* WEBP_RESTRICT in, + int16_t* WEBP_RESTRICT out) { // input is 12b signed int32_t tmp[16]; int i; @@ -234,8 +242,9 @@ static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) { } } -static WEBP_INLINE void VerticalPred(uint8_t* dst, - const uint8_t* top, int size) { +static WEBP_INLINE void VerticalPred(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top, + int size) { int j; if (top != NULL) { for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size); @@ -244,8 +253,9 @@ static WEBP_INLINE void VerticalPred(uint8_t* dst, } } -static WEBP_INLINE void HorizontalPred(uint8_t* dst, - const uint8_t* left, int size) { +static WEBP_INLINE void HorizontalPred(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + int size) { if (left != NULL) { int j; for (j = 0; j < size; ++j) { @@ -256,8 +266,9 @@ static WEBP_INLINE void HorizontalPred(uint8_t* dst, } } -static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left, - const uint8_t* top, int size) { +static WEBP_INLINE void TrueMotion(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top, int size) { int y; if (left != NULL) { if (top != NULL) { @@ -286,8 +297,9 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left, } } -static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left, - const uint8_t* top, +static WEBP_INLINE void DCMode(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top, int size, int round, int shift) { int DC = 0; int j; @@ -312,8 +324,9 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left, //------------------------------------------------------------------------------ // Chroma 8x8 prediction (paragraph 12.2) -static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static void IntraChromaPreds_C(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { // U block DCMode(C8DC8 + dst, left, top, 8, 8, 4); VerticalPred(C8VE8 + dst, top, 8); @@ -333,8 +346,9 @@ static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left, // luma 16x16 prediction (paragraph 12.3) #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 -static void Intra16Preds_C(uint8_t* dst, - const uint8_t* left, const uint8_t* top) { +static void Intra16Preds_C(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { DCMode(I16DC16 + dst, left, top, 16, 16, 5); VerticalPred(I16VE16 + dst, top, 16); HorizontalPred(I16HE16 + dst, left, 16); @@ -351,7 +365,8 @@ static void Intra16Preds_C(uint8_t* dst, #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2)) #define AVG2(a, b) (((a) + (b) + 1) >> 1) -static void VE4(uint8_t* dst, const uint8_t* top) { // vertical +// vertical +static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { const uint8_t vals[4] = { AVG3(top[-1], top[0], top[1]), AVG3(top[ 0], top[1], top[2]), @@ -364,7 +379,8 @@ static void VE4(uint8_t* dst, const uint8_t* top) { // vertical } } -static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal +// horizontal +static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -376,14 +392,14 @@ static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L)); } -static void DC4(uint8_t* dst, const uint8_t* top) { +static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { uint32_t dc = 4; int i; for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; Fill(dst, dc >> 3, 4); } -static void RD4(uint8_t* dst, const uint8_t* top) { +static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -402,7 +418,7 @@ static void RD4(uint8_t* dst, const uint8_t* top) { DST(3, 0) = AVG3(D, C, B); } -static void LD4(uint8_t* dst, const uint8_t* top) { +static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { const int A = top[0]; const int B = top[1]; const int C = top[2]; @@ -420,7 +436,7 @@ static void LD4(uint8_t* dst, const uint8_t* top) { DST(3, 3) = AVG3(G, H, H); } -static void VR4(uint8_t* dst, const uint8_t* top) { +static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -442,7 +458,7 @@ static void VR4(uint8_t* dst, const uint8_t* top) { DST(3, 1) = AVG3(B, C, D); } -static void VL4(uint8_t* dst, const uint8_t* top) { +static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { const int A = top[0]; const int B = top[1]; const int C = top[2]; @@ -464,7 +480,7 @@ static void VL4(uint8_t* dst, const uint8_t* top) { DST(3, 3) = AVG3(F, G, H); } -static void HU4(uint8_t* dst, const uint8_t* top) { +static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { const int I = top[-2]; const int J = top[-3]; const int K = top[-4]; @@ -479,7 +495,7 @@ static void HU4(uint8_t* dst, const uint8_t* top) { DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; } -static void HD4(uint8_t* dst, const uint8_t* top) { +static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -502,7 +518,7 @@ static void HD4(uint8_t* dst, const uint8_t* top) { DST(1, 3) = AVG3(L, K, J); } -static void TM4(uint8_t* dst, const uint8_t* top) { +static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int x, y; const uint8_t* const clip = clip1 + 255 - top[-1]; for (y = 0; y < 4; ++y) { @@ -520,7 +536,8 @@ static void TM4(uint8_t* dst, const uint8_t* top) { // Left samples are top[-5 .. -2], top_left is top[-1], top are // located at top[0..3], and top right is top[4..7] -static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) { +static void Intra4Preds_C(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { DC4(I4DC4 + dst, top); TM4(I4TM4 + dst, top); VE4(I4VE4 + dst, top); @@ -539,7 +556,8 @@ static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) { // Metric #if !WEBP_NEON_OMIT_C_CODE -static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b, +static WEBP_INLINE int GetSSE(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b, int w, int h) { int count = 0; int y, x; @@ -554,21 +572,25 @@ static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b, return count; } -static int SSE16x16_C(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_C(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { return GetSSE(a, b, 16, 16); } -static int SSE16x8_C(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_C(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { return GetSSE(a, b, 16, 8); } -static int SSE8x8_C(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_C(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { return GetSSE(a, b, 8, 8); } -static int SSE4x4_C(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_C(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { return GetSSE(a, b, 4, 4); } #endif // !WEBP_NEON_OMIT_C_CODE -static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) { +static void Mean16x4_C(const uint8_t* WEBP_RESTRICT ref, uint32_t dc[4]) { int k, x, y; for (k = 0; k < 4; ++k) { uint32_t avg = 0; @@ -592,7 +614,8 @@ static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) { // Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. // w[] contains a row-major 4 by 4 symmetric matrix. -static int TTransform(const uint8_t* in, const uint16_t* w) { +static int TTransform(const uint8_t* WEBP_RESTRICT in, + const uint16_t* WEBP_RESTRICT w) { int sum = 0; int tmp[16]; int i; @@ -626,15 +649,17 @@ static int TTransform(const uint8_t* in, const uint16_t* w) { return sum; } -static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_C(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { const int sum1 = TTransform(a, w); const int sum2 = TTransform(b, w); return abs(sum2 - sum1) >> 5; } -static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_C(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { @@ -657,7 +682,7 @@ static const uint8_t kZigzag[16] = { // Simple quantization static int QuantizeBlock_C(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int last = -1; int n; for (n = 0; n < 16; ++n) { @@ -683,7 +708,7 @@ static int QuantizeBlock_C(int16_t in[16], int16_t out[16], } static int Quantize2Blocks_C(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int nz; nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; @@ -694,7 +719,8 @@ static int Quantize2Blocks_C(int16_t in[32], int16_t out[32], //------------------------------------------------------------------------------ // Block copy -static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) { +static WEBP_INLINE void Copy(const uint8_t* WEBP_RESTRICT src, + uint8_t* WEBP_RESTRICT dst, int w, int h) { int y; for (y = 0; y < h; ++y) { memcpy(dst, src, w); @@ -703,11 +729,13 @@ static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) { } } -static void Copy4x4_C(const uint8_t* src, uint8_t* dst) { +static void Copy4x4_C(const uint8_t* WEBP_RESTRICT src, + uint8_t* WEBP_RESTRICT dst) { Copy(src, dst, 4, 4); } -static void Copy16x8_C(const uint8_t* src, uint8_t* dst) { +static void Copy16x8_C(const uint8_t* WEBP_RESTRICT src, + uint8_t* WEBP_RESTRICT dst) { Copy(src, dst, 16, 8); } @@ -720,7 +748,7 @@ VP8CHisto VP8CollectHistogram; VP8Idct VP8ITransform; VP8Fdct VP8FTransform; VP8Fdct VP8FTransform2; -VP8FWHT VP8FTransformWHT; +VP8WHT VP8FTransformWHT; VP8Intra4Preds VP8EncPredLuma4; VP8IntraPreds VP8EncPredLuma16; VP8IntraPreds VP8EncPredChroma8; diff --git a/src/dsp/enc_mips32.c b/src/dsp/enc_mips32.c index 50518a5f..6cd8c93d 100644 --- a/src/dsp/enc_mips32.c +++ b/src/dsp/enc_mips32.c @@ -109,9 +109,9 @@ static const int kC2 = WEBP_TRANSFORM_AC3_C2; "sb %[" #TEMP12 "], 3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" // Does one or two inverse transforms. -static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref, - const int16_t* in, - uint8_t* dst) { +static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { int temp0, temp1, temp2, temp3, temp4, temp5, temp6; int temp7, temp8, temp9, temp10, temp11, temp12, temp13; int temp14, temp15, temp16, temp17, temp18, temp19, temp20; @@ -141,8 +141,9 @@ static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref, ); } -static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in, - uint8_t* dst, int do_two) { +static void ITransform_MIPS32(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { ITransformOne_MIPS32(ref, in, dst); if (do_two) { ITransformOne_MIPS32(ref + 4, in + 16, dst + 4); @@ -236,7 +237,7 @@ static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16], } static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int nz; nz = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0; nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1; @@ -358,8 +359,9 @@ static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32], "msub %[temp6], %[temp0] \n\t" \ "msub %[temp7], %[temp1] \n\t" -static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_MIPS32(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { int tmp[32]; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; @@ -393,8 +395,9 @@ static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b, #undef VERTICAL_PASS #undef HORIZONTAL_PASS -static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_MIPS32(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { @@ -475,8 +478,9 @@ static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b, "sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \ "sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t" -static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform_MIPS32(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, + int16_t* WEBP_RESTRICT out) { int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16; int temp17, temp18, temp19, temp20; @@ -537,7 +541,8 @@ static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref, GET_SSE_INNER(C, C + 1, C + 2, C + 3) \ GET_SSE_INNER(D, D + 1, D + 2, D + 3) -static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_MIPS32(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { int count; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; @@ -571,7 +576,8 @@ static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_MIPS32(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { int count; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; @@ -597,7 +603,8 @@ static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_MIPS32(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { int count; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; @@ -619,7 +626,8 @@ static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_MIPS32(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { int count; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; diff --git a/src/dsp/enc_mips_dsp_r2.c b/src/dsp/enc_mips_dsp_r2.c index e1431f3b..4d808960 100644 --- a/src/dsp/enc_mips_dsp_r2.c +++ b/src/dsp/enc_mips_dsp_r2.c @@ -141,8 +141,9 @@ static const int kC2 = WEBP_TRANSFORM_AC3_C2; "sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \ "sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t" -static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, + int16_t* WEBP_RESTRICT out) { const int c2217 = 2217; const int c5352 = 5352; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; @@ -171,8 +172,9 @@ static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref, #undef VERTICAL_PASS #undef HORIZONTAL_PASS -static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, - uint8_t* dst) { +static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18; @@ -239,16 +241,18 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, ); } -static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in, - uint8_t* dst, int do_two) { +static void ITransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { ITransformOne(ref, in, dst); if (do_two) { ITransformOne(ref + 4, in + 16, dst + 4); } } -static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17; @@ -314,9 +318,9 @@ static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b, return abs(temp3 - temp17) >> 5; } -static int Disto16x16_MIPSdspR2(const uint8_t* const a, - const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { @@ -367,8 +371,8 @@ static int Disto16x16_MIPSdspR2(const uint8_t* const a, } while (0) #define VERTICAL_PRED(DST, TOP, SIZE) \ -static WEBP_INLINE void VerticalPred##SIZE(uint8_t* (DST), \ - const uint8_t* (TOP)) { \ +static WEBP_INLINE void VerticalPred##SIZE( \ + uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (TOP)) { \ int j; \ if ((TOP)) { \ for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE)); \ @@ -383,8 +387,8 @@ VERTICAL_PRED(dst, top, 16) #undef VERTICAL_PRED #define HORIZONTAL_PRED(DST, LEFT, SIZE) \ -static WEBP_INLINE void HorizontalPred##SIZE(uint8_t* (DST), \ - const uint8_t* (LEFT)) { \ +static WEBP_INLINE void HorizontalPred##SIZE( \ + uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (LEFT)) { \ if (LEFT) { \ int j; \ for (j = 0; j < (SIZE); ++j) { \ @@ -451,8 +455,9 @@ HORIZONTAL_PRED(dst, left, 16) } while (0) #define TRUE_MOTION(DST, LEFT, TOP, SIZE) \ -static WEBP_INLINE void TrueMotion##SIZE(uint8_t* (DST), const uint8_t* (LEFT),\ - const uint8_t* (TOP)) { \ +static WEBP_INLINE void TrueMotion##SIZE(uint8_t* WEBP_RESTRICT (DST), \ + const uint8_t* WEBP_RESTRICT (LEFT), \ + const uint8_t* WEBP_RESTRICT (TOP)) { \ if ((LEFT) != NULL) { \ if ((TOP) != NULL) { \ CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE)); \ @@ -480,8 +485,9 @@ TRUE_MOTION(dst, left, top, 16) #undef CLIP_8B_TO_DST #undef CLIPPING -static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DCMode16(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { int DC, DC1; int temp0, temp1, temp2, temp3; @@ -543,8 +549,9 @@ static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left, FILL_8_OR_16(dst, DC, 16); } -static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DCMode8(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { int DC, DC1; int temp0, temp1, temp2, temp3; @@ -588,7 +595,7 @@ static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left, FILL_8_OR_16(dst, DC, 8); } -static void DC4(uint8_t* dst, const uint8_t* top) { +static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int temp0, temp1; __asm__ volatile( "ulw %[temp0], 0(%[top]) \n\t" @@ -609,7 +616,7 @@ static void DC4(uint8_t* dst, const uint8_t* top) { ); } -static void TM4(uint8_t* dst, const uint8_t* top) { +static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5; const int c35 = 0xff00ff; __asm__ volatile ( @@ -664,7 +671,7 @@ static void TM4(uint8_t* dst, const uint8_t* top) { ); } -static void VE4(uint8_t* dst, const uint8_t* top) { +static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int temp0, temp1, temp2, temp3, temp4, temp5, temp6; __asm__ volatile( "ulw %[temp0], -1(%[top]) \n\t" @@ -695,7 +702,7 @@ static void VE4(uint8_t* dst, const uint8_t* top) { ); } -static void HE4(uint8_t* dst, const uint8_t* top) { +static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int temp0, temp1, temp2, temp3, temp4, temp5, temp6; __asm__ volatile( "ulw %[temp0], -4(%[top]) \n\t" @@ -731,7 +738,7 @@ static void HE4(uint8_t* dst, const uint8_t* top) { ); } -static void RD4(uint8_t* dst, const uint8_t* top) { +static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int temp0, temp1, temp2, temp3, temp4, temp5; int temp6, temp7, temp8, temp9, temp10, temp11; __asm__ volatile( @@ -780,7 +787,7 @@ static void RD4(uint8_t* dst, const uint8_t* top) { ); } -static void VR4(uint8_t* dst, const uint8_t* top) { +static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int temp0, temp1, temp2, temp3, temp4; int temp5, temp6, temp7, temp8, temp9; __asm__ volatile ( @@ -830,7 +837,7 @@ static void VR4(uint8_t* dst, const uint8_t* top) { ); } -static void LD4(uint8_t* dst, const uint8_t* top) { +static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int temp0, temp1, temp2, temp3, temp4, temp5; int temp6, temp7, temp8, temp9, temp10, temp11; __asm__ volatile( @@ -877,7 +884,7 @@ static void LD4(uint8_t* dst, const uint8_t* top) { ); } -static void VL4(uint8_t* dst, const uint8_t* top) { +static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int temp0, temp1, temp2, temp3, temp4; int temp5, temp6, temp7, temp8, temp9; __asm__ volatile ( @@ -926,7 +933,7 @@ static void VL4(uint8_t* dst, const uint8_t* top) { ); } -static void HD4(uint8_t* dst, const uint8_t* top) { +static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int temp0, temp1, temp2, temp3, temp4; int temp5, temp6, temp7, temp8, temp9; __asm__ volatile ( @@ -974,7 +981,7 @@ static void HD4(uint8_t* dst, const uint8_t* top) { ); } -static void HU4(uint8_t* dst, const uint8_t* top) { +static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; __asm__ volatile ( "ulw %[temp0], -5(%[top]) \n\t" @@ -1013,8 +1020,9 @@ static void HU4(uint8_t* dst, const uint8_t* top) { //------------------------------------------------------------------------------ // Chroma 8x8 prediction (paragraph 12.2) -static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static void IntraChromaPreds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { // U block DCMode8(C8DC8 + dst, left, top); VerticalPred8(C8VE8 + dst, top); @@ -1033,8 +1041,9 @@ static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left, //------------------------------------------------------------------------------ // luma 16x16 prediction (paragraph 12.3) -static void Intra16Preds_MIPSdspR2(uint8_t* dst, - const uint8_t* left, const uint8_t* top) { +static void Intra16Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { DCMode16(I16DC16 + dst, left, top); VerticalPred16(I16VE16 + dst, top); HorizontalPred16(I16HE16 + dst, left); @@ -1043,7 +1052,8 @@ static void Intra16Preds_MIPSdspR2(uint8_t* dst, // Left samples are top[-5 .. -2], top_left is top[-1], top are // located at top[0..3], and top right is top[4..7] -static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) { +static void Intra4Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { DC4(I4DC4 + dst, top); TM4(I4TM4 + dst, top); VE4(I4VE4 + dst, top); @@ -1079,7 +1089,8 @@ static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) { GET_SSE_INNER(C) \ GET_SSE_INNER(D) -static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { int count; int temp0, temp1, temp2, temp3; __asm__ volatile ( @@ -1109,7 +1120,8 @@ static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { int count; int temp0, temp1, temp2, temp3; __asm__ volatile ( @@ -1131,7 +1143,8 @@ static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { int count; int temp0, temp1, temp2, temp3; __asm__ volatile ( @@ -1149,7 +1162,8 @@ static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { int count; int temp0, temp1, temp2, temp3; __asm__ volatile ( @@ -1273,7 +1287,7 @@ static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) { "3: \n\t" static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int temp0, temp1, temp2, temp3, temp4, temp5,temp6; int sign, coeff, level; int max_level = MAX_LEVEL; @@ -1314,7 +1328,7 @@ static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16], } static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int nz; nz = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0; nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1; @@ -1360,7 +1374,8 @@ static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32], "usw %[" #TEMP4 "], " #C "(%[out]) \n\t" \ "usw %[" #TEMP6 "], " #D "(%[out]) \n\t" -static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) { +static void FTransformWHT_MIPSdspR2(const int16_t* WEBP_RESTRICT in, + int16_t* WEBP_RESTRICT out) { int temp0, temp1, temp2, temp3, temp4; int temp5, temp6, temp7, temp8, temp9; diff --git a/src/dsp/enc_msa.c b/src/dsp/enc_msa.c index 6f85add4..31ecb942 100644 --- a/src/dsp/enc_msa.c +++ b/src/dsp/enc_msa.c @@ -41,8 +41,9 @@ BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ } while (0) -static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, - uint8_t* dst) { +static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { v8i16 input0, input1; v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; v4i32 res0, res1, res2, res3; @@ -69,16 +70,18 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS); } -static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst, - int do_two) { +static void ITransform_MSA(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { ITransformOne(ref, in, dst); if (do_two) { ITransformOne(ref + 4, in + 16, dst + 4); } } -static void FTransform_MSA(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform_MSA(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, + int16_t* WEBP_RESTRICT out) { uint64_t out0, out1, out2, out3; uint32_t in0, in1, in2, in3; v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; @@ -131,7 +134,8 @@ static void FTransform_MSA(const uint8_t* src, const uint8_t* ref, SD4(out0, out1, out2, out3, out, 8); } -static void FTransformWHT_MSA(const int16_t* in, int16_t* out) { +static void FTransformWHT_MSA(const int16_t* WEBP_RESTRICT in, + int16_t* WEBP_RESTRICT out) { v8i16 in0 = { 0 }; v8i16 in1 = { 0 }; v8i16 tmp0, tmp1, tmp2, tmp3; @@ -168,7 +172,8 @@ static void FTransformWHT_MSA(const int16_t* in, int16_t* out) { ST_SH2(out0, out1, out, 8); } -static int TTransform_MSA(const uint8_t* in, const uint16_t* w) { +static int TTransform_MSA(const uint8_t* WEBP_RESTRICT in, + const uint16_t* WEBP_RESTRICT w) { int sum; uint32_t in0_m, in1_m, in2_m, in3_m; v16i8 src0 = { 0 }; @@ -200,15 +205,17 @@ static int TTransform_MSA(const uint8_t* in, const uint16_t* w) { return sum; } -static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_MSA(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { const int sum1 = TTransform_MSA(a, w); const int sum2 = TTransform_MSA(b, w); return abs(sum2 - sum1) >> 5; } -static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_MSA(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { @@ -259,7 +266,9 @@ static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred, #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) #define AVG2(a, b) (((a) + (b) + 1) >> 1) -static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical +// vertical +static WEBP_INLINE void VE4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const v16u8 A1 = { 0 }; const uint64_t val_m = LD(top - 1); const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m); @@ -272,7 +281,9 @@ static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical SW4(out, out, out, out, dst, BPS); } -static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal +// horizontal +static WEBP_INLINE void HE4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -284,7 +295,8 @@ static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L)); } -static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void DC4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { uint32_t dc = 4; int i; for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; @@ -293,7 +305,8 @@ static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) { SW4(dc, dc, dc, dc, dst, BPS); } -static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void RD4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const v16u8 A2 = { 0 }; const uint64_t val_m = LD(top - 5); const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m); @@ -313,7 +326,8 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { SW4(val3, val2, val1, val0, dst, BPS); } -static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void LD4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const v16u8 A1 = { 0 }; const uint64_t val_m = LD(top); const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m); @@ -333,7 +347,8 @@ static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { SW4(val0, val1, val2, val3, dst, BPS); } -static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void VR4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -354,7 +369,8 @@ static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) { DST(3, 1) = AVG3(B, C, D); } -static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void VL4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const int A = top[0]; const int B = top[1]; const int C = top[2]; @@ -375,7 +391,8 @@ static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) { DST(3, 3) = AVG3(F, G, H); } -static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void HU4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const int I = top[-2]; const int J = top[-3]; const int K = top[-4]; @@ -390,7 +407,8 @@ static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) { DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; } -static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void HD4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -411,7 +429,8 @@ static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) { DST(1, 3) = AVG3(L, K, J); } -static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void TM4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const v16i8 zero = { 0 }; const v8i16 TL = (v8i16)__msa_fill_h(top[-1]); const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]); @@ -431,7 +450,8 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) { #undef AVG3 #undef AVG2 -static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) { +static void Intra4Preds_MSA(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { DC4(I4DC4 + dst, top); TM4(I4TM4 + dst, top); VE4(I4VE4 + dst, top); @@ -451,7 +471,8 @@ static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) { ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS); \ } while (0) -static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void VerticalPred16x16(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { if (top != NULL) { const v16u8 out = LD_UB(top); STORE16x16(out, dst); @@ -461,8 +482,8 @@ static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) { } } -static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst, - const uint8_t* left) { +static WEBP_INLINE void HorizontalPred16x16(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left) { if (left != NULL) { int j; for (j = 0; j < 16; j += 4) { @@ -480,8 +501,9 @@ static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst, } } -static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void TrueMotion16x16(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { if (left != NULL) { if (top != NULL) { int j; @@ -519,8 +541,9 @@ static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left, } } -static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DCMode16x16(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { int DC; v16u8 out; if (top != NULL && left != NULL) { @@ -548,8 +571,9 @@ static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left, STORE16x16(out, dst); } -static void Intra16Preds_MSA(uint8_t* dst, - const uint8_t* left, const uint8_t* top) { +static void Intra16Preds_MSA(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { DCMode16x16(I16DC16 + dst, left, top); VerticalPred16x16(I16VE16 + dst, top); HorizontalPred16x16(I16HE16 + dst, left); @@ -574,7 +598,8 @@ static void Intra16Preds_MSA(uint8_t* dst, SD4(out, out, out, out, dst + 4 * BPS, BPS); \ } while (0) -static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void VerticalPred8x8(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { if (top != NULL) { const uint64_t out = LD(top); STORE8x8(out, dst); @@ -584,7 +609,8 @@ static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) { } } -static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) { +static WEBP_INLINE void HorizontalPred8x8(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left) { if (left != NULL) { int j; for (j = 0; j < 8; j += 4) { @@ -606,8 +632,9 @@ static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) { } } -static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void TrueMotion8x8(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { if (left != NULL) { if (top != NULL) { int j; @@ -646,8 +673,9 @@ static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left, } } -static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DCMode8x8(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { uint64_t out; v16u8 src = { 0 }; if (top != NULL && left != NULL) { @@ -670,8 +698,9 @@ static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left, STORE8x8(out, dst); } -static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static void IntraChromaPreds_MSA(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { // U block DCMode8x8(C8DC8 + dst, left, top); VerticalPred8x8(C8VE8 + dst, top); @@ -712,7 +741,8 @@ static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left, DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \ } while (0) -static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_MSA(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { uint32_t sum; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; @@ -739,7 +769,8 @@ static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) { return sum; } -static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_MSA(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { uint32_t sum; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; @@ -758,7 +789,8 @@ static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) { return sum; } -static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_MSA(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { uint32_t sum; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; @@ -778,7 +810,8 @@ static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) { return sum; } -static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_MSA(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { uint32_t sum = 0; uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1; @@ -801,7 +834,7 @@ static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) { // Quantization static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int sum; v8i16 in0, in1, sh0, sh1, out0, out1; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1; @@ -854,7 +887,7 @@ static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16], } static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int nz; nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c index 7ba5b2d6..30a66fc5 100644 --- a/src/dsp/enc_neon.c +++ b/src/dsp/enc_neon.c @@ -60,8 +60,8 @@ static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst, static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01, const int16x8_t row23, - const uint8_t* const ref, - uint8_t* const dst) { + const uint8_t* WEBP_RESTRICT const ref, + uint8_t* WEBP_RESTRICT const dst) { uint32x2_t dst01 = vdup_n_u32(0); uint32x2_t dst23 = vdup_n_u32(0); @@ -120,8 +120,9 @@ static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) { Transpose8x2_NEON(E0, E1, rows); } -static void ITransformOne_NEON(const uint8_t* ref, - const int16_t* in, uint8_t* dst) { +static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { int16x8x2_t rows; INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8)); TransformPass_NEON(&rows); @@ -131,8 +132,9 @@ static void ITransformOne_NEON(const uint8_t* ref, #else -static void ITransformOne_NEON(const uint8_t* ref, - const int16_t* in, uint8_t* dst) { +static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const int kBPS = BPS; const int16_t kC1C2[] = { kC1, kC2, 0, 0 }; @@ -247,8 +249,9 @@ static void ITransformOne_NEON(const uint8_t* ref, #endif // WEBP_USE_INTRINSICS -static void ITransform_NEON(const uint8_t* ref, - const int16_t* in, uint8_t* dst, int do_two) { +static void ITransform_NEON(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { ITransformOne_NEON(ref, in, dst); if (do_two) { ITransformOne_NEON(ref + 4, in + 16, dst + 4); @@ -294,8 +297,9 @@ static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a, return vreinterpretq_s16_u16(vsubl_u8(a, b)); } -static void FTransform_NEON(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, + int16_t* WEBP_RESTRICT out) { int16x8_t d0d1, d3d2; // working 4x4 int16 variables { const uint8x16_t S0 = Load4x4_NEON(src); @@ -364,8 +368,9 @@ static const int32_t kCoeff32[] = { 51000, 51000, 51000, 51000 }; -static void FTransform_NEON(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, + int16_t* WEBP_RESTRICT out) { const int kBPS = BPS; const uint8_t* src_ptr = src; const uint8_t* ref_ptr = ref; @@ -484,7 +489,8 @@ static void FTransform_NEON(const uint8_t* src, const uint8_t* ref, src += stride; \ } while (0) -static void FTransformWHT_NEON(const int16_t* src, int16_t* out) { +static void FTransformWHT_NEON(const int16_t* WEBP_RESTRICT src, + int16_t* WEBP_RESTRICT out) { const int stride = 16; const int16x4_t zero = vdup_n_s16(0); int32x4x4_t tmp0; @@ -659,8 +665,9 @@ static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in, // Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. // w[] contains a row-major 4 by 4 symmetric matrix. -static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_NEON(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { uint32x2_t d_in_ab_0123 = vdup_n_u32(0); uint32x2_t d_in_ab_4567 = vdup_n_u32(0); uint32x2_t d_in_ab_89ab = vdup_n_u32(0); @@ -701,8 +708,9 @@ static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b, } #undef LOAD_LANE_32b -static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_NEON(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { @@ -715,9 +723,10 @@ static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b, //------------------------------------------------------------------------------ -static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred, +static void CollectHistogram_NEON(const uint8_t* WEBP_RESTRICT ref, + const uint8_t* WEBP_RESTRICT pred, int start_block, int end_block, - VP8Histogram* const histo) { + VP8Histogram* WEBP_RESTRICT const histo) { const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH); int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; @@ -747,9 +756,9 @@ static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred, //------------------------------------------------------------------------------ -static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a, - const uint8_t* const b, - uint32x4_t* const sum) { +static WEBP_INLINE void AccumulateSSE16_NEON( + const uint8_t* WEBP_RESTRICT const a, const uint8_t* WEBP_RESTRICT const b, + uint32x4_t* const sum) { const uint8x16_t a0 = vld1q_u8(a); const uint8x16_t b0 = vld1q_u8(b); const uint8x16_t abs_diff = vabdq_u8(a0, b0); @@ -775,7 +784,8 @@ static int SumToInt_NEON(uint32x4_t sum) { #endif } -static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_NEON(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { uint32x4_t sum = vdupq_n_u32(0); int y; for (y = 0; y < 16; ++y) { @@ -784,7 +794,8 @@ static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) { return SumToInt_NEON(sum); } -static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_NEON(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { uint32x4_t sum = vdupq_n_u32(0); int y; for (y = 0; y < 8; ++y) { @@ -793,7 +804,8 @@ static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) { return SumToInt_NEON(sum); } -static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_NEON(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { uint32x4_t sum = vdupq_n_u32(0); int y; for (y = 0; y < 8; ++y) { @@ -806,7 +818,8 @@ static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) { return SumToInt_NEON(sum); } -static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_NEON(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { const uint8x16_t a0 = Load4x4_NEON(a); const uint8x16_t b0 = Load4x4_NEON(b); const uint8x16_t abs_diff = vabdq_u8(a0, b0); @@ -825,8 +838,9 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) { // Compilation with gcc-4.6.x is problematic for now. #if !defined(WORK_AROUND_GCC) -static int16x8_t Quantize_NEON(int16_t* const in, - const VP8Matrix* const mtx, int offset) { +static int16x8_t Quantize_NEON(int16_t* WEBP_RESTRICT const in, + const VP8Matrix* WEBP_RESTRICT const mtx, + int offset) { const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]); const uint16x8_t q = vld1q_u16(&mtx->q_[offset]); const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]); @@ -860,7 +874,7 @@ static const uint8_t kShuffles[4][8] = { }; static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { const int16x8_t out0 = Quantize_NEON(in, mtx, 0); const int16x8_t out1 = Quantize_NEON(in, mtx, 8); uint8x8x4_t shuffles; @@ -902,7 +916,7 @@ static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16], } static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int nz; nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0; nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1; @@ -930,7 +944,8 @@ static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32], vst1q_u8(dst, r); \ } while (0) -static void Intra4Preds_NEON(uint8_t* dst, const uint8_t* top) { +static void Intra4Preds_NEON(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 // L K J I X A B C D E F G H // -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 @@ -1162,8 +1177,9 @@ static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, const uint8_t* left, } } -static void Intra16Preds_NEON(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static void Intra16Preds_NEON(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { DCMode_NEON(I16DC16 + dst, left, top); VerticalPred16_NEON(I16VE16 + dst, top); HorizontalPred16_NEON(I16HE16 + dst, left); diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c index 010624a2..588a6292 100644 --- a/src/dsp/enc_sse2.c +++ b/src/dsp/enc_sse2.c @@ -26,8 +26,9 @@ // Transforms (Paragraph 14.4) // Does one inverse transform. -static void ITransform_One_SSE2(const uint8_t* ref, const int16_t* in, - uint8_t* dst) { +static void ITransform_One_SSE2(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 @@ -177,8 +178,9 @@ static void ITransform_One_SSE2(const uint8_t* ref, const int16_t* in, } // Does two inverse transforms. -static void ITransform_Two_SSE2(const uint8_t* ref, const int16_t* in, - uint8_t* dst) { +static void ITransform_Two_SSE2(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 @@ -316,7 +318,9 @@ static void ITransform_Two_SSE2(const uint8_t* ref, const int16_t* in, } // Does one or two inverse transforms. -static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, +static void ITransform_SSE2(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { if (do_two) { ITransform_Two_SSE2(ref, in, dst); @@ -373,7 +377,7 @@ static void FTransformPass1_SSE2(const __m128i* const in01, static void FTransformPass2_SSE2(const __m128i* const v01, const __m128i* const v32, - int16_t* out) { + int16_t* WEBP_RESTRICT out) { const __m128i zero = _mm_setzero_si128(); const __m128i seven = _mm_set1_epi16(7); const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, @@ -424,8 +428,9 @@ static void FTransformPass2_SSE2(const __m128i* const v01, _mm_storeu_si128((__m128i*)&out[8], d2_f3); } -static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform_SSE2(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, + int16_t* WEBP_RESTRICT out) { const __m128i zero = _mm_setzero_si128(); // Load src. const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); @@ -468,8 +473,9 @@ static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref, FTransformPass2_SSE2(&v01, &v32, out); } -static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform2_SSE2(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, + int16_t* WEBP_RESTRICT out) { const __m128i zero = _mm_setzero_si128(); // Load src and convert to 16b. @@ -517,7 +523,8 @@ static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref, FTransformPass2_SSE2(&v01h, &v32h, out + 16); } -static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) { +static void FTransformWHTRow_SSE2(const int16_t* WEBP_RESTRICT const in, + __m128i* const out) { const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1); const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]); const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]); @@ -533,7 +540,8 @@ static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) { *out = _mm_madd_epi16(D, kMult); } -static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) { +static void FTransformWHT_SSE2(const int16_t* WEBP_RESTRICT in, + int16_t* WEBP_RESTRICT out) { // Input is 12b signed. __m128i row0, row1, row2, row3; // Rows are 14b signed. @@ -566,9 +574,10 @@ static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) { // Compute susceptibility based on DCT-coeff histograms: // the higher, the "easier" the macroblock is to compress. -static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred, +static void CollectHistogram_SSE2(const uint8_t* WEBP_RESTRICT ref, + const uint8_t* WEBP_RESTRICT pred, int start_block, int end_block, - VP8Histogram* const histo) { + VP8Histogram* WEBP_RESTRICT const histo) { const __m128i zero = _mm_setzero_si128(); const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); int j; @@ -640,7 +649,8 @@ static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) { } } -static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void VE8uv_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { int j; const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); for (j = 0; j < 8; ++j) { @@ -648,7 +658,8 @@ static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) { } } -static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void VE16_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const __m128i top_values = _mm_load_si128((const __m128i*)top); int j; for (j = 0; j < 16; ++j) { @@ -656,8 +667,9 @@ static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) { } } -static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst, - const uint8_t* top, int size) { +static WEBP_INLINE void VerticalPred_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top, + int size) { if (top != NULL) { if (size == 8) { VE8uv_SSE2(dst, top); @@ -669,7 +681,8 @@ static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst, } } -static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) { +static WEBP_INLINE void HE8uv_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left) { int j; for (j = 0; j < 8; ++j) { const __m128i values = _mm_set1_epi8((char)left[j]); @@ -678,7 +691,8 @@ static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) { } } -static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) { +static WEBP_INLINE void HE16_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left) { int j; for (j = 0; j < 16; ++j) { const __m128i values = _mm_set1_epi8((char)left[j]); @@ -687,8 +701,9 @@ static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) { } } -static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst, - const uint8_t* left, int size) { +static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + int size) { if (left != NULL) { if (size == 8) { HE8uv_SSE2(dst, left); @@ -700,8 +715,9 @@ static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst, } } -static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left, - const uint8_t* top, int size) { +static WEBP_INLINE void TM_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top, int size) { const __m128i zero = _mm_setzero_si128(); int y; if (size == 8) { @@ -728,8 +744,10 @@ static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left, } } -static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left, - const uint8_t* top, int size) { +static WEBP_INLINE void TrueMotion_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top, + int size) { if (left != NULL) { if (top != NULL) { TM_SSE2(dst, left, top, size); @@ -749,8 +767,9 @@ static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left, } } -static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DC8uv_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); const __m128i left_values = _mm_loadl_epi64((const __m128i*)left); const __m128i combined = _mm_unpacklo_epi64(top_values, left_values); @@ -758,7 +777,8 @@ static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left, Put8x8uv_SSE2(DC >> 4, dst); } -static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const __m128i zero = _mm_setzero_si128(); const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); const __m128i sum = _mm_sad_epu8(top_values, zero); @@ -766,7 +786,8 @@ static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) { Put8x8uv_SSE2(DC >> 3, dst); } -static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* dst, const uint8_t* left) { +static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left) { // 'left' is contiguous so we can reuse the top summation. DC8uvNoLeft_SSE2(dst, left); } @@ -775,8 +796,9 @@ static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) { Put8x8uv_SSE2(0x80, dst); } -static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { if (top != NULL) { if (left != NULL) { // top and left present DC8uv_SSE2(dst, left, top); @@ -790,8 +812,9 @@ static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left, } } -static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DC16_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { const __m128i top_row = _mm_load_si128((const __m128i*)top); const __m128i left_row = _mm_load_si128((const __m128i*)left); const int DC = @@ -799,13 +822,15 @@ static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left, Put16_SSE2(DC >> 5, dst); } -static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const __m128i top_row = _mm_load_si128((const __m128i*)top); const int DC = VP8HorizontalAdd8b(&top_row) + 8; Put16_SSE2(DC >> 4, dst); } -static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* dst, const uint8_t* left) { +static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left) { // 'left' is contiguous so we can reuse the top summation. DC16NoLeft_SSE2(dst, left); } @@ -814,8 +839,9 @@ static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) { Put16_SSE2(0x80, dst); } -static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DC16Mode_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { if (top != NULL) { if (left != NULL) { // top and left present DC16_SSE2(dst, left, top); @@ -844,8 +870,9 @@ static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left, // where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1 // and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1 -static WEBP_INLINE void VE4_SSE2(uint8_t* dst, - const uint8_t* top) { // vertical +// vertical +static WEBP_INLINE void VE4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const __m128i one = _mm_set1_epi8(1); const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1)); const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); @@ -861,8 +888,9 @@ static WEBP_INLINE void VE4_SSE2(uint8_t* dst, } } -static WEBP_INLINE void HE4_SSE2(uint8_t* dst, - const uint8_t* top) { // horizontal +// horizontal +static WEBP_INLINE void HE4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -874,15 +902,17 @@ static WEBP_INLINE void HE4_SSE2(uint8_t* dst, WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L)); } -static WEBP_INLINE void DC4_SSE2(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void DC4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { uint32_t dc = 4; int i; for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; Fill_SSE2(dst, dc >> 3, 4); } -static WEBP_INLINE void LD4_SSE2(uint8_t* dst, - const uint8_t* top) { // Down-Left +// Down-Left +static WEBP_INLINE void LD4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const __m128i one = _mm_set1_epi8(1); const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top); const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); @@ -898,8 +928,9 @@ static WEBP_INLINE void LD4_SSE2(uint8_t* dst, WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } -static WEBP_INLINE void VR4_SSE2(uint8_t* dst, - const uint8_t* top) { // Vertical-Right +// Vertical-Right +static WEBP_INLINE void VR4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const __m128i one = _mm_set1_epi8(1); const int I = top[-2]; const int J = top[-3]; @@ -924,8 +955,9 @@ static WEBP_INLINE void VR4_SSE2(uint8_t* dst, DST(0, 3) = AVG3(K, J, I); } -static WEBP_INLINE void VL4_SSE2(uint8_t* dst, - const uint8_t* top) { // Vertical-Left +// Vertical-Left +static WEBP_INLINE void VL4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const __m128i one = _mm_set1_epi8(1); const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top); const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1); @@ -951,8 +983,9 @@ static WEBP_INLINE void VL4_SSE2(uint8_t* dst, DST(3, 3) = (extra_out >> 8) & 0xff; } -static WEBP_INLINE void RD4_SSE2(uint8_t* dst, - const uint8_t* top) { // Down-right +// Down-right +static WEBP_INLINE void RD4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const __m128i one = _mm_set1_epi8(1); const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5)); const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4); @@ -968,7 +1001,8 @@ static WEBP_INLINE void RD4_SSE2(uint8_t* dst, WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } -static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void HU4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const int I = top[-2]; const int J = top[-3]; const int K = top[-4]; @@ -983,7 +1017,8 @@ static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) { DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; } -static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void HD4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -1006,7 +1041,8 @@ static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) { DST(1, 3) = AVG3(L, K, J); } -static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void TM4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const __m128i zero = _mm_setzero_si128(); const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top)); const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); @@ -1028,7 +1064,8 @@ static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) { // Left samples are top[-5 .. -2], top_left is top[-1], top are // located at top[0..3], and top right is top[4..7] -static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) { +static void Intra4Preds_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { DC4_SSE2(I4DC4 + dst, top); TM4_SSE2(I4TM4 + dst, top); VE4_SSE2(I4VE4 + dst, top); @@ -1044,8 +1081,9 @@ static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) { //------------------------------------------------------------------------------ // Chroma 8x8 prediction (paragraph 12.2) -static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static void IntraChromaPreds_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { // U block DC8uvMode_SSE2(C8DC8 + dst, left, top); VerticalPred_SSE2(C8VE8 + dst, top, 8); @@ -1064,8 +1102,9 @@ static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left, //------------------------------------------------------------------------------ // luma 16x16 prediction (paragraph 12.3) -static void Intra16Preds_SSE2(uint8_t* dst, - const uint8_t* left, const uint8_t* top) { +static void Intra16Preds_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { DC16Mode_SSE2(I16DC16 + dst, left, top); VerticalPred_SSE2(I16VE16 + dst, top, 16); HorizontalPred_SSE2(I16HE16 + dst, left, 16); @@ -1092,7 +1131,8 @@ static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a, *sum = _mm_add_epi32(sum1, sum2); } -static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b, +static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b, int num_pairs) { __m128i sum = _mm_setzero_si128(); int32_t tmp[4]; @@ -1114,18 +1154,21 @@ static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b, return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); } -static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_SSE2(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { return SSE_16xN_SSE2(a, b, 8); } -static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_SSE2(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { return SSE_16xN_SSE2(a, b, 4); } #define LOAD_8x16b(ptr) \ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero) -static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_SSE2(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { const __m128i zero = _mm_setzero_si128(); int num_pairs = 4; __m128i sum = zero; @@ -1152,7 +1195,8 @@ static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) { } #undef LOAD_8x16b -static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_SSE2(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { const __m128i zero = _mm_setzero_si128(); // Load values. Note that we read 8 pixels instead of 4, @@ -1189,7 +1233,7 @@ static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) { //------------------------------------------------------------------------------ -static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) { +static void Mean16x4_SSE2(const uint8_t* WEBP_RESTRICT ref, uint32_t dc[4]) { const __m128i mask = _mm_set1_epi16(0x00ff); const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]); const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]); @@ -1227,8 +1271,9 @@ static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) { // Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. // w[] contains a row-major 4 by 4 symmetric matrix. -static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB, - const uint16_t* const w) { +static int TTransform_SSE2(const uint8_t* WEBP_RESTRICT inA, + const uint8_t* WEBP_RESTRICT inB, + const uint16_t* WEBP_RESTRICT const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; const __m128i zero = _mm_setzero_si128(); @@ -1328,14 +1373,16 @@ static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB, return sum[0] + sum[1] + sum[2] + sum[3]; } -static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_SSE2(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { const int diff_sum = TTransform_SSE2(a, b, w); return abs(diff_sum) >> 5; } -static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_SSE2(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { @@ -1350,9 +1397,10 @@ static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b, // Quantization // -static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16], - const uint16_t* const sharpen, - const VP8Matrix* const mtx) { +static WEBP_INLINE int DoQuantizeBlock_SSE2( + int16_t in[16], int16_t out[16], + const uint16_t* WEBP_RESTRICT const sharpen, + const VP8Matrix* WEBP_RESTRICT const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL); const __m128i zero = _mm_setzero_si128(); __m128i coeff0, coeff8; @@ -1463,17 +1511,17 @@ static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16], } static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx); } static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { return DoQuantizeBlock_SSE2(in, out, NULL, mtx); } static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int nz; const uint16_t* const sharpen = &mtx->sharpen_[0]; nz = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0; diff --git a/src/dsp/enc_sse41.c b/src/dsp/enc_sse41.c index 924035a6..613c44cf 100644 --- a/src/dsp/enc_sse41.c +++ b/src/dsp/enc_sse41.c @@ -23,9 +23,10 @@ //------------------------------------------------------------------------------ // Compute susceptibility based on DCT-coeff histograms. -static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred, +static void CollectHistogram_SSE41(const uint8_t* WEBP_RESTRICT ref, + const uint8_t* WEBP_RESTRICT pred, int start_block, int end_block, - VP8Histogram* const histo) { + VP8Histogram* WEBP_RESTRICT const histo) { const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; @@ -168,14 +169,16 @@ static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB, return sum[0] + sum[1] + sum[2] + sum[3]; } -static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_SSE41(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { const int diff_sum = TTransform_SSE41(a, b, w); return abs(diff_sum) >> 5; } -static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_SSE41(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { @@ -301,17 +304,17 @@ static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16], #undef PSHUFB_CST static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx); } static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { return DoQuantizeBlock_SSE41(in, out, NULL, mtx); } static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int nz; const uint16_t* const sharpen = &mtx->sharpen_[0]; nz = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0; From 04d4b4f38781490fc1831a3876d3c3cb8a287e24 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 16 Aug 2024 13:52:32 -0700 Subject: [PATCH 4/8] dsp/filters*: use WEBP_RESTRICT qualifier Better stack/register usage in SSE2/NEON code and improved vectorization of the C code with ndk r27/gcc-13/clang-16. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: I32b53dd38bfc7e2231d875409e7dfda7c513cfb6 --- src/dsp/dsp.h | 5 +++-- src/dsp/filters.c | 33 +++++++++++++++------------ src/dsp/filters_mips_dsp_r2.c | 40 ++++++++++++++++++--------------- src/dsp/filters_msa.c | 27 +++++++++++++--------- src/dsp/filters_neon.c | 42 ++++++++++++++++++++--------------- src/dsp/filters_sse2.c | 41 ++++++++++++++++++++-------------- 6 files changed, 108 insertions(+), 80 deletions(-) diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index ec0302f2..13df7dfa 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -497,8 +497,9 @@ typedef enum { // Filter types. WEBP_FILTER_FAST } WEBP_FILTER_TYPE; -typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height, - int stride, uint8_t* out); +typedef void (*WebPFilterFunc)(const uint8_t* WEBP_RESTRICT in, + int width, int height, int stride, + uint8_t* WEBP_RESTRICT out); // In-place un-filtering. // Warning! 'prev_line' pointer can be equal to 'cur_line' or 'preds'. typedef void (*WebPUnfilterFunc)(const uint8_t* prev_line, const uint8_t* preds, diff --git a/src/dsp/filters.c b/src/dsp/filters.c index 9c1fa919..f5e1e5f9 100644 --- a/src/dsp/filters.c +++ b/src/dsp/filters.c @@ -23,14 +23,16 @@ do { \ assert((in) != NULL); \ assert((out) != NULL); \ + assert((in) != (out)); \ assert(width > 0); \ assert(height > 0); \ assert(stride >= width); \ } while (0) #if !WEBP_NEON_OMIT_C_CODE -static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred, - uint8_t* dst, int length) { +static WEBP_INLINE void PredictLine_C(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT pred, + uint8_t* WEBP_RESTRICT dst, int length) { int i; for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] - pred[i]); } @@ -38,9 +40,9 @@ static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred, //------------------------------------------------------------------------------ // Horizontal filter. -static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in, +static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* WEBP_RESTRICT in, int width, int height, int stride, - uint8_t* out) { + uint8_t* WEBP_RESTRICT out) { const uint8_t* preds = in; int row; DCHECK(in, out); @@ -66,9 +68,9 @@ static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in, //------------------------------------------------------------------------------ // Vertical filter. -static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in, +static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* WEBP_RESTRICT in, int width, int height, int stride, - uint8_t* out) { + uint8_t* WEBP_RESTRICT out) { const uint8_t* preds = in; int row; DCHECK(in, out); @@ -99,9 +101,9 @@ static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) { } #if !WEBP_NEON_OMIT_C_CODE -static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in, +static WEBP_INLINE void DoGradientFilter_C(const uint8_t* WEBP_RESTRICT in, int width, int height, int stride, - uint8_t* out) { + uint8_t* WEBP_RESTRICT out) { const uint8_t* preds = in; int row; DCHECK(in, out); @@ -136,18 +138,21 @@ static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in, //------------------------------------------------------------------------------ #if !WEBP_NEON_OMIT_C_CODE -static void HorizontalFilter_C(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { +static void HorizontalFilter_C(const uint8_t* WEBP_RESTRICT data, + int width, int height, int stride, + uint8_t* WEBP_RESTRICT filtered_data) { DoHorizontalFilter_C(data, width, height, stride, filtered_data); } -static void VerticalFilter_C(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { +static void VerticalFilter_C(const uint8_t* WEBP_RESTRICT data, + int width, int height, int stride, + uint8_t* WEBP_RESTRICT filtered_data) { DoVerticalFilter_C(data, width, height, stride, filtered_data); } -static void GradientFilter_C(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { +static void GradientFilter_C(const uint8_t* WEBP_RESTRICT data, + int width, int height, int stride, + uint8_t* WEBP_RESTRICT filtered_data) { DoGradientFilter_C(data, width, height, stride, filtered_data); } #endif // !WEBP_NEON_OMIT_C_CODE diff --git a/src/dsp/filters_mips_dsp_r2.c b/src/dsp/filters_mips_dsp_r2.c index 2c2c63c0..c62bb872 100644 --- a/src/dsp/filters_mips_dsp_r2.c +++ b/src/dsp/filters_mips_dsp_r2.c @@ -26,8 +26,9 @@ #define DCHECK(in, out) \ do { \ - assert(in != NULL); \ - assert(out != NULL); \ + assert((in) != NULL); \ + assert((out) != NULL); \ + assert((in) != (out)); \ assert(width > 0); \ assert(height > 0); \ assert(stride >= width); \ @@ -101,7 +102,8 @@ ); \ } while (0) -static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst, +static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* WEBP_RESTRICT src, + uint8_t* WEBP_RESTRICT dst, int length) { DO_PREDICT_LINE(src, dst, length, 0); } @@ -191,9 +193,9 @@ static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst, } \ } while (0) -static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in, - int width, int height, - int stride, uint8_t* out) { +static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2( + const uint8_t* WEBP_RESTRICT in, int width, int height, int stride, + uint8_t* WEBP_RESTRICT out) { const uint8_t* preds = in; int row; DCHECK(in, out); @@ -210,9 +212,9 @@ static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in, } #undef FILTER_LINE_BY_LINE -static void HorizontalFilter_MIPSdspR2(const uint8_t* data, - int width, int height, - int stride, uint8_t* filtered_data) { +static void HorizontalFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT data, + int width, int height, int stride, + uint8_t* WEBP_RESTRICT filtered_data) { DoHorizontalFilter_MIPSdspR2(data, width, height, stride, filtered_data); } @@ -228,9 +230,9 @@ static void HorizontalFilter_MIPSdspR2(const uint8_t* data, } \ } while (0) -static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in, - int width, int height, - int stride, uint8_t* out) { +static WEBP_INLINE void DoVerticalFilter_MIPSdspR2( + const uint8_t* WEBP_RESTRICT in, int width, int height, int stride, + uint8_t* WEBP_RESTRICT out) { const uint8_t* preds = in; int row; DCHECK(in, out); @@ -247,8 +249,9 @@ static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in, } #undef FILTER_LINE_BY_LINE -static void VerticalFilter_MIPSdspR2(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { +static void VerticalFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT data, + int width, int height, int stride, + uint8_t* WEBP_RESTRICT filtered_data) { DoVerticalFilter_MIPSdspR2(data, width, height, stride, filtered_data); } @@ -284,9 +287,9 @@ static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) { } \ } while (0) -static void DoGradientFilter_MIPSdspR2(const uint8_t* in, +static void DoGradientFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT in, int width, int height, int stride, - uint8_t* out) { + uint8_t* WEBP_RESTRICT out) { const uint8_t* preds = in; int row; DCHECK(in, out); @@ -303,8 +306,9 @@ static void DoGradientFilter_MIPSdspR2(const uint8_t* in, } #undef FILTER_LINE_BY_LINE -static void GradientFilter_MIPSdspR2(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { +static void GradientFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT data, + int width, int height, int stride, + uint8_t* WEBP_RESTRICT filtered_data) { DoGradientFilter_MIPSdspR2(data, width, height, stride, filtered_data); } diff --git a/src/dsp/filters_msa.c b/src/dsp/filters_msa.c index 33a1b20b..ae3d3699 100644 --- a/src/dsp/filters_msa.c +++ b/src/dsp/filters_msa.c @@ -21,7 +21,8 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src, const uint8_t* pred, - uint8_t* dst, int length) { + uint8_t* WEBP_RESTRICT dst, + int length) { v16u8 src0, pred0, dst0; assert(length >= 0); while (length >= 32) { @@ -58,8 +59,9 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src, #define DCHECK(in, out) \ do { \ - assert(in != NULL); \ - assert(out != NULL); \ + assert((in) != NULL); \ + assert((out) != NULL); \ + assert((in) != (out)); \ assert(width > 0); \ assert(height > 0); \ assert(stride >= width); \ @@ -68,8 +70,9 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src, //------------------------------------------------------------------------------ // Horrizontal filter -static void HorizontalFilter_MSA(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { +static void HorizontalFilter_MSA(const uint8_t* WEBP_RESTRICT data, + int width, int height, int stride, + uint8_t* WEBP_RESTRICT filtered_data) { const uint8_t* preds = data; const uint8_t* in = data; uint8_t* out = filtered_data; @@ -99,8 +102,8 @@ static void HorizontalFilter_MSA(const uint8_t* data, int width, int height, static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput, const uint8_t* ppred, - uint8_t* poutput, int stride, - int size) { + uint8_t* WEBP_RESTRICT poutput, + int stride, int size) { int w; const v16i8 zero = { 0 }; while (size >= 16) { @@ -131,8 +134,9 @@ static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput, } -static void GradientFilter_MSA(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { +static void GradientFilter_MSA(const uint8_t* WEBP_RESTRICT data, + int width, int height, int stride, + uint8_t* WEBP_RESTRICT filtered_data) { const uint8_t* in = data; const uint8_t* preds = data; uint8_t* out = filtered_data; @@ -159,8 +163,9 @@ static void GradientFilter_MSA(const uint8_t* data, int width, int height, //------------------------------------------------------------------------------ // Vertical filter -static void VerticalFilter_MSA(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { +static void VerticalFilter_MSA(const uint8_t* WEBP_RESTRICT data, + int width, int height, int stride, + uint8_t* WEBP_RESTRICT filtered_data) { const uint8_t* in = data; const uint8_t* preds = data; uint8_t* out = filtered_data; diff --git a/src/dsp/filters_neon.c b/src/dsp/filters_neon.c index 0b0a8421..4df10172 100644 --- a/src/dsp/filters_neon.c +++ b/src/dsp/filters_neon.c @@ -23,8 +23,9 @@ #define DCHECK(in, out) \ do { \ - assert(in != NULL); \ - assert(out != NULL); \ + assert((in) != NULL); \ + assert((out) != NULL); \ + assert((in) != (out)); \ assert(width > 0); \ assert(height > 0); \ assert(stride >= width); \ @@ -44,7 +45,7 @@ #define ROTATE_RIGHT_N(A, N) vext_u8((A), (A), (8 - (N)) % 8) static void PredictLine_NEON(const uint8_t* src, const uint8_t* pred, - uint8_t* dst, int length) { + uint8_t* WEBP_RESTRICT dst, int length) { int i; assert(length >= 0); for (i = 0; i + 16 <= length; i += 16) { @@ -57,16 +58,17 @@ static void PredictLine_NEON(const uint8_t* src, const uint8_t* pred, } // Special case for left-based prediction (when preds==dst-1 or preds==src-1). -static void PredictLineLeft_NEON(const uint8_t* src, uint8_t* dst, int length) { +static void PredictLineLeft_NEON(const uint8_t* WEBP_RESTRICT src, + uint8_t* WEBP_RESTRICT dst, int length) { PredictLine_NEON(src, src - 1, dst, length); } //------------------------------------------------------------------------------ // Horizontal filter. -static WEBP_INLINE void DoHorizontalFilter_NEON(const uint8_t* in, - int width, int height, - int stride, uint8_t* out) { +static WEBP_INLINE void DoHorizontalFilter_NEON( + const uint8_t* WEBP_RESTRICT in, int width, int height, int stride, + uint8_t* WEBP_RESTRICT out) { int row; DCHECK(in, out); @@ -86,17 +88,18 @@ static WEBP_INLINE void DoHorizontalFilter_NEON(const uint8_t* in, } } -static void HorizontalFilter_NEON(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { +static void HorizontalFilter_NEON(const uint8_t* WEBP_RESTRICT data, + int width, int height, int stride, + uint8_t* WEBP_RESTRICT filtered_data) { DoHorizontalFilter_NEON(data, width, height, stride, filtered_data); } //------------------------------------------------------------------------------ // Vertical filter. -static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in, +static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* WEBP_RESTRICT in, int width, int height, int stride, - uint8_t* out) { + uint8_t* WEBP_RESTRICT out) { int row; DCHECK(in, out); @@ -115,8 +118,9 @@ static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in, } } -static void VerticalFilter_NEON(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { +static void VerticalFilter_NEON(const uint8_t* WEBP_RESTRICT data, + int width, int height, int stride, + uint8_t* WEBP_RESTRICT filtered_data) { DoVerticalFilter_NEON(data, width, height, stride, filtered_data); } @@ -130,7 +134,8 @@ static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) { static void GradientPredictDirect_NEON(const uint8_t* const row, const uint8_t* const top, - uint8_t* const out, int length) { + uint8_t* WEBP_RESTRICT const out, + int length) { int i; for (i = 0; i + 8 <= length; i += 8) { const uint8x8_t A = vld1_u8(&row[i - 1]); @@ -146,9 +151,9 @@ static void GradientPredictDirect_NEON(const uint8_t* const row, } } -static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in, +static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* WEBP_RESTRICT in, int width, int height, int stride, - uint8_t* out) { + uint8_t* WEBP_RESTRICT out) { int row; DCHECK(in, out); @@ -167,8 +172,9 @@ static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in, } } -static void GradientFilter_NEON(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { +static void GradientFilter_NEON(const uint8_t* WEBP_RESTRICT data, + int width, int height, int stride, + uint8_t* WEBP_RESTRICT filtered_data) { DoGradientFilter_NEON(data, width, height, stride, filtered_data); } diff --git a/src/dsp/filters_sse2.c b/src/dsp/filters_sse2.c index add4f493..d2ba7894 100644 --- a/src/dsp/filters_sse2.c +++ b/src/dsp/filters_sse2.c @@ -27,13 +27,15 @@ do { \ assert((in) != NULL); \ assert((out) != NULL); \ + assert((in) != (out)); \ assert(width > 0); \ assert(height > 0); \ assert(stride >= width); \ } while (0) -static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred, - uint8_t* dst, int length) { +static void PredictLineTop_SSE2(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT pred, + uint8_t* WEBP_RESTRICT dst, int length) { int i; const int max_pos = length & ~31; assert(length >= 0); @@ -51,7 +53,8 @@ static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred, } // Special case for left-based prediction (when preds==dst-1 or preds==src-1). -static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) { +static void PredictLineLeft_SSE2(const uint8_t* WEBP_RESTRICT src, + uint8_t* WEBP_RESTRICT dst, int length) { int i; const int max_pos = length & ~31; assert(length >= 0); @@ -71,9 +74,9 @@ static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) { //------------------------------------------------------------------------------ // Horizontal filter. -static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in, - int width, int height, - int stride, uint8_t* out) { +static WEBP_INLINE void DoHorizontalFilter_SSE2( + const uint8_t* WEBP_RESTRICT in, int width, int height, int stride, + uint8_t* WEBP_RESTRICT out) { int row; DCHECK(in, out); @@ -96,9 +99,9 @@ static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in, //------------------------------------------------------------------------------ // Vertical filter. -static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in, +static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* WEBP_RESTRICT in, int width, int height, int stride, - uint8_t* out) { + uint8_t* WEBP_RESTRICT out) { int row; DCHECK(in, out); @@ -127,7 +130,8 @@ static WEBP_INLINE int GradientPredictor_SSE2(uint8_t a, uint8_t b, uint8_t c) { static void GradientPredictDirect_SSE2(const uint8_t* const row, const uint8_t* const top, - uint8_t* const out, int length) { + uint8_t* WEBP_RESTRICT const out, + int length) { const int max_pos = length & ~7; int i; const __m128i zero = _mm_setzero_si128(); @@ -151,9 +155,9 @@ static void GradientPredictDirect_SSE2(const uint8_t* const row, } } -static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in, +static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* WEBP_RESTRICT in, int width, int height, int stride, - uint8_t* out) { + uint8_t* WEBP_RESTRICT out) { int row; DCHECK(in, out); @@ -176,18 +180,21 @@ static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in, //------------------------------------------------------------------------------ -static void HorizontalFilter_SSE2(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { +static void HorizontalFilter_SSE2(const uint8_t* WEBP_RESTRICT data, + int width, int height, int stride, + uint8_t* WEBP_RESTRICT filtered_data) { DoHorizontalFilter_SSE2(data, width, height, stride, filtered_data); } -static void VerticalFilter_SSE2(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { +static void VerticalFilter_SSE2(const uint8_t* WEBP_RESTRICT data, + int width, int height, int stride, + uint8_t* WEBP_RESTRICT filtered_data) { DoVerticalFilter_SSE2(data, width, height, stride, filtered_data); } -static void GradientFilter_SSE2(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { +static void GradientFilter_SSE2(const uint8_t* WEBP_RESTRICT data, + int width, int height, int stride, + uint8_t* WEBP_RESTRICT filtered_data) { DoGradientFilter_SSE2(data, width, height, stride, filtered_data); } From a32b436bd5878265286c3eb9c3284715c60913b2 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 16 Aug 2024 16:51:11 -0700 Subject: [PATCH 5/8] dsp/lossless*: use WEBP_RESTRICT qualifier lossless_enc: better vectorization, most benefits seen in AddVector/Eq w/ndk r27/gcc-13/clang-16 lossless: minor reordering and some improvement to PredictorAdd5_SSE2 w/gcc-13 This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: I2356e314f391ee2f2c71f00bc6ee10097d3881e7 --- src/dsp/dsp.h | 5 ++- src/dsp/lossless.c | 28 ++++++------- src/dsp/lossless.h | 65 +++++++++++++++++------------- src/dsp/lossless_common.h | 18 ++++----- src/dsp/lossless_enc.c | 63 ++++++++++++++++------------- src/dsp/lossless_enc_mips32.c | 35 +++++++++------- src/dsp/lossless_enc_mips_dsp_r2.c | 16 ++++---- src/dsp/lossless_enc_msa.c | 4 +- src/dsp/lossless_enc_neon.c | 5 ++- src/dsp/lossless_enc_sse2.c | 42 +++++++++++-------- src/dsp/lossless_enc_sse41.c | 11 +++-- src/dsp/lossless_neon.c | 46 ++++++++++----------- src/dsp/lossless_sse2.c | 38 +++++++++-------- src/dsp/lossless_sse41.c | 8 ++-- 14 files changed, 213 insertions(+), 171 deletions(-) diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index 13df7dfa..30a1d43f 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -131,9 +131,10 @@ typedef struct { int max_value; int last_non_zero; } VP8Histogram; -typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred, +typedef void (*VP8CHisto)(const uint8_t* WEBP_RESTRICT ref, + const uint8_t* WEBP_RESTRICT pred, int start_block, int end_block, - VP8Histogram* const histo); + VP8Histogram* WEBP_RESTRICT const histo); extern VP8CHisto VP8CollectHistogram; // General-purpose util function to help VP8CollectHistogram(). void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1], diff --git a/src/dsp/lossless.c b/src/dsp/lossless.c index e7cbfd9e..a02443f1 100644 --- a/src/dsp/lossless.c +++ b/src/dsp/lossless.c @@ -182,13 +182,13 @@ uint32_t VP8LPredictor13_C(const uint32_t* const left, } static void PredictorAdd0_C(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int x; (void)upper; for (x = 0; x < num_pixels; ++x) out[x] = VP8LAddPixels(in[x], ARGB_BLACK); } static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; uint32_t left = out[-1]; (void)upper; @@ -441,8 +441,8 @@ static int is_big_endian(void) { return (tmp.b[0] != 1); } -void VP8LConvertBGRAToRGB_C(const uint32_t* src, - int num_pixels, uint8_t* dst) { +void VP8LConvertBGRAToRGB_C(const uint32_t* WEBP_RESTRICT src, + int num_pixels, uint8_t* WEBP_RESTRICT dst) { const uint32_t* const src_end = src + num_pixels; while (src < src_end) { const uint32_t argb = *src++; @@ -452,8 +452,8 @@ void VP8LConvertBGRAToRGB_C(const uint32_t* src, } } -void VP8LConvertBGRAToRGBA_C(const uint32_t* src, - int num_pixels, uint8_t* dst) { +void VP8LConvertBGRAToRGBA_C(const uint32_t* WEBP_RESTRICT src, + int num_pixels, uint8_t* WEBP_RESTRICT dst) { const uint32_t* const src_end = src + num_pixels; while (src < src_end) { const uint32_t argb = *src++; @@ -464,8 +464,8 @@ void VP8LConvertBGRAToRGBA_C(const uint32_t* src, } } -void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src, - int num_pixels, uint8_t* dst) { +void VP8LConvertBGRAToRGBA4444_C(const uint32_t* WEBP_RESTRICT src, + int num_pixels, uint8_t* WEBP_RESTRICT dst) { const uint32_t* const src_end = src + num_pixels; while (src < src_end) { const uint32_t argb = *src++; @@ -481,8 +481,8 @@ void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src, } } -void VP8LConvertBGRAToRGB565_C(const uint32_t* src, - int num_pixels, uint8_t* dst) { +void VP8LConvertBGRAToRGB565_C(const uint32_t* WEBP_RESTRICT src, + int num_pixels, uint8_t* WEBP_RESTRICT dst) { const uint32_t* const src_end = src + num_pixels; while (src < src_end) { const uint32_t argb = *src++; @@ -498,8 +498,8 @@ void VP8LConvertBGRAToRGB565_C(const uint32_t* src, } } -void VP8LConvertBGRAToBGR_C(const uint32_t* src, - int num_pixels, uint8_t* dst) { +void VP8LConvertBGRAToBGR_C(const uint32_t* WEBP_RESTRICT src, + int num_pixels, uint8_t* WEBP_RESTRICT dst) { const uint32_t* const src_end = src + num_pixels; while (src < src_end) { const uint32_t argb = *src++; @@ -509,8 +509,8 @@ void VP8LConvertBGRAToBGR_C(const uint32_t* src, } } -static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst, - int swap_on_big_endian) { +static void CopyOrSwap(const uint32_t* WEBP_RESTRICT src, int num_pixels, + uint8_t* WEBP_RESTRICT dst, int swap_on_big_endian) { if (is_big_endian() == swap_on_big_endian) { const uint32_t* const src_end = src + num_pixels; while (src < src_end) { diff --git a/src/dsp/lossless.h b/src/dsp/lossless.h index 53278cda..a72e3b2a 100644 --- a/src/dsp/lossless.h +++ b/src/dsp/lossless.h @@ -18,6 +18,7 @@ #include "src/webp/types.h" #include "src/webp/decode.h" +#include "src/dsp/dsp.h" #include "src/enc/histogram_enc.h" #include "src/utils/utils.h" @@ -60,7 +61,7 @@ uint32_t VP8LPredictor13_C(const uint32_t* const left, // These Add/Sub function expects upper[-1] and out[-1] to be readable. typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in, const uint32_t* upper, int num_pixels, - uint32_t* out); + uint32_t* WEBP_RESTRICT out); extern VP8LPredictorAddSubFunc VP8LPredictorsAdd[16]; extern VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16]; @@ -91,8 +92,8 @@ void VP8LInverseTransform(const struct VP8LTransform* const transform, const uint32_t* const in, uint32_t* const out); // Color space conversion. -typedef void (*VP8LConvertFunc)(const uint32_t* src, int num_pixels, - uint8_t* dst); +typedef void (*VP8LConvertFunc)(const uint32_t* WEBP_RESTRICT src, + int num_pixels, uint8_t* WEBP_RESTRICT dst); extern VP8LConvertFunc VP8LConvertBGRAToRGB; extern VP8LConvertFunc VP8LConvertBGRAToRGBA; extern VP8LConvertFunc VP8LConvertBGRAToRGBA4444; @@ -145,29 +146,33 @@ void VP8LDspInit(void); typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels); extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed; -typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m, - uint32_t* dst, int num_pixels); +typedef void (*VP8LTransformColorFunc)( + const VP8LMultipliers* WEBP_RESTRICT const m, uint32_t* WEBP_RESTRICT dst, + int num_pixels); extern VP8LTransformColorFunc VP8LTransformColor; typedef void (*VP8LCollectColorBlueTransformsFunc)( - const uint32_t* argb, int stride, + const uint32_t* WEBP_RESTRICT argb, int stride, int tile_width, int tile_height, int green_to_blue, int red_to_blue, uint32_t histo[]); extern VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms; typedef void (*VP8LCollectColorRedTransformsFunc)( - const uint32_t* argb, int stride, + const uint32_t* WEBP_RESTRICT argb, int stride, int tile_width, int tile_height, int green_to_red, uint32_t histo[]); extern VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms; // Expose some C-only fallback functions -void VP8LTransformColor_C(const VP8LMultipliers* const m, - uint32_t* data, int num_pixels); -void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels); -void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride, +void VP8LTransformColor_C(const VP8LMultipliers* WEBP_RESTRICT const m, + uint32_t* WEBP_RESTRICT data, int num_pixels); +void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* WEBP_RESTRICT argb_data, + int num_pixels); +void VP8LCollectColorRedTransforms_C(const uint32_t* WEBP_RESTRICT argb, + int stride, int tile_width, int tile_height, int green_to_red, uint32_t histo[]); -void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride, +void VP8LCollectColorBlueTransforms_C(const uint32_t* WEBP_RESTRICT argb, + int stride, int tile_width, int tile_height, int green_to_blue, int red_to_blue, uint32_t histo[]); @@ -179,7 +184,8 @@ extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16]; // Huffman-cost related functions. typedef uint32_t (*VP8LCostFunc)(const uint32_t* population, int length); -typedef uint32_t (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y, +typedef uint32_t (*VP8LCostCombinedFunc)(const uint32_t* WEBP_RESTRICT X, + const uint32_t* WEBP_RESTRICT Y, int length); typedef uint64_t (*VP8LCombinedShannonEntropyFunc)(const uint32_t X[256], const uint32_t Y[256]); @@ -210,26 +216,30 @@ void VP8LBitEntropyInit(VP8LBitEntropy* const entropy); // codec specific heuristics. typedef void (*VP8LGetCombinedEntropyUnrefinedFunc)( const uint32_t X[], const uint32_t Y[], int length, - VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats); + VP8LBitEntropy* WEBP_RESTRICT const bit_entropy, + VP8LStreaks* WEBP_RESTRICT const stats); extern VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined; // Get the entropy for the distribution 'X'. -typedef void (*VP8LGetEntropyUnrefinedFunc)(const uint32_t X[], int length, - VP8LBitEntropy* const bit_entropy, - VP8LStreaks* const stats); +typedef void (*VP8LGetEntropyUnrefinedFunc)( + const uint32_t X[], int length, + VP8LBitEntropy* WEBP_RESTRICT const bit_entropy, + VP8LStreaks* WEBP_RESTRICT const stats); extern VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined; void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n, VP8LBitEntropy* const entropy); -typedef void (*VP8LAddVectorFunc)(const uint32_t* a, const uint32_t* b, - uint32_t* out, int size); +typedef void (*VP8LAddVectorFunc)(const uint32_t* WEBP_RESTRICT a, + const uint32_t* WEBP_RESTRICT b, + uint32_t* WEBP_RESTRICT out, int size); extern VP8LAddVectorFunc VP8LAddVector; -typedef void (*VP8LAddVectorEqFunc)(const uint32_t* a, uint32_t* out, int size); +typedef void (*VP8LAddVectorEqFunc)(const uint32_t* WEBP_RESTRICT a, + uint32_t* WEBP_RESTRICT out, int size); extern VP8LAddVectorEqFunc VP8LAddVectorEq; -void VP8LHistogramAdd(const VP8LHistogram* const a, - const VP8LHistogram* const b, - VP8LHistogram* const out); +void VP8LHistogramAdd(const VP8LHistogram* WEBP_RESTRICT const a, + const VP8LHistogram* WEBP_RESTRICT const b, + VP8LHistogram* WEBP_RESTRICT const out); // ----------------------------------------------------------------------------- // PrefixEncode() @@ -239,11 +249,12 @@ typedef int (*VP8LVectorMismatchFunc)(const uint32_t* const array1, // Returns the first index where array1 and array2 are different. extern VP8LVectorMismatchFunc VP8LVectorMismatch; -typedef void (*VP8LBundleColorMapFunc)(const uint8_t* const row, int width, - int xbits, uint32_t* dst); +typedef void (*VP8LBundleColorMapFunc)(const uint8_t* WEBP_RESTRICT const row, + int width, int xbits, + uint32_t* WEBP_RESTRICT dst); extern VP8LBundleColorMapFunc VP8LBundleColorMap; -void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits, - uint32_t* dst); +void VP8LBundleColorMap_C(const uint8_t* WEBP_RESTRICT const row, + int width, int xbits, uint32_t* WEBP_RESTRICT dst); // Must be called before calling any of the above methods. void VP8LEncDspInit(void); diff --git a/src/dsp/lossless_common.h b/src/dsp/lossless_common.h index 33f2c4dc..66eadf1f 100644 --- a/src/dsp/lossless_common.h +++ b/src/dsp/lossless_common.h @@ -194,15 +194,15 @@ uint32_t VP8LSubPixels(uint32_t a, uint32_t b) { // The predictor is added to the output pixel (which // is therefore considered as a residual) to get the final prediction. -#define GENERATE_PREDICTOR_ADD(PREDICTOR, PREDICTOR_ADD) \ -static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \ - int num_pixels, uint32_t* out) { \ - int x; \ - assert(upper != NULL); \ - for (x = 0; x < num_pixels; ++x) { \ - const uint32_t pred = (PREDICTOR)(&out[x - 1], upper + x); \ - out[x] = VP8LAddPixels(in[x], pred); \ - } \ +#define GENERATE_PREDICTOR_ADD(PREDICTOR, PREDICTOR_ADD) \ +static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \ + int num_pixels, uint32_t* WEBP_RESTRICT out) { \ + int x; \ + assert(upper != NULL); \ + for (x = 0; x < num_pixels; ++x) { \ + const uint32_t pred = (PREDICTOR)(&out[x - 1], upper + x); \ + out[x] = VP8LAddPixels(in[x], pred); \ + } \ } #ifdef __cplusplus diff --git a/src/dsp/lossless_enc.c b/src/dsp/lossless_enc.c index 6ba1b373..7e621a71 100644 --- a/src/dsp/lossless_enc.c +++ b/src/dsp/lossless_enc.c @@ -359,8 +359,8 @@ void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) { entropy->nonzero_code = VP8L_NON_TRIVIAL_SYM; } -void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n, - VP8LBitEntropy* const entropy) { +void VP8LBitsEntropyUnrefined(const uint32_t* WEBP_RESTRICT const array, int n, + VP8LBitEntropy* WEBP_RESTRICT const entropy) { int i; VP8LBitEntropyInit(entropy); @@ -380,8 +380,10 @@ void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n, } static WEBP_INLINE void GetEntropyUnrefinedHelper( - uint32_t val, int i, uint32_t* const val_prev, int* const i_prev, - VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) { + uint32_t val, int i, uint32_t* WEBP_RESTRICT const val_prev, + int* WEBP_RESTRICT const i_prev, + VP8LBitEntropy* WEBP_RESTRICT const bit_entropy, + VP8LStreaks* WEBP_RESTRICT const stats) { const int streak = i - *i_prev; // Gather info for the bit entropy. @@ -403,9 +405,10 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper( *i_prev = i; } -static void GetEntropyUnrefined_C(const uint32_t X[], int length, - VP8LBitEntropy* const bit_entropy, - VP8LStreaks* const stats) { +static void GetEntropyUnrefined_C( + const uint32_t X[], int length, + VP8LBitEntropy* WEBP_RESTRICT const bit_entropy, + VP8LStreaks* WEBP_RESTRICT const stats) { int i; int i_prev = 0; uint32_t x_prev = X[0]; @@ -424,11 +427,10 @@ static void GetEntropyUnrefined_C(const uint32_t X[], int length, bit_entropy->entropy = VP8LFastSLog2(bit_entropy->sum) - bit_entropy->entropy; } -static void GetCombinedEntropyUnrefined_C(const uint32_t X[], - const uint32_t Y[], - int length, - VP8LBitEntropy* const bit_entropy, - VP8LStreaks* const stats) { +static void GetCombinedEntropyUnrefined_C( + const uint32_t X[], const uint32_t Y[], int length, + VP8LBitEntropy* WEBP_RESTRICT const bit_entropy, + VP8LStreaks* WEBP_RESTRICT const stats) { int i = 1; int i_prev = 0; uint32_t xy_prev = X[0] + Y[0]; @@ -468,8 +470,8 @@ static WEBP_INLINE int8_t U32ToS8(uint32_t v) { return (int8_t)(v & 0xff); } -void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data, - int num_pixels) { +void VP8LTransformColor_C(const VP8LMultipliers* WEBP_RESTRICT const m, + uint32_t* WEBP_RESTRICT data, int num_pixels) { int i; for (i = 0; i < num_pixels; ++i) { const uint32_t argb = data[i]; @@ -505,7 +507,8 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue, return (new_blue & 0xff); } -void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride, +void VP8LCollectColorRedTransforms_C(const uint32_t* WEBP_RESTRICT argb, + int stride, int tile_width, int tile_height, int green_to_red, uint32_t histo[]) { while (tile_height-- > 0) { @@ -517,7 +520,8 @@ void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride, } } -void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride, +void VP8LCollectColorBlueTransforms_C(const uint32_t* WEBP_RESTRICT argb, + int stride, int tile_width, int tile_height, int green_to_blue, int red_to_blue, uint32_t histo[]) { @@ -544,8 +548,8 @@ static int VectorMismatch_C(const uint32_t* const array1, } // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel. -void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits, - uint32_t* dst) { +void VP8LBundleColorMap_C(const uint8_t* WEBP_RESTRICT const row, + int width, int xbits, uint32_t* WEBP_RESTRICT dst) { int x; if (xbits > 0) { const int bit_depth = 1 << (3 - xbits); @@ -576,7 +580,8 @@ static uint32_t ExtraCost_C(const uint32_t* population, int length) { return cost; } -static uint32_t ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y, +static uint32_t ExtraCostCombined_C(const uint32_t* WEBP_RESTRICT X, + const uint32_t* WEBP_RESTRICT Y, int length) { int i; uint32_t cost = X[4] + Y[4] + X[5] + Y[5]; @@ -591,13 +596,15 @@ static uint32_t ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y, //------------------------------------------------------------------------------ -static void AddVector_C(const uint32_t* a, const uint32_t* b, uint32_t* out, - int size) { +static void AddVector_C(const uint32_t* WEBP_RESTRICT a, + const uint32_t* WEBP_RESTRICT b, + uint32_t* WEBP_RESTRICT out, int size) { int i; for (i = 0; i < size; ++i) out[i] = a[i] + b[i]; } -static void AddVectorEq_C(const uint32_t* a, uint32_t* out, int size) { +static void AddVectorEq_C(const uint32_t* WEBP_RESTRICT a, + uint32_t* WEBP_RESTRICT out, int size) { int i; for (i = 0; i < size; ++i) out[i] += a[i]; } @@ -626,8 +633,9 @@ static void AddVectorEq_C(const uint32_t* a, uint32_t* out, int size) { } \ } while (0) -void VP8LHistogramAdd(const VP8LHistogram* const a, - const VP8LHistogram* const b, VP8LHistogram* const out) { +void VP8LHistogramAdd(const VP8LHistogram* WEBP_RESTRICT const a, + const VP8LHistogram* WEBP_RESTRICT const b, + VP8LHistogram* WEBP_RESTRICT const out) { int i; const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_); assert(a->palette_code_bits_ == b->palette_code_bits_); @@ -657,14 +665,14 @@ void VP8LHistogramAdd(const VP8LHistogram* const a, // Image transforms. static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], ARGB_BLACK); (void)upper; } static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], in[i - 1]); (void)upper; @@ -675,7 +683,8 @@ static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper, #define GENERATE_PREDICTOR_SUB(PREDICTOR_I) \ static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in, \ const uint32_t* upper, \ - int num_pixels, uint32_t* out) { \ + int num_pixels, \ + uint32_t* WEBP_RESTRICT out) { \ int x; \ assert(upper != NULL); \ for (x = 0; x < num_pixels; ++x) { \ diff --git a/src/dsp/lossless_enc_mips32.c b/src/dsp/lossless_enc_mips32.c index 58529f9a..8e9d7358 100644 --- a/src/dsp/lossless_enc_mips32.c +++ b/src/dsp/lossless_enc_mips32.c @@ -149,8 +149,9 @@ static uint32_t ExtraCost_MIPS32(const uint32_t* const population, int length) { // pY += 2; // } // return cost; -static uint32_t ExtraCostCombined_MIPS32(const uint32_t* const X, - const uint32_t* const Y, int length) { +static uint32_t ExtraCostCombined_MIPS32(const uint32_t* WEBP_RESTRICT const X, + const uint32_t* WEBP_RESTRICT const Y, + int length) { int i, temp0, temp1, temp2, temp3; const uint32_t* pX = &X[4]; const uint32_t* pY = &Y[4]; @@ -215,8 +216,10 @@ static uint32_t ExtraCostCombined_MIPS32(const uint32_t* const X, // Returns the various RLE counts static WEBP_INLINE void GetEntropyUnrefinedHelper( - uint32_t val, int i, uint32_t* const val_prev, int* const i_prev, - VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) { + uint32_t val, int i, uint32_t* WEBP_RESTRICT const val_prev, + int* WEBP_RESTRICT const i_prev, + VP8LBitEntropy* WEBP_RESTRICT const bit_entropy, + VP8LStreaks* WEBP_RESTRICT const stats) { int* const pstreaks = &stats->streaks[0][0]; int* const pcnts = &stats->counts[0]; int temp0, temp1, temp2, temp3; @@ -241,9 +244,10 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper( *i_prev = i; } -static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length, - VP8LBitEntropy* const bit_entropy, - VP8LStreaks* const stats) { +static void GetEntropyUnrefined_MIPS32( + const uint32_t X[], int length, + VP8LBitEntropy* WEBP_RESTRICT const bit_entropy, + VP8LStreaks* WEBP_RESTRICT const stats) { int i; int i_prev = 0; uint32_t x_prev = X[0]; @@ -262,11 +266,10 @@ static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length, bit_entropy->entropy = VP8LFastSLog2(bit_entropy->sum) - bit_entropy->entropy; } -static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[], - const uint32_t Y[], - int length, - VP8LBitEntropy* const entropy, - VP8LStreaks* const stats) { +static void GetCombinedEntropyUnrefined_MIPS32( + const uint32_t X[], const uint32_t Y[], int length, + VP8LBitEntropy* WEBP_RESTRICT const entropy, + VP8LStreaks* WEBP_RESTRICT const stats) { int i = 1; int i_prev = 0; uint32_t xy_prev = X[0] + Y[0]; @@ -344,8 +347,9 @@ static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[], ASM_END_COMMON_0 \ ASM_END_COMMON_1 -static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb, - uint32_t* pout, int size) { +static void AddVector_MIPS32(const uint32_t* WEBP_RESTRICT pa, + const uint32_t* WEBP_RESTRICT pb, + uint32_t* WEBP_RESTRICT pout, int size) { uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; const int end = ((size) / 4) * 4; const uint32_t* const LoopEnd = pa + end; @@ -356,7 +360,8 @@ static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb, for (i = 0; i < size - end; ++i) pout[i] = pa[i] + pb[i]; } -static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) { +static void AddVectorEq_MIPS32(const uint32_t* WEBP_RESTRICT pa, + uint32_t* WEBP_RESTRICT pout, int size) { uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; const int end = ((size) / 4) * 4; const uint32_t* const LoopEnd = pa + end; diff --git a/src/dsp/lossless_enc_mips_dsp_r2.c b/src/dsp/lossless_enc_mips_dsp_r2.c index 6eaab0af..e10b8f7e 100644 --- a/src/dsp/lossless_enc_mips_dsp_r2.c +++ b/src/dsp/lossless_enc_mips_dsp_r2.c @@ -78,8 +78,9 @@ static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred, return (uint32_t)((int)(color_pred) * color) >> 5; } -static void TransformColor_MIPSdspR2(const VP8LMultipliers* const m, - uint32_t* data, int num_pixels) { +static void TransformColor_MIPSdspR2( + const VP8LMultipliers* WEBP_RESTRICT const m, uint32_t* WEBP_RESTRICT data, + int num_pixels) { int temp0, temp1, temp2, temp3, temp4, temp5; uint32_t argb, argb1, new_red, new_red1; const uint32_t G_to_R = m->green_to_red_; @@ -172,7 +173,8 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue, } static void CollectColorBlueTransforms_MIPSdspR2( - const uint32_t* argb, int stride, int tile_width, int tile_height, + const uint32_t* WEBP_RESTRICT argb, int stride, + int tile_width, int tile_height, int green_to_blue, int red_to_blue, uint32_t histo[]) { const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff); const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff); @@ -221,11 +223,9 @@ static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red, return (new_red & 0xff); } -static void CollectColorRedTransforms_MIPSdspR2(const uint32_t* argb, - int stride, int tile_width, - int tile_height, - int green_to_red, - uint32_t histo[]) { +static void CollectColorRedTransforms_MIPSdspR2( + const uint32_t* WEBP_RESTRICT argb, int stride, + int tile_width, int tile_height, int green_to_red, uint32_t histo[]) { const int gtr = (green_to_red << 16) | (green_to_red & 0xffff); while (tile_height-- > 0) { int x; diff --git a/src/dsp/lossless_enc_msa.c b/src/dsp/lossless_enc_msa.c index 600dddfb..6d835ab7 100644 --- a/src/dsp/lossless_enc_msa.c +++ b/src/dsp/lossless_enc_msa.c @@ -48,8 +48,8 @@ dst = VSHF_UB(src, t0, mask1); \ } while (0) -static void TransformColor_MSA(const VP8LMultipliers* const m, uint32_t* data, - int num_pixels) { +static void TransformColor_MSA(const VP8LMultipliers* WEBP_RESTRICT const m, + uint32_t* WEBP_RESTRICT data, int num_pixels) { v16u8 src0, dst0; const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ | (m->green_to_red_ << 16)); diff --git a/src/dsp/lossless_enc_neon.c b/src/dsp/lossless_enc_neon.c index e32c7961..838204a7 100644 --- a/src/dsp/lossless_enc_neon.c +++ b/src/dsp/lossless_enc_neon.c @@ -72,8 +72,9 @@ static void SubtractGreenFromBlueAndRed_NEON(uint32_t* argb_data, //------------------------------------------------------------------------------ // Color Transform -static void TransformColor_NEON(const VP8LMultipliers* const m, - uint32_t* argb_data, int num_pixels) { +static void TransformColor_NEON(const VP8LMultipliers* WEBP_RESTRICT const m, + uint32_t* WEBP_RESTRICT argb_data, + int num_pixels) { // sign-extended multiplying constants, pre-shifted by 6. #define CST(X) (((int16_t)(m->X << 8)) >> 6) const int16_t rb[8] = { diff --git a/src/dsp/lossless_enc_sse2.c b/src/dsp/lossless_enc_sse2.c index 530acc37..f6706dd5 100644 --- a/src/dsp/lossless_enc_sse2.c +++ b/src/dsp/lossless_enc_sse2.c @@ -49,8 +49,9 @@ static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data, #define MK_CST_16(HI, LO) \ _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) -static void TransformColor_SSE2(const VP8LMultipliers* const m, - uint32_t* argb_data, int num_pixels) { +static void TransformColor_SSE2(const VP8LMultipliers* WEBP_RESTRICT const m, + uint32_t* WEBP_RESTRICT argb_data, + int num_pixels) { const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_)); const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0); @@ -79,7 +80,8 @@ static void TransformColor_SSE2(const VP8LMultipliers* const m, //------------------------------------------------------------------------------ #define SPAN 8 -static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride, +static void CollectColorBlueTransforms_SSE2(const uint32_t* WEBP_RESTRICT argb, + int stride, int tile_width, int tile_height, int green_to_blue, int red_to_blue, uint32_t histo[]) { @@ -126,7 +128,8 @@ static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride, } } -static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride, +static void CollectColorRedTransforms_SSE2(const uint32_t* WEBP_RESTRICT argb, + int stride, int tile_width, int tile_height, int green_to_red, uint32_t histo[]) { const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_red)); @@ -173,8 +176,9 @@ static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride, // Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But // that's ok since the histogram values are less than 1<<28 (max picture size). #define LINE_SIZE 16 // 8 or 16 -static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out, - int size) { +static void AddVector_SSE2(const uint32_t* WEBP_RESTRICT a, + const uint32_t* WEBP_RESTRICT b, + uint32_t* WEBP_RESTRICT out, int size) { int i; for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) { const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); @@ -201,7 +205,8 @@ static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out, } } -static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) { +static void AddVectorEq_SSE2(const uint32_t* WEBP_RESTRICT a, + uint32_t* WEBP_RESTRICT out, int size) { int i; for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) { const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); @@ -333,8 +338,9 @@ static int VectorMismatch_SSE2(const uint32_t* const array1, } // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel. -static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits, - uint32_t* dst) { +static void BundleColorMap_SSE2(const uint8_t* WEBP_RESTRICT const row, + int width, int xbits, + uint32_t* WEBP_RESTRICT dst) { int x; assert(xbits >= 0); assert(xbits <= 3); @@ -423,7 +429,7 @@ static WEBP_INLINE void Average2_m128i(const __m128i* const a0, // Predictor0: ARGB_BLACK. static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; const __m128i black = _mm_set1_epi32((int)ARGB_BLACK); for (i = 0; i + 4 <= num_pixels; i += 4) { @@ -440,7 +446,8 @@ static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper, #define GENERATE_PREDICTOR_1(X, IN) \ static void PredictorSub##X##_SSE2(const uint32_t* const in, \ const uint32_t* const upper, \ - int num_pixels, uint32_t* const out) { \ + int num_pixels, \ + uint32_t* WEBP_RESTRICT const out) { \ int i; \ for (i = 0; i + 4 <= num_pixels; i += 4) { \ const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \ @@ -462,7 +469,7 @@ GENERATE_PREDICTOR_1(4, upper[i - 1]) // Predictor4: TL // Predictor5: avg2(avg2(L, TR), T) static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); @@ -482,7 +489,8 @@ static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper, #define GENERATE_PREDICTOR_2(X, A, B) \ static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \ - int num_pixels, uint32_t* out) { \ + int num_pixels, \ + uint32_t* WEBP_RESTRICT out) { \ int i; \ for (i = 0; i + 4 <= num_pixels; i += 4) { \ const __m128i tA = _mm_loadu_si128((const __m128i*)&(A)); \ @@ -506,7 +514,7 @@ GENERATE_PREDICTOR_2(9, upper[i], upper[i + 1]) // Predictor9: average(T, TR) // Predictor10: avg(avg(L,TL), avg(T, TR)). static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); @@ -541,7 +549,7 @@ static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B, } static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); @@ -567,7 +575,7 @@ static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper, // Predictor12: ClampedSubSubtractFull. static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; const __m128i zero = _mm_setzero_si128(); for (i = 0; i + 4 <= num_pixels; i += 4) { @@ -596,7 +604,7 @@ static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper, // Predictors13: ClampedAddSubtractHalf static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; const __m128i zero = _mm_setzero_si128(); for (i = 0; i + 2 <= num_pixels; i += 2) { diff --git a/src/dsp/lossless_enc_sse41.c b/src/dsp/lossless_enc_sse41.c index 9a0dcf9b..87ed056f 100644 --- a/src/dsp/lossless_enc_sse41.c +++ b/src/dsp/lossless_enc_sse41.c @@ -44,8 +44,9 @@ static uint32_t ExtraCost_SSE41(const uint32_t* const a, int length) { return HorizontalSum_SSE41(cost); } -static uint32_t ExtraCostCombined_SSE41(const uint32_t* const a, - const uint32_t* const b, int length) { +static uint32_t ExtraCostCombined_SSE41(const uint32_t* WEBP_RESTRICT const a, + const uint32_t* WEBP_RESTRICT const b, + int length) { int i; __m128i cost = _mm_add_epi32(_mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]), _mm_set_epi32(2 * b[7], 2 * b[6], b[5], b[4])); @@ -95,7 +96,8 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data, #define MK_CST_16(HI, LO) \ _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) -static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride, +static void CollectColorBlueTransforms_SSE41(const uint32_t* WEBP_RESTRICT argb, + int stride, int tile_width, int tile_height, int green_to_blue, int red_to_blue, uint32_t histo[]) { @@ -141,7 +143,8 @@ static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride, } } -static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride, +static void CollectColorRedTransforms_SSE41(const uint32_t* WEBP_RESTRICT argb, + int stride, int tile_width, int tile_height, int green_to_red, uint32_t histo[]) { diff --git a/src/dsp/lossless_neon.c b/src/dsp/lossless_neon.c index e9960db3..93f41cef 100644 --- a/src/dsp/lossless_neon.c +++ b/src/dsp/lossless_neon.c @@ -26,8 +26,8 @@ #if !defined(WORK_AROUND_GCC) // gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for // gcc-4.8.x at least. -static void ConvertBGRAToRGBA_NEON(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGBA_NEON(const uint32_t* WEBP_RESTRICT src, + int num_pixels, uint8_t* WEBP_RESTRICT dst) { const uint32_t* const end = src + (num_pixels & ~15); for (; src < end; src += 16) { uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); @@ -41,8 +41,8 @@ static void ConvertBGRAToRGBA_NEON(const uint32_t* src, VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs } -static void ConvertBGRAToBGR_NEON(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToBGR_NEON(const uint32_t* WEBP_RESTRICT src, + int num_pixels, uint8_t* WEBP_RESTRICT dst) { const uint32_t* const end = src + (num_pixels & ~15); for (; src < end; src += 16) { const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); @@ -53,8 +53,8 @@ static void ConvertBGRAToBGR_NEON(const uint32_t* src, VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst); // left-overs } -static void ConvertBGRAToRGB_NEON(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGB_NEON(const uint32_t* WEBP_RESTRICT src, + int num_pixels, uint8_t* WEBP_RESTRICT dst) { const uint32_t* const end = src + (num_pixels & ~15); for (; src < end; src += 16) { const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); @@ -71,8 +71,8 @@ static void ConvertBGRAToRGB_NEON(const uint32_t* src, static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 }; -static void ConvertBGRAToRGBA_NEON(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGBA_NEON(const uint32_t* WEBP_RESTRICT src, + int num_pixels, uint8_t* WEBP_RESTRICT dst) { const uint32_t* const end = src + (num_pixels & ~1); const uint8x8_t shuffle = vld1_u8(kRGBAShuffle); for (; src < end; src += 2) { @@ -89,8 +89,8 @@ static const uint8_t kBGRShuffle[3][8] = { { 21, 22, 24, 25, 26, 28, 29, 30 } }; -static void ConvertBGRAToBGR_NEON(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToBGR_NEON(const uint32_t* WEBP_RESTRICT src, + int num_pixels, uint8_t* WEBP_RESTRICT dst) { const uint32_t* const end = src + (num_pixels & ~7); const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]); const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]); @@ -116,8 +116,8 @@ static const uint8_t kRGBShuffle[3][8] = { { 21, 20, 26, 25, 24, 30, 29, 28 } }; -static void ConvertBGRAToRGB_NEON(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGB_NEON(const uint32_t* WEBP_RESTRICT src, + int num_pixels, uint8_t* WEBP_RESTRICT dst) { const uint32_t* const end = src + (num_pixels & ~7); const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]); const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]); @@ -209,7 +209,7 @@ static uint32_t Predictor13_NEON(const uint32_t* const left, // Predictor0: ARGB_BLACK. static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; const uint8x16_t black = vreinterpretq_u8_u32(vdupq_n_u32(ARGB_BLACK)); for (i = 0; i + 4 <= num_pixels; i += 4) { @@ -222,7 +222,7 @@ static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper, // Predictor1: left. static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; const uint8x16_t zero = LOADQ_U32_AS_U8(0); for (i = 0; i + 4 <= num_pixels; i += 4) { @@ -248,7 +248,7 @@ static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper, #define GENERATE_PREDICTOR_1(X, IN) \ static void PredictorAdd##X##_NEON(const uint32_t* in, \ const uint32_t* upper, int num_pixels, \ - uint32_t* out) { \ + uint32_t* WEBP_RESTRICT out) { \ int i; \ for (i = 0; i + 4 <= num_pixels; i += 4) { \ const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); \ @@ -276,7 +276,7 @@ GENERATE_PREDICTOR_1(4, upper[i - 1]) } while (0) static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { @@ -301,7 +301,7 @@ static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper, // Predictor6: average(left, TL) static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { @@ -317,7 +317,7 @@ static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper, // Predictor7: average(left, T) static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { @@ -335,7 +335,7 @@ static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper, #define GENERATE_PREDICTOR_2(X, IN) \ static void PredictorAdd##X##_NEON(const uint32_t* in, \ const uint32_t* upper, int num_pixels, \ - uint32_t* out) { \ + uint32_t* WEBP_RESTRICT out) { \ int i; \ for (i = 0; i + 4 <= num_pixels; i += 4) { \ const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); \ @@ -363,7 +363,7 @@ GENERATE_PREDICTOR_2(9, upper[i + 1]) } while (0) static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { @@ -394,7 +394,7 @@ static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper, } while (0) static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { @@ -427,7 +427,7 @@ static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper, } while (0) static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; uint16x8_t L = vmovl_u8(LOAD_U32_AS_U8(out[-1])); for (i = 0; i + 4 <= num_pixels; i += 4) { @@ -468,7 +468,7 @@ static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper, } while (0) static void PredictorAdd13_NEON(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { diff --git a/src/dsp/lossless_sse2.c b/src/dsp/lossless_sse2.c index 4b6a532c..5b68d1cf 100644 --- a/src/dsp/lossless_sse2.c +++ b/src/dsp/lossless_sse2.c @@ -186,7 +186,7 @@ static uint32_t Predictor13_SSE2(const uint32_t* const left, // Predictor0: ARGB_BLACK. static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; const __m128i black = _mm_set1_epi32((int)ARGB_BLACK); for (i = 0; i + 4 <= num_pixels; i += 4) { @@ -202,7 +202,7 @@ static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper, // Predictor1: left. static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; __m128i prev = _mm_set1_epi32((int)out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { @@ -230,7 +230,8 @@ static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper, // per 8 bit channel. #define GENERATE_PREDICTOR_1(X, IN) \ static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \ - int num_pixels, uint32_t* out) { \ + int num_pixels, \ + uint32_t* WEBP_RESTRICT out) { \ int i; \ for (i = 0; i + 4 <= num_pixels; i += 4) { \ const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \ @@ -259,7 +260,8 @@ GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2) #define GENERATE_PREDICTOR_2(X, IN) \ static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \ - int num_pixels, uint32_t* out) { \ + int num_pixels, \ + uint32_t* WEBP_RESTRICT out) { \ int i; \ for (i = 0; i + 4 <= num_pixels; i += 4) { \ const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN)); \ @@ -297,7 +299,7 @@ GENERATE_PREDICTOR_2(9, upper[i + 1]) } while (0) static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; __m128i L = _mm_cvtsi32_si128((int)out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { @@ -344,7 +346,7 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper, } while (0) static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; __m128i pa; __m128i L = _mm_cvtsi32_si128((int)out[-1]); @@ -395,7 +397,7 @@ static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper, } while (0) static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { + int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; const __m128i zero = _mm_setzero_si128(); const __m128i L8 = _mm_cvtsi32_si128((int)out[-1]); @@ -490,8 +492,8 @@ static void TransformColorInverse_SSE2(const VP8LMultipliers* const m, //------------------------------------------------------------------------------ // Color-space conversion functions -static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels, - uint8_t* dst) { +static void ConvertBGRAToRGB_SSE2(const uint32_t* WEBP_RESTRICT src, + int num_pixels, uint8_t* WEBP_RESTRICT dst) { const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; @@ -526,8 +528,8 @@ static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels, } } -static void ConvertBGRAToRGBA_SSE2(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGBA_SSE2(const uint32_t* WEBP_RESTRICT src, + int num_pixels, uint8_t* WEBP_RESTRICT dst) { const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ff); const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; @@ -554,8 +556,9 @@ static void ConvertBGRAToRGBA_SSE2(const uint32_t* src, } } -static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* WEBP_RESTRICT src, + int num_pixels, + uint8_t* WEBP_RESTRICT dst) { const __m128i mask_0x0f = _mm_set1_epi8(0x0f); const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0); const __m128i* in = (const __m128i*)src; @@ -590,8 +593,9 @@ static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src, } } -static void ConvertBGRAToRGB565_SSE2(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGB565_SSE2(const uint32_t* WEBP_RESTRICT src, + int num_pixels, + uint8_t* WEBP_RESTRICT dst) { const __m128i mask_0xe0 = _mm_set1_epi8((char)0xe0); const __m128i mask_0xf8 = _mm_set1_epi8((char)0xf8); const __m128i mask_0x07 = _mm_set1_epi8(0x07); @@ -631,8 +635,8 @@ static void ConvertBGRAToRGB565_SSE2(const uint32_t* src, } } -static void ConvertBGRAToBGR_SSE2(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToBGR_SSE2(const uint32_t* WEBP_RESTRICT src, + int num_pixels, uint8_t* WEBP_RESTRICT dst) { const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff); const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0); const __m128i* in = (const __m128i*)src; diff --git a/src/dsp/lossless_sse41.c b/src/dsp/lossless_sse41.c index bb7ce761..a2d19144 100644 --- a/src/dsp/lossless_sse41.c +++ b/src/dsp/lossless_sse41.c @@ -77,8 +77,8 @@ static void TransformColorInverse_SSE41(const VP8LMultipliers* const m, } \ } while (0) -static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels, - uint8_t* dst) { +static void ConvertBGRAToRGB_SSE41(const uint32_t* WEBP_RESTRICT src, + int num_pixels, uint8_t* WEBP_RESTRICT dst) { const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, @@ -95,8 +95,8 @@ static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels, } } -static void ConvertBGRAToBGR_SSE41(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToBGR_SSE41(const uint32_t* WEBP_RESTRICT src, + int num_pixels, uint8_t* WEBP_RESTRICT dst) { const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, From 35915b389eab7fa08c6db4feeb52aa0ff2dbde03 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 16 Aug 2024 16:55:00 -0700 Subject: [PATCH 6/8] dsp/rescaler*: use WEBP_RESTRICT qualifier Some improvement in the C code. No changes in NEON or SSE2 w/ndk r27/gcc-13/clang-16. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: I2316122db893f48f0afda90a147c83cac7f07526 --- src/dsp/dsp.h | 20 ++++++++++++-------- src/dsp/rescaler.c | 11 ++++++----- src/dsp/rescaler_mips32.c | 8 ++++---- src/dsp/rescaler_msa.c | 27 ++++++++++++++------------- src/dsp/rescaler_neon.c | 4 ++-- src/dsp/rescaler_sse2.c | 18 ++++++++---------- 6 files changed, 46 insertions(+), 42 deletions(-) diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index 30a1d43f..b7c2acf0 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -366,8 +366,9 @@ struct WebPRescaler; // Import a row of data and save its contribution in the rescaler. // 'channel' denotes the channel number to be imported. 'Expand' corresponds to // the wrk->x_expand case. Otherwise, 'Shrink' is to be used. -typedef void (*WebPRescalerImportRowFunc)(struct WebPRescaler* const wrk, - const uint8_t* src); +typedef void (*WebPRescalerImportRowFunc)( + struct WebPRescaler* WEBP_RESTRICT const wrk, + const uint8_t* WEBP_RESTRICT src); extern WebPRescalerImportRowFunc WebPRescalerImportRowExpand; extern WebPRescalerImportRowFunc WebPRescalerImportRowShrink; @@ -380,16 +381,19 @@ extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand; extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink; // Plain-C implementation, as fall-back. -extern void WebPRescalerImportRowExpand_C(struct WebPRescaler* const wrk, - const uint8_t* src); -extern void WebPRescalerImportRowShrink_C(struct WebPRescaler* const wrk, - const uint8_t* src); +extern void WebPRescalerImportRowExpand_C( + struct WebPRescaler* WEBP_RESTRICT const wrk, + const uint8_t* WEBP_RESTRICT src); +extern void WebPRescalerImportRowShrink_C( + struct WebPRescaler* WEBP_RESTRICT const wrk, + const uint8_t* WEBP_RESTRICT src); extern void WebPRescalerExportRowExpand_C(struct WebPRescaler* const wrk); extern void WebPRescalerExportRowShrink_C(struct WebPRescaler* const wrk); // Main entry calls: -extern void WebPRescalerImportRow(struct WebPRescaler* const wrk, - const uint8_t* src); +extern void WebPRescalerImportRow( + struct WebPRescaler* WEBP_RESTRICT const wrk, + const uint8_t* WEBP_RESTRICT src); // Export one row (starting at x_out position) from rescaler. extern void WebPRescalerExportRow(struct WebPRescaler* const wrk); diff --git a/src/dsp/rescaler.c b/src/dsp/rescaler.c index 325d8be1..a96ca669 100644 --- a/src/dsp/rescaler.c +++ b/src/dsp/rescaler.c @@ -26,8 +26,8 @@ //------------------------------------------------------------------------------ // Row import -void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk, - const uint8_t* src) { +void WebPRescalerImportRowExpand_C(WebPRescaler* WEBP_RESTRICT const wrk, + const uint8_t* WEBP_RESTRICT src) { const int x_stride = wrk->num_channels; const int x_out_max = wrk->dst_width * wrk->num_channels; int channel; @@ -59,8 +59,8 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk, } } -void WebPRescalerImportRowShrink_C(WebPRescaler* const wrk, - const uint8_t* src) { +void WebPRescalerImportRowShrink_C(WebPRescaler* WEBP_RESTRICT const wrk, + const uint8_t* WEBP_RESTRICT src) { const int x_stride = wrk->num_channels; const int x_out_max = wrk->dst_width * wrk->num_channels; int channel; @@ -158,7 +158,8 @@ void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) { //------------------------------------------------------------------------------ // Main entry calls -void WebPRescalerImportRow(WebPRescaler* const wrk, const uint8_t* src) { +void WebPRescalerImportRow(WebPRescaler* WEBP_RESTRICT const wrk, + const uint8_t* WEBP_RESTRICT src) { assert(!WebPRescalerInputDone(wrk)); if (!wrk->x_expand) { WebPRescalerImportRowShrink(wrk, src); diff --git a/src/dsp/rescaler_mips32.c b/src/dsp/rescaler_mips32.c index 61f63c61..b5168caa 100644 --- a/src/dsp/rescaler_mips32.c +++ b/src/dsp/rescaler_mips32.c @@ -21,8 +21,8 @@ //------------------------------------------------------------------------------ // Row import -static void ImportRowShrink_MIPS32(WebPRescaler* const wrk, - const uint8_t* src) { +static void ImportRowShrink_MIPS32(WebPRescaler* WEBP_RESTRICT const wrk, + const uint8_t* WEBP_RESTRICT src) { const int x_stride = wrk->num_channels; const int x_out_max = wrk->dst_width * wrk->num_channels; const int fx_scale = wrk->fx_scale; @@ -81,8 +81,8 @@ static void ImportRowShrink_MIPS32(WebPRescaler* const wrk, } } -static void ImportRowExpand_MIPS32(WebPRescaler* const wrk, - const uint8_t* src) { +static void ImportRowExpand_MIPS32(WebPRescaler* WEBP_RESTRICT const wrk, + const uint8_t* WEBP_RESTRICT src) { const int x_stride = wrk->num_channels; const int x_out_max = wrk->dst_width * wrk->num_channels; const int x_add = wrk->x_add; diff --git a/src/dsp/rescaler_msa.c b/src/dsp/rescaler_msa.c index 256dbdd4..954d0fdf 100644 --- a/src/dsp/rescaler_msa.c +++ b/src/dsp/rescaler_msa.c @@ -114,9 +114,9 @@ dst = __msa_copy_s_w((v4i32)t0, 0); \ } while (0) -static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst, - int length, - WebPRescaler* const wrk) { +static WEBP_INLINE void ExportRowExpand_0( + const uint32_t* WEBP_RESTRICT frow, uint8_t* WEBP_RESTRICT dst, int length, + WebPRescaler* WEBP_RESTRICT const wrk) { const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale); const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); const v4i32 zero = { 0 }; @@ -171,9 +171,10 @@ static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst, } } -static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow, - uint8_t* dst, int length, - WebPRescaler* const wrk) { +static WEBP_INLINE void ExportRowExpand_1( + const uint32_t* WEBP_RESTRICT frow, uint32_t* WEBP_RESTRICT irow, + uint8_t* WEBP_RESTRICT dst, int length, + WebPRescaler* WEBP_RESTRICT const wrk) { const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub); const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B); const v4i32 B1 = __msa_fill_w(B); @@ -262,10 +263,10 @@ static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) { } #if 0 // disabled for now. TODO(skal): make match the C-code -static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow, - uint8_t* dst, int length, - const uint32_t yscale, - WebPRescaler* const wrk) { +static WEBP_INLINE void ExportRowShrink_0( + const uint32_t* WEBP_RESTRICT frow, uint32_t* WEBP_RESTRICT irow, + uint8_t* WEBP_RESTRICT dst, int length, const uint32_t yscale, + WebPRescaler* WEBP_RESTRICT const wrk) { const v4u32 y_scale = (v4u32)__msa_fill_w(yscale); const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale); const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); @@ -348,9 +349,9 @@ static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow, } } -static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst, - int length, - WebPRescaler* const wrk) { +static WEBP_INLINE void ExportRowShrink_1( + uint32_t* WEBP_RESTRICT irow, uint8_t* WEBP_RESTRICT dst, int length, + WebPRescaler* WEBP_RESTRICT const wrk) { const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale); const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); const v4i32 zero = { 0 }; diff --git a/src/dsp/rescaler_neon.c b/src/dsp/rescaler_neon.c index 957a92db..ab4ddc00 100644 --- a/src/dsp/rescaler_neon.c +++ b/src/dsp/rescaler_neon.c @@ -45,8 +45,8 @@ #error "MULT_FIX/WEBP_RESCALER_RFIX need some more work" #endif -static uint32x4_t Interpolate_NEON(const rescaler_t* const frow, - const rescaler_t* const irow, +static uint32x4_t Interpolate_NEON(const rescaler_t* WEBP_RESTRICT const frow, + const rescaler_t* WEBP_RESTRICT const irow, uint32_t A, uint32_t B) { LOAD_32x4(frow, A0); LOAD_32x4(irow, B0); diff --git a/src/dsp/rescaler_sse2.c b/src/dsp/rescaler_sse2.c index 3f18e94e..e898e2ac 100644 --- a/src/dsp/rescaler_sse2.c +++ b/src/dsp/rescaler_sse2.c @@ -43,8 +43,8 @@ static void LoadEightPixels_SSE2(const uint8_t* const src, __m128i* out) { *out = _mm_unpacklo_epi8(A, zero); } -static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk, - const uint8_t* src) { +static void RescalerImportRowExpand_SSE2(WebPRescaler* WEBP_RESTRICT const wrk, + const uint8_t* WEBP_RESTRICT src) { rescaler_t* frow = wrk->frow; const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels; const int x_add = wrk->x_add; @@ -109,8 +109,8 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk, assert(accum == 0); } -static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk, - const uint8_t* src) { +static void RescalerImportRowShrink_SSE2(WebPRescaler* WEBP_RESTRICT const wrk, + const uint8_t* WEBP_RESTRICT src) { const int x_sub = wrk->x_sub; int accum = 0; const __m128i zero = _mm_setzero_si128(); @@ -168,12 +168,10 @@ static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk, // Row export // load *src as epi64, multiply by mult and store result in [out0 ... out3] -static WEBP_INLINE void LoadDispatchAndMult_SSE2(const rescaler_t* const src, - const __m128i* const mult, - __m128i* const out0, - __m128i* const out1, - __m128i* const out2, - __m128i* const out3) { +static WEBP_INLINE void LoadDispatchAndMult_SSE2( + const rescaler_t* WEBP_RESTRICT const src, const __m128i* const mult, + __m128i* const out0, __m128i* const out1, __m128i* const out2, + __m128i* const out3) { const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0)); const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4)); const __m128i A2 = _mm_srli_epi64(A0, 32); From 23bbafbeb8ea2bbd701738d712855a5bb7330838 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 16 Aug 2024 19:02:19 -0700 Subject: [PATCH 7/8] dsp/upsampling*: use WEBP_RESTRICT qualifier Better vectorization in the C code, fewer instructions in NEON, and some code reordering / better register usage in SSE2/SSE4 w/ndk r27/gcc-13/clang-16. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: Ib29980f778ad3dbb952178ad8dee39b8673c4ff8 --- src/dsp/dsp.h | 29 +++++++++-------- src/dsp/upsampling.c | 36 ++++++++++++++------- src/dsp/upsampling_mips_dsp_r2.c | 18 +++++++---- src/dsp/upsampling_msa.c | 55 +++++++++++++++++++++----------- src/dsp/upsampling_neon.c | 17 ++++++---- src/dsp/upsampling_sse2.c | 29 +++++++++++------ src/dsp/upsampling_sse41.c | 29 +++++++++++------ 7 files changed, 137 insertions(+), 76 deletions(-) diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index b7c2acf0..8e0b4143 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -285,10 +285,10 @@ void VP8DspInit(void); // Convert a pair of y/u/v lines together to the output rgb/a colorspace. // bottom_y can be NULL if only one line of output is needed (at top/bottom). typedef void (*WebPUpsampleLinePairFunc)( - const uint8_t* top_y, const uint8_t* bottom_y, - const uint8_t* top_u, const uint8_t* top_v, - const uint8_t* cur_u, const uint8_t* cur_v, - uint8_t* top_dst, uint8_t* bottom_dst, int len); + const uint8_t* WEBP_RESTRICT top_y, const uint8_t* WEBP_RESTRICT bottom_y, + const uint8_t* WEBP_RESTRICT top_u, const uint8_t* WEBP_RESTRICT top_v, + const uint8_t* WEBP_RESTRICT cur_u, const uint8_t* WEBP_RESTRICT cur_v, + uint8_t* WEBP_RESTRICT top_dst, uint8_t* WEBP_RESTRICT bottom_dst, int len); #ifdef FANCY_UPSAMPLING @@ -298,13 +298,15 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */]; #endif // FANCY_UPSAMPLING // Per-row point-sampling methods. -typedef void (*WebPSamplerRowFunc)(const uint8_t* y, - const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len); +typedef void (*WebPSamplerRowFunc)(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int len); // Generic function to apply 'WebPSamplerRowFunc' to the whole plane: -void WebPSamplerProcessPlane(const uint8_t* y, int y_stride, - const uint8_t* u, const uint8_t* v, int uv_stride, - uint8_t* dst, int dst_stride, +void WebPSamplerProcessPlane(const uint8_t* WEBP_RESTRICT y, int y_stride, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, int uv_stride, + uint8_t* WEBP_RESTRICT dst, int dst_stride, int width, int height, WebPSamplerRowFunc func); // Sampling functions to convert rows of YUV to RGB(A) @@ -316,9 +318,10 @@ extern WebPSamplerRowFunc WebPSamplers[/* MODE_LAST */]; WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last); // YUV444->RGB converters -typedef void (*WebPYUV444Converter)(const uint8_t* y, - const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len); +typedef void (*WebPYUV444Converter)(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int len); extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */]; diff --git a/src/dsp/upsampling.c b/src/dsp/upsampling.c index 983b9c42..5953fe48 100644 --- a/src/dsp/upsampling.c +++ b/src/dsp/upsampling.c @@ -35,10 +35,14 @@ WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST]; #define LOAD_UV(u, v) ((u) | ((v) << 16)) #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ -static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ - const uint8_t* top_u, const uint8_t* top_v, \ - const uint8_t* cur_u, const uint8_t* cur_v, \ - uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \ + const uint8_t* WEBP_RESTRICT bottom_y, \ + const uint8_t* WEBP_RESTRICT top_u, \ + const uint8_t* WEBP_RESTRICT top_v, \ + const uint8_t* WEBP_RESTRICT cur_u, \ + const uint8_t* WEBP_RESTRICT cur_v, \ + uint8_t* WEBP_RESTRICT top_dst, \ + uint8_t* WEBP_RESTRICT bottom_dst, int len) { \ int x; \ const int last_pixel_pair = (len - 1) >> 1; \ uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \ @@ -136,10 +140,14 @@ static void EmptyUpsampleFunc(const uint8_t* top_y, const uint8_t* bottom_y, #if !defined(FANCY_UPSAMPLING) #define DUAL_SAMPLE_FUNC(FUNC_NAME, FUNC) \ -static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \ - const uint8_t* top_u, const uint8_t* top_v, \ - const uint8_t* bot_u, const uint8_t* bot_v, \ - uint8_t* top_dst, uint8_t* bot_dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \ + const uint8_t* WEBP_RESTRICT bot_y, \ + const uint8_t* WEBP_RESTRICT top_u, \ + const uint8_t* WEBP_RESTRICT top_v, \ + const uint8_t* WEBP_RESTRICT bot_u, \ + const uint8_t* WEBP_RESTRICT bot_v, \ + uint8_t* WEBP_RESTRICT top_dst, \ + uint8_t* WEBP_RESTRICT bot_dst, int len) { \ const int half_len = len >> 1; \ int x; \ assert(top_dst != NULL); \ @@ -178,10 +186,14 @@ WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) { // YUV444 converter #define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \ -extern void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len); \ -void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len) { \ +extern void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len); \ +void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len) { \ int i; \ for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]); \ } diff --git a/src/dsp/upsampling_mips_dsp_r2.c b/src/dsp/upsampling_mips_dsp_r2.c index 10d499d7..cbe8e71d 100644 --- a/src/dsp/upsampling_mips_dsp_r2.c +++ b/src/dsp/upsampling_mips_dsp_r2.c @@ -143,10 +143,14 @@ static WEBP_INLINE void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, #define LOAD_UV(u, v) ((u) | ((v) << 16)) #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ -static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ - const uint8_t* top_u, const uint8_t* top_v, \ - const uint8_t* cur_u, const uint8_t* cur_v, \ - uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \ + const uint8_t* WEBP_RESTRICT bottom_y, \ + const uint8_t* WEBP_RESTRICT top_u, \ + const uint8_t* WEBP_RESTRICT top_v, \ + const uint8_t* WEBP_RESTRICT cur_u, \ + const uint8_t* WEBP_RESTRICT cur_v, \ + uint8_t* WEBP_RESTRICT top_dst, \ + uint8_t* WEBP_RESTRICT bottom_dst, int len) { \ int x; \ const int last_pixel_pair = (len - 1) >> 1; \ uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \ @@ -241,8 +245,10 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) { // YUV444 converter #define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \ -static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len) { \ int i; \ for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \ } diff --git a/src/dsp/upsampling_msa.c b/src/dsp/upsampling_msa.c index f2e03e85..72a526bc 100644 --- a/src/dsp/upsampling_msa.c +++ b/src/dsp/upsampling_msa.c @@ -320,8 +320,10 @@ static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) { } #if !defined(WEBP_REDUCE_CSP) -static void YuvToRgbLine(const uint8_t* y, const uint8_t* u, - const uint8_t* v, uint8_t* dst, int length) { +static void YuvToRgbLine(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int length) { v16u8 R, G, B; while (length >= 16) { CALC_RGB16(y, u, v, R, G, B); @@ -347,8 +349,10 @@ static void YuvToRgbLine(const uint8_t* y, const uint8_t* u, } } -static void YuvToBgrLine(const uint8_t* y, const uint8_t* u, - const uint8_t* v, uint8_t* dst, int length) { +static void YuvToBgrLine(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int length) { v16u8 R, G, B; while (length >= 16) { CALC_RGB16(y, u, v, R, G, B); @@ -375,8 +379,10 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u, } #endif // WEBP_REDUCE_CSP -static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u, - const uint8_t* v, uint8_t* dst, int length) { +static void YuvToRgbaLine(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int length) { v16u8 R, G, B; const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL); while (length >= 16) { @@ -403,8 +409,10 @@ static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u, } } -static void YuvToBgraLine(const uint8_t* y, const uint8_t* u, - const uint8_t* v, uint8_t* dst, int length) { +static void YuvToBgraLine(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int length) { v16u8 R, G, B; const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL); while (length >= 16) { @@ -432,8 +440,10 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u, } #if !defined(WEBP_REDUCE_CSP) -static void YuvToArgbLine(const uint8_t* y, const uint8_t* u, - const uint8_t* v, uint8_t* dst, int length) { +static void YuvToArgbLine(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int length) { v16u8 R, G, B; const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL); while (length >= 16) { @@ -460,8 +470,10 @@ static void YuvToArgbLine(const uint8_t* y, const uint8_t* u, } } -static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u, - const uint8_t* v, uint8_t* dst, int length) { +static void YuvToRgba4444Line(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int length) { v16u8 R, G, B, RG, BA, tmp0, tmp1; while (length >= 16) { #if (WEBP_SWAP_16BIT_CSP == 1) @@ -496,8 +508,10 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u, } } -static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u, - const uint8_t* v, uint8_t* dst, int length) { +static void YuvToRgb565Line(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int length) { v16u8 R, G, B, RG, GB, tmp0, tmp1; while (length >= 16) { #if (WEBP_SWAP_16BIT_CSP == 1) @@ -564,11 +578,14 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u, } while (0) #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ -static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \ - const uint8_t* top_u, const uint8_t* top_v, \ - const uint8_t* cur_u, const uint8_t* cur_v, \ - uint8_t* top_dst, uint8_t* bot_dst, int len) \ -{ \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \ + const uint8_t* WEBP_RESTRICT bot_y, \ + const uint8_t* WEBP_RESTRICT top_u, \ + const uint8_t* WEBP_RESTRICT top_v, \ + const uint8_t* WEBP_RESTRICT cur_u, \ + const uint8_t* WEBP_RESTRICT cur_v, \ + uint8_t* WEBP_RESTRICT top_dst, \ + uint8_t* WEBP_RESTRICT bot_dst, int len) { \ int size = (len - 1) >> 1; \ uint8_t temp_u[64]; \ uint8_t temp_v[64]; \ diff --git a/src/dsp/upsampling_neon.c b/src/dsp/upsampling_neon.c index f39d75e0..2bd3e931 100644 --- a/src/dsp/upsampling_neon.c +++ b/src/dsp/upsampling_neon.c @@ -58,8 +58,9 @@ } while (0) // Turn the macro into a function for reducing code-size when non-critical -static void Upsample16Pixels_NEON(const uint8_t* r1, const uint8_t* r2, - uint8_t* out) { +static void Upsample16Pixels_NEON(const uint8_t* WEBP_RESTRICT const r1, + const uint8_t* WEBP_RESTRICT const r2, + uint8_t* WEBP_RESTRICT const out) { UPSAMPLE_16PIXELS(r1, r2, out); } @@ -190,10 +191,14 @@ static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 }; } #define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP) \ -static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ - const uint8_t* top_u, const uint8_t* top_v, \ - const uint8_t* cur_u, const uint8_t* cur_v, \ - uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \ + const uint8_t* WEBP_RESTRICT bottom_y, \ + const uint8_t* WEBP_RESTRICT top_u, \ + const uint8_t* WEBP_RESTRICT top_v, \ + const uint8_t* WEBP_RESTRICT cur_u, \ + const uint8_t* WEBP_RESTRICT cur_v, \ + uint8_t* WEBP_RESTRICT top_dst, \ + uint8_t* WEBP_RESTRICT bottom_dst, int len) { \ int block; \ /* 16 byte aligned array to cache reconstructed u and v */ \ uint8_t uv_buf[2 * 32 + 15]; \ diff --git a/src/dsp/upsampling_sse2.c b/src/dsp/upsampling_sse2.c index 77b4f722..36226fb1 100644 --- a/src/dsp/upsampling_sse2.c +++ b/src/dsp/upsampling_sse2.c @@ -88,8 +88,9 @@ } while (0) // Turn the macro into a function for reducing code-size when non-critical -static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[], - uint8_t* const out) { +static void Upsample32Pixels_SSE2(const uint8_t* WEBP_RESTRICT const r1, + const uint8_t* WEBP_RESTRICT const r2, + uint8_t* WEBP_RESTRICT const out) { UPSAMPLE_32PIXELS(r1, r2, out); } @@ -114,10 +115,14 @@ static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[], } while (0) #define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ -static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ - const uint8_t* top_u, const uint8_t* top_v, \ - const uint8_t* cur_u, const uint8_t* cur_v, \ - uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \ + const uint8_t* WEBP_RESTRICT bottom_y, \ + const uint8_t* WEBP_RESTRICT top_u, \ + const uint8_t* WEBP_RESTRICT top_v, \ + const uint8_t* WEBP_RESTRICT cur_u, \ + const uint8_t* WEBP_RESTRICT cur_v, \ + uint8_t* WEBP_RESTRICT top_dst, \ + uint8_t* WEBP_RESTRICT bottom_dst, int len) { \ int uv_pos, pos; \ /* 16byte-aligned array to cache reconstructed u and v */ \ uint8_t uv_buf[14 * 32 + 15] = { 0 }; \ @@ -215,10 +220,14 @@ extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */]; extern void WebPInitYUV444ConvertersSSE2(void); #define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \ -extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len); \ -static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len) { \ +extern void CALL_C(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len); \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len) { \ int i; \ const int max_len = len & ~31; \ for (i = 0; i < max_len; i += 32) { \ diff --git a/src/dsp/upsampling_sse41.c b/src/dsp/upsampling_sse41.c index a880974a..823633c4 100644 --- a/src/dsp/upsampling_sse41.c +++ b/src/dsp/upsampling_sse41.c @@ -90,8 +90,9 @@ } while (0) // Turn the macro into a function for reducing code-size when non-critical -static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[], - uint8_t* const out) { +static void Upsample32Pixels_SSE41(const uint8_t* WEBP_RESTRICT const r1, + const uint8_t* WEBP_RESTRICT const r2, + uint8_t* WEBP_RESTRICT const out) { UPSAMPLE_32PIXELS(r1, r2, out); } @@ -116,10 +117,14 @@ static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[], } while (0) #define SSE4_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ -static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ - const uint8_t* top_u, const uint8_t* top_v, \ - const uint8_t* cur_u, const uint8_t* cur_v, \ - uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \ + const uint8_t* WEBP_RESTRICT bottom_y, \ + const uint8_t* WEBP_RESTRICT top_u, \ + const uint8_t* WEBP_RESTRICT top_v, \ + const uint8_t* WEBP_RESTRICT cur_u, \ + const uint8_t* WEBP_RESTRICT cur_v, \ + uint8_t* WEBP_RESTRICT top_dst, \ + uint8_t* WEBP_RESTRICT bottom_dst, int len) { \ int uv_pos, pos; \ /* 16byte-aligned array to cache reconstructed u and v */ \ uint8_t uv_buf[14 * 32 + 15] = { 0 }; \ @@ -202,10 +207,14 @@ extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */]; extern void WebPInitYUV444ConvertersSSE41(void); #define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \ -extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len); \ -static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len) { \ +extern void CALL_C(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len); \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len) { \ int i; \ const int max_len = len & ~31; \ for (i = 0; i < max_len; i += 32) { \ From 2dd5eb9862d4822d2746cfb55f4b59b53bda99eb Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 16 Aug 2024 19:26:36 -0700 Subject: [PATCH 8/8] dsp/yuv*: use WEBP_RESTRICT qualifier Better vectorization in the C code, fewer instructions / comparisons in NEON, and fewer reloads in SSE2/SSE4 w/ndk r27/gcc-13/clang-16. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: I07a7e36a2dce8632c71c0fbbeef94dc51453eaf7 --- src/dsp/dsp.h | 27 +++++--- src/dsp/yuv.c | 48 ++++++++------ src/dsp/yuv.h | 54 ++++++++++------ src/dsp/yuv_mips32.c | 7 +- src/dsp/yuv_mips_dsp_r2.c | 7 +- src/dsp/yuv_neon.c | 18 ++++-- src/dsp/yuv_sse2.c | 130 ++++++++++++++++++++++---------------- src/dsp/yuv_sse41.c | 65 +++++++++++-------- 8 files changed, 218 insertions(+), 138 deletions(-) diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index 8e0b4143..1b37ef4b 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -337,26 +337,35 @@ void WebPInitYUV444Converters(void); // ARGB -> YUV converters // Convert ARGB samples to luma Y. -extern void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width); +extern void (*WebPConvertARGBToY)(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT y, int width); // Convert ARGB samples to U/V with downsampling. do_store should be '1' for // even lines and '0' for odd ones. 'src_width' is the original width, not // the U/V one. -extern void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v, +extern void (*WebPConvertARGBToUV)(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int src_width, int do_store); // Convert a row of accumulated (four-values) of rgba32 toward U/V -extern void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb, - uint8_t* u, uint8_t* v, int width); +extern void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int width); // Convert RGB or BGR to Y -extern void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width); -extern void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width); +extern void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT y, int width); +extern void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr, + uint8_t* WEBP_RESTRICT y, int width); // used for plain-C fallback. -extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v, +extern void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int src_width, int do_store); -extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb, - uint8_t* u, uint8_t* v, int width); +extern void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int width); // Must be called before using the above. void WebPInitConvertARGBToYUV(void); diff --git a/src/dsp/yuv.c b/src/dsp/yuv.c index 8a04b85d..c1320f28 100644 --- a/src/dsp/yuv.c +++ b/src/dsp/yuv.c @@ -20,9 +20,10 @@ // Plain-C version #define ROW_FUNC(FUNC_NAME, FUNC, XSTEP) \ -static void FUNC_NAME(const uint8_t* y, \ - const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len) { \ const uint8_t* const end = dst + (len & ~1) * (XSTEP); \ while (dst != end) { \ FUNC(y[0], u[0], v[0], dst); \ @@ -49,9 +50,10 @@ ROW_FUNC(YuvToRgb565Row, VP8YuvToRgb565, 2) #undef ROW_FUNC // Main call for processing a plane with a WebPSamplerRowFunc function: -void WebPSamplerProcessPlane(const uint8_t* y, int y_stride, - const uint8_t* u, const uint8_t* v, int uv_stride, - uint8_t* dst, int dst_stride, +void WebPSamplerProcessPlane(const uint8_t* WEBP_RESTRICT y, int y_stride, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, int uv_stride, + uint8_t* WEBP_RESTRICT dst, int dst_stride, int width, int height, WebPSamplerRowFunc func) { int j; for (j = 0; j < height; ++j) { @@ -117,7 +119,8 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) { //----------------------------------------------------------------------------- // ARGB -> YUV converters -static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) { +static void ConvertARGBToY_C(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT y, int width) { int i; for (i = 0; i < width; ++i) { const uint32_t p = argb[i]; @@ -126,7 +129,8 @@ static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) { } } -void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v, +void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, int src_width, int do_store) { // No rounding. Last pixel is dealt with separately. const int uv_width = src_width >> 1; @@ -169,22 +173,25 @@ void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v, //----------------------------------------------------------------------------- -static void ConvertRGB24ToY_C(const uint8_t* rgb, uint8_t* y, int width) { +static void ConvertRGB24ToY_C(const uint8_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT y, int width) { int i; for (i = 0; i < width; ++i, rgb += 3) { y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); } } -static void ConvertBGR24ToY_C(const uint8_t* bgr, uint8_t* y, int width) { +static void ConvertBGR24ToY_C(const uint8_t* WEBP_RESTRICT bgr, + uint8_t* WEBP_RESTRICT y, int width) { int i; for (i = 0; i < width; ++i, bgr += 3) { y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); } } -void WebPConvertRGBA32ToUV_C(const uint16_t* rgb, - uint8_t* u, uint8_t* v, int width) { +void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int width) { int i; for (i = 0; i < width; i += 1, rgb += 4) { const int r = rgb[0], g = rgb[1], b = rgb[2]; @@ -195,13 +202,18 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb, //----------------------------------------------------------------------------- -void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width); -void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width); -void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb, - uint8_t* u, uint8_t* v, int width); +void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT y, int width); +void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr, + uint8_t* WEBP_RESTRICT y, int width); +void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int width); -void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width); -void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v, +void (*WebPConvertARGBToY)(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT y, int width); +void (*WebPConvertARGBToUV)(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, int src_width, int do_store); extern void WebPInitConvertARGBToYUVSSE2(void); diff --git a/src/dsp/yuv.h b/src/dsp/yuv.h index 66a397d1..91fdba12 100644 --- a/src/dsp/yuv.h +++ b/src/dsp/yuv.h @@ -149,20 +149,34 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v, #if defined(WEBP_USE_SSE2) // Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst. -void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u, - const uint8_t* v, uint8_t* dst); -void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); +void VP8YuvToRgba32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst); +void VP8YuvToRgb32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst); +void VP8YuvToBgra32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst); +void VP8YuvToBgr32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst); +void VP8YuvToArgb32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst); +void VP8YuvToRgba444432_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst); +void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst); #endif // WEBP_USE_SSE2 @@ -172,10 +186,14 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, #if defined(WEBP_USE_SSE41) // Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst. -void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); +void VP8YuvToRgb32_SSE41(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst); +void VP8YuvToBgr32_SSE41(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst); #endif // WEBP_USE_SSE41 diff --git a/src/dsp/yuv_mips32.c b/src/dsp/yuv_mips32.c index 9d0a8878..1f634858 100644 --- a/src/dsp/yuv_mips32.c +++ b/src/dsp/yuv_mips32.c @@ -22,9 +22,10 @@ // simple point-sampling #define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \ -static void FUNC_NAME(const uint8_t* y, \ - const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len) { \ int i, r, g, b; \ int temp0, temp1, temp2, temp3, temp4; \ for (i = 0; i < (len >> 1); i++) { \ diff --git a/src/dsp/yuv_mips_dsp_r2.c b/src/dsp/yuv_mips_dsp_r2.c index cc8afcc7..816340fe 100644 --- a/src/dsp/yuv_mips_dsp_r2.c +++ b/src/dsp/yuv_mips_dsp_r2.c @@ -69,9 +69,10 @@ : "memory", "hi", "lo" \ #define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \ -static void FUNC_NAME(const uint8_t* y, \ - const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len) { \ int i; \ uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; \ const int t_con_1 = 26149; \ diff --git a/src/dsp/yuv_neon.c b/src/dsp/yuv_neon.c index ff77b009..b1b7c604 100644 --- a/src/dsp/yuv_neon.c +++ b/src/dsp/yuv_neon.c @@ -46,7 +46,8 @@ static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R, return vqmovn_u16(Y2); } -static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) { +static void ConvertRGB24ToY_NEON(const uint8_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT y, int width) { int i; for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) { const uint8x8x3_t RGB = vld3_u8(rgb); @@ -58,7 +59,8 @@ static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) { } } -static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) { +static void ConvertBGR24ToY_NEON(const uint8_t* WEBP_RESTRICT bgr, + uint8_t* WEBP_RESTRICT y, int width) { int i; for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) { const uint8x8x3_t BGR = vld3_u8(bgr); @@ -70,7 +72,8 @@ static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) { } } -static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) { +static void ConvertARGBToY_NEON(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT y, int width) { int i; for (i = 0; i + 8 <= width; i += 8) { const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]); @@ -114,8 +117,9 @@ static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) { MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST); \ } while (0) -static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb, - uint8_t* u, uint8_t* v, int width) { +static void ConvertRGBA32ToUV_NEON(const uint16_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int width) { int i; for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) { const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb); @@ -131,7 +135,9 @@ static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb, } } -static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v, +static void ConvertARGBToUV_NEON(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int src_width, int do_store) { int i; for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) { diff --git a/src/dsp/yuv_sse2.c b/src/dsp/yuv_sse2.c index 01a48f9a..a96b4522 100644 --- a/src/dsp/yuv_sse2.c +++ b/src/dsp/yuv_sse2.c @@ -82,9 +82,9 @@ static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) { } // Convert 32 samples of YUV444 to R/G/B -static void YUV444ToRGB_SSE2(const uint8_t* const y, - const uint8_t* const u, - const uint8_t* const v, +static void YUV444ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y, + const uint8_t* WEBP_RESTRICT const u, + const uint8_t* WEBP_RESTRICT const v, __m128i* const R, __m128i* const G, __m128i* const B) { const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u), @@ -93,9 +93,9 @@ static void YUV444ToRGB_SSE2(const uint8_t* const y, } // Convert 32 samples of YUV420 to R/G/B -static void YUV420ToRGB_SSE2(const uint8_t* const y, - const uint8_t* const u, - const uint8_t* const v, +static void YUV420ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y, + const uint8_t* WEBP_RESTRICT const u, + const uint8_t* WEBP_RESTRICT const v, __m128i* const R, __m128i* const G, __m128i* const B) { const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u), @@ -108,7 +108,7 @@ static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R, const __m128i* const G, const __m128i* const B, const __m128i* const A, - uint8_t* const dst) { + uint8_t* WEBP_RESTRICT const dst) { const __m128i rb = _mm_packus_epi16(*R, *B); const __m128i ga = _mm_packus_epi16(*G, *A); const __m128i rg = _mm_unpacklo_epi8(rb, ga); @@ -120,11 +120,9 @@ static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R, } // Pack R/G/B/A results into 16b output. -static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R, - const __m128i* const G, - const __m128i* const B, - const __m128i* const A, - uint8_t* const dst) { +static WEBP_INLINE void PackAndStore4444_SSE2( + const __m128i* const R, const __m128i* const G, const __m128i* const B, + const __m128i* const A, uint8_t* WEBP_RESTRICT const dst) { #if (WEBP_SWAP_16BIT_CSP == 0) const __m128i rg0 = _mm_packus_epi16(*R, *G); const __m128i ba0 = _mm_packus_epi16(*B, *A); @@ -145,7 +143,7 @@ static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R, static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R, const __m128i* const G, const __m128i* const B, - uint8_t* const dst) { + uint8_t* WEBP_RESTRICT const dst) { const __m128i r0 = _mm_packus_epi16(*R, *R); const __m128i g0 = _mm_packus_epi16(*G, *G); const __m128i b0 = _mm_packus_epi16(*B, *B); @@ -170,7 +168,7 @@ static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R, static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1, __m128i* const in2, __m128i* const in3, __m128i* const in4, __m128i* const in5, - uint8_t* const rgb) { + uint8_t* WEBP_RESTRICT const rgb) { // The input is 6 registers of sixteen 8b but for the sake of explanation, // let's take 6 registers of four 8b values. // To pack, we will keep taking one every two 8b integer and move it @@ -193,8 +191,10 @@ static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1, _mm_storeu_si128((__m128i*)(rgb + 80), *in5); } -void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToRgba32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n < 32; n += 8, dst += 32) { @@ -204,8 +204,10 @@ void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, } } -void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToBgra32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n < 32; n += 8, dst += 32) { @@ -215,8 +217,10 @@ void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, } } -void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToArgb32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n < 32; n += 8, dst += 32) { @@ -226,8 +230,10 @@ void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, } } -void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u, - const uint8_t* v, uint8_t* dst) { +void VP8YuvToRgba444432_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n < 32; n += 8, dst += 16) { @@ -237,8 +243,10 @@ void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u, } } -void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst) { int n; for (n = 0; n < 32; n += 8, dst += 16) { __m128i R, G, B; @@ -247,8 +255,10 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, } } -void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToRgb32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; @@ -269,8 +279,10 @@ void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); } -void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToBgr32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; @@ -294,9 +306,10 @@ void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, //----------------------------------------------------------------------------- // Arbitrary-length row conversion functions -static void YuvToRgbaRow_SSE2(const uint8_t* y, - const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToRgbaRow_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int len) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n + 8 <= len; n += 8, dst += 32) { @@ -316,9 +329,10 @@ static void YuvToRgbaRow_SSE2(const uint8_t* y, } } -static void YuvToBgraRow_SSE2(const uint8_t* y, - const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToBgraRow_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int len) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n + 8 <= len; n += 8, dst += 32) { @@ -338,9 +352,10 @@ static void YuvToBgraRow_SSE2(const uint8_t* y, } } -static void YuvToArgbRow_SSE2(const uint8_t* y, - const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToArgbRow_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int len) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n + 8 <= len; n += 8, dst += 32) { @@ -360,9 +375,10 @@ static void YuvToArgbRow_SSE2(const uint8_t* y, } } -static void YuvToRgbRow_SSE2(const uint8_t* y, - const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToRgbRow_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int len) { int n; for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; @@ -397,9 +413,10 @@ static void YuvToRgbRow_SSE2(const uint8_t* y, } } -static void YuvToBgrRow_SSE2(const uint8_t* y, - const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToBgrRow_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int len) { int n; for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; @@ -471,7 +488,7 @@ static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2( // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // Similar to PlanarTo24bHelper(), but in reverse order. static WEBP_INLINE void RGB24PackedToPlanar_SSE2( - const uint8_t* const rgb, __m128i* const out /*out[6]*/) { + const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) { __m128i tmp[6]; tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0)); tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16)); @@ -488,8 +505,8 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE2( } // Convert 8 packed ARGB to r[], g[], b[] -static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb, - __m128i* const rgb /*in[6]*/) { +static WEBP_INLINE void RGB32PackedToPlanar_SSE2( + const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) { const __m128i zero = _mm_setzero_si128(); __m128i a0 = LOAD_16(argb + 0); __m128i a1 = LOAD_16(argb + 4); @@ -562,7 +579,8 @@ static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R, #undef MK_CST_16 #undef TRANSFORM -static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) { +static void ConvertRGB24ToY_SSE2(const uint8_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT y, int width) { const int max_width = width & ~31; int i; for (i = 0; i < max_width; rgb += 3 * 16 * 2) { @@ -596,7 +614,8 @@ static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) { } } -static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) { +static void ConvertBGR24ToY_SSE2(const uint8_t* WEBP_RESTRICT bgr, + uint8_t* WEBP_RESTRICT y, int width) { const int max_width = width & ~31; int i; for (i = 0; i < max_width; bgr += 3 * 16 * 2) { @@ -630,7 +649,8 @@ static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) { } } -static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) { +static void ConvertARGBToY_SSE2(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT y, int width) { const int max_width = width & ~15; int i; for (i = 0; i < max_width; i += 16) { @@ -658,8 +678,9 @@ static void HorizontalAddPack_SSE2(const __m128i* const A, *out = _mm_packs_epi32(C, D); } -static void ConvertARGBToUV_SSE2(const uint32_t* argb, - uint8_t* u, uint8_t* v, +static void ConvertARGBToUV_SSE2(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int src_width, int do_store) { const int max_width = src_width & ~31; int i; @@ -695,7 +716,7 @@ static void ConvertARGBToUV_SSE2(const uint32_t* argb, // Convert 16 packed ARGB 16b-values to r[], g[], b[] static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2( - const uint16_t* const rgbx, + const uint16_t* WEBP_RESTRICT const rgbx, __m128i* const r, __m128i* const g, __m128i* const b) { const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x @@ -715,8 +736,9 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2( *b = _mm_unpacklo_epi64(B1, B3); } -static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb, - uint8_t* u, uint8_t* v, int width) { +static void ConvertRGBA32ToUV_SSE2(const uint16_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int width) { const int max_width = width & ~15; const uint16_t* const last_rgb = rgb + 4 * max_width; while (rgb < last_rgb) { diff --git a/src/dsp/yuv_sse41.c b/src/dsp/yuv_sse41.c index f79b802e..071e4908 100644 --- a/src/dsp/yuv_sse41.c +++ b/src/dsp/yuv_sse41.c @@ -82,9 +82,9 @@ static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) { } // Convert 32 samples of YUV444 to R/G/B -static void YUV444ToRGB_SSE41(const uint8_t* const y, - const uint8_t* const u, - const uint8_t* const v, +static void YUV444ToRGB_SSE41(const uint8_t* WEBP_RESTRICT const y, + const uint8_t* WEBP_RESTRICT const u, + const uint8_t* WEBP_RESTRICT const v, __m128i* const R, __m128i* const G, __m128i* const B) { const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u), @@ -93,9 +93,9 @@ static void YUV444ToRGB_SSE41(const uint8_t* const y, } // Convert 32 samples of YUV420 to R/G/B -static void YUV420ToRGB_SSE41(const uint8_t* const y, - const uint8_t* const u, - const uint8_t* const v, +static void YUV420ToRGB_SSE41(const uint8_t* WEBP_RESTRICT const y, + const uint8_t* WEBP_RESTRICT const u, + const uint8_t* WEBP_RESTRICT const v, __m128i* const R, __m128i* const G, __m128i* const B) { const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u), @@ -109,7 +109,7 @@ static void YUV420ToRGB_SSE41(const uint8_t* const y, static WEBP_INLINE void PlanarTo24b_SSE41( __m128i* const in0, __m128i* const in1, __m128i* const in2, __m128i* const in3, __m128i* const in4, __m128i* const in5, - uint8_t* const rgb) { + uint8_t* WEBP_RESTRICT const rgb) { // The input is 6 registers of sixteen 8b but for the sake of explanation, // let's take 6 registers of four 8b values. // To pack, we will keep taking one every two 8b integer and move it @@ -132,8 +132,10 @@ static WEBP_INLINE void PlanarTo24b_SSE41( _mm_storeu_si128((__m128i*)(rgb + 80), *in5); } -void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToRgb32_SSE41(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; @@ -154,8 +156,10 @@ void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); } -void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToBgr32_SSE41(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; @@ -179,9 +183,10 @@ void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, //----------------------------------------------------------------------------- // Arbitrary-length row conversion functions -static void YuvToRgbRow_SSE41(const uint8_t* y, - const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToRgbRow_SSE41(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int len) { int n; for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; @@ -216,9 +221,10 @@ static void YuvToRgbRow_SSE41(const uint8_t* y, } } -static void YuvToBgrRow_SSE41(const uint8_t* y, - const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToBgrRow_SSE41(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int len) { int n; for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; @@ -290,7 +296,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) { // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // Similar to PlanarTo24bHelper(), but in reverse order. static WEBP_INLINE void RGB24PackedToPlanar_SSE41( - const uint8_t* const rgb, __m128i* const out /*out[6]*/) { + const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) { const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb + 0)); const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16)); const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + 32)); @@ -334,7 +340,7 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE41( // Convert 8 packed ARGB to r[], g[], b[] static WEBP_INLINE void RGB32PackedToPlanar_SSE41( - const uint32_t* const argb, __m128i* const rgb /*in[6]*/) { + const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) { const __m128i zero = _mm_setzero_si128(); __m128i a0 = LOAD_16(argb + 0); __m128i a1 = LOAD_16(argb + 4); @@ -407,7 +413,8 @@ static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R, #undef MK_CST_16 #undef TRANSFORM -static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) { +static void ConvertRGB24ToY_SSE41(const uint8_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT y, int width) { const int max_width = width & ~31; int i; for (i = 0; i < max_width; rgb += 3 * 16 * 2) { @@ -441,7 +448,8 @@ static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) { } } -static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) { +static void ConvertBGR24ToY_SSE41(const uint8_t* WEBP_RESTRICT bgr, + uint8_t* WEBP_RESTRICT y, int width) { const int max_width = width & ~31; int i; for (i = 0; i < max_width; bgr += 3 * 16 * 2) { @@ -475,7 +483,8 @@ static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) { } } -static void ConvertARGBToY_SSE41(const uint32_t* argb, uint8_t* y, int width) { +static void ConvertARGBToY_SSE41(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT y, int width) { const int max_width = width & ~15; int i; for (i = 0; i < max_width; i += 16) { @@ -503,8 +512,9 @@ static void HorizontalAddPack_SSE41(const __m128i* const A, *out = _mm_packs_epi32(C, D); } -static void ConvertARGBToUV_SSE41(const uint32_t* argb, - uint8_t* u, uint8_t* v, +static void ConvertARGBToUV_SSE41(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int src_width, int do_store) { const int max_width = src_width & ~31; int i; @@ -540,7 +550,7 @@ static void ConvertARGBToUV_SSE41(const uint32_t* argb, // Convert 16 packed ARGB 16b-values to r[], g[], b[] static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41( - const uint16_t* const rgbx, + const uint16_t* WEBP_RESTRICT const rgbx, __m128i* const r, __m128i* const g, __m128i* const b) { const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x @@ -570,8 +580,9 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41( *b = _mm_unpackhi_epi64(B1, B3); } -static void ConvertRGBA32ToUV_SSE41(const uint16_t* rgb, - uint8_t* u, uint8_t* v, int width) { +static void ConvertRGBA32ToUV_SSE41(const uint16_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int width) { const int max_width = width & ~15; const uint16_t* const last_rgb = rgb + 4 * max_width; while (rgb < last_rgb) {