Merge changes I07a7e36a,Ib29980f7,I2316122d,I2356e314,I32b53dd3, ... into main

* changes: dsp/yuv*: use WEBP_RESTRICT qualifier dsp/upsampling*: use WEBP_RESTRICT qualifier dsp/rescaler*: use WEBP_RESTRICT qualifier dsp/lossless*: use WEBP_RESTRICT qualifier dsp/filters*: use WEBP_RESTRICT qualifier dsp/enc*: use WEBP_RESTRICT qualifier dsp/dec*: use WEBP_RESTRICT qualifier dsp/cost*: use WEBP_RESTRICT qualifier
dsp/yuv*: use WEBP_RESTRICT qualifier
2025-07-01 00:24:29 +02:00 · 2024-10-03 17:01:02 +00:00 · 2024-10-02 14:55:15 -07:00 · 2024-10-02 14:55:15 -07:00 · 2024-10-02 14:55:14 -07:00 · 2024-10-02 14:55:14 -07:00
54 changed files with 1300 additions and 885 deletions
--- a/src/dsp/cost.c
+++ b/src/dsp/cost.c
@ -354,8 +354,8 @@ static int GetResidualCost_C(int ctx0, const VP8Residual* const res) {
  return cost;
 }
-static void SetResidualCoeffs_C(const int16_t* const coeffs,
+static void SetResidualCoeffs_C(const int16_t* WEBP_RESTRICT const coeffs,
-                                VP8Residual* const res) {
+                                VP8Residual* WEBP_RESTRICT const res) {
  int n;
  res->last = -1;
  assert(res->first == 0 || coeffs[0] == 0);
--- a/src/dsp/cost_mips32.c
+++ b/src/dsp/cost_mips32.c
@ -96,8 +96,8 @@ static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) {
  return cost;
 }
-static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs,
+static void SetResidualCoeffs_MIPS32(const int16_t* WEBP_RESTRICT const coeffs,
-                                     VP8Residual* const res) {
+                                     VP8Residual* WEBP_RESTRICT const res) {
  const int16_t* p_coeffs = (int16_t*)coeffs;
  int temp0, temp1, temp2, n, n1;
  assert(res->first == 0 || coeffs[0] == 0);
--- a/src/dsp/cost_neon.c
+++ b/src/dsp/cost_neon.c
@ -19,8 +19,8 @@
 static const uint8_t position[16] = { 1, 2,  3,  4,  5,  6,  7,  8,
                                      9, 10, 11, 12, 13, 14, 15, 16 };
-static void SetResidualCoeffs_NEON(const int16_t* const coeffs,
+static void SetResidualCoeffs_NEON(const int16_t* WEBP_RESTRICT const coeffs,
-                                   VP8Residual* const res) {
+                                   VP8Residual* WEBP_RESTRICT const res) {
  const int16x8_t minus_one = vdupq_n_s16(-1);
  const int16x8_t coeffs_0 = vld1q_s16(coeffs);
  const int16x8_t coeffs_1 = vld1q_s16(coeffs + 8);
--- a/src/dsp/cost_sse2.c
+++ b/src/dsp/cost_sse2.c
@ -22,8 +22,8 @@
 //------------------------------------------------------------------------------
-static void SetResidualCoeffs_SSE2(const int16_t* const coeffs,
+static void SetResidualCoeffs_SSE2(const int16_t* WEBP_RESTRICT const coeffs,
-                                   VP8Residual* const res) {
+                                   VP8Residual* WEBP_RESTRICT const res) {
  const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
  const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
  // Use SSE2 to compare 16 values with a single instruction.
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@ -38,7 +38,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
 } while (0)
 #if !WEBP_NEON_OMIT_C_CODE
-static void TransformOne_C(const int16_t* in, uint8_t* dst) {
+static void TransformOne_C(const int16_t* WEBP_RESTRICT in,
                           uint8_t* WEBP_RESTRICT dst) {
  int C[4 * 4], *tmp;
  int i;
  tmp = C;
@ -82,7 +83,8 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) {
 }
 // Simplified transform when only in[0], in[1] and in[4] are non-zero
-static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
+static void TransformAC3_C(const int16_t* WEBP_RESTRICT in,
                           uint8_t* WEBP_RESTRICT dst) {
  const int a = in[0] + 4;
  const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
  const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
@ -95,7 +97,8 @@ static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
 }
 #undef STORE2
-static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
+static void TransformTwo_C(const int16_t* WEBP_RESTRICT in,
                           uint8_t* WEBP_RESTRICT dst, int do_two) {
  TransformOne_C(in, dst);
  if (do_two) {
    TransformOne_C(in + 16, dst + 4);
@ -103,13 +106,15 @@ static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
-static void TransformUV_C(const int16_t* in, uint8_t* dst) {
+static void TransformUV_C(const int16_t* WEBP_RESTRICT in,
                          uint8_t* WEBP_RESTRICT dst) {
  VP8Transform(in + 0 * 16, dst, 1);
  VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
 }
 #if !WEBP_NEON_OMIT_C_CODE
-static void TransformDC_C(const int16_t* in, uint8_t* dst) {
+static void TransformDC_C(const int16_t* WEBP_RESTRICT in,
                          uint8_t* WEBP_RESTRICT dst) {
  const int DC = in[0] + 4;
  int i, j;
  for (j = 0; j < 4; ++j) {
@ -120,7 +125,8 @@ static void TransformDC_C(const int16_t* in, uint8_t* dst) {
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
-static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
+static void TransformDCUV_C(const int16_t* WEBP_RESTRICT in,
                            uint8_t* WEBP_RESTRICT dst) {
  if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
  if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
  if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
@ -133,7 +139,8 @@ static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
 // Paragraph 14.3
 #if !WEBP_NEON_OMIT_C_CODE
-static void TransformWHT_C(const int16_t* in, int16_t* out) {
+static void TransformWHT_C(const int16_t* WEBP_RESTRICT in,
                           int16_t* WEBP_RESTRICT out) {
  int tmp[16];
  int i;
  for (i = 0; i < 4; ++i) {
@ -161,7 +168,7 @@ static void TransformWHT_C(const int16_t* in, int16_t* out) {
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
-void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
+VP8WHT VP8TransformWHT;
 //------------------------------------------------------------------------------
 // Intra predictions
@ -661,32 +668,32 @@ static void HFilter16i_C(uint8_t* p, int stride,
 #if !WEBP_NEON_OMIT_C_CODE
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8_C(uint8_t* u, uint8_t* v, int stride,
+static void VFilter8_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-                       int thresh, int ithresh, int hev_thresh) {
+                       int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh);
  FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
 #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
-static void HFilter8_C(uint8_t* u, uint8_t* v, int stride,
+static void HFilter8_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-                       int thresh, int ithresh, int hev_thresh) {
+                       int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh);
  FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 #if !WEBP_NEON_OMIT_C_CODE
-static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride,
+static void VFilter8i_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-                        int thresh, int ithresh, int hev_thresh) {
+                        int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
  FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
 #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
-static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
+static void HFilter8i_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-                        int thresh, int ithresh, int hev_thresh) {
+                        int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
  FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
@ -694,8 +701,8 @@ static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
 //------------------------------------------------------------------------------
-static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst,
+static void DitherCombine8x8_C(const uint8_t* WEBP_RESTRICT dither,
-                               int dst_stride) {
+                               uint8_t* WEBP_RESTRICT dst, int dst_stride) {
  int i, j;
  for (j = 0; j < 8; ++j) {
    for (i = 0; i < 8; ++i) {
@ -730,8 +737,8 @@ VP8SimpleFilterFunc VP8SimpleHFilter16;
 VP8SimpleFilterFunc VP8SimpleVFilter16i;
 VP8SimpleFilterFunc VP8SimpleHFilter16i;
-void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
+void (*VP8DitherCombine8x8)(const uint8_t* WEBP_RESTRICT dither,
-                            int dst_stride);
+                            uint8_t* WEBP_RESTRICT dst, int dst_stride);
 extern VP8CPUInfo VP8GetCPUInfo;
 extern void VP8DspInitSSE2(void);
--- a/src/dsp/dec_mips32.c
+++ b/src/dsp/dec_mips32.c
@ -133,26 +133,26 @@ static void HFilter16(uint8_t* p, int stride,
 }
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-                     int thresh, int ithresh, int hev_thresh) {
+                     int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
-static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-                     int thresh, int ithresh, int hev_thresh) {
+                     int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
-static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-                      int thresh, int ithresh, int hev_thresh) {
+                      int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
-static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-                      int thresh, int ithresh, int hev_thresh) {
+                      int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
@ -215,7 +215,8 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
  }
 }
-static void TransformOne(const int16_t* in, uint8_t* dst) {
+static void TransformOne(const int16_t* WEBP_RESTRICT in,
                         uint8_t* WEBP_RESTRICT dst) {
  int temp0, temp1, temp2, temp3, temp4;
  int temp5, temp6, temp7, temp8, temp9;
  int temp10, temp11, temp12, temp13, temp14;
@ -532,7 +533,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
  );
 }
-static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+static void TransformTwo(const int16_t* WEBP_RESTRICT in,
                         uint8_t* WEBP_RESTRICT dst, int do_two) {
  TransformOne(in, dst);
  if (do_two) {
    TransformOne(in + 16, dst + 4);
--- a/src/dsp/dec_mips_dsp_r2.c
+++ b/src/dsp/dec_mips_dsp_r2.c
@ -21,7 +21,8 @@
 static const int kC1 = WEBP_TRANSFORM_AC3_C1;
 static const int kC2 = WEBP_TRANSFORM_AC3_C2;
-static void TransformDC(const int16_t* in, uint8_t* dst) {
+static void TransformDC(const int16_t* WEBP_RESTRICT in,
                        uint8_t* WEBP_RESTRICT dst) {
  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
  __asm__ volatile (
@ -45,7 +46,8 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
  );
 }
-static void TransformAC3(const int16_t* in, uint8_t* dst) {
+static void TransformAC3(const int16_t* WEBP_RESTRICT in,
                         uint8_t* WEBP_RESTRICT dst) {
  const int a = in[0] + 4;
  int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
  const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
@ -81,7 +83,8 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
  );
 }
-static void TransformOne(const int16_t* in, uint8_t* dst) {
+static void TransformOne(const int16_t* WEBP_RESTRICT in,
                         uint8_t* WEBP_RESTRICT dst) {
  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
@ -148,7 +151,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
  );
 }
-static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+static void TransformTwo(const int16_t* WEBP_RESTRICT in,
                         uint8_t* WEBP_RESTRICT dst, int do_two) {
  TransformOne(in, dst);
  if (do_two) {
    TransformOne(in + 16, dst + 4);
@ -434,14 +438,14 @@ static void HFilter16(uint8_t* p, int stride,
 }
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-                     int thresh, int ithresh, int hev_thresh) {
+                     int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
-static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-                     int thresh, int ithresh, int hev_thresh) {
+                     int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
@ -465,14 +469,14 @@ static void HFilter16i(uint8_t* p, int stride,
  }
 }
-static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-                      int thresh, int ithresh, int hev_thresh) {
+                      int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
-static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-                      int thresh, int ithresh, int hev_thresh) {
+                      int stride, int thresh, int ithresh, int hev_thresh) {
  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
--- a/src/dsp/dec_msa.c
+++ b/src/dsp/dec_msa.c
@ -38,7 +38,8 @@
  BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
 }
-static void TransformOne(const int16_t* in, uint8_t* dst) {
+static void TransformOne(const int16_t* WEBP_RESTRICT in,
                         uint8_t* WEBP_RESTRICT dst) {
  v8i16 input0, input1;
  v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
  v4i32 res0, res1, res2, res3;
@ -65,14 +66,16 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
 }
-static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+static void TransformTwo(const int16_t* WEBP_RESTRICT in,
                         uint8_t* WEBP_RESTRICT dst, int do_two) {
  TransformOne(in, dst);
  if (do_two) {
    TransformOne(in + 16, dst + 4);
  }
 }
-static void TransformWHT(const int16_t* in, int16_t* out) {
+static void TransformWHT(const int16_t* WEBP_RESTRICT in,
                         int16_t* WEBP_RESTRICT out) {
  v8i16 input0, input1;
  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
@ -114,13 +117,15 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
  out[240] = __msa_copy_s_h(out1, 7);
 }
-static void TransformDC(const int16_t* in, uint8_t* dst) {
+static void TransformDC(const int16_t* WEBP_RESTRICT in,
                        uint8_t* WEBP_RESTRICT dst) {
  const int DC = (in[0] + 4) >> 3;
  const v8i16 tmp0 = __msa_fill_h(DC);
  ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS);
 }
-static void TransformAC3(const int16_t* in, uint8_t* dst) {
+static void TransformAC3(const int16_t* WEBP_RESTRICT in,
                         uint8_t* WEBP_RESTRICT dst) {
  const int a = in[0] + 4;
  const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
  const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
@ -475,8 +480,8 @@ static void HFilter16i(uint8_t* src_y, int stride,
 }
 // 8-pixels wide variants, for chroma filtering
-static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
+static void VFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v,
-                     int b_limit_in, int limit_in, int thresh_in) {
+                     int stride, int b_limit_in, int limit_in, int thresh_in) {
  uint8_t* ptmp_src_u = src_u - 4 * stride;
  uint8_t* ptmp_src_v = src_v - 4 * stride;
  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
@ -520,8 +525,8 @@ static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
  SD(q2_d, ptmp_src_v);
 }
-static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
+static void HFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v,
-                     int b_limit_in, int limit_in, int thresh_in) {
+                     int stride, int b_limit_in, int limit_in, int thresh_in) {
  uint8_t* ptmp_src_u = src_u - 4;
  uint8_t* ptmp_src_v = src_v - 4;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
@ -556,7 +561,8 @@ static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
  ST6x4_UB(tmp7, 0, tmp5, 4, ptmp_src_v, stride);
 }
-static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
+static void VFilter8i(uint8_t* WEBP_RESTRICT src_u,
                      uint8_t* WEBP_RESTRICT src_v, int stride,
                      int b_limit_in, int limit_in, int thresh_in) {
  uint64_t p1_d, p0_d, q0_d, q1_d;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
@ -587,7 +593,8 @@ static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
  SD4(q1_d, q0_d, p0_d, p1_d, src_v, -stride);
 }
-static void HFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
+static void HFilter8i(uint8_t* WEBP_RESTRICT src_u,
                      uint8_t* WEBP_RESTRICT src_v, int stride,
                      int b_limit_in, int limit_in, int thresh_in) {
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@ -916,8 +916,8 @@ static void HFilter16i_NEON(uint8_t* p, int stride,
 #endif  // !WORK_AROUND_GCC
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
+static void VFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-                          int thresh, int ithresh, int hev_thresh) {
+                          int stride, int thresh, int ithresh, int hev_thresh) {
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
  Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
  {
@ -932,7 +932,8 @@ static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
    Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
  }
 }
-static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
+static void VFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
                           int stride,
                           int thresh, int ithresh, int hev_thresh) {
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
  u += 4 * stride;
@ -949,8 +950,8 @@ static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
 }
 #if !defined(WORK_AROUND_GCC)
-static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
+static void HFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-                          int thresh, int ithresh, int hev_thresh) {
+                          int stride, int thresh, int ithresh, int hev_thresh) {
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
  Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
  {
@ -964,7 +965,8 @@ static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
  }
 }
-static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
+static void HFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
                           int stride,
                           int thresh, int ithresh, int hev_thresh) {
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
  u += 4;
@ -1041,7 +1043,8 @@ static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
  Transpose8x2_NEON(E0, E1, rows);
 }
-static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
+static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
                              uint8_t* WEBP_RESTRICT dst) {
  int16x8x2_t rows;
  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
  TransformPass_NEON(&rows);
@ -1051,7 +1054,8 @@ static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
 #else
-static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
+static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
                              uint8_t* WEBP_RESTRICT dst) {
  const int kBPS = BPS;
  // kC1, kC2. Padded because vld1.16 loads 8 bytes
  const int16_t constants[4] = { kC1, kC2, 0, 0 };
@ -1184,14 +1188,16 @@ static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
 #endif    // WEBP_USE_INTRINSICS
-static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
+static void TransformTwo_NEON(const int16_t* WEBP_RESTRICT in,
                              uint8_t* WEBP_RESTRICT dst, int do_two) {
  TransformOne_NEON(in, dst);
  if (do_two) {
    TransformOne_NEON(in + 16, dst + 4);
  }
 }
-static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
+static void TransformDC_NEON(const int16_t* WEBP_RESTRICT in,
                             uint8_t* WEBP_RESTRICT dst) {
  const int16x8_t DC = vdupq_n_s16(in[0]);
  Add4x4_NEON(DC, DC, dst);
 }
@ -1205,7 +1211,8 @@ static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
  *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
 } while (0)
-static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
+static void TransformWHT_NEON(const int16_t* WEBP_RESTRICT in,
                              int16_t* WEBP_RESTRICT out) {
  int32x4x4_t tmp;
  {
@ -1256,7 +1263,8 @@ static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
 //------------------------------------------------------------------------------
-static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
+static void TransformAC3_NEON(const int16_t* WEBP_RESTRICT in,
                              uint8_t* WEBP_RESTRICT dst) {
  const int16x4_t A = vld1_dup_s16(in);
  const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
  const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
--- a/src/dsp/dec_sse2.c
+++ b/src/dsp/dec_sse2.c
@ -30,7 +30,8 @@
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
-static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
+static void Transform_SSE2(const int16_t* WEBP_RESTRICT in,
                           uint8_t* WEBP_RESTRICT dst, int do_two) {
  // This implementation makes use of 16-bit fixed point versions of two
  // multiply constants:
  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@ -197,7 +198,8 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
 #if (USE_TRANSFORM_AC3 == 1)
-static void TransformAC3_SSE2(const int16_t* in, uint8_t* dst) {
+static void TransformAC3_SSE2(const int16_t* WEBP_RESTRICT in,
                              uint8_t* WEBP_RESTRICT dst) {
  const __m128i A = _mm_set1_epi16(in[0] + 4);
  const __m128i c4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
  const __m128i d4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
@ -792,8 +794,8 @@ static void HFilter16i_SSE2(uint8_t* p, int stride,
 }
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
+static void VFilter8_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-                          int thresh, int ithresh, int hev_thresh) {
+                          int stride, int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i t1, p2, p1, p0, q0, q1, q2;
@ -817,8 +819,8 @@ static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
  STOREUV(q2, u, v, 2 * stride);
 }
-static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
+static void HFilter8_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
-                          int thresh, int ithresh, int hev_thresh) {
+                          int stride, int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
@ -837,7 +839,8 @@ static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
  Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride);
 }
-static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
+static void VFilter8i_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
                           int stride,
                           int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i t1, t2, p1, p0, q0, q1;
@ -863,7 +866,8 @@ static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
  STOREUV(q1, u, v, 1 * stride);
 }
-static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
+static void HFilter8i_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
                           int stride,
                           int thresh, int ithresh, int hev_thresh) {
  __m128i mask;
  __m128i t1, t2, p1, p0, q0, q1;
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@ -60,53 +60,66 @@ extern "C" {
 // Transforms
 // VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms
 //          will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4).
-typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+typedef void (*VP8Idct)(const uint8_t* WEBP_RESTRICT ref,
-                        int do_two);
+                        const int16_t* WEBP_RESTRICT in,
-typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
+                        uint8_t* WEBP_RESTRICT dst, int do_two);
-typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
+typedef void (*VP8Fdct)(const uint8_t* WEBP_RESTRICT src,
                        const uint8_t* WEBP_RESTRICT ref,
                        int16_t* WEBP_RESTRICT out);
 typedef void (*VP8WHT)(const int16_t* WEBP_RESTRICT in,
                       int16_t* WEBP_RESTRICT out);
 extern VP8Idct VP8ITransform;
 extern VP8Fdct VP8FTransform;
 extern VP8Fdct VP8FTransform2;   // performs two transforms at a time
 extern VP8WHT VP8FTransformWHT;
 // Predictions
 // *dst is the destination block. *top and *left can be NULL.
-typedef void (*VP8IntraPreds)(uint8_t* dst, const uint8_t* left,
+typedef void (*VP8IntraPreds)(uint8_t* WEBP_RESTRICT dst,
-                              const uint8_t* top);
+                              const uint8_t* WEBP_RESTRICT left,
-typedef void (*VP8Intra4Preds)(uint8_t* dst, const uint8_t* top);
+                              const uint8_t* WEBP_RESTRICT top);
 typedef void (*VP8Intra4Preds)(uint8_t* WEBP_RESTRICT dst,
                               const uint8_t* WEBP_RESTRICT top);
 extern VP8Intra4Preds VP8EncPredLuma4;
 extern VP8IntraPreds VP8EncPredLuma16;
 extern VP8IntraPreds VP8EncPredChroma8;
-typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref);
+typedef int (*VP8Metric)(const uint8_t* WEBP_RESTRICT pix,
                         const uint8_t* WEBP_RESTRICT ref);
 extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4;
-typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref,
+typedef int (*VP8WMetric)(const uint8_t* WEBP_RESTRICT pix,
-                          const uint16_t* const weights);
+                          const uint8_t* WEBP_RESTRICT ref,
                          const uint16_t* WEBP_RESTRICT const weights);
 // The weights for VP8TDisto4x4 and VP8TDisto16x16 contain a row-major
 // 4 by 4 symmetric matrix.
 extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
 // Compute the average (DC) of four 4x4 blocks.
 // Each sub-4x4 block #i sum is stored in dc[i].
-typedef void (*VP8MeanMetric)(const uint8_t* ref, uint32_t dc[4]);
+typedef void (*VP8MeanMetric)(const uint8_t* WEBP_RESTRICT ref,
                              uint32_t dc[4]);
 extern VP8MeanMetric VP8Mean16x4;
-typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
+typedef void (*VP8BlockCopy)(const uint8_t* WEBP_RESTRICT src,
                             uint8_t* WEBP_RESTRICT dst);
 extern VP8BlockCopy VP8Copy4x4;
 extern VP8BlockCopy VP8Copy16x8;
 // Quantization
 struct VP8Matrix;   // forward declaration
-typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
+typedef int (*VP8QuantizeBlock)(
-                                const struct VP8Matrix* const mtx);
+    int16_t in[16], int16_t out[16],
    const struct VP8Matrix* WEBP_RESTRICT const mtx);
 // Same as VP8QuantizeBlock, but quantizes two consecutive blocks.
-typedef int (*VP8Quantize2Blocks)(int16_t in[32], int16_t out[32],
+typedef int (*VP8Quantize2Blocks)(
-                                  const struct VP8Matrix* const mtx);
+    int16_t in[32], int16_t out[32],
    const struct VP8Matrix* WEBP_RESTRICT const mtx);
 extern VP8QuantizeBlock VP8EncQuantizeBlock;
 extern VP8Quantize2Blocks VP8EncQuantize2Blocks;
 // specific to 2nd transform:
-typedef int (*VP8QuantizeBlockWHT)(int16_t in[16], int16_t out[16],
+typedef int (*VP8QuantizeBlockWHT)(
-                                   const struct VP8Matrix* const mtx);
+    int16_t in[16], int16_t out[16],
    const struct VP8Matrix* WEBP_RESTRICT const mtx);
 extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
 extern const int VP8DspScan[16 + 4 + 4];
@ -118,9 +131,10 @@ typedef struct {
  int max_value;
  int last_non_zero;
 } VP8Histogram;
-typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
+typedef void (*VP8CHisto)(const uint8_t* WEBP_RESTRICT ref,
                          const uint8_t* WEBP_RESTRICT pred,
                          int start_block, int end_block,
-                          VP8Histogram* const histo);
+                          VP8Histogram* WEBP_RESTRICT const histo);
 extern VP8CHisto VP8CollectHistogram;
 // General-purpose util function to help VP8CollectHistogram().
 void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
@ -138,8 +152,9 @@ extern const uint16_t VP8LevelFixedCosts[2047 /*MAX_LEVEL*/ + 1];
 extern const uint8_t VP8EncBands[16 + 1];
 struct VP8Residual;
-typedef void (*VP8SetResidualCoeffsFunc)(const int16_t* const coeffs,
+typedef void (*VP8SetResidualCoeffsFunc)(
-                                         struct VP8Residual* const res);
+    const int16_t* WEBP_RESTRICT const coeffs,
    struct VP8Residual* WEBP_RESTRICT const res);
 extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
 // Cost calculation function.
@ -193,9 +208,11 @@ void VP8SSIMDspInit(void);
 //------------------------------------------------------------------------------
 // Decoding
-typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
+typedef void (*VP8DecIdct)(const int16_t* WEBP_RESTRICT coeffs,
                           uint8_t* WEBP_RESTRICT dst);
 // when doing two transforms, coeffs is actually int16_t[2][16].
-typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
+typedef void (*VP8DecIdct2)(const int16_t* WEBP_RESTRICT coeffs,
                            uint8_t* WEBP_RESTRICT dst, int do_two);
 extern VP8DecIdct2 VP8Transform;
 extern VP8DecIdct VP8TransformAC3;
 extern VP8DecIdct VP8TransformUV;
@ -233,7 +250,8 @@ extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
 // regular filter (on both macroblock edges and inner edges)
 typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
                                  int thresh, int ithresh, int hev_t);
-typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride,
+typedef void (*VP8ChromaFilterFunc)(uint8_t* WEBP_RESTRICT u,
                                    uint8_t* WEBP_RESTRICT v, int stride,
                                    int thresh, int ithresh, int hev_t);
 // on outer edge
 extern VP8LumaFilterFunc VP8VFilter16;
@ -253,8 +271,8 @@ extern VP8ChromaFilterFunc VP8HFilter8i;
 #define VP8_DITHER_DESCALE_ROUNDER (1 << (VP8_DITHER_DESCALE - 1))
 #define VP8_DITHER_AMP_BITS 7
 #define VP8_DITHER_AMP_CENTER (1 << VP8_DITHER_AMP_BITS)
-extern void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
+extern void (*VP8DitherCombine8x8)(const uint8_t* WEBP_RESTRICT dither,
-                                   int dst_stride);
+                                   uint8_t* WEBP_RESTRICT dst, int dst_stride);
 // must be called before anything using the above
 void VP8DspInit(void);
@ -267,10 +285,10 @@ void VP8DspInit(void);
 // Convert a pair of y/u/v lines together to the output rgb/a colorspace.
 // bottom_y can be NULL if only one line of output is needed (at top/bottom).
 typedef void (*WebPUpsampleLinePairFunc)(
-    const uint8_t* top_y, const uint8_t* bottom_y,
+    const uint8_t* WEBP_RESTRICT top_y, const uint8_t* WEBP_RESTRICT bottom_y,
-    const uint8_t* top_u, const uint8_t* top_v,
+    const uint8_t* WEBP_RESTRICT top_u, const uint8_t* WEBP_RESTRICT top_v,
-    const uint8_t* cur_u, const uint8_t* cur_v,
+    const uint8_t* WEBP_RESTRICT cur_u, const uint8_t* WEBP_RESTRICT cur_v,
-    uint8_t* top_dst, uint8_t* bottom_dst, int len);
+    uint8_t* WEBP_RESTRICT top_dst, uint8_t* WEBP_RESTRICT bottom_dst, int len);
 #ifdef FANCY_UPSAMPLING
@ -280,13 +298,15 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 #endif    // FANCY_UPSAMPLING
 // Per-row point-sampling methods.
-typedef void (*WebPSamplerRowFunc)(const uint8_t* y,
+typedef void (*WebPSamplerRowFunc)(const uint8_t* WEBP_RESTRICT y,
-                                   const uint8_t* u, const uint8_t* v,
+                                   const uint8_t* WEBP_RESTRICT u,
-                                   uint8_t* dst, int len);
+                                   const uint8_t* WEBP_RESTRICT v,
                                   uint8_t* WEBP_RESTRICT dst, int len);
 // Generic function to apply 'WebPSamplerRowFunc' to the whole plane:
-void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
+void WebPSamplerProcessPlane(const uint8_t* WEBP_RESTRICT y, int y_stride,
-                             const uint8_t* u, const uint8_t* v, int uv_stride,
+                             const uint8_t* WEBP_RESTRICT u,
-                             uint8_t* dst, int dst_stride,
+                             const uint8_t* WEBP_RESTRICT v, int uv_stride,
                             uint8_t* WEBP_RESTRICT dst, int dst_stride,
                             int width, int height, WebPSamplerRowFunc func);
 // Sampling functions to convert rows of YUV to RGB(A)
@ -298,9 +318,10 @@ extern WebPSamplerRowFunc WebPSamplers[/* MODE_LAST */];
 WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last);
 // YUV444->RGB converters
-typedef void (*WebPYUV444Converter)(const uint8_t* y,
+typedef void (*WebPYUV444Converter)(const uint8_t* WEBP_RESTRICT y,
-                                    const uint8_t* u, const uint8_t* v,
+                                    const uint8_t* WEBP_RESTRICT u,
-                                    uint8_t* dst, int len);
+                                    const uint8_t* WEBP_RESTRICT v,
                                    uint8_t* WEBP_RESTRICT dst, int len);
 extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
@ -316,26 +337,35 @@ void WebPInitYUV444Converters(void);
 // ARGB -> YUV converters
 // Convert ARGB samples to luma Y.
-extern void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
+extern void (*WebPConvertARGBToY)(const uint32_t* WEBP_RESTRICT argb,
                                  uint8_t* WEBP_RESTRICT y, int width);
 // Convert ARGB samples to U/V with downsampling. do_store should be '1' for
 // even lines and '0' for odd ones. 'src_width' is the original width, not
 // the U/V one.
-extern void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
+extern void (*WebPConvertARGBToUV)(const uint32_t* WEBP_RESTRICT argb,
                                   uint8_t* WEBP_RESTRICT u,
                                   uint8_t* WEBP_RESTRICT v,
                                   int src_width, int do_store);
 // Convert a row of accumulated (four-values) of rgba32 toward U/V
-extern void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
+extern void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb,
-                                     uint8_t* u, uint8_t* v, int width);
+                                     uint8_t* WEBP_RESTRICT u,
                                     uint8_t* WEBP_RESTRICT v, int width);
 // Convert RGB or BGR to Y
-extern void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
+extern void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb,
-extern void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
+                                   uint8_t* WEBP_RESTRICT y, int width);
 extern void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr,
                                   uint8_t* WEBP_RESTRICT y, int width);
 // used for plain-C fallback.
-extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
+extern void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb,
                                  uint8_t* WEBP_RESTRICT u,
                                  uint8_t* WEBP_RESTRICT v,
                                  int src_width, int do_store);
-extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
+extern void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb,
-                                    uint8_t* u, uint8_t* v, int width);
+                                    uint8_t* WEBP_RESTRICT u,
                                    uint8_t* WEBP_RESTRICT v, int width);
 // Must be called before using the above.
 void WebPInitConvertARGBToYUV(void);
@ -348,8 +378,9 @@ struct WebPRescaler;
 // Import a row of data and save its contribution in the rescaler.
 // 'channel' denotes the channel number to be imported. 'Expand' corresponds to
 // the wrk->x_expand case. Otherwise, 'Shrink' is to be used.
-typedef void (*WebPRescalerImportRowFunc)(struct WebPRescaler* const wrk,
+typedef void (*WebPRescalerImportRowFunc)(
-                                          const uint8_t* src);
+    struct WebPRescaler* WEBP_RESTRICT const wrk,
    const uint8_t* WEBP_RESTRICT src);
 extern WebPRescalerImportRowFunc WebPRescalerImportRowExpand;
 extern WebPRescalerImportRowFunc WebPRescalerImportRowShrink;
@ -362,16 +393,19 @@ extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
 extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
 // Plain-C implementation, as fall-back.
-extern void WebPRescalerImportRowExpand_C(struct WebPRescaler* const wrk,
+extern void WebPRescalerImportRowExpand_C(
-                                          const uint8_t* src);
+    struct WebPRescaler* WEBP_RESTRICT const wrk,
-extern void WebPRescalerImportRowShrink_C(struct WebPRescaler* const wrk,
+    const uint8_t* WEBP_RESTRICT src);
-                                          const uint8_t* src);
+extern void WebPRescalerImportRowShrink_C(
    struct WebPRescaler* WEBP_RESTRICT const wrk,
    const uint8_t* WEBP_RESTRICT src);
 extern void WebPRescalerExportRowExpand_C(struct WebPRescaler* const wrk);
 extern void WebPRescalerExportRowShrink_C(struct WebPRescaler* const wrk);
 // Main entry calls:
-extern void WebPRescalerImportRow(struct WebPRescaler* const wrk,
+extern void WebPRescalerImportRow(
-                                  const uint8_t* src);
+    struct WebPRescaler* WEBP_RESTRICT const wrk,
    const uint8_t* WEBP_RESTRICT src);
 // Export one row (starting at x_out position) from rescaler.
 extern void WebPRescalerExportRow(struct WebPRescaler* const wrk);
@ -480,8 +514,9 @@ typedef enum {     // Filter types.
  WEBP_FILTER_FAST
 } WEBP_FILTER_TYPE;
-typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height,
+typedef void (*WebPFilterFunc)(const uint8_t* WEBP_RESTRICT in,
-                               int stride, uint8_t* out);
+                               int width, int height, int stride,
                               uint8_t* WEBP_RESTRICT out);
 // In-place un-filtering.
 // Warning! 'prev_line' pointer can be equal to 'cur_line' or 'preds'.
 typedef void (*WebPUnfilterFunc)(const uint8_t* prev_line, const uint8_t* preds,
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@ -59,9 +59,10 @@ void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
 }
 #if !WEBP_NEON_OMIT_C_CODE
-static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
+static void CollectHistogram_C(const uint8_t* WEBP_RESTRICT ref,
                               const uint8_t* WEBP_RESTRICT pred,
                               int start_block, int end_block,
-                               VP8Histogram* const histo) {
+                               VP8Histogram* WEBP_RESTRICT const histo) {
  int j;
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
  for (j = start_block; j < end_block; ++j) {
@ -109,8 +110,9 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
 #define STORE(x, y, v) \
  dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
-static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
-                                      uint8_t* dst) {
+                                      const int16_t* WEBP_RESTRICT in,
                                      uint8_t* WEBP_RESTRICT dst) {
  int C[4 * 4], *tmp;
  int i;
  tmp = C;
@ -146,7 +148,9 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
  }
 }
-static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+static void ITransform_C(const uint8_t* WEBP_RESTRICT ref,
                         const int16_t* WEBP_RESTRICT in,
                         uint8_t* WEBP_RESTRICT dst,
                         int do_two) {
  ITransformOne(ref, in, dst);
  if (do_two) {
@ -154,7 +158,9 @@ static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
  }
 }
-static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_C(const uint8_t* WEBP_RESTRICT src,
                         const uint8_t* WEBP_RESTRICT ref,
                         int16_t* WEBP_RESTRICT out) {
  int i;
  int tmp[16];
  for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
@ -184,14 +190,16 @@ static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
-static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
+static void FTransform2_C(const uint8_t* WEBP_RESTRICT src,
-                          int16_t* out) {
+                          const uint8_t* WEBP_RESTRICT ref,
                          int16_t* WEBP_RESTRICT out) {
  VP8FTransform(src, ref, out);
  VP8FTransform(src + 4, ref + 4, out + 16);
 }
 #if !WEBP_NEON_OMIT_C_CODE
-static void FTransformWHT_C(const int16_t* in, int16_t* out) {
+static void FTransformWHT_C(const int16_t* WEBP_RESTRICT in,
                            int16_t* WEBP_RESTRICT out) {
  // input is 12b signed
  int32_t tmp[16];
  int i;
@ -234,8 +242,9 @@ static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
  }
 }
-static WEBP_INLINE void VerticalPred(uint8_t* dst,
+static WEBP_INLINE void VerticalPred(uint8_t* WEBP_RESTRICT dst,
-                                     const uint8_t* top, int size) {
+                                     const uint8_t* WEBP_RESTRICT top,
                                     int size) {
  int j;
  if (top != NULL) {
    for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
@ -244,8 +253,9 @@ static WEBP_INLINE void VerticalPred(uint8_t* dst,
  }
 }
-static WEBP_INLINE void HorizontalPred(uint8_t* dst,
+static WEBP_INLINE void HorizontalPred(uint8_t* WEBP_RESTRICT dst,
-                                       const uint8_t* left, int size) {
+                                       const uint8_t* WEBP_RESTRICT left,
                                       int size) {
  if (left != NULL) {
    int j;
    for (j = 0; j < size; ++j) {
@ -256,8 +266,9 @@ static WEBP_INLINE void HorizontalPred(uint8_t* dst,
  }
 }
-static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
+static WEBP_INLINE void TrueMotion(uint8_t* WEBP_RESTRICT dst,
-                                   const uint8_t* top, int size) {
+                                   const uint8_t* WEBP_RESTRICT left,
                                   const uint8_t* WEBP_RESTRICT top, int size) {
  int y;
  if (left != NULL) {
    if (top != NULL) {
@ -286,8 +297,9 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
  }
 }
-static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
+static WEBP_INLINE void DCMode(uint8_t* WEBP_RESTRICT dst,
-                               const uint8_t* top,
+                               const uint8_t* WEBP_RESTRICT left,
                               const uint8_t* WEBP_RESTRICT top,
                               int size, int round, int shift) {
  int DC = 0;
  int j;
@ -312,8 +324,9 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)
-static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
+static void IntraChromaPreds_C(uint8_t* WEBP_RESTRICT dst,
-                               const uint8_t* top) {
+                               const uint8_t* WEBP_RESTRICT left,
                               const uint8_t* WEBP_RESTRICT top) {
  // U block
  DCMode(C8DC8 + dst, left, top, 8, 8, 4);
  VerticalPred(C8VE8 + dst, top, 8);
@ -333,8 +346,9 @@ static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
 // luma 16x16 prediction (paragraph 12.3)
 #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
-static void Intra16Preds_C(uint8_t* dst,
+static void Intra16Preds_C(uint8_t* WEBP_RESTRICT dst,
-                           const uint8_t* left, const uint8_t* top) {
+                           const uint8_t* WEBP_RESTRICT left,
                           const uint8_t* WEBP_RESTRICT top) {
  DCMode(I16DC16 + dst, left, top, 16, 16, 5);
  VerticalPred(I16VE16 + dst, top, 16);
  HorizontalPred(I16HE16 + dst, left, 16);
@ -352,7 +366,8 @@ static void Intra16Preds_C(uint8_t* dst,
 #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
-static void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
+// vertical
 static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  const uint8_t vals[4] = {
    AVG3(top[-1], top[0], top[1]),
    AVG3(top[ 0], top[1], top[2]),
@ -365,7 +380,8 @@ static void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
  }
 }
-static void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
+// horizontal
 static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@ -377,14 +393,14 @@ static void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
 }
-static void DC4(uint8_t* dst, const uint8_t* top) {
+static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  uint32_t dc = 4;
  int i;
  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
  Fill(dst, dc >> 3, 4);
 }
-static void RD4(uint8_t* dst, const uint8_t* top) {
+static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@ -403,7 +419,7 @@ static void RD4(uint8_t* dst, const uint8_t* top) {
  DST(3, 0)                                     = AVG3(D, C, B);
 }
-static void LD4(uint8_t* dst, const uint8_t* top) {
+static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  const int A = top[0];
  const int B = top[1];
  const int C = top[2];
@ -421,7 +437,7 @@ static void LD4(uint8_t* dst, const uint8_t* top) {
  DST(3, 3)                                     = AVG3(G, H, H);
 }
-static void VR4(uint8_t* dst, const uint8_t* top) {
+static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@ -443,7 +459,7 @@ static void VR4(uint8_t* dst, const uint8_t* top) {
  DST(3, 1) =             AVG3(B, C, D);
 }
-static void VL4(uint8_t* dst, const uint8_t* top) {
+static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  const int A = top[0];
  const int B = top[1];
  const int C = top[2];
@ -465,7 +481,7 @@ static void VL4(uint8_t* dst, const uint8_t* top) {
              DST(3, 3) = AVG3(F, G, H);
 }
-static void HU4(uint8_t* dst, const uint8_t* top) {
+static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  const int I = top[-2];
  const int J = top[-3];
  const int K = top[-4];
@ -480,7 +496,7 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }
-static void HD4(uint8_t* dst, const uint8_t* top) {
+static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@ -503,7 +519,7 @@ static void HD4(uint8_t* dst, const uint8_t* top) {
  DST(1, 3)             = AVG3(L, K, J);
 }
-static void TM4(uint8_t* dst, const uint8_t* top) {
+static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int x, y;
  const uint8_t* const clip = clip1 + 255 - top[-1];
  for (y = 0; y < 4; ++y) {
@ -521,7 +537,8 @@ static void TM4(uint8_t* dst, const uint8_t* top) {
 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_C(uint8_t* WEBP_RESTRICT dst,
                          const uint8_t* WEBP_RESTRICT top) {
  DC4(I4DC4 + dst, top);
  TM4(I4TM4 + dst, top);
  VE4(I4VE4 + dst, top);
@ -540,7 +557,8 @@ static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
 // Metric
 #if !WEBP_NEON_OMIT_C_CODE
-static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
+static WEBP_INLINE int GetSSE(const uint8_t* WEBP_RESTRICT a,
                              const uint8_t* WEBP_RESTRICT b,
                              int w, int h) {
  int count = 0;
  int y, x;
@ -555,21 +573,25 @@ static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
  return count;
 }
-static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_C(const uint8_t* WEBP_RESTRICT a,
                      const uint8_t* WEBP_RESTRICT b) {
  return GetSSE(a, b, 16, 16);
 }
-static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_C(const uint8_t* WEBP_RESTRICT a,
                     const uint8_t* WEBP_RESTRICT b) {
  return GetSSE(a, b, 16, 8);
 }
-static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_C(const uint8_t* WEBP_RESTRICT a,
                    const uint8_t* WEBP_RESTRICT b) {
  return GetSSE(a, b, 8, 8);
 }
-static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_C(const uint8_t* WEBP_RESTRICT a,
                    const uint8_t* WEBP_RESTRICT b) {
  return GetSSE(a, b, 4, 4);
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
-static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
+static void Mean16x4_C(const uint8_t* WEBP_RESTRICT ref, uint32_t dc[4]) {
  int k, x, y;
  for (k = 0; k < 4; ++k) {
    uint32_t avg = 0;
@ -593,7 +615,8 @@ static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int TTransform(const uint8_t* in, const uint16_t* w) {
+static int TTransform(const uint8_t* WEBP_RESTRICT in,
                      const uint16_t* WEBP_RESTRICT w) {
  int sum = 0;
  int tmp[16];
  int i;
@ -627,15 +650,17 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
  return sum;
 }
-static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
+static int Disto4x4_C(const uint8_t* WEBP_RESTRICT const a,
-                      const uint16_t* const w) {
+                      const uint8_t* WEBP_RESTRICT const b,
                      const uint16_t* WEBP_RESTRICT const w) {
  const int sum1 = TTransform(a, w);
  const int sum2 = TTransform(b, w);
  return abs(sum2 - sum1) >> 5;
 }
-static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
+static int Disto16x16_C(const uint8_t* WEBP_RESTRICT const a,
-                        const uint16_t* const w) {
+                        const uint8_t* WEBP_RESTRICT const b,
                        const uint16_t* WEBP_RESTRICT const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -658,7 +683,7 @@ static const uint8_t kZigzag[16] = {
 // Simple quantization
 static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
-                           const VP8Matrix* const mtx) {
+                           const VP8Matrix* WEBP_RESTRICT const mtx) {
  int last = -1;
  int n;
  for (n = 0; n < 16; ++n) {
@ -684,7 +709,7 @@ static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
 }
 static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
-                             const VP8Matrix* const mtx) {
+                             const VP8Matrix* WEBP_RESTRICT const mtx) {
  int nz;
  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
@ -695,7 +720,8 @@ static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
 //------------------------------------------------------------------------------
 // Block copy
-static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
+static WEBP_INLINE void Copy(const uint8_t* WEBP_RESTRICT src,
                             uint8_t* WEBP_RESTRICT dst, int w, int h) {
  int y;
  for (y = 0; y < h; ++y) {
    memcpy(dst, src, w);
@ -704,11 +730,13 @@ static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
  }
 }
-static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
+static void Copy4x4_C(const uint8_t* WEBP_RESTRICT src,
                      uint8_t* WEBP_RESTRICT dst) {
  Copy(src, dst, 4, 4);
 }
-static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
+static void Copy16x8_C(const uint8_t* WEBP_RESTRICT src,
                       uint8_t* WEBP_RESTRICT dst) {
  Copy(src, dst, 16, 8);
 }
--- a/src/dsp/enc_mips32.c
+++ b/src/dsp/enc_mips32.c
@ -109,9 +109,9 @@ static const int kC2 = WEBP_TRANSFORM_AC3_C2;
  "sb      %[" #TEMP12 "],   3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
 // Does one or two inverse transforms.
-static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
+static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* WEBP_RESTRICT ref,
-                                             const int16_t* in,
+                                             const int16_t* WEBP_RESTRICT in,
-                                             uint8_t* dst) {
+                                             uint8_t* WEBP_RESTRICT dst) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
  int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
  int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
@ -141,8 +141,9 @@ static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
  );
 }
-static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in,
+static void ITransform_MIPS32(const uint8_t* WEBP_RESTRICT ref,
-                              uint8_t* dst, int do_two) {
+                              const int16_t* WEBP_RESTRICT in,
                              uint8_t* WEBP_RESTRICT dst, int do_two) {
  ITransformOne_MIPS32(ref, in, dst);
  if (do_two) {
    ITransformOne_MIPS32(ref + 4, in + 16, dst + 4);
@ -236,7 +237,7 @@ static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16],
 }
 static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
-                                  const VP8Matrix* const mtx) {
+                                  const VP8Matrix* WEBP_RESTRICT const mtx) {
  int nz;
  nz  = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0;
  nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1;
@ -358,8 +359,9 @@ static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
  "msub   %[temp6],  %[temp0]                \n\t"                \
  "msub   %[temp7],  %[temp1]                \n\t"
-static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
+static int Disto4x4_MIPS32(const uint8_t* WEBP_RESTRICT const a,
-                           const uint16_t* const w) {
+                           const uint8_t* WEBP_RESTRICT const b,
                           const uint16_t* WEBP_RESTRICT const w) {
  int tmp[32];
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@ -393,8 +395,9 @@ static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
 #undef VERTICAL_PASS
 #undef HORIZONTAL_PASS
-static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
+static int Disto16x16_MIPS32(const uint8_t* WEBP_RESTRICT const a,
-                             const uint16_t* const w) {
+                             const uint8_t* WEBP_RESTRICT const b,
                             const uint16_t* WEBP_RESTRICT const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -475,8 +478,9 @@ static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
  "sh     %[" #TEMP8 "],  " #D "(%[temp20])              \n\t"    \
  "sh     %[" #TEMP12 "], " #B "(%[temp20])              \n\t"
-static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
+static void FTransform_MIPS32(const uint8_t* WEBP_RESTRICT src,
-                              int16_t* out) {
+                              const uint8_t* WEBP_RESTRICT ref,
                              int16_t* WEBP_RESTRICT out) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
  int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
  int temp17, temp18, temp19, temp20;
@ -537,7 +541,8 @@ static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
  GET_SSE_INNER(C, C + 1, C + 2, C + 3)   \
  GET_SSE_INNER(D, D + 1, D + 2, D + 3)
-static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MIPS32(const uint8_t* WEBP_RESTRICT a,
                           const uint8_t* WEBP_RESTRICT b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -571,7 +576,8 @@ static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
  return count;
 }
-static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MIPS32(const uint8_t* WEBP_RESTRICT a,
                          const uint8_t* WEBP_RESTRICT b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -597,7 +603,8 @@ static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
  return count;
 }
-static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MIPS32(const uint8_t* WEBP_RESTRICT a,
                         const uint8_t* WEBP_RESTRICT b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -619,7 +626,8 @@ static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
  return count;
 }
-static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MIPS32(const uint8_t* WEBP_RESTRICT a,
                         const uint8_t* WEBP_RESTRICT b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
--- a/src/dsp/enc_mips_dsp_r2.c
+++ b/src/dsp/enc_mips_dsp_r2.c
@ -141,8 +141,9 @@ static const int kC2 = WEBP_TRANSFORM_AC3_C2;
  "sh              %[" #TEMP8 "],   " #D "(%[temp20])               \n\t"      \
  "sh              %[" #TEMP12 "],  " #B "(%[temp20])               \n\t"
-static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
+static void FTransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT src,
-                                 int16_t* out) {
+                                 const uint8_t* WEBP_RESTRICT ref,
                                 int16_t* WEBP_RESTRICT out) {
  const int c2217 = 2217;
  const int c5352 = 5352;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@ -171,8 +172,9 @@ static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
 #undef VERTICAL_PASS
 #undef HORIZONTAL_PASS
-static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
-                                      uint8_t* dst) {
+                                      const int16_t* WEBP_RESTRICT in,
                                      uint8_t* WEBP_RESTRICT dst) {
  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
@ -239,16 +241,18 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
  );
 }
-static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in,
+static void ITransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT ref,
-                                 uint8_t* dst, int do_two) {
+                                 const int16_t* WEBP_RESTRICT in,
                                 uint8_t* WEBP_RESTRICT dst, int do_two) {
  ITransformOne(ref, in, dst);
  if (do_two) {
    ITransformOne(ref + 4, in + 16, dst + 4);
  }
 }
-static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
+static int Disto4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a,
-                              const uint16_t* const w) {
+                              const uint8_t* WEBP_RESTRICT const b,
                              const uint16_t* WEBP_RESTRICT const w) {
  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
@ -314,9 +318,9 @@ static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
  return abs(temp3 - temp17) >> 5;
 }
-static int Disto16x16_MIPSdspR2(const uint8_t* const a,
+static int Disto16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a,
-                                const uint8_t* const b,
+                                const uint8_t* WEBP_RESTRICT const b,
-                                const uint16_t* const w) {
+                                const uint16_t* WEBP_RESTRICT const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -367,8 +371,8 @@ static int Disto16x16_MIPSdspR2(const uint8_t* const a,
 } while (0)
 #define VERTICAL_PRED(DST, TOP, SIZE)                                          \
-static WEBP_INLINE void VerticalPred##SIZE(uint8_t* (DST),                     \
+static WEBP_INLINE void VerticalPred##SIZE(                                    \
-                                           const uint8_t* (TOP)) {             \
+    uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (TOP)) {        \
  int j;                                                                       \
  if ((TOP)) {                                                                 \
    for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE));       \
@ -383,8 +387,8 @@ VERTICAL_PRED(dst, top, 16)
 #undef VERTICAL_PRED
 #define HORIZONTAL_PRED(DST, LEFT, SIZE)                                       \
-static WEBP_INLINE void HorizontalPred##SIZE(uint8_t* (DST),                   \
+static WEBP_INLINE void HorizontalPred##SIZE(                                  \
-                                             const uint8_t* (LEFT)) {          \
+    uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (LEFT)) {       \
  if (LEFT) {                                                                  \
    int j;                                                                     \
    for (j = 0; j < (SIZE); ++j) {                                             \
@ -451,8 +455,9 @@ HORIZONTAL_PRED(dst, left, 16)
 } while (0)
 #define TRUE_MOTION(DST, LEFT, TOP, SIZE)                                      \
-static WEBP_INLINE void TrueMotion##SIZE(uint8_t* (DST), const uint8_t* (LEFT),\
+static WEBP_INLINE void TrueMotion##SIZE(uint8_t* WEBP_RESTRICT (DST),         \
-                                         const uint8_t* (TOP)) {               \
+                                         const uint8_t* WEBP_RESTRICT (LEFT),  \
                                         const uint8_t* WEBP_RESTRICT (TOP)) { \
  if ((LEFT) != NULL) {                                                        \
    if ((TOP) != NULL) {                                                       \
      CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE));                               \
@ -480,8 +485,9 @@ TRUE_MOTION(dst, left, top, 16)
 #undef CLIP_8B_TO_DST
 #undef CLIPPING
-static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left,
+static WEBP_INLINE void DCMode16(uint8_t* WEBP_RESTRICT dst,
-                                 const uint8_t* top) {
+                                 const uint8_t* WEBP_RESTRICT left,
                                 const uint8_t* WEBP_RESTRICT top) {
  int DC, DC1;
  int temp0, temp1, temp2, temp3;
@ -543,8 +549,9 @@ static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left,
  FILL_8_OR_16(dst, DC, 16);
 }
-static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left,
+static WEBP_INLINE void DCMode8(uint8_t* WEBP_RESTRICT dst,
-                                const uint8_t* top) {
+                                const uint8_t* WEBP_RESTRICT left,
                                const uint8_t* WEBP_RESTRICT top) {
  int DC, DC1;
  int temp0, temp1, temp2, temp3;
@ -588,7 +595,7 @@ static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left,
  FILL_8_OR_16(dst, DC, 8);
 }
-static void DC4(uint8_t* dst, const uint8_t* top) {
+static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int temp0, temp1;
  __asm__ volatile(
    "ulw          %[temp0],   0(%[top])               \n\t"
@ -609,7 +616,7 @@ static void DC4(uint8_t* dst, const uint8_t* top) {
  );
 }
-static void TM4(uint8_t* dst, const uint8_t* top) {
+static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5;
  const int c35 = 0xff00ff;
  __asm__ volatile (
@ -664,7 +671,7 @@ static void TM4(uint8_t* dst, const uint8_t* top) {
  );
 }
-static void VE4(uint8_t* dst, const uint8_t* top) {
+static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
  __asm__ volatile(
    "ulw             %[temp0],   -1(%[top])              \n\t"
@ -695,7 +702,7 @@ static void VE4(uint8_t* dst, const uint8_t* top) {
  );
 }
-static void HE4(uint8_t* dst, const uint8_t* top) {
+static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
  __asm__ volatile(
    "ulw             %[temp0],   -4(%[top])              \n\t"
@ -731,7 +738,7 @@ static void HE4(uint8_t* dst, const uint8_t* top) {
  );
 }
-static void RD4(uint8_t* dst, const uint8_t* top) {
+static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int temp0, temp1, temp2, temp3, temp4, temp5;
  int temp6, temp7, temp8, temp9, temp10, temp11;
  __asm__ volatile(
@ -780,7 +787,7 @@ static void RD4(uint8_t* dst, const uint8_t* top) {
  );
 }
-static void VR4(uint8_t* dst, const uint8_t* top) {
+static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int temp0, temp1, temp2, temp3, temp4;
  int temp5, temp6, temp7, temp8, temp9;
  __asm__ volatile (
@ -830,7 +837,7 @@ static void VR4(uint8_t* dst, const uint8_t* top) {
  );
 }
-static void LD4(uint8_t* dst, const uint8_t* top) {
+static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int temp0, temp1, temp2, temp3, temp4, temp5;
  int temp6, temp7, temp8, temp9, temp10, temp11;
  __asm__ volatile(
@ -877,7 +884,7 @@ static void LD4(uint8_t* dst, const uint8_t* top) {
  );
 }
-static void VL4(uint8_t* dst, const uint8_t* top) {
+static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int temp0, temp1, temp2, temp3, temp4;
  int temp5, temp6, temp7, temp8, temp9;
  __asm__ volatile (
@ -926,7 +933,7 @@ static void VL4(uint8_t* dst, const uint8_t* top) {
  );
 }
-static void HD4(uint8_t* dst, const uint8_t* top) {
+static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int temp0, temp1, temp2, temp3, temp4;
  int temp5, temp6, temp7, temp8, temp9;
  __asm__ volatile (
@ -974,7 +981,7 @@ static void HD4(uint8_t* dst, const uint8_t* top) {
  );
 }
-static void HU4(uint8_t* dst, const uint8_t* top) {
+static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  __asm__ volatile (
    "ulw             %[temp0],   -5(%[top])              \n\t"
@ -1013,8 +1020,9 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)
-static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
+static void IntraChromaPreds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
-                                       const uint8_t* top) {
+                                       const uint8_t* WEBP_RESTRICT left,
                                       const uint8_t* WEBP_RESTRICT top) {
  // U block
  DCMode8(C8DC8 + dst, left, top);
  VerticalPred8(C8VE8 + dst, top);
@ -1033,8 +1041,9 @@ static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)
-static void Intra16Preds_MIPSdspR2(uint8_t* dst,
+static void Intra16Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
-                                   const uint8_t* left, const uint8_t* top) {
+                                   const uint8_t* WEBP_RESTRICT left,
                                   const uint8_t* WEBP_RESTRICT top) {
  DCMode16(I16DC16 + dst, left, top);
  VerticalPred16(I16VE16 + dst, top);
  HorizontalPred16(I16HE16 + dst, left);
@ -1043,7 +1052,8 @@ static void Intra16Preds_MIPSdspR2(uint8_t* dst,
 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
                                  const uint8_t* WEBP_RESTRICT top) {
  DC4(I4DC4 + dst, top);
  TM4(I4TM4 + dst, top);
  VE4(I4VE4 + dst, top);
@ -1079,7 +1089,8 @@ static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
  GET_SSE_INNER(C)                        \
  GET_SSE_INNER(D)
-static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
                              const uint8_t* WEBP_RESTRICT b) {
  int count;
  int temp0, temp1, temp2, temp3;
  __asm__ volatile (
@ -1109,7 +1120,8 @@ static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
  return count;
 }
-static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
                             const uint8_t* WEBP_RESTRICT b) {
  int count;
  int temp0, temp1, temp2, temp3;
  __asm__ volatile (
@ -1131,7 +1143,8 @@ static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
  return count;
 }
-static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
                            const uint8_t* WEBP_RESTRICT b) {
  int count;
  int temp0, temp1, temp2, temp3;
  __asm__ volatile (
@ -1149,7 +1162,8 @@ static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
  return count;
 }
-static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
                            const uint8_t* WEBP_RESTRICT b) {
  int count;
  int temp0, temp1, temp2, temp3;
  __asm__ volatile (
@ -1273,7 +1287,7 @@ static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
 "3:                                                          \n\t"
 static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
-                                   const VP8Matrix* const mtx) {
+                                   const VP8Matrix* WEBP_RESTRICT const mtx) {
  int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
  int sign, coeff, level;
  int max_level = MAX_LEVEL;
@ -1314,7 +1328,7 @@ static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
 }
 static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
-                                     const VP8Matrix* const mtx) {
+                                     const VP8Matrix* WEBP_RESTRICT const mtx) {
  int nz;
  nz  = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
  nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
@ -1360,7 +1374,8 @@ static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
  "usw             %[" #TEMP4 "],  " #C "(%[out])                 \n\t"        \
  "usw             %[" #TEMP6 "],  " #D "(%[out])                 \n\t"
-static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) {
+static void FTransformWHT_MIPSdspR2(const int16_t* WEBP_RESTRICT in,
                                    int16_t* WEBP_RESTRICT out) {
  int temp0, temp1, temp2, temp3, temp4;
  int temp5, temp6, temp7, temp8, temp9;
--- a/src/dsp/enc_msa.c
+++ b/src/dsp/enc_msa.c
@ -41,8 +41,9 @@
  BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);      \
 } while (0)
-static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
-                                      uint8_t* dst) {
+                                      const int16_t* WEBP_RESTRICT in,
                                      uint8_t* WEBP_RESTRICT dst) {
  v8i16 input0, input1;
  v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
  v4i32 res0, res1, res2, res3;
@ -69,16 +70,18 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
 }
-static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+static void ITransform_MSA(const uint8_t* WEBP_RESTRICT ref,
-                           int do_two) {
+                           const int16_t* WEBP_RESTRICT in,
                           uint8_t* WEBP_RESTRICT dst, int do_two) {
  ITransformOne(ref, in, dst);
  if (do_two) {
    ITransformOne(ref + 4, in + 16, dst + 4);
  }
 }
-static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
+static void FTransform_MSA(const uint8_t* WEBP_RESTRICT src,
-                           int16_t* out) {
+                           const uint8_t* WEBP_RESTRICT ref,
                           int16_t* WEBP_RESTRICT out) {
  uint64_t out0, out1, out2, out3;
  uint32_t in0, in1, in2, in3;
  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
@ -131,7 +134,8 @@ static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
  SD4(out0, out1, out2, out3, out, 8);
 }
-static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
+static void FTransformWHT_MSA(const int16_t* WEBP_RESTRICT in,
                              int16_t* WEBP_RESTRICT out) {
  v8i16 in0 = { 0 };
  v8i16 in1 = { 0 };
  v8i16 tmp0, tmp1, tmp2, tmp3;
@ -168,7 +172,8 @@ static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
  ST_SH2(out0, out1, out, 8);
 }
-static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
+static int TTransform_MSA(const uint8_t* WEBP_RESTRICT in,
                          const uint16_t* WEBP_RESTRICT w) {
  int sum;
  uint32_t in0_m, in1_m, in2_m, in3_m;
  v16i8 src0 = { 0 };
@ -200,15 +205,17 @@ static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
  return sum;
 }
-static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b,
+static int Disto4x4_MSA(const uint8_t* WEBP_RESTRICT const a,
-                        const uint16_t* const w) {
+                        const uint8_t* WEBP_RESTRICT const b,
                        const uint16_t* WEBP_RESTRICT const w) {
  const int sum1 = TTransform_MSA(a, w);
  const int sum2 = TTransform_MSA(b, w);
  return abs(sum2 - sum1) >> 5;
 }
-static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,
+static int Disto16x16_MSA(const uint8_t* WEBP_RESTRICT const a,
-                          const uint16_t* const w) {
+                          const uint8_t* WEBP_RESTRICT const b,
                          const uint16_t* WEBP_RESTRICT const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -259,7 +266,9 @@ static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
-static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
+// vertical
 static WEBP_INLINE void VE4(uint8_t* WEBP_RESTRICT dst,
                            const uint8_t* WEBP_RESTRICT top) {
  const v16u8 A1 = { 0 };
  const uint64_t val_m = LD(top - 1);
  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
@ -272,7 +281,9 @@ static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
  SW4(out, out, out, out, dst, BPS);
 }
-static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
+// horizontal
 static WEBP_INLINE void HE4(uint8_t* WEBP_RESTRICT dst,
                            const uint8_t* WEBP_RESTRICT top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@ -284,7 +295,8 @@ static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
 }
-static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC4(uint8_t* WEBP_RESTRICT dst,
                            const uint8_t* WEBP_RESTRICT top) {
  uint32_t dc = 4;
  int i;
  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
@ -293,7 +305,8 @@ static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
  SW4(dc, dc, dc, dc, dst, BPS);
 }
-static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void RD4(uint8_t* WEBP_RESTRICT dst,
                            const uint8_t* WEBP_RESTRICT top) {
  const v16u8 A2 = { 0 };
  const uint64_t val_m = LD(top - 5);
  const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
@ -313,7 +326,8 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
  SW4(val3, val2, val1, val0, dst, BPS);
 }
-static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void LD4(uint8_t* WEBP_RESTRICT dst,
                            const uint8_t* WEBP_RESTRICT top) {
  const v16u8 A1 = { 0 };
  const uint64_t val_m = LD(top);
  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
@ -333,7 +347,8 @@ static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
  SW4(val0, val1, val2, val3, dst, BPS);
 }
-static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VR4(uint8_t* WEBP_RESTRICT dst,
                            const uint8_t* WEBP_RESTRICT top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@ -354,7 +369,8 @@ static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {
  DST(3, 1) =             AVG3(B, C, D);
 }
-static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VL4(uint8_t* WEBP_RESTRICT dst,
                            const uint8_t* WEBP_RESTRICT top) {
  const int A = top[0];
  const int B = top[1];
  const int C = top[2];
@ -375,7 +391,8 @@ static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {
              DST(3, 3) = AVG3(F, G, H);
 }
-static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HU4(uint8_t* WEBP_RESTRICT dst,
                            const uint8_t* WEBP_RESTRICT top) {
  const int I = top[-2];
  const int J = top[-3];
  const int K = top[-4];
@ -390,7 +407,8 @@ static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }
-static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HD4(uint8_t* WEBP_RESTRICT dst,
                            const uint8_t* WEBP_RESTRICT top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@ -411,7 +429,8 @@ static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
  DST(1, 3)             = AVG3(L, K, J);
 }
-static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void TM4(uint8_t* WEBP_RESTRICT dst,
                            const uint8_t* WEBP_RESTRICT top) {
  const v16i8 zero = { 0 };
  const v8i16 TL = (v8i16)__msa_fill_h(top[-1]);
  const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]);
@ -431,7 +450,8 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
 #undef AVG3
 #undef AVG2
-static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_MSA(uint8_t* WEBP_RESTRICT dst,
                            const uint8_t* WEBP_RESTRICT top) {
  DC4(I4DC4 + dst, top);
  TM4(I4TM4 + dst, top);
  VE4(I4VE4 + dst, top);
@ -451,7 +471,8 @@ static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
    ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);  \
 } while (0)
-static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VerticalPred16x16(uint8_t* WEBP_RESTRICT dst,
                                          const uint8_t* WEBP_RESTRICT top) {
  if (top != NULL) {
    const v16u8 out = LD_UB(top);
    STORE16x16(out, dst);
@ -461,8 +482,8 @@ static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
  }
 }
-static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
+static WEBP_INLINE void HorizontalPred16x16(uint8_t* WEBP_RESTRICT dst,
-                                            const uint8_t* left) {
+                                            const uint8_t* WEBP_RESTRICT left) {
  if (left != NULL) {
    int j;
    for (j = 0; j < 16; j += 4) {
@ -480,8 +501,9 @@ static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
  }
 }
-static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
+static WEBP_INLINE void TrueMotion16x16(uint8_t* WEBP_RESTRICT dst,
-                                        const uint8_t* top) {
+                                        const uint8_t* WEBP_RESTRICT left,
                                        const uint8_t* WEBP_RESTRICT top) {
  if (left != NULL) {
    if (top != NULL) {
      int j;
@ -519,8 +541,9 @@ static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
  }
 }
-static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
+static WEBP_INLINE void DCMode16x16(uint8_t* WEBP_RESTRICT dst,
-                                    const uint8_t* top) {
+                                    const uint8_t* WEBP_RESTRICT left,
                                    const uint8_t* WEBP_RESTRICT top) {
  int DC;
  v16u8 out;
  if (top != NULL && left != NULL) {
@ -548,8 +571,9 @@ static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
  STORE16x16(out, dst);
 }
-static void Intra16Preds_MSA(uint8_t* dst,
+static void Intra16Preds_MSA(uint8_t* WEBP_RESTRICT dst,
-                             const uint8_t* left, const uint8_t* top) {
+                             const uint8_t* WEBP_RESTRICT left,
                             const uint8_t* WEBP_RESTRICT top) {
  DCMode16x16(I16DC16 + dst, left, top);
  VerticalPred16x16(I16VE16 + dst, top);
  HorizontalPred16x16(I16HE16 + dst, left);
@ -574,7 +598,8 @@ static void Intra16Preds_MSA(uint8_t* dst,
  SD4(out, out, out, out, dst + 4 * BPS, BPS);  \
 } while (0)
-static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VerticalPred8x8(uint8_t* WEBP_RESTRICT dst,
                                        const uint8_t* WEBP_RESTRICT top) {
  if (top != NULL) {
    const uint64_t out = LD(top);
    STORE8x8(out, dst);
@ -584,7 +609,8 @@ static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {
  }
 }
-static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void HorizontalPred8x8(uint8_t* WEBP_RESTRICT dst,
                                          const uint8_t* WEBP_RESTRICT left) {
  if (left != NULL) {
    int j;
    for (j = 0; j < 8; j += 4) {
@ -606,8 +632,9 @@ static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {
  }
 }
-static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
+static WEBP_INLINE void TrueMotion8x8(uint8_t* WEBP_RESTRICT dst,
-                                      const uint8_t* top) {
+                                      const uint8_t* WEBP_RESTRICT left,
                                      const uint8_t* WEBP_RESTRICT top) {
  if (left != NULL) {
    if (top != NULL) {
      int j;
@ -646,8 +673,9 @@ static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
  }
 }
-static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
+static WEBP_INLINE void DCMode8x8(uint8_t* WEBP_RESTRICT dst,
-                                  const uint8_t* top) {
+                                  const uint8_t* WEBP_RESTRICT left,
                                  const uint8_t* WEBP_RESTRICT top) {
  uint64_t out;
  v16u8 src = { 0 };
  if (top != NULL && left != NULL) {
@ -670,8 +698,9 @@ static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
  STORE8x8(out, dst);
 }
-static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
+static void IntraChromaPreds_MSA(uint8_t* WEBP_RESTRICT dst,
-                                 const uint8_t* top) {
+                                 const uint8_t* WEBP_RESTRICT left,
                                 const uint8_t* WEBP_RESTRICT top) {
  // U block
  DCMode8x8(C8DC8 + dst, left, top);
  VerticalPred8x8(C8VE8 + dst, top);
@ -712,7 +741,8 @@ static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
  DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3);                         \
 } while (0)
-static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MSA(const uint8_t* WEBP_RESTRICT a,
                        const uint8_t* WEBP_RESTRICT b) {
  uint32_t sum;
  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -739,7 +769,8 @@ static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
  return sum;
 }
-static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MSA(const uint8_t* WEBP_RESTRICT a,
                       const uint8_t* WEBP_RESTRICT b) {
  uint32_t sum;
  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -758,7 +789,8 @@ static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
  return sum;
 }
-static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MSA(const uint8_t* WEBP_RESTRICT a,
                      const uint8_t* WEBP_RESTRICT b) {
  uint32_t sum;
  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -778,7 +810,8 @@ static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
  return sum;
 }
-static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MSA(const uint8_t* WEBP_RESTRICT a,
                      const uint8_t* WEBP_RESTRICT b) {
  uint32_t sum = 0;
  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
  v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
@ -801,7 +834,7 @@ static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
 // Quantization
 static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
-                             const VP8Matrix* const mtx) {
+                             const VP8Matrix* WEBP_RESTRICT const mtx) {
  int sum;
  v8i16 in0, in1, sh0, sh1, out0, out1;
  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
@ -854,7 +887,7 @@ static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
 }
 static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
-                               const VP8Matrix* const mtx) {
+                               const VP8Matrix* WEBP_RESTRICT const mtx) {
  int nz;
  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
--- a/src/dsp/enc_neon.c
+++ b/src/dsp/enc_neon.c
@ -60,8 +60,8 @@ static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
                                    const int16x8_t row23,
-                                    const uint8_t* const ref,
+                                    const uint8_t* WEBP_RESTRICT const ref,
-                                    uint8_t* const dst) {
+                                    uint8_t* WEBP_RESTRICT const dst) {
  uint32x2_t dst01 = vdup_n_u32(0);
  uint32x2_t dst23 = vdup_n_u32(0);
@ -120,8 +120,9 @@ static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
  Transpose8x2_NEON(E0, E1, rows);
 }
-static void ITransformOne_NEON(const uint8_t* ref,
+static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
-                               const int16_t* in, uint8_t* dst) {
+                               const int16_t* WEBP_RESTRICT in,
                               uint8_t* WEBP_RESTRICT dst) {
  int16x8x2_t rows;
  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
  TransformPass_NEON(&rows);
@ -131,8 +132,9 @@ static void ITransformOne_NEON(const uint8_t* ref,
 #else
-static void ITransformOne_NEON(const uint8_t* ref,
+static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
-                               const int16_t* in, uint8_t* dst) {
+                               const int16_t* WEBP_RESTRICT in,
                               uint8_t* WEBP_RESTRICT dst) {
  const int kBPS = BPS;
  const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
@ -247,8 +249,9 @@ static void ITransformOne_NEON(const uint8_t* ref,
 #endif    // WEBP_USE_INTRINSICS
-static void ITransform_NEON(const uint8_t* ref,
+static void ITransform_NEON(const uint8_t* WEBP_RESTRICT ref,
-                            const int16_t* in, uint8_t* dst, int do_two) {
+                            const int16_t* WEBP_RESTRICT in,
                            uint8_t* WEBP_RESTRICT dst, int do_two) {
  ITransformOne_NEON(ref, in, dst);
  if (do_two) {
    ITransformOne_NEON(ref + 4, in + 16, dst + 4);
@ -294,8 +297,9 @@ static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
  return vreinterpretq_s16_u16(vsubl_u8(a, b));
 }
-static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
+static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
-                            int16_t* out) {
+                            const uint8_t* WEBP_RESTRICT ref,
                            int16_t* WEBP_RESTRICT out) {
  int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
  {
    const uint8x16_t S0 = Load4x4_NEON(src);
@ -364,8 +368,9 @@ static const int32_t kCoeff32[] = {
  51000, 51000, 51000, 51000
 };
-static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
+static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
-                            int16_t* out) {
+                            const uint8_t* WEBP_RESTRICT ref,
                            int16_t* WEBP_RESTRICT out) {
  const int kBPS = BPS;
  const uint8_t* src_ptr = src;
  const uint8_t* ref_ptr = ref;
@ -484,7 +489,8 @@ static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
  src += stride;                                    \
 } while (0)
-static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
+static void FTransformWHT_NEON(const int16_t* WEBP_RESTRICT src,
                               int16_t* WEBP_RESTRICT out) {
  const int stride = 16;
  const int16x4_t zero = vdup_n_s16(0);
  int32x4x4_t tmp0;
@ -659,8 +665,9 @@ static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
+static int Disto4x4_NEON(const uint8_t* WEBP_RESTRICT const a,
-                         const uint16_t* const w) {
+                         const uint8_t* WEBP_RESTRICT const b,
                         const uint16_t* WEBP_RESTRICT const w) {
  uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
  uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
  uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
@ -701,8 +708,9 @@ static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
 }
 #undef LOAD_LANE_32b
-static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
+static int Disto16x16_NEON(const uint8_t* WEBP_RESTRICT const a,
-                           const uint16_t* const w) {
+                           const uint8_t* WEBP_RESTRICT const b,
                           const uint16_t* WEBP_RESTRICT const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -715,9 +723,10 @@ static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
 //------------------------------------------------------------------------------
-static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
+static void CollectHistogram_NEON(const uint8_t* WEBP_RESTRICT ref,
                                  const uint8_t* WEBP_RESTRICT pred,
                                  int start_block, int end_block,
-                                  VP8Histogram* const histo) {
+                                  VP8Histogram* WEBP_RESTRICT const histo) {
  const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
  int j;
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
@ -747,9 +756,9 @@ static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
 //------------------------------------------------------------------------------
-static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
+static WEBP_INLINE void AccumulateSSE16_NEON(
-                                             const uint8_t* const b,
+    const uint8_t* WEBP_RESTRICT const a, const uint8_t* WEBP_RESTRICT const b,
-                                             uint32x4_t* const sum) {
+    uint32x4_t* const sum) {
  const uint8x16_t a0 = vld1q_u8(a);
  const uint8x16_t b0 = vld1q_u8(b);
  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
@ -775,7 +784,8 @@ static int SumToInt_NEON(uint32x4_t sum) {
 #endif
 }
-static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_NEON(const uint8_t* WEBP_RESTRICT a,
                         const uint8_t* WEBP_RESTRICT b) {
  uint32x4_t sum = vdupq_n_u32(0);
  int y;
  for (y = 0; y < 16; ++y) {
@ -784,7 +794,8 @@ static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
  return SumToInt_NEON(sum);
 }
-static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_NEON(const uint8_t* WEBP_RESTRICT a,
                        const uint8_t* WEBP_RESTRICT b) {
  uint32x4_t sum = vdupq_n_u32(0);
  int y;
  for (y = 0; y < 8; ++y) {
@ -793,7 +804,8 @@ static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
  return SumToInt_NEON(sum);
 }
-static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_NEON(const uint8_t* WEBP_RESTRICT a,
                       const uint8_t* WEBP_RESTRICT b) {
  uint32x4_t sum = vdupq_n_u32(0);
  int y;
  for (y = 0; y < 8; ++y) {
@ -806,7 +818,8 @@ static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
  return SumToInt_NEON(sum);
 }
-static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_NEON(const uint8_t* WEBP_RESTRICT a,
                       const uint8_t* WEBP_RESTRICT b) {
  const uint8x16_t a0 = Load4x4_NEON(a);
  const uint8x16_t b0 = Load4x4_NEON(b);
  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
@ -825,8 +838,9 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
 // Compilation with gcc-4.6.x is problematic for now.
 #if !defined(WORK_AROUND_GCC)
-static int16x8_t Quantize_NEON(int16_t* const in,
+static int16x8_t Quantize_NEON(int16_t* WEBP_RESTRICT const in,
-                               const VP8Matrix* const mtx, int offset) {
+                               const VP8Matrix* WEBP_RESTRICT const mtx,
                               int offset) {
  const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
  const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
  const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
@ -860,7 +874,7 @@ static const uint8_t kShuffles[4][8] = {
 };
 static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
-                              const VP8Matrix* const mtx) {
+                              const VP8Matrix* WEBP_RESTRICT const mtx) {
  const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
  const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
  uint8x8x4_t shuffles;
@ -902,7 +916,7 @@ static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
 }
 static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
-                                const VP8Matrix* const mtx) {
+                                const VP8Matrix* WEBP_RESTRICT const mtx) {
  int nz;
  nz  = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
  nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
@ -932,7 +946,8 @@ static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
    vst1q_u8(dst, r);                                                          \
  } while (0)
-static void Intra4Preds_NEON(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_NEON(uint8_t* WEBP_RESTRICT dst,
                             const uint8_t* WEBP_RESTRICT top) {
  // 0   1   2   3   4   5   6   7   8   9  10  11  12  13
  //     L   K   J   I   X   A   B   C   D   E   F   G   H
  //    -5  -4  -3  -2  -1   0   1   2   3   4   5   6   7
@ -1165,8 +1180,9 @@ static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, const uint8_t* left,
  }
 }
-static void Intra16Preds_NEON(uint8_t* dst, const uint8_t* left,
+static void Intra16Preds_NEON(uint8_t* WEBP_RESTRICT dst,
-                              const uint8_t* top) {
+                              const uint8_t* WEBP_RESTRICT left,
                              const uint8_t* WEBP_RESTRICT top) {
  DCMode_NEON(I16DC16 + dst, left, top);
  VerticalPred16_NEON(I16VE16 + dst, top);
  HorizontalPred16_NEON(I16HE16 + dst, left);
--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
@ -26,8 +26,9 @@
 // Transforms (Paragraph 14.4)
 // Does one inverse transform.
-static void ITransform_One_SSE2(const uint8_t* ref, const int16_t* in,
+static void ITransform_One_SSE2(const uint8_t* WEBP_RESTRICT ref,
-                                uint8_t* dst) {
+                                const int16_t* WEBP_RESTRICT in,
                                uint8_t* WEBP_RESTRICT dst) {
  // This implementation makes use of 16-bit fixed point versions of two
  // multiply constants:
  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@ -177,8 +178,9 @@ static void ITransform_One_SSE2(const uint8_t* ref, const int16_t* in,
 }
 // Does two inverse transforms.
-static void ITransform_Two_SSE2(const uint8_t* ref, const int16_t* in,
+static void ITransform_Two_SSE2(const uint8_t* WEBP_RESTRICT ref,
-                                uint8_t* dst) {
+                                const int16_t* WEBP_RESTRICT in,
                                uint8_t* WEBP_RESTRICT dst) {
  // This implementation makes use of 16-bit fixed point versions of two
  // multiply constants:
  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@ -316,7 +318,9 @@ static void ITransform_Two_SSE2(const uint8_t* ref, const int16_t* in,
 }
 // Does one or two inverse transforms.
-static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+static void ITransform_SSE2(const uint8_t* WEBP_RESTRICT ref,
                            const int16_t* WEBP_RESTRICT in,
                            uint8_t* WEBP_RESTRICT dst,
                            int do_two) {
  if (do_two) {
    ITransform_Two_SSE2(ref, in, dst);
@ -373,7 +377,7 @@ static void FTransformPass1_SSE2(const __m128i* const in01,
 static void FTransformPass2_SSE2(const __m128i* const v01,
                                 const __m128i* const v32,
-                                 int16_t* out) {
+                                 int16_t* WEBP_RESTRICT out) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i seven = _mm_set1_epi16(7);
  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
@ -424,8 +428,9 @@ static void FTransformPass2_SSE2(const __m128i* const v01,
  _mm_storeu_si128((__m128i*)&out[8], d2_f3);
 }
-static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref,
+static void FTransform_SSE2(const uint8_t* WEBP_RESTRICT src,
-                            int16_t* out) {
+                            const uint8_t* WEBP_RESTRICT ref,
                            int16_t* WEBP_RESTRICT out) {
  const __m128i zero = _mm_setzero_si128();
  // Load src.
  const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
@ -468,8 +473,9 @@ static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref,
  FTransformPass2_SSE2(&v01, &v32, out);
 }
-static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref,
+static void FTransform2_SSE2(const uint8_t* WEBP_RESTRICT src,
-                             int16_t* out) {
+                             const uint8_t* WEBP_RESTRICT ref,
                             int16_t* WEBP_RESTRICT out) {
  const __m128i zero = _mm_setzero_si128();
  // Load src and convert to 16b.
@ -517,7 +523,8 @@ static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref,
  FTransformPass2_SSE2(&v01h, &v32h, out + 16);
 }
-static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) {
+static void FTransformWHTRow_SSE2(const int16_t* WEBP_RESTRICT const in,
                                  __m128i* const out) {
  const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);
  const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
  const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
@ -533,7 +540,8 @@ static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) {
  *out = _mm_madd_epi16(D, kMult);
 }
-static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) {
+static void FTransformWHT_SSE2(const int16_t* WEBP_RESTRICT in,
                               int16_t* WEBP_RESTRICT out) {
  // Input is 12b signed.
  __m128i row0, row1, row2, row3;
  // Rows are 14b signed.
@ -566,9 +574,10 @@ static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) {
 // Compute susceptibility based on DCT-coeff histograms:
 // the higher, the "easier" the macroblock is to compress.
-static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred,
+static void CollectHistogram_SSE2(const uint8_t* WEBP_RESTRICT ref,
                                  const uint8_t* WEBP_RESTRICT pred,
                                  int start_block, int end_block,
-                                  VP8Histogram* const histo) {
+                                  VP8Histogram* WEBP_RESTRICT const histo) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
  int j;
@ -640,7 +649,8 @@ static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) {
  }
 }
-static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VE8uv_SSE2(uint8_t* WEBP_RESTRICT dst,
                                   const uint8_t* WEBP_RESTRICT top) {
  int j;
  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
  for (j = 0; j < 8; ++j) {
@ -648,7 +658,8 @@ static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) {
  }
 }
-static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VE16_SSE2(uint8_t* WEBP_RESTRICT dst,
                                  const uint8_t* WEBP_RESTRICT top) {
  const __m128i top_values = _mm_load_si128((const __m128i*)top);
  int j;
  for (j = 0; j < 16; ++j) {
@ -656,8 +667,9 @@ static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) {
  }
 }
-static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
+static WEBP_INLINE void VerticalPred_SSE2(uint8_t* WEBP_RESTRICT dst,
-                                          const uint8_t* top, int size) {
+                                          const uint8_t* WEBP_RESTRICT top,
                                          int size) {
  if (top != NULL) {
    if (size == 8) {
      VE8uv_SSE2(dst, top);
@ -669,7 +681,8 @@ static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
  }
 }
-static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void HE8uv_SSE2(uint8_t* WEBP_RESTRICT dst,
                                   const uint8_t* WEBP_RESTRICT left) {
  int j;
  for (j = 0; j < 8; ++j) {
    const __m128i values = _mm_set1_epi8((char)left[j]);
@ -678,7 +691,8 @@ static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
  }
 }
-static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void HE16_SSE2(uint8_t* WEBP_RESTRICT dst,
                                  const uint8_t* WEBP_RESTRICT left) {
  int j;
  for (j = 0; j < 16; ++j) {
    const __m128i values = _mm_set1_epi8((char)left[j]);
@ -687,8 +701,9 @@ static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
  }
 }
-static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst,
+static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* WEBP_RESTRICT dst,
-                                            const uint8_t* left, int size) {
+                                            const uint8_t* WEBP_RESTRICT left,
                                            int size) {
  if (left != NULL) {
    if (size == 8) {
      HE8uv_SSE2(dst, left);
@ -700,8 +715,9 @@ static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst,
  }
 }
-static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left,
+static WEBP_INLINE void TM_SSE2(uint8_t* WEBP_RESTRICT dst,
-                                const uint8_t* top, int size) {
+                                const uint8_t* WEBP_RESTRICT left,
                                const uint8_t* WEBP_RESTRICT top, int size) {
  const __m128i zero = _mm_setzero_si128();
  int y;
  if (size == 8) {
@ -728,8 +744,10 @@ static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left,
  }
 }
-static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left,
+static WEBP_INLINE void TrueMotion_SSE2(uint8_t* WEBP_RESTRICT dst,
-                                        const uint8_t* top, int size) {
+                                        const uint8_t* WEBP_RESTRICT left,
                                        const uint8_t* WEBP_RESTRICT top,
                                        int size) {
  if (left != NULL) {
    if (top != NULL) {
      TM_SSE2(dst, left, top, size);
@ -749,8 +767,9 @@ static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left,
  }
 }
-static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left,
+static WEBP_INLINE void DC8uv_SSE2(uint8_t* WEBP_RESTRICT dst,
-                                   const uint8_t* top) {
+                                   const uint8_t* WEBP_RESTRICT left,
                                   const uint8_t* WEBP_RESTRICT top) {
  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
  const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
  const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);
@ -758,7 +777,8 @@ static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left,
  Put8x8uv_SSE2(DC >> 4, dst);
 }
-static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* WEBP_RESTRICT dst,
                                         const uint8_t* WEBP_RESTRICT top) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
  const __m128i sum = _mm_sad_epu8(top_values, zero);
@ -766,7 +786,8 @@ static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
  Put8x8uv_SSE2(DC >> 3, dst);
 }
-static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* WEBP_RESTRICT dst,
                                        const uint8_t* WEBP_RESTRICT left) {
  // 'left' is contiguous so we can reuse the top summation.
  DC8uvNoLeft_SSE2(dst, left);
 }
@ -775,8 +796,9 @@ static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) {
  Put8x8uv_SSE2(0x80, dst);
 }
-static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left,
+static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* WEBP_RESTRICT dst,
-                                       const uint8_t* top) {
+                                       const uint8_t* WEBP_RESTRICT left,
                                       const uint8_t* WEBP_RESTRICT top) {
  if (top != NULL) {
    if (left != NULL) {  // top and left present
      DC8uv_SSE2(dst, left, top);
@ -790,8 +812,9 @@ static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left,
  }
 }
-static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left,
+static WEBP_INLINE void DC16_SSE2(uint8_t* WEBP_RESTRICT dst,
-                                  const uint8_t* top) {
+                                  const uint8_t* WEBP_RESTRICT left,
                                  const uint8_t* WEBP_RESTRICT top) {
  const __m128i top_row = _mm_load_si128((const __m128i*)top);
  const __m128i left_row = _mm_load_si128((const __m128i*)left);
  const int DC =
@ -799,13 +822,15 @@ static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left,
  Put16_SSE2(DC >> 5, dst);
 }
-static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* WEBP_RESTRICT dst,
                                        const uint8_t* WEBP_RESTRICT top) {
  const __m128i top_row = _mm_load_si128((const __m128i*)top);
  const int DC = VP8HorizontalAdd8b(&top_row) + 8;
  Put16_SSE2(DC >> 4, dst);
 }
-static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* WEBP_RESTRICT dst,
                                       const uint8_t* WEBP_RESTRICT left) {
  // 'left' is contiguous so we can reuse the top summation.
  DC16NoLeft_SSE2(dst, left);
 }
@ -814,8 +839,9 @@ static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) {
  Put16_SSE2(0x80, dst);
 }
-static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left,
+static WEBP_INLINE void DC16Mode_SSE2(uint8_t* WEBP_RESTRICT dst,
-                                      const uint8_t* top) {
+                                      const uint8_t* WEBP_RESTRICT left,
                                      const uint8_t* WEBP_RESTRICT top) {
  if (top != NULL) {
    if (left != NULL) {  // top and left present
      DC16_SSE2(dst, left, top);
@ -844,8 +870,9 @@ static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left,
 //   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
 //   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
-static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
+// vertical
-                                 const uint8_t* top) {  // vertical
+static WEBP_INLINE void VE4_SSE2(uint8_t* WEBP_RESTRICT dst,
                                 const uint8_t* WEBP_RESTRICT top) {
  const __m128i one = _mm_set1_epi8(1);
  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@ -861,8 +888,9 @@ static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
  }
 }
-static WEBP_INLINE void HE4_SSE2(uint8_t* dst,
+// horizontal
-                                 const uint8_t* top) {  // horizontal
+static WEBP_INLINE void HE4_SSE2(uint8_t* WEBP_RESTRICT dst,
                                 const uint8_t* WEBP_RESTRICT top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@ -874,15 +902,17 @@ static WEBP_INLINE void HE4_SSE2(uint8_t* dst,
  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
 }
-static WEBP_INLINE void DC4_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC4_SSE2(uint8_t* WEBP_RESTRICT dst,
                                 const uint8_t* WEBP_RESTRICT top) {
  uint32_t dc = 4;
  int i;
  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
  Fill_SSE2(dst, dc >> 3, 4);
 }
-static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
+// Down-Left
-                                 const uint8_t* top) {  // Down-Left
+static WEBP_INLINE void LD4_SSE2(uint8_t* WEBP_RESTRICT dst,
                                 const uint8_t* WEBP_RESTRICT top) {
  const __m128i one = _mm_set1_epi8(1);
  const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@ -898,8 +928,9 @@ static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
-static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
+// Vertical-Right
-                                 const uint8_t* top) {  // Vertical-Right
+static WEBP_INLINE void VR4_SSE2(uint8_t* WEBP_RESTRICT dst,
                                 const uint8_t* WEBP_RESTRICT top) {
  const __m128i one = _mm_set1_epi8(1);
  const int I = top[-2];
  const int J = top[-3];
@ -924,8 +955,9 @@ static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
  DST(0, 3) = AVG3(K, J, I);
 }
-static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
+// Vertical-Left
-                                 const uint8_t* top) {  // Vertical-Left
+static WEBP_INLINE void VL4_SSE2(uint8_t* WEBP_RESTRICT dst,
                                 const uint8_t* WEBP_RESTRICT top) {
  const __m128i one = _mm_set1_epi8(1);
  const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
  const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
@ -951,8 +983,9 @@ static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
  DST(3, 3) = (extra_out >> 8) & 0xff;
 }
-static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
+// Down-right
-                                 const uint8_t* top) {  // Down-right
+static WEBP_INLINE void RD4_SSE2(uint8_t* WEBP_RESTRICT dst,
                                 const uint8_t* WEBP_RESTRICT top) {
  const __m128i one = _mm_set1_epi8(1);
  const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
  const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
@ -968,7 +1001,8 @@ static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
-static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HU4_SSE2(uint8_t* WEBP_RESTRICT dst,
                                 const uint8_t* WEBP_RESTRICT top) {
  const int I = top[-2];
  const int J = top[-3];
  const int K = top[-4];
@ -983,7 +1017,8 @@ static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }
-static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HD4_SSE2(uint8_t* WEBP_RESTRICT dst,
                                 const uint8_t* WEBP_RESTRICT top) {
  const int X = top[-1];
  const int I = top[-2];
  const int J = top[-3];
@ -1006,7 +1041,8 @@ static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
  DST(1, 3)             = AVG3(L, K, J);
 }
-static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void TM4_SSE2(uint8_t* WEBP_RESTRICT dst,
                                 const uint8_t* WEBP_RESTRICT top) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top));
  const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
@ -1028,7 +1064,8 @@ static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_SSE2(uint8_t* WEBP_RESTRICT dst,
                             const uint8_t* WEBP_RESTRICT top) {
  DC4_SSE2(I4DC4 + dst, top);
  TM4_SSE2(I4TM4 + dst, top);
  VE4_SSE2(I4VE4 + dst, top);
@ -1044,8 +1081,9 @@ static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) {
 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)
-static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left,
+static void IntraChromaPreds_SSE2(uint8_t* WEBP_RESTRICT dst,
-                                  const uint8_t* top) {
+                                  const uint8_t* WEBP_RESTRICT left,
                                  const uint8_t* WEBP_RESTRICT top) {
  // U block
  DC8uvMode_SSE2(C8DC8 + dst, left, top);
  VerticalPred_SSE2(C8VE8 + dst, top, 8);
@ -1064,8 +1102,9 @@ static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)
-static void Intra16Preds_SSE2(uint8_t* dst,
+static void Intra16Preds_SSE2(uint8_t* WEBP_RESTRICT dst,
-                              const uint8_t* left, const uint8_t* top) {
+                              const uint8_t* WEBP_RESTRICT left,
                              const uint8_t* WEBP_RESTRICT top) {
  DC16Mode_SSE2(I16DC16 + dst, left, top);
  VerticalPred_SSE2(I16VE16 + dst, top, 16);
  HorizontalPred_SSE2(I16HE16 + dst, left, 16);
@ -1092,7 +1131,8 @@ static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a,
  *sum = _mm_add_epi32(sum1, sum2);
 }
-static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
+static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* WEBP_RESTRICT a,
                                     const uint8_t* WEBP_RESTRICT b,
                                     int num_pairs) {
  __m128i sum = _mm_setzero_si128();
  int32_t tmp[4];
@ -1114,18 +1154,21 @@ static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
 }
-static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_SSE2(const uint8_t* WEBP_RESTRICT a,
                         const uint8_t* WEBP_RESTRICT b) {
  return SSE_16xN_SSE2(a, b, 8);
 }
-static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_SSE2(const uint8_t* WEBP_RESTRICT a,
                        const uint8_t* WEBP_RESTRICT b) {
  return SSE_16xN_SSE2(a, b, 4);
 }
 #define LOAD_8x16b(ptr) \
  _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero)
-static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_SSE2(const uint8_t* WEBP_RESTRICT a,
                       const uint8_t* WEBP_RESTRICT b) {
  const __m128i zero = _mm_setzero_si128();
  int num_pairs = 4;
  __m128i sum = zero;
@ -1152,7 +1195,8 @@ static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) {
 }
 #undef LOAD_8x16b
-static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_SSE2(const uint8_t* WEBP_RESTRICT a,
                       const uint8_t* WEBP_RESTRICT b) {
  const __m128i zero = _mm_setzero_si128();
  // Load values. Note that we read 8 pixels instead of 4,
@ -1189,7 +1233,7 @@ static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) {
 //------------------------------------------------------------------------------
-static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) {
+static void Mean16x4_SSE2(const uint8_t* WEBP_RESTRICT ref, uint32_t dc[4]) {
  const __m128i mask = _mm_set1_epi16(0x00ff);
  const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
  const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
@ -1227,8 +1271,9 @@ static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) {
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB,
+static int TTransform_SSE2(const uint8_t* WEBP_RESTRICT inA,
-                           const uint16_t* const w) {
+                           const uint8_t* WEBP_RESTRICT inB,
                           const uint16_t* WEBP_RESTRICT const w) {
  int32_t sum[4];
  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
  const __m128i zero = _mm_setzero_si128();
@ -1328,14 +1373,16 @@ static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB,
  return sum[0] + sum[1] + sum[2] + sum[3];
 }
-static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b,
+static int Disto4x4_SSE2(const uint8_t* WEBP_RESTRICT const a,
-                         const uint16_t* const w) {
+                         const uint8_t* WEBP_RESTRICT const b,
                         const uint16_t* WEBP_RESTRICT const w) {
  const int diff_sum = TTransform_SSE2(a, b, w);
  return abs(diff_sum) >> 5;
 }
-static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b,
+static int Disto16x16_SSE2(const uint8_t* WEBP_RESTRICT const a,
-                           const uint16_t* const w) {
+                           const uint8_t* WEBP_RESTRICT const b,
                           const uint16_t* WEBP_RESTRICT const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -1350,9 +1397,10 @@ static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b,
 // Quantization
 //
-static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
+static WEBP_INLINE int DoQuantizeBlock_SSE2(
-                                            const uint16_t* const sharpen,
+    int16_t in[16], int16_t out[16],
-                                            const VP8Matrix* const mtx) {
+    const uint16_t* WEBP_RESTRICT const sharpen,
    const VP8Matrix* WEBP_RESTRICT const mtx) {
  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
  const __m128i zero = _mm_setzero_si128();
  __m128i coeff0, coeff8;
@ -1463,17 +1511,17 @@ static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
 }
 static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
-                              const VP8Matrix* const mtx) {
+                              const VP8Matrix* WEBP_RESTRICT const mtx) {
  return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx);
 }
 static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16],
-                                 const VP8Matrix* const mtx) {
+                                 const VP8Matrix* WEBP_RESTRICT const mtx) {
  return DoQuantizeBlock_SSE2(in, out, NULL, mtx);
 }
 static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32],
-                                const VP8Matrix* const mtx) {
+                                const VP8Matrix* WEBP_RESTRICT const mtx) {
  int nz;
  const uint16_t* const sharpen = &mtx->sharpen_[0];
  nz  = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
--- a/src/dsp/enc_sse41.c
+++ b/src/dsp/enc_sse41.c
@ -23,9 +23,10 @@
 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms.
-static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred,
+static void CollectHistogram_SSE41(const uint8_t* WEBP_RESTRICT ref,
                                   const uint8_t* WEBP_RESTRICT pred,
                                   int start_block, int end_block,
-                                   VP8Histogram* const histo) {
+                                   VP8Histogram* WEBP_RESTRICT const histo) {
  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
  int j;
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
@ -168,14 +169,16 @@ static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
  return sum[0] + sum[1] + sum[2] + sum[3];
 }
-static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b,
+static int Disto4x4_SSE41(const uint8_t* WEBP_RESTRICT const a,
-                          const uint16_t* const w) {
+                          const uint8_t* WEBP_RESTRICT const b,
                          const uint16_t* WEBP_RESTRICT const w) {
  const int diff_sum = TTransform_SSE41(a, b, w);
  return abs(diff_sum) >> 5;
 }
-static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b,
+static int Disto16x16_SSE41(const uint8_t* WEBP_RESTRICT const a,
-                            const uint16_t* const w) {
+                            const uint8_t* WEBP_RESTRICT const b,
                            const uint16_t* WEBP_RESTRICT const w) {
  int D = 0;
  int x, y;
  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@ -301,17 +304,17 @@ static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
 #undef PSHUFB_CST
 static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
-                               const VP8Matrix* const mtx) {
+                               const VP8Matrix* WEBP_RESTRICT const mtx) {
  return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx);
 }
 static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16],
-                                  const VP8Matrix* const mtx) {
+                                  const VP8Matrix* WEBP_RESTRICT const mtx) {
  return DoQuantizeBlock_SSE41(in, out, NULL, mtx);
 }
 static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],
-                                 const VP8Matrix* const mtx) {
+                                 const VP8Matrix* WEBP_RESTRICT const mtx) {
  int nz;
  const uint16_t* const sharpen = &mtx->sharpen_[0];
  nz  = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
--- a/src/dsp/filters.c
+++ b/src/dsp/filters.c
@ -23,14 +23,16 @@
  do {                                                                         \
    assert((in) != NULL);                                                      \
    assert((out) != NULL);                                                     \
    assert((in) != (out));                                                     \
    assert(width > 0);                                                         \
    assert(height > 0);                                                        \
    assert(stride >= width);                                                   \
  } while (0)
 #if !WEBP_NEON_OMIT_C_CODE
-static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred,
+static WEBP_INLINE void PredictLine_C(const uint8_t* WEBP_RESTRICT src,
-                                      uint8_t* dst, int length) {
+                                      const uint8_t* WEBP_RESTRICT pred,
                                      uint8_t* WEBP_RESTRICT dst, int length) {
  int i;
  for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] - pred[i]);
 }
@ -38,9 +40,9 @@ static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred,
 //------------------------------------------------------------------------------
 // Horizontal filter.
-static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
+static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* WEBP_RESTRICT in,
                                             int width, int height, int stride,
-                                             uint8_t* out) {
+                                             uint8_t* WEBP_RESTRICT out) {
  const uint8_t* preds = in;
  int row;
  DCHECK(in, out);
@ -66,9 +68,9 @@ static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
 //------------------------------------------------------------------------------
 // Vertical filter.
-static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in,
+static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* WEBP_RESTRICT in,
                                           int width, int height, int stride,
-                                           uint8_t* out) {
+                                           uint8_t* WEBP_RESTRICT out) {
  const uint8_t* preds = in;
  int row;
  DCHECK(in, out);
@ -99,9 +101,9 @@ static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
 }
 #if !WEBP_NEON_OMIT_C_CODE
-static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
+static WEBP_INLINE void DoGradientFilter_C(const uint8_t* WEBP_RESTRICT in,
                                           int width, int height, int stride,
-                                           uint8_t* out) {
+                                           uint8_t* WEBP_RESTRICT out) {
  const uint8_t* preds = in;
  int row;
  DCHECK(in, out);
@ -136,18 +138,21 @@ static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
 //------------------------------------------------------------------------------
 #if !WEBP_NEON_OMIT_C_CODE
-static void HorizontalFilter_C(const uint8_t* data, int width, int height,
+static void HorizontalFilter_C(const uint8_t* WEBP_RESTRICT data,
-                               int stride, uint8_t* filtered_data) {
+                               int width, int height, int stride,
                               uint8_t* WEBP_RESTRICT filtered_data) {
  DoHorizontalFilter_C(data, width, height, stride, filtered_data);
 }
-static void VerticalFilter_C(const uint8_t* data, int width, int height,
+static void VerticalFilter_C(const uint8_t* WEBP_RESTRICT data,
-                             int stride, uint8_t* filtered_data) {
+                             int width, int height, int stride,
                             uint8_t* WEBP_RESTRICT filtered_data) {
  DoVerticalFilter_C(data, width, height, stride, filtered_data);
 }
-static void GradientFilter_C(const uint8_t* data, int width, int height,
+static void GradientFilter_C(const uint8_t* WEBP_RESTRICT data,
-                             int stride, uint8_t* filtered_data) {
+                             int width, int height, int stride,
                             uint8_t* WEBP_RESTRICT filtered_data) {
  DoGradientFilter_C(data, width, height, stride, filtered_data);
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
--- a/src/dsp/filters_mips_dsp_r2.c
+++ b/src/dsp/filters_mips_dsp_r2.c
@ -26,8 +26,9 @@
 #define DCHECK(in, out)                                                        \
  do {                                                                         \
-    assert(in != NULL);                                                        \
+    assert((in) != NULL);                                                      \
-    assert(out != NULL);                                                       \
+    assert((out) != NULL);                                                     \
    assert((in) != (out));                                                     \
    assert(width > 0);                                                         \
    assert(height > 0);                                                        \
    assert(stride >= width);                                                   \
@ -101,7 +102,8 @@
    );                                                                         \
  } while (0)
-static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
+static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* WEBP_RESTRICT src,
                                              uint8_t* WEBP_RESTRICT dst,
                                              int length) {
  DO_PREDICT_LINE(src, dst, length, 0);
 }
@ -191,9 +193,9 @@ static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
    }                                                                          \
  } while (0)
-static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in,
+static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(
-                                                     int width, int height,
+    const uint8_t* WEBP_RESTRICT in, int width, int height, int stride,
-                                                     int stride, uint8_t* out) {
+    uint8_t* WEBP_RESTRICT out) {
  const uint8_t* preds = in;
  int row;
  DCHECK(in, out);
@ -210,9 +212,9 @@ static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in,
 }
 #undef FILTER_LINE_BY_LINE
-static void HorizontalFilter_MIPSdspR2(const uint8_t* data,
+static void HorizontalFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT data,
-                                       int width, int height,
+                                       int width, int height, int stride,
-                                       int stride, uint8_t* filtered_data) {
+                                       uint8_t* WEBP_RESTRICT filtered_data) {
  DoHorizontalFilter_MIPSdspR2(data, width, height, stride, filtered_data);
 }
@ -228,9 +230,9 @@ static void HorizontalFilter_MIPSdspR2(const uint8_t* data,
    }                                                                          \
  } while (0)
-static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in,
+static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(
-                                                   int width, int height,
+    const uint8_t* WEBP_RESTRICT in, int width, int height, int stride,
-                                                   int stride, uint8_t* out) {
+    uint8_t* WEBP_RESTRICT out) {
  const uint8_t* preds = in;
  int row;
  DCHECK(in, out);
@ -247,8 +249,9 @@ static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in,
 }
 #undef FILTER_LINE_BY_LINE
-static void VerticalFilter_MIPSdspR2(const uint8_t* data, int width, int height,
+static void VerticalFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT data,
-                                     int stride, uint8_t* filtered_data) {
+                                     int width, int height, int stride,
                                     uint8_t* WEBP_RESTRICT filtered_data) {
  DoVerticalFilter_MIPSdspR2(data, width, height, stride, filtered_data);
 }
@ -284,9 +287,9 @@ static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
    }                                                                          \
  } while (0)
-static void DoGradientFilter_MIPSdspR2(const uint8_t* in,
+static void DoGradientFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT in,
                                       int width, int height, int stride,
-                                       uint8_t* out) {
+                                       uint8_t* WEBP_RESTRICT out) {
  const uint8_t* preds = in;
  int row;
  DCHECK(in, out);
@ -303,8 +306,9 @@ static void DoGradientFilter_MIPSdspR2(const uint8_t* in,
 }
 #undef FILTER_LINE_BY_LINE
-static void GradientFilter_MIPSdspR2(const uint8_t* data, int width, int height,
+static void GradientFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT data,
-                                     int stride, uint8_t* filtered_data) {
+                                     int width, int height, int stride,
                                     uint8_t* WEBP_RESTRICT filtered_data) {
  DoGradientFilter_MIPSdspR2(data, width, height, stride, filtered_data);
 }
--- a/src/dsp/filters_msa.c
+++ b/src/dsp/filters_msa.c
@ -21,7 +21,8 @@
 static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
                                            const uint8_t* pred,
-                                            uint8_t* dst, int length) {
+                                            uint8_t* WEBP_RESTRICT dst,
                                            int length) {
  v16u8 src0, pred0, dst0;
  assert(length >= 0);
  while (length >= 32) {
@ -58,8 +59,9 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
 #define DCHECK(in, out)        \
  do {                         \
-    assert(in != NULL);        \
+    assert((in) != NULL);      \
-    assert(out != NULL);       \
+    assert((out) != NULL);     \
    assert((in) != (out));     \
    assert(width > 0);         \
    assert(height > 0);        \
    assert(stride >= width);   \
@ -68,8 +70,9 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
 //------------------------------------------------------------------------------
 // Horrizontal filter
-static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
+static void HorizontalFilter_MSA(const uint8_t* WEBP_RESTRICT data,
-                                 int stride, uint8_t* filtered_data) {
+                                 int width, int height, int stride,
                                 uint8_t* WEBP_RESTRICT filtered_data) {
  const uint8_t* preds = data;
  const uint8_t* in = data;
  uint8_t* out = filtered_data;
@ -99,8 +102,8 @@ static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
 static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
                                            const uint8_t* ppred,
-                                            uint8_t* poutput, int stride,
+                                            uint8_t* WEBP_RESTRICT poutput,
-                                            int size) {
+                                            int stride, int size) {
  int w;
  const v16i8 zero = { 0 };
  while (size >= 16) {
@ -131,8 +134,9 @@ static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
 }
-static void GradientFilter_MSA(const uint8_t* data, int width, int height,
+static void GradientFilter_MSA(const uint8_t* WEBP_RESTRICT data,
-                               int stride, uint8_t* filtered_data) {
+                               int width, int height, int stride,
                               uint8_t* WEBP_RESTRICT filtered_data) {
  const uint8_t* in = data;
  const uint8_t* preds = data;
  uint8_t* out = filtered_data;
@ -159,8 +163,9 @@ static void GradientFilter_MSA(const uint8_t* data, int width, int height,
 //------------------------------------------------------------------------------
 // Vertical filter
-static void VerticalFilter_MSA(const uint8_t* data, int width, int height,
+static void VerticalFilter_MSA(const uint8_t* WEBP_RESTRICT data,
-                               int stride, uint8_t* filtered_data) {
+                               int width, int height, int stride,
                               uint8_t* WEBP_RESTRICT filtered_data) {
  const uint8_t* in = data;
  const uint8_t* preds = data;
  uint8_t* out = filtered_data;
--- a/src/dsp/filters_neon.c
+++ b/src/dsp/filters_neon.c
@ -23,8 +23,9 @@
 #define DCHECK(in, out)                                                        \
  do {                                                                         \
-    assert(in != NULL);                                                        \
+    assert((in) != NULL);                                                      \
-    assert(out != NULL);                                                       \
+    assert((out) != NULL);                                                     \
    assert((in) != (out));                                                     \
    assert(width > 0);                                                         \
    assert(height > 0);                                                        \
    assert(stride >= width);                                                   \
@ -44,7 +45,7 @@
 #define ROTATE_RIGHT_N(A, N)   vext_u8((A), (A), (8 - (N)) % 8)
 static void PredictLine_NEON(const uint8_t* src, const uint8_t* pred,
-                             uint8_t* dst, int length) {
+                             uint8_t* WEBP_RESTRICT dst, int length) {
  int i;
  assert(length >= 0);
  for (i = 0; i + 16 <= length; i += 16) {
@ -57,16 +58,17 @@ static void PredictLine_NEON(const uint8_t* src, const uint8_t* pred,
 }
 // Special case for left-based prediction (when preds==dst-1 or preds==src-1).
-static void PredictLineLeft_NEON(const uint8_t* src, uint8_t* dst, int length) {
+static void PredictLineLeft_NEON(const uint8_t* WEBP_RESTRICT src,
                                 uint8_t* WEBP_RESTRICT dst, int length) {
  PredictLine_NEON(src, src - 1, dst, length);
 }
 //------------------------------------------------------------------------------
 // Horizontal filter.
-static WEBP_INLINE void DoHorizontalFilter_NEON(const uint8_t* in,
+static WEBP_INLINE void DoHorizontalFilter_NEON(
-                                                int width, int height,
+    const uint8_t* WEBP_RESTRICT in, int width, int height, int stride,
-                                                int stride, uint8_t* out) {
+    uint8_t* WEBP_RESTRICT out) {
  int row;
  DCHECK(in, out);
@ -86,17 +88,18 @@ static WEBP_INLINE void DoHorizontalFilter_NEON(const uint8_t* in,
  }
 }
-static void HorizontalFilter_NEON(const uint8_t* data, int width, int height,
+static void HorizontalFilter_NEON(const uint8_t* WEBP_RESTRICT data,
-                                  int stride, uint8_t* filtered_data) {
+                                  int width, int height, int stride,
                                  uint8_t* WEBP_RESTRICT filtered_data) {
  DoHorizontalFilter_NEON(data, width, height, stride, filtered_data);
 }
 //------------------------------------------------------------------------------
 // Vertical filter.
-static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in,
+static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* WEBP_RESTRICT in,
                                              int width, int height, int stride,
-                                              uint8_t* out) {
+                                              uint8_t* WEBP_RESTRICT out) {
  int row;
  DCHECK(in, out);
@ -115,8 +118,9 @@ static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in,
  }
 }
-static void VerticalFilter_NEON(const uint8_t* data, int width, int height,
+static void VerticalFilter_NEON(const uint8_t* WEBP_RESTRICT data,
-                                int stride, uint8_t* filtered_data) {
+                                int width, int height, int stride,
                                uint8_t* WEBP_RESTRICT filtered_data) {
  DoVerticalFilter_NEON(data, width, height, stride, filtered_data);
 }
@ -130,7 +134,8 @@ static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
 static void GradientPredictDirect_NEON(const uint8_t* const row,
                                       const uint8_t* const top,
-                                       uint8_t* const out, int length) {
+                                       uint8_t* WEBP_RESTRICT const out,
                                       int length) {
  int i;
  for (i = 0; i + 8 <= length; i += 8) {
    const uint8x8_t A = vld1_u8(&row[i - 1]);
@ -146,9 +151,9 @@ static void GradientPredictDirect_NEON(const uint8_t* const row,
  }
 }
-static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in,
+static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* WEBP_RESTRICT in,
                                              int width, int height, int stride,
-                                              uint8_t* out) {
+                                              uint8_t* WEBP_RESTRICT out) {
  int row;
  DCHECK(in, out);
@ -167,8 +172,9 @@ static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in,
  }
 }
-static void GradientFilter_NEON(const uint8_t* data, int width, int height,
+static void GradientFilter_NEON(const uint8_t* WEBP_RESTRICT data,
-                                int stride, uint8_t* filtered_data) {
+                                int width, int height, int stride,
                                uint8_t* WEBP_RESTRICT filtered_data) {
  DoGradientFilter_NEON(data, width, height, stride, filtered_data);
 }
--- a/src/dsp/filters_sse2.c
+++ b/src/dsp/filters_sse2.c
@ -27,13 +27,15 @@
  do {                                                                         \
    assert((in) != NULL);                                                      \
    assert((out) != NULL);                                                     \
    assert((in) != (out));                                                     \
    assert(width > 0);                                                         \
    assert(height > 0);                                                        \
    assert(stride >= width);                                                   \
  } while (0)
-static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred,
+static void PredictLineTop_SSE2(const uint8_t* WEBP_RESTRICT src,
-                                uint8_t* dst, int length) {
+                                const uint8_t* WEBP_RESTRICT pred,
                                uint8_t* WEBP_RESTRICT dst, int length) {
  int i;
  const int max_pos = length & ~31;
  assert(length >= 0);
@ -51,7 +53,8 @@ static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred,
 }
 // Special case for left-based prediction (when preds==dst-1 or preds==src-1).
-static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) {
+static void PredictLineLeft_SSE2(const uint8_t* WEBP_RESTRICT src,
                                 uint8_t* WEBP_RESTRICT dst, int length) {
  int i;
  const int max_pos = length & ~31;
  assert(length >= 0);
@ -71,9 +74,9 @@ static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) {
 //------------------------------------------------------------------------------
 // Horizontal filter.
-static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
+static WEBP_INLINE void DoHorizontalFilter_SSE2(
-                                                int width, int height,
+    const uint8_t* WEBP_RESTRICT in, int width, int height, int stride,
-                                                int stride, uint8_t* out) {
+    uint8_t* WEBP_RESTRICT out) {
  int row;
  DCHECK(in, out);
@ -96,9 +99,9 @@ static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
 //------------------------------------------------------------------------------
 // Vertical filter.
-static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in,
+static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* WEBP_RESTRICT in,
                                              int width, int height, int stride,
-                                              uint8_t* out) {
+                                              uint8_t* WEBP_RESTRICT out) {
  int row;
  DCHECK(in, out);
@ -127,7 +130,8 @@ static WEBP_INLINE int GradientPredictor_SSE2(uint8_t a, uint8_t b, uint8_t c) {
 static void GradientPredictDirect_SSE2(const uint8_t* const row,
                                       const uint8_t* const top,
-                                       uint8_t* const out, int length) {
+                                       uint8_t* WEBP_RESTRICT const out,
                                       int length) {
  const int max_pos = length & ~7;
  int i;
  const __m128i zero = _mm_setzero_si128();
@ -151,9 +155,9 @@ static void GradientPredictDirect_SSE2(const uint8_t* const row,
  }
 }
-static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
+static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* WEBP_RESTRICT in,
                                              int width, int height, int stride,
-                                              uint8_t* out) {
+                                              uint8_t* WEBP_RESTRICT out) {
  int row;
  DCHECK(in, out);
@ -176,18 +180,21 @@ static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
 //------------------------------------------------------------------------------
-static void HorizontalFilter_SSE2(const uint8_t* data, int width, int height,
+static void HorizontalFilter_SSE2(const uint8_t* WEBP_RESTRICT data,
-                                  int stride, uint8_t* filtered_data) {
+                                  int width, int height, int stride,
                                  uint8_t* WEBP_RESTRICT filtered_data) {
  DoHorizontalFilter_SSE2(data, width, height, stride, filtered_data);
 }
-static void VerticalFilter_SSE2(const uint8_t* data, int width, int height,
+static void VerticalFilter_SSE2(const uint8_t* WEBP_RESTRICT data,
-                                int stride, uint8_t* filtered_data) {
+                                int width, int height, int stride,
                                uint8_t* WEBP_RESTRICT filtered_data) {
  DoVerticalFilter_SSE2(data, width, height, stride, filtered_data);
 }
-static void GradientFilter_SSE2(const uint8_t* data, int width, int height,
+static void GradientFilter_SSE2(const uint8_t* WEBP_RESTRICT data,
-                                int stride, uint8_t* filtered_data) {
+                                int width, int height, int stride,
                                uint8_t* WEBP_RESTRICT filtered_data) {
  DoGradientFilter_SSE2(data, width, height, stride, filtered_data);
 }
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
@ -182,13 +182,13 @@ uint32_t VP8LPredictor13_C(const uint32_t* const left,
 }
 static void PredictorAdd0_C(const uint32_t* in, const uint32_t* upper,
-                            int num_pixels, uint32_t* out) {
+                            int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int x;
  (void)upper;
  for (x = 0; x < num_pixels; ++x) out[x] = VP8LAddPixels(in[x], ARGB_BLACK);
 }
 static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
-                            int num_pixels, uint32_t* out) {
+                            int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  uint32_t left = out[-1];
  (void)upper;
@ -441,8 +441,8 @@ static int is_big_endian(void) {
  return (tmp.b[0] != 1);
 }
-void VP8LConvertBGRAToRGB_C(const uint32_t* src,
+void VP8LConvertBGRAToRGB_C(const uint32_t* WEBP_RESTRICT src,
-                            int num_pixels, uint8_t* dst) {
+                            int num_pixels, uint8_t* WEBP_RESTRICT dst) {
  const uint32_t* const src_end = src + num_pixels;
  while (src < src_end) {
    const uint32_t argb = *src++;
@ -452,8 +452,8 @@ void VP8LConvertBGRAToRGB_C(const uint32_t* src,
  }
 }
-void VP8LConvertBGRAToRGBA_C(const uint32_t* src,
+void VP8LConvertBGRAToRGBA_C(const uint32_t* WEBP_RESTRICT src,
-                             int num_pixels, uint8_t* dst) {
+                             int num_pixels, uint8_t* WEBP_RESTRICT dst) {
  const uint32_t* const src_end = src + num_pixels;
  while (src < src_end) {
    const uint32_t argb = *src++;
@ -464,8 +464,8 @@ void VP8LConvertBGRAToRGBA_C(const uint32_t* src,
  }
 }
-void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
+void VP8LConvertBGRAToRGBA4444_C(const uint32_t* WEBP_RESTRICT src,
-                                 int num_pixels, uint8_t* dst) {
+                                 int num_pixels, uint8_t* WEBP_RESTRICT dst) {
  const uint32_t* const src_end = src + num_pixels;
  while (src < src_end) {
    const uint32_t argb = *src++;
@ -481,8 +481,8 @@ void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
  }
 }
-void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
+void VP8LConvertBGRAToRGB565_C(const uint32_t* WEBP_RESTRICT src,
-                               int num_pixels, uint8_t* dst) {
+                               int num_pixels, uint8_t* WEBP_RESTRICT dst) {
  const uint32_t* const src_end = src + num_pixels;
  while (src < src_end) {
    const uint32_t argb = *src++;
@ -498,8 +498,8 @@ void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
  }
 }
-void VP8LConvertBGRAToBGR_C(const uint32_t* src,
+void VP8LConvertBGRAToBGR_C(const uint32_t* WEBP_RESTRICT src,
-                            int num_pixels, uint8_t* dst) {
+                            int num_pixels, uint8_t* WEBP_RESTRICT dst) {
  const uint32_t* const src_end = src + num_pixels;
  while (src < src_end) {
    const uint32_t argb = *src++;
@ -509,8 +509,8 @@ void VP8LConvertBGRAToBGR_C(const uint32_t* src,
  }
 }
-static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
+static void CopyOrSwap(const uint32_t* WEBP_RESTRICT src, int num_pixels,
-                       int swap_on_big_endian) {
+                       uint8_t* WEBP_RESTRICT dst, int swap_on_big_endian) {
  if (is_big_endian() == swap_on_big_endian) {
    const uint32_t* const src_end = src + num_pixels;
    while (src < src_end) {
--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@ -18,6 +18,7 @@
 #include "src/webp/types.h"
 #include "src/webp/decode.h"
 #include "src/dsp/dsp.h"
 #include "src/enc/histogram_enc.h"
 #include "src/utils/utils.h"
@ -60,7 +61,7 @@ uint32_t VP8LPredictor13_C(const uint32_t* const left,
 // These Add/Sub function expects upper[-1] and out[-1] to be readable.
 typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
                                        const uint32_t* upper, int num_pixels,
-                                        uint32_t* out);
+                                        uint32_t* WEBP_RESTRICT out);
 extern VP8LPredictorAddSubFunc VP8LPredictorsAdd[16];
 extern VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
@ -91,8 +92,8 @@ void VP8LInverseTransform(const struct VP8LTransform* const transform,
                          const uint32_t* const in, uint32_t* const out);
 // Color space conversion.
-typedef void (*VP8LConvertFunc)(const uint32_t* src, int num_pixels,
+typedef void (*VP8LConvertFunc)(const uint32_t* WEBP_RESTRICT src,
-                                uint8_t* dst);
+                                int num_pixels, uint8_t* WEBP_RESTRICT dst);
 extern VP8LConvertFunc VP8LConvertBGRAToRGB;
 extern VP8LConvertFunc VP8LConvertBGRAToRGBA;
 extern VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
@ -145,29 +146,33 @@ void VP8LDspInit(void);
 typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels);
 extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
-typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
+typedef void (*VP8LTransformColorFunc)(
-                                       uint32_t* dst, int num_pixels);
+    const VP8LMultipliers* WEBP_RESTRICT const m, uint32_t* WEBP_RESTRICT dst,
    int num_pixels);
 extern VP8LTransformColorFunc VP8LTransformColor;
 typedef void (*VP8LCollectColorBlueTransformsFunc)(
-    const uint32_t* argb, int stride,
+    const uint32_t* WEBP_RESTRICT argb, int stride,
    int tile_width, int tile_height,
    int green_to_blue, int red_to_blue, uint32_t histo[]);
 extern VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
 typedef void (*VP8LCollectColorRedTransformsFunc)(
-    const uint32_t* argb, int stride,
+    const uint32_t* WEBP_RESTRICT argb, int stride,
    int tile_width, int tile_height,
    int green_to_red, uint32_t histo[]);
 extern VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
 // Expose some C-only fallback functions
-void VP8LTransformColor_C(const VP8LMultipliers* const m,
+void VP8LTransformColor_C(const VP8LMultipliers* WEBP_RESTRICT const m,
-                          uint32_t* data, int num_pixels);
+                          uint32_t* WEBP_RESTRICT data, int num_pixels);
-void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels);
+void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* WEBP_RESTRICT argb_data,
-void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
+                                       int num_pixels);
 void VP8LCollectColorRedTransforms_C(const uint32_t* WEBP_RESTRICT argb,
                                     int stride,
                                     int tile_width, int tile_height,
                                     int green_to_red, uint32_t histo[]);
-void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
+void VP8LCollectColorBlueTransforms_C(const uint32_t* WEBP_RESTRICT argb,
                                      int stride,
                                      int tile_width, int tile_height,
                                      int green_to_blue, int red_to_blue,
                                      uint32_t histo[]);
@ -179,7 +184,8 @@ extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
 // Huffman-cost related functions.
 typedef uint32_t (*VP8LCostFunc)(const uint32_t* population, int length);
-typedef uint32_t (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
+typedef uint32_t (*VP8LCostCombinedFunc)(const uint32_t* WEBP_RESTRICT X,
                                         const uint32_t* WEBP_RESTRICT Y,
                                         int length);
 typedef uint64_t (*VP8LCombinedShannonEntropyFunc)(const uint32_t X[256],
                                                   const uint32_t Y[256]);
@ -210,26 +216,30 @@ void VP8LBitEntropyInit(VP8LBitEntropy* const entropy);
 // codec specific heuristics.
 typedef void (*VP8LGetCombinedEntropyUnrefinedFunc)(
    const uint32_t X[], const uint32_t Y[], int length,
-    VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats);
+    VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
    VP8LStreaks* WEBP_RESTRICT const stats);
 extern VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
 // Get the entropy for the distribution 'X'.
-typedef void (*VP8LGetEntropyUnrefinedFunc)(const uint32_t X[], int length,
+typedef void (*VP8LGetEntropyUnrefinedFunc)(
-                                            VP8LBitEntropy* const bit_entropy,
+    const uint32_t X[], int length,
-                                            VP8LStreaks* const stats);
+    VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
    VP8LStreaks* WEBP_RESTRICT const stats);
 extern VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
 void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
                              VP8LBitEntropy* const entropy);
-typedef void (*VP8LAddVectorFunc)(const uint32_t* a, const uint32_t* b,
+typedef void (*VP8LAddVectorFunc)(const uint32_t* WEBP_RESTRICT a,
-                                  uint32_t* out, int size);
+                                  const uint32_t* WEBP_RESTRICT b,
                                  uint32_t* WEBP_RESTRICT out, int size);
 extern VP8LAddVectorFunc VP8LAddVector;
-typedef void (*VP8LAddVectorEqFunc)(const uint32_t* a, uint32_t* out, int size);
+typedef void (*VP8LAddVectorEqFunc)(const uint32_t* WEBP_RESTRICT a,
                                    uint32_t* WEBP_RESTRICT out, int size);
 extern VP8LAddVectorEqFunc VP8LAddVectorEq;
-void VP8LHistogramAdd(const VP8LHistogram* const a,
+void VP8LHistogramAdd(const VP8LHistogram* WEBP_RESTRICT const a,
-                      const VP8LHistogram* const b,
+                      const VP8LHistogram* WEBP_RESTRICT const b,
-                      VP8LHistogram* const out);
+                      VP8LHistogram* WEBP_RESTRICT const out);
 // -----------------------------------------------------------------------------
 // PrefixEncode()
@ -239,11 +249,12 @@ typedef int (*VP8LVectorMismatchFunc)(const uint32_t* const array1,
 // Returns the first index where array1 and array2 are different.
 extern VP8LVectorMismatchFunc VP8LVectorMismatch;
-typedef void (*VP8LBundleColorMapFunc)(const uint8_t* const row, int width,
+typedef void (*VP8LBundleColorMapFunc)(const uint8_t* WEBP_RESTRICT const row,
-                                       int xbits, uint32_t* dst);
+                                       int width, int xbits,
                                       uint32_t* WEBP_RESTRICT dst);
 extern VP8LBundleColorMapFunc VP8LBundleColorMap;
-void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
+void VP8LBundleColorMap_C(const uint8_t* WEBP_RESTRICT const row,
-                          uint32_t* dst);
+                          int width, int xbits, uint32_t* WEBP_RESTRICT dst);
 // Must be called before calling any of the above methods.
 void VP8LEncDspInit(void);
--- a/src/dsp/lossless_common.h
+++ b/src/dsp/lossless_common.h
@ -194,15 +194,15 @@ uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
 // The predictor is added to the output pixel (which
 // is therefore considered as a residual) to get the final prediction.
-#define GENERATE_PREDICTOR_ADD(PREDICTOR, PREDICTOR_ADD)             \
+#define GENERATE_PREDICTOR_ADD(PREDICTOR, PREDICTOR_ADD)                 \
-static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \
+static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper,     \
-                          int num_pixels, uint32_t* out) {           \
+                          int num_pixels, uint32_t* WEBP_RESTRICT out) { \
-  int x;                                                             \
+  int x;                                                                 \
-  assert(upper != NULL);                                             \
+  assert(upper != NULL);                                                 \
-  for (x = 0; x < num_pixels; ++x) {                                 \
+  for (x = 0; x < num_pixels; ++x) {                                     \
-    const uint32_t pred = (PREDICTOR)(&out[x - 1], upper + x);       \
+    const uint32_t pred = (PREDICTOR)(&out[x - 1], upper + x);           \
-    out[x] = VP8LAddPixels(in[x], pred);                             \
+    out[x] = VP8LAddPixels(in[x], pred);                                 \
-  }                                                                  \
+  }                                                                      \
 }
 #ifdef __cplusplus
--- a/src/dsp/lossless_enc.c
+++ b/src/dsp/lossless_enc.c
@ -359,8 +359,8 @@ void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) {
  entropy->nonzero_code = VP8L_NON_TRIVIAL_SYM;
 }
-void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
+void VP8LBitsEntropyUnrefined(const uint32_t* WEBP_RESTRICT const array, int n,
-                              VP8LBitEntropy* const entropy) {
+                              VP8LBitEntropy* WEBP_RESTRICT const entropy) {
  int i;
  VP8LBitEntropyInit(entropy);
@ -380,8 +380,10 @@ void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
 }
 static WEBP_INLINE void GetEntropyUnrefinedHelper(
-    uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
+    uint32_t val, int i, uint32_t* WEBP_RESTRICT const val_prev,
-    VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
+    int* WEBP_RESTRICT const i_prev,
    VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
    VP8LStreaks* WEBP_RESTRICT const stats) {
  const int streak = i - *i_prev;
  // Gather info for the bit entropy.
@ -403,9 +405,10 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
  *i_prev = i;
 }
-static void GetEntropyUnrefined_C(const uint32_t X[], int length,
+static void GetEntropyUnrefined_C(
-                                  VP8LBitEntropy* const bit_entropy,
+    const uint32_t X[], int length,
-                                  VP8LStreaks* const stats) {
+    VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
    VP8LStreaks* WEBP_RESTRICT const stats) {
  int i;
  int i_prev = 0;
  uint32_t x_prev = X[0];
@ -424,11 +427,10 @@ static void GetEntropyUnrefined_C(const uint32_t X[], int length,
  bit_entropy->entropy = VP8LFastSLog2(bit_entropy->sum) - bit_entropy->entropy;
 }
-static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
+static void GetCombinedEntropyUnrefined_C(
-                                          const uint32_t Y[],
+    const uint32_t X[], const uint32_t Y[], int length,
-                                          int length,
+    VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
-                                          VP8LBitEntropy* const bit_entropy,
+    VP8LStreaks* WEBP_RESTRICT const stats) {
                                          VP8LStreaks* const stats) {
  int i = 1;
  int i_prev = 0;
  uint32_t xy_prev = X[0] + Y[0];
@ -468,8 +470,8 @@ static WEBP_INLINE int8_t U32ToS8(uint32_t v) {
  return (int8_t)(v & 0xff);
 }
-void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
+void VP8LTransformColor_C(const VP8LMultipliers* WEBP_RESTRICT const m,
-                          int num_pixels) {
+                          uint32_t* WEBP_RESTRICT data, int num_pixels) {
  int i;
  for (i = 0; i < num_pixels; ++i) {
    const uint32_t argb = data[i];
@ -505,7 +507,8 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
  return (new_blue & 0xff);
 }
-void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
+void VP8LCollectColorRedTransforms_C(const uint32_t* WEBP_RESTRICT argb,
                                     int stride,
                                     int tile_width, int tile_height,
                                     int green_to_red, uint32_t histo[]) {
  while (tile_height-- > 0) {
@ -517,7 +520,8 @@ void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
  }
 }
-void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
+void VP8LCollectColorBlueTransforms_C(const uint32_t* WEBP_RESTRICT argb,
                                      int stride,
                                      int tile_width, int tile_height,
                                      int green_to_blue, int red_to_blue,
                                      uint32_t histo[]) {
@ -544,8 +548,8 @@ static int VectorMismatch_C(const uint32_t* const array1,
 }
 // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
-void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
+void VP8LBundleColorMap_C(const uint8_t* WEBP_RESTRICT const row,
-                          uint32_t* dst) {
+                          int width, int xbits, uint32_t* WEBP_RESTRICT dst) {
  int x;
  if (xbits > 0) {
    const int bit_depth = 1 << (3 - xbits);
@ -576,7 +580,8 @@ static uint32_t ExtraCost_C(const uint32_t* population, int length) {
  return cost;
 }
-static uint32_t ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
+static uint32_t ExtraCostCombined_C(const uint32_t* WEBP_RESTRICT X,
                                    const uint32_t* WEBP_RESTRICT Y,
                                    int length) {
  int i;
  uint32_t cost = X[4] + Y[4] + X[5] + Y[5];
@ -591,13 +596,15 @@ static uint32_t ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
 //------------------------------------------------------------------------------
-static void AddVector_C(const uint32_t* a, const uint32_t* b, uint32_t* out,
+static void AddVector_C(const uint32_t* WEBP_RESTRICT a,
-                        int size) {
+                        const uint32_t* WEBP_RESTRICT b,
                        uint32_t* WEBP_RESTRICT out, int size) {
  int i;
  for (i = 0; i < size; ++i) out[i] = a[i] + b[i];
 }
-static void AddVectorEq_C(const uint32_t* a, uint32_t* out, int size) {
+static void AddVectorEq_C(const uint32_t* WEBP_RESTRICT a,
                          uint32_t* WEBP_RESTRICT out, int size) {
  int i;
  for (i = 0; i < size; ++i) out[i] += a[i];
 }
@ -626,8 +633,9 @@ static void AddVectorEq_C(const uint32_t* a, uint32_t* out, int size) {
  }                                                                            \
 } while (0)
-void VP8LHistogramAdd(const VP8LHistogram* const a,
+void VP8LHistogramAdd(const VP8LHistogram* WEBP_RESTRICT const a,
-                      const VP8LHistogram* const b, VP8LHistogram* const out) {
+                      const VP8LHistogram* WEBP_RESTRICT const b,
                      VP8LHistogram* WEBP_RESTRICT const out) {
  int i;
  const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
  assert(a->palette_code_bits_ == b->palette_code_bits_);
@ -657,14 +665,14 @@ void VP8LHistogramAdd(const VP8LHistogram* const a,
 // Image transforms.
 static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper,
-                            int num_pixels, uint32_t* out) {
+                            int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], ARGB_BLACK);
  (void)upper;
 }
 static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
-                            int num_pixels, uint32_t* out) {
+                            int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], in[i - 1]);
  (void)upper;
@ -675,7 +683,8 @@ static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
 #define GENERATE_PREDICTOR_SUB(PREDICTOR_I)                                \
 static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in,              \
                                          const uint32_t* upper,           \
-                                          int num_pixels, uint32_t* out) { \
+                                          int num_pixels,                  \
                                          uint32_t* WEBP_RESTRICT out) {   \
  int x;                                                                   \
  assert(upper != NULL);                                                   \
  for (x = 0; x < num_pixels; ++x) {                                       \
--- a/src/dsp/lossless_enc_mips32.c
+++ b/src/dsp/lossless_enc_mips32.c
@ -149,8 +149,9 @@ static uint32_t ExtraCost_MIPS32(const uint32_t* const population, int length) {
 //     pY += 2;
 //   }
 //   return cost;
-static uint32_t ExtraCostCombined_MIPS32(const uint32_t* const X,
+static uint32_t ExtraCostCombined_MIPS32(const uint32_t* WEBP_RESTRICT const X,
-                                         const uint32_t* const Y, int length) {
+                                         const uint32_t* WEBP_RESTRICT const Y,
                                         int length) {
  int i, temp0, temp1, temp2, temp3;
  const uint32_t* pX = &X[4];
  const uint32_t* pY = &Y[4];
@ -215,8 +216,10 @@ static uint32_t ExtraCostCombined_MIPS32(const uint32_t* const X,
 // Returns the various RLE counts
 static WEBP_INLINE void GetEntropyUnrefinedHelper(
-    uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
+    uint32_t val, int i, uint32_t* WEBP_RESTRICT const val_prev,
-    VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
+    int* WEBP_RESTRICT const i_prev,
    VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
    VP8LStreaks* WEBP_RESTRICT const stats) {
  int* const pstreaks = &stats->streaks[0][0];
  int* const pcnts = &stats->counts[0];
  int temp0, temp1, temp2, temp3;
@ -241,9 +244,10 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
  *i_prev = i;
 }
-static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,
+static void GetEntropyUnrefined_MIPS32(
-                                       VP8LBitEntropy* const bit_entropy,
+    const uint32_t X[], int length,
-                                       VP8LStreaks* const stats) {
+    VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
    VP8LStreaks* WEBP_RESTRICT const stats) {
  int i;
  int i_prev = 0;
  uint32_t x_prev = X[0];
@ -262,11 +266,10 @@ static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,
  bit_entropy->entropy = VP8LFastSLog2(bit_entropy->sum) - bit_entropy->entropy;
 }
-static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
+static void GetCombinedEntropyUnrefined_MIPS32(
-                                               const uint32_t Y[],
+    const uint32_t X[], const uint32_t Y[], int length,
-                                               int length,
+    VP8LBitEntropy* WEBP_RESTRICT const entropy,
-                                               VP8LBitEntropy* const entropy,
+    VP8LStreaks* WEBP_RESTRICT const stats) {
                                               VP8LStreaks* const stats) {
  int i = 1;
  int i_prev = 0;
  uint32_t xy_prev = X[0] + Y[0];
@ -344,8 +347,9 @@ static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
    ASM_END_COMMON_0                                    \
    ASM_END_COMMON_1
-static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb,
+static void AddVector_MIPS32(const uint32_t* WEBP_RESTRICT pa,
-                             uint32_t* pout, int size) {
+                             const uint32_t* WEBP_RESTRICT pb,
                             uint32_t* WEBP_RESTRICT pout, int size) {
  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  const int end = ((size) / 4) * 4;
  const uint32_t* const LoopEnd = pa + end;
@ -356,7 +360,8 @@ static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb,
  for (i = 0; i < size - end; ++i) pout[i] = pa[i] + pb[i];
 }
-static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) {
+static void AddVectorEq_MIPS32(const uint32_t* WEBP_RESTRICT pa,
                               uint32_t* WEBP_RESTRICT pout, int size) {
  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  const int end = ((size) / 4) * 4;
  const uint32_t* const LoopEnd = pa + end;
--- a/src/dsp/lossless_enc_mips_dsp_r2.c
+++ b/src/dsp/lossless_enc_mips_dsp_r2.c
@ -78,8 +78,9 @@ static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
  return (uint32_t)((int)(color_pred) * color) >> 5;
 }
-static void TransformColor_MIPSdspR2(const VP8LMultipliers* const m,
+static void TransformColor_MIPSdspR2(
-                                     uint32_t* data, int num_pixels) {
+    const VP8LMultipliers* WEBP_RESTRICT const m, uint32_t* WEBP_RESTRICT data,
    int num_pixels) {
  int temp0, temp1, temp2, temp3, temp4, temp5;
  uint32_t argb, argb1, new_red, new_red1;
  const uint32_t G_to_R = m->green_to_red_;
@ -172,7 +173,8 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
 }
 static void CollectColorBlueTransforms_MIPSdspR2(
-    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    const uint32_t* WEBP_RESTRICT argb, int stride,
    int tile_width, int tile_height,
    int green_to_blue, int red_to_blue, uint32_t histo[]) {
  const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
  const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
@ -221,11 +223,9 @@ static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
  return (new_red & 0xff);
 }
-static void CollectColorRedTransforms_MIPSdspR2(const uint32_t* argb,
+static void CollectColorRedTransforms_MIPSdspR2(
-                                                int stride, int tile_width,
+    const uint32_t* WEBP_RESTRICT argb, int stride,
-                                                int tile_height,
+    int tile_width, int tile_height, int green_to_red, uint32_t histo[]) {
                                                int green_to_red,
                                                uint32_t histo[]) {
  const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
  while (tile_height-- > 0) {
    int x;
--- a/src/dsp/lossless_enc_msa.c
+++ b/src/dsp/lossless_enc_msa.c
@ -48,8 +48,8 @@
  dst = VSHF_UB(src, t0, mask1);                                \
 } while (0)
-static void TransformColor_MSA(const VP8LMultipliers* const m, uint32_t* data,
+static void TransformColor_MSA(const VP8LMultipliers* WEBP_RESTRICT const m,
-                               int num_pixels) {
+                               uint32_t* WEBP_RESTRICT data, int num_pixels) {
  v16u8 src0, dst0;
  const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
                                         (m->green_to_red_ << 16));
--- a/src/dsp/lossless_enc_neon.c
+++ b/src/dsp/lossless_enc_neon.c
@ -72,8 +72,9 @@ static void SubtractGreenFromBlueAndRed_NEON(uint32_t* argb_data,
 //------------------------------------------------------------------------------
 // Color Transform
-static void TransformColor_NEON(const VP8LMultipliers* const m,
+static void TransformColor_NEON(const VP8LMultipliers* WEBP_RESTRICT const m,
-                                uint32_t* argb_data, int num_pixels) {
+                                uint32_t* WEBP_RESTRICT argb_data,
                                int num_pixels) {
  // sign-extended multiplying constants, pre-shifted by 6.
 #define CST(X)  (((int16_t)(m->X << 8)) >> 6)
  const int16_t rb[8] = {
--- a/src/dsp/lossless_enc_sse2.c
+++ b/src/dsp/lossless_enc_sse2.c
@ -49,8 +49,9 @@ static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
 #define MK_CST_16(HI, LO) \
  _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
-static void TransformColor_SSE2(const VP8LMultipliers* const m,
+static void TransformColor_SSE2(const VP8LMultipliers* WEBP_RESTRICT const m,
-                                uint32_t* argb_data, int num_pixels) {
+                                uint32_t* WEBP_RESTRICT argb_data,
                                int num_pixels) {
  const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_),
                                     CST_5b(m->green_to_blue_));
  const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0);
@ -79,7 +80,8 @@ static void TransformColor_SSE2(const VP8LMultipliers* const m,
 //------------------------------------------------------------------------------
 #define SPAN 8
-static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
+static void CollectColorBlueTransforms_SSE2(const uint32_t* WEBP_RESTRICT argb,
                                            int stride,
                                            int tile_width, int tile_height,
                                            int green_to_blue, int red_to_blue,
                                            uint32_t histo[]) {
@ -126,7 +128,8 @@ static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
  }
 }
-static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
+static void CollectColorRedTransforms_SSE2(const uint32_t* WEBP_RESTRICT argb,
                                           int stride,
                                           int tile_width, int tile_height,
                                           int green_to_red, uint32_t histo[]) {
  const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_red));
@ -173,8 +176,9 @@ static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
 // Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
 // that's ok since the histogram values are less than 1<<28 (max picture size).
 #define LINE_SIZE 16    // 8 or 16
-static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out,
+static void AddVector_SSE2(const uint32_t* WEBP_RESTRICT a,
-                           int size) {
+                           const uint32_t* WEBP_RESTRICT b,
                           uint32_t* WEBP_RESTRICT out, int size) {
  int i;
  for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
@ -201,7 +205,8 @@ static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out,
  }
 }
-static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
+static void AddVectorEq_SSE2(const uint32_t* WEBP_RESTRICT a,
                             uint32_t* WEBP_RESTRICT out, int size) {
  int i;
  for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
@ -333,8 +338,9 @@ static int VectorMismatch_SSE2(const uint32_t* const array1,
 }
 // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
-static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits,
+static void BundleColorMap_SSE2(const uint8_t* WEBP_RESTRICT const row,
-                                uint32_t* dst) {
+                                int width, int xbits,
                                uint32_t* WEBP_RESTRICT dst) {
  int x;
  assert(xbits >= 0);
  assert(xbits <= 3);
@ -423,7 +429,7 @@ static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
 // Predictor0: ARGB_BLACK.
 static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
-                               int num_pixels, uint32_t* out) {
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -440,7 +446,8 @@ static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
 #define GENERATE_PREDICTOR_1(X, IN)                                         \
  static void PredictorSub##X##_SSE2(const uint32_t* const in,              \
                                     const uint32_t* const upper,           \
-                                     int num_pixels, uint32_t* const out) { \
+                                     int num_pixels,                        \
                                     uint32_t* WEBP_RESTRICT const out) {   \
    int i;                                                                  \
    for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
      const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);          \
@ -462,7 +469,7 @@ GENERATE_PREDICTOR_1(4, upper[i - 1])    // Predictor4: TL
 // Predictor5: avg2(avg2(L, TR), T)
 static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper,
-                               int num_pixels, uint32_t* out) {
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
@ -482,7 +489,8 @@ static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper,
 #define GENERATE_PREDICTOR_2(X, A, B)                                         \
 static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
-                                   int num_pixels, uint32_t* out) {           \
+                                   int num_pixels,                            \
                                   uint32_t* WEBP_RESTRICT out) {             \
  int i;                                                                      \
  for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
    const __m128i tA = _mm_loadu_si128((const __m128i*)&(A));                 \
@ -506,7 +514,7 @@ GENERATE_PREDICTOR_2(9, upper[i], upper[i + 1])    // Predictor9: average(T, TR)
 // Predictor10: avg(avg(L,TL), avg(T, TR)).
 static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
@ -541,7 +549,7 @@ static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B,
 }
 static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
@ -567,7 +575,7 @@ static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
 // Predictor12: ClampedSubSubtractFull.
 static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  const __m128i zero = _mm_setzero_si128();
  for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -596,7 +604,7 @@ static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper,
 // Predictors13: ClampedAddSubtractHalf
 static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  const __m128i zero = _mm_setzero_si128();
  for (i = 0; i + 2 <= num_pixels; i += 2) {
--- a/src/dsp/lossless_enc_sse41.c
+++ b/src/dsp/lossless_enc_sse41.c
@ -44,8 +44,9 @@ static uint32_t ExtraCost_SSE41(const uint32_t* const a, int length) {
  return HorizontalSum_SSE41(cost);
 }
-static uint32_t ExtraCostCombined_SSE41(const uint32_t* const a,
+static uint32_t ExtraCostCombined_SSE41(const uint32_t* WEBP_RESTRICT const a,
-                                        const uint32_t* const b, int length) {
+                                        const uint32_t* WEBP_RESTRICT const b,
                                        int length) {
  int i;
  __m128i cost = _mm_add_epi32(_mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]),
                               _mm_set_epi32(2 * b[7], 2 * b[6], b[5], b[4]));
@ -95,7 +96,8 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
 #define MK_CST_16(HI, LO) \
  _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
-static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
+static void CollectColorBlueTransforms_SSE41(const uint32_t* WEBP_RESTRICT argb,
                                             int stride,
                                             int tile_width, int tile_height,
                                             int green_to_blue, int red_to_blue,
                                             uint32_t histo[]) {
@ -141,7 +143,8 @@ static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
  }
 }
-static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
+static void CollectColorRedTransforms_SSE41(const uint32_t* WEBP_RESTRICT argb,
                                            int stride,
                                            int tile_width, int tile_height,
                                            int green_to_red,
                                            uint32_t histo[]) {
--- a/src/dsp/lossless_neon.c
+++ b/src/dsp/lossless_neon.c
@ -26,8 +26,8 @@
 #if !defined(WORK_AROUND_GCC)
 // gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
 // gcc-4.8.x at least.
-static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
+static void ConvertBGRAToRGBA_NEON(const uint32_t* WEBP_RESTRICT src,
-                                   int num_pixels, uint8_t* dst) {
+                                   int num_pixels, uint8_t* WEBP_RESTRICT dst) {
  const uint32_t* const end = src + (num_pixels & ~15);
  for (; src < end; src += 16) {
    uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@ -41,8 +41,8 @@ static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
  VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst);  // left-overs
 }
-static void ConvertBGRAToBGR_NEON(const uint32_t* src,
+static void ConvertBGRAToBGR_NEON(const uint32_t* WEBP_RESTRICT src,
-                                  int num_pixels, uint8_t* dst) {
+                                  int num_pixels, uint8_t* WEBP_RESTRICT dst) {
  const uint32_t* const end = src + (num_pixels & ~15);
  for (; src < end; src += 16) {
    const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@ -53,8 +53,8 @@ static void ConvertBGRAToBGR_NEON(const uint32_t* src,
  VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst);  // left-overs
 }
-static void ConvertBGRAToRGB_NEON(const uint32_t* src,
+static void ConvertBGRAToRGB_NEON(const uint32_t* WEBP_RESTRICT src,
-                                  int num_pixels, uint8_t* dst) {
+                                  int num_pixels, uint8_t* WEBP_RESTRICT dst) {
  const uint32_t* const end = src + (num_pixels & ~15);
  for (; src < end; src += 16) {
    const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@ -71,8 +71,8 @@ static void ConvertBGRAToRGB_NEON(const uint32_t* src,
 static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
-static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
+static void ConvertBGRAToRGBA_NEON(const uint32_t* WEBP_RESTRICT src,
-                                   int num_pixels, uint8_t* dst) {
+                                   int num_pixels, uint8_t* WEBP_RESTRICT dst) {
  const uint32_t* const end = src + (num_pixels & ~1);
  const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
  for (; src < end; src += 2) {
@ -89,8 +89,8 @@ static const uint8_t kBGRShuffle[3][8] = {
  { 21, 22, 24, 25, 26, 28, 29, 30 }
 };
-static void ConvertBGRAToBGR_NEON(const uint32_t* src,
+static void ConvertBGRAToBGR_NEON(const uint32_t* WEBP_RESTRICT src,
-                                  int num_pixels, uint8_t* dst) {
+                                  int num_pixels, uint8_t* WEBP_RESTRICT dst) {
  const uint32_t* const end = src + (num_pixels & ~7);
  const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
  const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
@ -116,8 +116,8 @@ static const uint8_t kRGBShuffle[3][8] = {
  { 21, 20, 26, 25, 24, 30, 29, 28 }
 };
-static void ConvertBGRAToRGB_NEON(const uint32_t* src,
+static void ConvertBGRAToRGB_NEON(const uint32_t* WEBP_RESTRICT src,
-                                  int num_pixels, uint8_t* dst) {
+                                  int num_pixels, uint8_t* WEBP_RESTRICT dst) {
  const uint32_t* const end = src + (num_pixels & ~7);
  const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
  const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
@ -209,7 +209,7 @@ static uint32_t Predictor13_NEON(const uint32_t* const left,
 // Predictor0: ARGB_BLACK.
 static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper,
-                               int num_pixels, uint32_t* out) {
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  const uint8x16_t black = vreinterpretq_u8_u32(vdupq_n_u32(ARGB_BLACK));
  for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -222,7 +222,7 @@ static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper,
 // Predictor1: left.
 static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper,
-                               int num_pixels, uint32_t* out) {
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  const uint8x16_t zero = LOADQ_U32_AS_U8(0);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -248,7 +248,7 @@ static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper,
 #define GENERATE_PREDICTOR_1(X, IN)                                       \
 static void PredictorAdd##X##_NEON(const uint32_t* in,                    \
                                   const uint32_t* upper, int num_pixels, \
-                                   uint32_t* out) {                       \
+                                   uint32_t* WEBP_RESTRICT out) {         \
  int i;                                                                  \
  for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);                      \
@ -276,7 +276,7 @@ GENERATE_PREDICTOR_1(4, upper[i - 1])
 } while (0)
 static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper,
-                               int num_pixels, uint32_t* out) {
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -301,7 +301,7 @@ static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper,
 // Predictor6: average(left, TL)
 static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper,
-                               int num_pixels, uint32_t* out) {
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -317,7 +317,7 @@ static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper,
 // Predictor7: average(left, T)
 static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper,
-                               int num_pixels, uint32_t* out) {
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -335,7 +335,7 @@ static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper,
 #define GENERATE_PREDICTOR_2(X, IN)                                       \
 static void PredictorAdd##X##_NEON(const uint32_t* in,                    \
                                   const uint32_t* upper, int num_pixels, \
-                                   uint32_t* out) {                       \
+                                   uint32_t* WEBP_RESTRICT out) {         \
  int i;                                                                  \
  for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);                      \
@ -363,7 +363,7 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
 } while (0)
 static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -394,7 +394,7 @@ static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper,
 } while (0)
 static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -427,7 +427,7 @@ static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper,
 } while (0)
 static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  uint16x8_t L = vmovl_u8(LOAD_U32_AS_U8(out[-1]));
  for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -468,7 +468,7 @@ static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper,
 } while (0)
 static void PredictorAdd13_NEON(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
--- a/src/dsp/lossless_sse2.c
+++ b/src/dsp/lossless_sse2.c
@ -186,7 +186,7 @@ static uint32_t Predictor13_SSE2(const uint32_t* const left,
 // Predictor0: ARGB_BLACK.
 static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
-                               int num_pixels, uint32_t* out) {
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -202,7 +202,7 @@ static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
 // Predictor1: left.
 static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
-                               int num_pixels, uint32_t* out) {
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  __m128i prev = _mm_set1_epi32((int)out[-1]);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -230,7 +230,8 @@ static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
 // per 8 bit channel.
 #define GENERATE_PREDICTOR_1(X, IN)                                           \
 static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
-                                  int num_pixels, uint32_t* out) {            \
+                                   int num_pixels,                            \
                                   uint32_t* WEBP_RESTRICT out) {             \
  int i;                                                                      \
  for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
@ -259,7 +260,8 @@ GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2)
 #define GENERATE_PREDICTOR_2(X, IN)                                           \
 static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
-                                   int num_pixels, uint32_t* out) {           \
+                                   int num_pixels,                            \
                                   uint32_t* WEBP_RESTRICT out) {             \
  int i;                                                                      \
  for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
    const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN));            \
@ -297,7 +299,7 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
 } while (0)
 static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  __m128i L = _mm_cvtsi32_si128((int)out[-1]);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
@ -344,7 +346,7 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
 } while (0)
 static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  __m128i pa;
  __m128i L = _mm_cvtsi32_si128((int)out[-1]);
@ -395,7 +397,7 @@ static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
 } while (0)
 static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
  int i;
  const __m128i zero = _mm_setzero_si128();
  const __m128i L8 = _mm_cvtsi32_si128((int)out[-1]);
@ -490,8 +492,8 @@ static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
 //------------------------------------------------------------------------------
 // Color-space conversion functions
-static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
+static void ConvertBGRAToRGB_SSE2(const uint32_t* WEBP_RESTRICT src,
-                                  uint8_t* dst) {
+                                  int num_pixels, uint8_t* WEBP_RESTRICT dst) {
  const __m128i* in = (const __m128i*)src;
  __m128i* out = (__m128i*)dst;
@ -526,8 +528,8 @@ static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
  }
 }
-static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
+static void ConvertBGRAToRGBA_SSE2(const uint32_t* WEBP_RESTRICT src,
-                                   int num_pixels, uint8_t* dst) {
+                                   int num_pixels, uint8_t* WEBP_RESTRICT dst) {
  const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ff);
  const __m128i* in = (const __m128i*)src;
  __m128i* out = (__m128i*)dst;
@ -554,8 +556,9 @@ static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
  }
 }
-static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
+static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* WEBP_RESTRICT src,
-                                       int num_pixels, uint8_t* dst) {
+                                       int num_pixels,
                                       uint8_t* WEBP_RESTRICT dst) {
  const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
  const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0);
  const __m128i* in = (const __m128i*)src;
@ -590,8 +593,9 @@ static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
  }
 }
-static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
+static void ConvertBGRAToRGB565_SSE2(const uint32_t* WEBP_RESTRICT src,
-                                     int num_pixels, uint8_t* dst) {
+                                     int num_pixels,
                                     uint8_t* WEBP_RESTRICT dst) {
  const __m128i mask_0xe0 = _mm_set1_epi8((char)0xe0);
  const __m128i mask_0xf8 = _mm_set1_epi8((char)0xf8);
  const __m128i mask_0x07 = _mm_set1_epi8(0x07);
@ -631,8 +635,8 @@ static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
  }
 }
-static void ConvertBGRAToBGR_SSE2(const uint32_t* src,
+static void ConvertBGRAToBGR_SSE2(const uint32_t* WEBP_RESTRICT src,
-                                  int num_pixels, uint8_t* dst) {
+                                  int num_pixels, uint8_t* WEBP_RESTRICT dst) {
  const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
  const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
  const __m128i* in = (const __m128i*)src;
--- a/src/dsp/lossless_sse41.c
+++ b/src/dsp/lossless_sse41.c
@ -77,8 +77,8 @@ static void TransformColorInverse_SSE41(const VP8LMultipliers* const m,
  }                                                   \
 } while (0)
-static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels,
+static void ConvertBGRAToRGB_SSE41(const uint32_t* WEBP_RESTRICT src,
-                                   uint8_t* dst) {
+                                   int num_pixels, uint8_t* WEBP_RESTRICT dst) {
  const __m128i* in = (const __m128i*)src;
  __m128i* out = (__m128i*)dst;
  const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9,
@ -95,8 +95,8 @@ static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels,
  }
 }
-static void ConvertBGRAToBGR_SSE41(const uint32_t* src,
+static void ConvertBGRAToBGR_SSE41(const uint32_t* WEBP_RESTRICT src,
-                                   int num_pixels, uint8_t* dst) {
+                                   int num_pixels, uint8_t* WEBP_RESTRICT dst) {
  const __m128i* in = (const __m128i*)src;
  __m128i* out = (__m128i*)dst;
  const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10,
--- a/src/dsp/rescaler.c
+++ b/src/dsp/rescaler.c
@ -26,8 +26,8 @@
 //------------------------------------------------------------------------------
 // Row import
-void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
+void WebPRescalerImportRowExpand_C(WebPRescaler* WEBP_RESTRICT const wrk,
-                                   const uint8_t* src) {
+                                   const uint8_t* WEBP_RESTRICT src) {
  const int x_stride = wrk->num_channels;
  const int x_out_max = wrk->dst_width * wrk->num_channels;
  int channel;
@ -59,8 +59,8 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
  }
 }
-void WebPRescalerImportRowShrink_C(WebPRescaler* const wrk,
+void WebPRescalerImportRowShrink_C(WebPRescaler* WEBP_RESTRICT const wrk,
-                                   const uint8_t* src) {
+                                   const uint8_t* WEBP_RESTRICT src) {
  const int x_stride = wrk->num_channels;
  const int x_out_max = wrk->dst_width * wrk->num_channels;
  int channel;
@ -158,7 +158,8 @@ void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) {
 //------------------------------------------------------------------------------
 // Main entry calls
-void WebPRescalerImportRow(WebPRescaler* const wrk, const uint8_t* src) {
+void WebPRescalerImportRow(WebPRescaler* WEBP_RESTRICT const wrk,
                           const uint8_t* WEBP_RESTRICT src) {
  assert(!WebPRescalerInputDone(wrk));
  if (!wrk->x_expand) {
    WebPRescalerImportRowShrink(wrk, src);
--- a/src/dsp/rescaler_mips32.c
+++ b/src/dsp/rescaler_mips32.c
@ -21,8 +21,8 @@
 //------------------------------------------------------------------------------
 // Row import
-static void ImportRowShrink_MIPS32(WebPRescaler* const wrk,
+static void ImportRowShrink_MIPS32(WebPRescaler* WEBP_RESTRICT const wrk,
-                                   const uint8_t* src) {
+                                   const uint8_t* WEBP_RESTRICT src) {
  const int x_stride = wrk->num_channels;
  const int x_out_max = wrk->dst_width * wrk->num_channels;
  const int fx_scale = wrk->fx_scale;
@ -81,8 +81,8 @@ static void ImportRowShrink_MIPS32(WebPRescaler* const wrk,
  }
 }
-static void ImportRowExpand_MIPS32(WebPRescaler* const wrk,
+static void ImportRowExpand_MIPS32(WebPRescaler* WEBP_RESTRICT const wrk,
-                                   const uint8_t* src) {
+                                   const uint8_t* WEBP_RESTRICT src) {
  const int x_stride = wrk->num_channels;
  const int x_out_max = wrk->dst_width * wrk->num_channels;
  const int x_add = wrk->x_add;
--- a/src/dsp/rescaler_msa.c
+++ b/src/dsp/rescaler_msa.c
@ -114,9 +114,9 @@
  dst = __msa_copy_s_w((v4i32)t0, 0);                             \
 } while (0)
-static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
+static WEBP_INLINE void ExportRowExpand_0(
-                                          int length,
+    const uint32_t* WEBP_RESTRICT frow, uint8_t* WEBP_RESTRICT dst, int length,
-                                          WebPRescaler* const wrk) {
+    WebPRescaler* WEBP_RESTRICT const wrk) {
  const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
  const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
  const v4i32 zero = { 0 };
@ -171,9 +171,10 @@ static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
  }
 }
-static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
+static WEBP_INLINE void ExportRowExpand_1(
-                                          uint8_t* dst, int length,
+    const uint32_t* WEBP_RESTRICT frow, uint32_t* WEBP_RESTRICT irow,
-                                          WebPRescaler* const wrk) {
+    uint8_t* WEBP_RESTRICT dst, int length,
    WebPRescaler* WEBP_RESTRICT const wrk) {
  const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
  const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
  const v4i32 B1 = __msa_fill_w(B);
@ -262,10 +263,10 @@ static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
 }
 #if 0  // disabled for now. TODO(skal): make match the C-code
-static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
+static WEBP_INLINE void ExportRowShrink_0(
-                                          uint8_t* dst, int length,
+    const uint32_t* WEBP_RESTRICT frow, uint32_t* WEBP_RESTRICT irow,
-                                          const uint32_t yscale,
+    uint8_t* WEBP_RESTRICT dst, int length, const uint32_t yscale,
-                                          WebPRescaler* const wrk) {
+    WebPRescaler* WEBP_RESTRICT const wrk) {
  const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
  const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
  const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
@ -348,9 +349,9 @@ static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
  }
 }
-static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
+static WEBP_INLINE void ExportRowShrink_1(
-                                          int length,
+    uint32_t* WEBP_RESTRICT irow, uint8_t* WEBP_RESTRICT dst, int length,
-                                          WebPRescaler* const wrk) {
+    WebPRescaler* WEBP_RESTRICT const wrk) {
  const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
  const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
  const v4i32 zero = { 0 };
--- a/src/dsp/rescaler_neon.c
+++ b/src/dsp/rescaler_neon.c
@ -45,8 +45,8 @@
 #error "MULT_FIX/WEBP_RESCALER_RFIX need some more work"
 #endif
-static uint32x4_t Interpolate_NEON(const rescaler_t* const frow,
+static uint32x4_t Interpolate_NEON(const rescaler_t* WEBP_RESTRICT const frow,
-                                   const rescaler_t* const irow,
+                                   const rescaler_t* WEBP_RESTRICT const irow,
                                   uint32_t A, uint32_t B) {
  LOAD_32x4(frow, A0);
  LOAD_32x4(irow, B0);
--- a/src/dsp/rescaler_sse2.c
+++ b/src/dsp/rescaler_sse2.c
@ -43,8 +43,8 @@ static void LoadEightPixels_SSE2(const uint8_t* const src, __m128i* out) {
  *out = _mm_unpacklo_epi8(A, zero);
 }
-static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
+static void RescalerImportRowExpand_SSE2(WebPRescaler* WEBP_RESTRICT const wrk,
-                                         const uint8_t* src) {
+                                         const uint8_t* WEBP_RESTRICT src) {
  rescaler_t* frow = wrk->frow;
  const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels;
  const int x_add = wrk->x_add;
@ -109,8 +109,8 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
  assert(accum == 0);
 }
-static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
+static void RescalerImportRowShrink_SSE2(WebPRescaler* WEBP_RESTRICT const wrk,
-                                         const uint8_t* src) {
+                                         const uint8_t* WEBP_RESTRICT src) {
  const int x_sub = wrk->x_sub;
  int accum = 0;
  const __m128i zero = _mm_setzero_si128();
@ -168,12 +168,10 @@ static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
 // Row export
 // load *src as epi64, multiply by mult and store result in [out0 ... out3]
-static WEBP_INLINE void LoadDispatchAndMult_SSE2(const rescaler_t* const src,
+static WEBP_INLINE void LoadDispatchAndMult_SSE2(
-                                                 const __m128i* const mult,
+    const rescaler_t* WEBP_RESTRICT const src, const __m128i* const mult,
-                                                 __m128i* const out0,
+    __m128i* const out0, __m128i* const out1, __m128i* const out2,
-                                                 __m128i* const out1,
+    __m128i* const out3) {
                                                 __m128i* const out2,
                                                 __m128i* const out3) {
  const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0));
  const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4));
  const __m128i A2 = _mm_srli_epi64(A0, 32);
--- a/src/dsp/upsampling.c
+++ b/src/dsp/upsampling.c
@ -35,10 +35,14 @@ WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST];
 #define LOAD_UV(u, v) ((u) | ((v) << 16))
 #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                  \
-static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y,                      \
-                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* WEBP_RESTRICT bottom_y,                   \
-                      const uint8_t* cur_u, const uint8_t* cur_v,              \
+                      const uint8_t* WEBP_RESTRICT top_u,                      \
-                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+                      const uint8_t* WEBP_RESTRICT top_v,                      \
                      const uint8_t* WEBP_RESTRICT cur_u,                      \
                      const uint8_t* WEBP_RESTRICT cur_v,                      \
                      uint8_t* WEBP_RESTRICT top_dst,                          \
                      uint8_t* WEBP_RESTRICT bottom_dst, int len) {            \
  int x;                                                                       \
  const int last_pixel_pair = (len - 1) >> 1;                                  \
  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
@ -136,10 +140,14 @@ static void EmptyUpsampleFunc(const uint8_t* top_y, const uint8_t* bottom_y,
 #if !defined(FANCY_UPSAMPLING)
 #define DUAL_SAMPLE_FUNC(FUNC_NAME, FUNC)                                      \
-static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,              \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y,                      \
-                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* WEBP_RESTRICT bot_y,                      \
-                      const uint8_t* bot_u, const uint8_t* bot_v,              \
+                      const uint8_t* WEBP_RESTRICT top_u,                      \
-                      uint8_t* top_dst, uint8_t* bot_dst, int len) {           \
+                      const uint8_t* WEBP_RESTRICT top_v,                      \
                      const uint8_t* WEBP_RESTRICT bot_u,                      \
                      const uint8_t* WEBP_RESTRICT bot_v,                      \
                      uint8_t* WEBP_RESTRICT top_dst,                          \
                      uint8_t* WEBP_RESTRICT bot_dst, int len) {               \
  const int half_len = len >> 1;                                               \
  int x;                                                                       \
  assert(top_dst != NULL);                                                     \
@ -178,10 +186,14 @@ WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
 // YUV444 converter
 #define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
-extern void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
+extern void FUNC_NAME(const uint8_t* WEBP_RESTRICT y,                          \
-                      uint8_t* dst, int len);                                  \
+                      const uint8_t* WEBP_RESTRICT u,                          \
-void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,           \
+                      const uint8_t* WEBP_RESTRICT v,                          \
-               uint8_t* dst, int len) {                                        \
+                      uint8_t* WEBP_RESTRICT dst, int len);                    \
 void FUNC_NAME(const uint8_t* WEBP_RESTRICT y,                                 \
               const uint8_t* WEBP_RESTRICT u,                                 \
               const uint8_t* WEBP_RESTRICT v,                                 \
               uint8_t* WEBP_RESTRICT dst, int len) {                          \
  int i;                                                                       \
  for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]);         \
 }
--- a/src/dsp/upsampling_mips_dsp_r2.c
+++ b/src/dsp/upsampling_mips_dsp_r2.c
@ -143,10 +143,14 @@ static WEBP_INLINE void YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
 #define LOAD_UV(u, v) ((u) | ((v) << 16))
 #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                  \
-static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y,                      \
-                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* WEBP_RESTRICT bottom_y,                   \
-                      const uint8_t* cur_u, const uint8_t* cur_v,              \
+                      const uint8_t* WEBP_RESTRICT top_u,                      \
-                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+                      const uint8_t* WEBP_RESTRICT top_v,                      \
                      const uint8_t* WEBP_RESTRICT cur_u,                      \
                      const uint8_t* WEBP_RESTRICT cur_v,                      \
                      uint8_t* WEBP_RESTRICT top_dst,                          \
                      uint8_t* WEBP_RESTRICT bottom_dst, int len) {            \
  int x;                                                                       \
  const int last_pixel_pair = (len - 1) >> 1;                                  \
  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
@ -241,8 +245,10 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) {
 // YUV444 converter
 #define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
-static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y,                          \
-                      uint8_t* dst, int len) {                                 \
+                      const uint8_t* WEBP_RESTRICT u,                          \
                      const uint8_t* WEBP_RESTRICT v,                          \
                      uint8_t* WEBP_RESTRICT dst, int len) {                   \
  int i;                                                                       \
  for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
 }
--- a/src/dsp/upsampling_msa.c
+++ b/src/dsp/upsampling_msa.c
@ -320,8 +320,10 @@ static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) {
 }
 #if !defined(WEBP_REDUCE_CSP)
-static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
+static void YuvToRgbLine(const uint8_t* WEBP_RESTRICT y,
-                         const uint8_t* v, uint8_t* dst, int length) {
+                         const uint8_t* WEBP_RESTRICT u,
                         const uint8_t* WEBP_RESTRICT v,
                         uint8_t* WEBP_RESTRICT dst, int length) {
  v16u8 R, G, B;
  while (length >= 16) {
    CALC_RGB16(y, u, v, R, G, B);
@ -347,8 +349,10 @@ static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
  }
 }
-static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
+static void YuvToBgrLine(const uint8_t* WEBP_RESTRICT y,
-                         const uint8_t* v, uint8_t* dst, int length) {
+                         const uint8_t* WEBP_RESTRICT u,
                         const uint8_t* WEBP_RESTRICT v,
                         uint8_t* WEBP_RESTRICT dst, int length) {
  v16u8 R, G, B;
  while (length >= 16) {
    CALC_RGB16(y, u, v, R, G, B);
@ -375,8 +379,10 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
 }
 #endif  // WEBP_REDUCE_CSP
-static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
+static void YuvToRgbaLine(const uint8_t* WEBP_RESTRICT y,
-                          const uint8_t* v, uint8_t* dst, int length) {
+                          const uint8_t* WEBP_RESTRICT u,
                          const uint8_t* WEBP_RESTRICT v,
                          uint8_t* WEBP_RESTRICT dst, int length) {
  v16u8 R, G, B;
  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
  while (length >= 16) {
@ -403,8 +409,10 @@ static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
  }
 }
-static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
+static void YuvToBgraLine(const uint8_t* WEBP_RESTRICT y,
-                          const uint8_t* v, uint8_t* dst, int length) {
+                          const uint8_t* WEBP_RESTRICT u,
                          const uint8_t* WEBP_RESTRICT v,
                          uint8_t* WEBP_RESTRICT dst, int length) {
  v16u8 R, G, B;
  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
  while (length >= 16) {
@ -432,8 +440,10 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
 }
 #if !defined(WEBP_REDUCE_CSP)
-static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
+static void YuvToArgbLine(const uint8_t* WEBP_RESTRICT y,
-                          const uint8_t* v, uint8_t* dst, int length) {
+                          const uint8_t* WEBP_RESTRICT u,
                          const uint8_t* WEBP_RESTRICT v,
                          uint8_t* WEBP_RESTRICT dst, int length) {
  v16u8 R, G, B;
  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
  while (length >= 16) {
@ -460,8 +470,10 @@ static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
  }
 }
-static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
+static void YuvToRgba4444Line(const uint8_t* WEBP_RESTRICT y,
-                              const uint8_t* v, uint8_t* dst, int length) {
+                              const uint8_t* WEBP_RESTRICT u,
                              const uint8_t* WEBP_RESTRICT v,
                              uint8_t* WEBP_RESTRICT dst, int length) {
  v16u8 R, G, B, RG, BA, tmp0, tmp1;
  while (length >= 16) {
 #if (WEBP_SWAP_16BIT_CSP == 1)
@ -496,8 +508,10 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
  }
 }
-static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
+static void YuvToRgb565Line(const uint8_t* WEBP_RESTRICT y,
-                            const uint8_t* v, uint8_t* dst, int length) {
+                            const uint8_t* WEBP_RESTRICT u,
                            const uint8_t* WEBP_RESTRICT v,
                            uint8_t* WEBP_RESTRICT dst, int length) {
  v16u8 R, G, B, RG, GB, tmp0, tmp1;
  while (length >= 16) {
 #if (WEBP_SWAP_16BIT_CSP == 1)
@ -564,11 +578,14 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
 } while (0)
 #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                            \
-static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,        \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y,                \
-                      const uint8_t* top_u, const uint8_t* top_v,        \
+                      const uint8_t* WEBP_RESTRICT bot_y,                \
-                      const uint8_t* cur_u, const uint8_t* cur_v,        \
+                      const uint8_t* WEBP_RESTRICT top_u,                \
-                      uint8_t* top_dst, uint8_t* bot_dst, int len)       \
+                      const uint8_t* WEBP_RESTRICT top_v,                \
-{                                                                        \
+                      const uint8_t* WEBP_RESTRICT cur_u,                \
                      const uint8_t* WEBP_RESTRICT cur_v,                \
                      uint8_t* WEBP_RESTRICT top_dst,                    \
                      uint8_t* WEBP_RESTRICT bot_dst, int len) {         \
  int size = (len - 1) >> 1;                                             \
  uint8_t temp_u[64];                                                    \
  uint8_t temp_v[64];                                                    \
--- a/src/dsp/upsampling_neon.c
+++ b/src/dsp/upsampling_neon.c
@ -58,8 +58,9 @@
 } while (0)
 // Turn the macro into a function for reducing code-size when non-critical
-static void Upsample16Pixels_NEON(const uint8_t* r1, const uint8_t* r2,
+static void Upsample16Pixels_NEON(const uint8_t* WEBP_RESTRICT const r1,
-                                  uint8_t* out) {
+                                  const uint8_t* WEBP_RESTRICT const r2,
                                  uint8_t* WEBP_RESTRICT const out) {
  UPSAMPLE_16PIXELS(r1, r2, out);
 }
@ -190,10 +191,14 @@ static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 };
 }
 #define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP)                              \
-static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y,                      \
-                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* WEBP_RESTRICT bottom_y,                   \
-                      const uint8_t* cur_u, const uint8_t* cur_v,              \
+                      const uint8_t* WEBP_RESTRICT top_u,                      \
-                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+                      const uint8_t* WEBP_RESTRICT top_v,                      \
                      const uint8_t* WEBP_RESTRICT cur_u,                      \
                      const uint8_t* WEBP_RESTRICT cur_v,                      \
                      uint8_t* WEBP_RESTRICT top_dst,                          \
                      uint8_t* WEBP_RESTRICT bottom_dst, int len) {            \
  int block;                                                                   \
  /* 16 byte aligned array to cache reconstructed u and v */                   \
  uint8_t uv_buf[2 * 32 + 15];                                                 \
--- a/src/dsp/upsampling_sse2.c
+++ b/src/dsp/upsampling_sse2.c
@ -88,8 +88,9 @@
 } while (0)
 // Turn the macro into a function for reducing code-size when non-critical
-static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
+static void Upsample32Pixels_SSE2(const uint8_t* WEBP_RESTRICT const r1,
-                                  uint8_t* const out) {
+                                  const uint8_t* WEBP_RESTRICT const r2,
                                  uint8_t* WEBP_RESTRICT const out) {
  UPSAMPLE_32PIXELS(r1, r2, out);
 }
@ -114,10 +115,14 @@ static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
 } while (0)
 #define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                             \
-static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y,                      \
-                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* WEBP_RESTRICT bottom_y,                   \
-                      const uint8_t* cur_u, const uint8_t* cur_v,              \
+                      const uint8_t* WEBP_RESTRICT top_u,                      \
-                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+                      const uint8_t* WEBP_RESTRICT top_v,                      \
                      const uint8_t* WEBP_RESTRICT cur_u,                      \
                      const uint8_t* WEBP_RESTRICT cur_v,                      \
                      uint8_t* WEBP_RESTRICT top_dst,                          \
                      uint8_t* WEBP_RESTRICT bottom_dst, int len) {            \
  int uv_pos, pos;                                                             \
  /* 16byte-aligned array to cache reconstructed u and v */                    \
  uint8_t uv_buf[14 * 32 + 15] = { 0 };                                        \
@ -215,10 +220,14 @@ extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
 extern void WebPInitYUV444ConvertersSSE2(void);
 #define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP)                            \
-extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v,       \
+extern void CALL_C(const uint8_t* WEBP_RESTRICT y,                             \
-                   uint8_t* dst, int len);                                     \
+                   const uint8_t* WEBP_RESTRICT u,                             \
-static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
+                   const uint8_t* WEBP_RESTRICT v,                             \
-                      uint8_t* dst, int len) {                                 \
+                   uint8_t* WEBP_RESTRICT dst, int len);                       \
 static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y,                          \
                      const uint8_t* WEBP_RESTRICT u,                          \
                      const uint8_t* WEBP_RESTRICT v,                          \
                      uint8_t* WEBP_RESTRICT dst, int len) {                   \
  int i;                                                                       \
  const int max_len = len & ~31;                                               \
  for (i = 0; i < max_len; i += 32) {                                          \
--- a/src/dsp/upsampling_sse41.c
+++ b/src/dsp/upsampling_sse41.c
@ -90,8 +90,9 @@
 } while (0)
 // Turn the macro into a function for reducing code-size when non-critical
-static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[],
+static void Upsample32Pixels_SSE41(const uint8_t* WEBP_RESTRICT const r1,
-                                  uint8_t* const out) {
+                                   const uint8_t* WEBP_RESTRICT const r2,
                                   uint8_t* WEBP_RESTRICT const out) {
  UPSAMPLE_32PIXELS(r1, r2, out);
 }
@ -116,10 +117,14 @@ static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[],
 } while (0)
 #define SSE4_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                             \
-static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y,                      \
-                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* WEBP_RESTRICT bottom_y,                   \
-                      const uint8_t* cur_u, const uint8_t* cur_v,              \
+                      const uint8_t* WEBP_RESTRICT top_u,                      \
-                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+                      const uint8_t* WEBP_RESTRICT top_v,                      \
                      const uint8_t* WEBP_RESTRICT cur_u,                      \
                      const uint8_t* WEBP_RESTRICT cur_v,                      \
                      uint8_t* WEBP_RESTRICT top_dst,                          \
                      uint8_t* WEBP_RESTRICT bottom_dst, int len) {            \
  int uv_pos, pos;                                                             \
  /* 16byte-aligned array to cache reconstructed u and v */                    \
  uint8_t uv_buf[14 * 32 + 15] = { 0 };                                        \
@ -202,10 +207,14 @@ extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
 extern void WebPInitYUV444ConvertersSSE41(void);
 #define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP)                            \
-extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v,       \
+extern void CALL_C(const uint8_t* WEBP_RESTRICT y,                             \
-                   uint8_t* dst, int len);                                     \
+                   const uint8_t* WEBP_RESTRICT u,                             \
-static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
+                   const uint8_t* WEBP_RESTRICT v,                             \
-                      uint8_t* dst, int len) {                                 \
+                   uint8_t* WEBP_RESTRICT dst, int len);                       \
 static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y,                          \
                      const uint8_t* WEBP_RESTRICT u,                          \
                      const uint8_t* WEBP_RESTRICT v,                          \
                      uint8_t* WEBP_RESTRICT dst, int len) {                   \
  int i;                                                                       \
  const int max_len = len & ~31;                                               \
  for (i = 0; i < max_len; i += 32) {                                          \
--- a/src/dsp/yuv.c
+++ b/src/dsp/yuv.c
@ -20,9 +20,10 @@
 // Plain-C version
 #define ROW_FUNC(FUNC_NAME, FUNC, XSTEP)                                       \
-static void FUNC_NAME(const uint8_t* y,                                        \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y,                          \
-                      const uint8_t* u, const uint8_t* v,                      \
+                      const uint8_t* WEBP_RESTRICT u,                          \
-                      uint8_t* dst, int len) {                                 \
+                      const uint8_t* WEBP_RESTRICT v,                          \
                      uint8_t* WEBP_RESTRICT dst, int len) {                   \
  const uint8_t* const end = dst + (len & ~1) * (XSTEP);                       \
  while (dst != end) {                                                         \
    FUNC(y[0], u[0], v[0], dst);                                               \
@ -49,9 +50,10 @@ ROW_FUNC(YuvToRgb565Row,   VP8YuvToRgb565, 2)
 #undef ROW_FUNC
 // Main call for processing a plane with a WebPSamplerRowFunc function:
-void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
+void WebPSamplerProcessPlane(const uint8_t* WEBP_RESTRICT y, int y_stride,
-                             const uint8_t* u, const uint8_t* v, int uv_stride,
+                             const uint8_t* WEBP_RESTRICT u,
-                             uint8_t* dst, int dst_stride,
+                             const uint8_t* WEBP_RESTRICT v, int uv_stride,
                             uint8_t* WEBP_RESTRICT dst, int dst_stride,
                             int width, int height, WebPSamplerRowFunc func) {
  int j;
  for (j = 0; j < height; ++j) {
@ -117,7 +119,8 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
 //-----------------------------------------------------------------------------
 // ARGB -> YUV converters
-static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) {
+static void ConvertARGBToY_C(const uint32_t* WEBP_RESTRICT argb,
                             uint8_t* WEBP_RESTRICT y, int width) {
  int i;
  for (i = 0; i < width; ++i) {
    const uint32_t p = argb[i];
@ -126,7 +129,8 @@ static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) {
  }
 }
-void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
+void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb,
                           uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
                           int src_width, int do_store) {
  // No rounding. Last pixel is dealt with separately.
  const int uv_width = src_width >> 1;
@ -169,22 +173,25 @@ void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
 //-----------------------------------------------------------------------------
-static void ConvertRGB24ToY_C(const uint8_t* rgb, uint8_t* y, int width) {
+static void ConvertRGB24ToY_C(const uint8_t* WEBP_RESTRICT rgb,
                              uint8_t* WEBP_RESTRICT y, int width) {
  int i;
  for (i = 0; i < width; ++i, rgb += 3) {
    y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
  }
 }
-static void ConvertBGR24ToY_C(const uint8_t* bgr, uint8_t* y, int width) {
+static void ConvertBGR24ToY_C(const uint8_t* WEBP_RESTRICT bgr,
                              uint8_t* WEBP_RESTRICT y, int width) {
  int i;
  for (i = 0; i < width; ++i, bgr += 3) {
    y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
  }
 }
-void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
+void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb,
-                             uint8_t* u, uint8_t* v, int width) {
+                             uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
                             int width) {
  int i;
  for (i = 0; i < width; i += 1, rgb += 4) {
    const int r = rgb[0], g = rgb[1], b = rgb[2];
@ -195,13 +202,18 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
 //-----------------------------------------------------------------------------
-void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
+void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb,
-void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
+                            uint8_t* WEBP_RESTRICT y, int width);
-void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
+void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr,
-                              uint8_t* u, uint8_t* v, int width);
+                            uint8_t* WEBP_RESTRICT y, int width);
 void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb,
                              uint8_t* WEBP_RESTRICT u,
                              uint8_t* WEBP_RESTRICT v, int width);
-void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
+void (*WebPConvertARGBToY)(const uint32_t* WEBP_RESTRICT argb,
-void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
+                           uint8_t* WEBP_RESTRICT y, int width);
 void (*WebPConvertARGBToUV)(const uint32_t* WEBP_RESTRICT argb,
                            uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
                            int src_width, int do_store);
 extern void WebPInitConvertARGBToYUVSSE2(void);
--- a/src/dsp/yuv.h
+++ b/src/dsp/yuv.h
@ -149,20 +149,34 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
 #if defined(WEBP_USE_SSE2)
 // Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
-void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+void VP8YuvToRgba32_SSE2(const uint8_t* WEBP_RESTRICT y,
-                         uint8_t* dst);
+                         const uint8_t* WEBP_RESTRICT u,
-void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         const uint8_t* WEBP_RESTRICT v,
-                        uint8_t* dst);
+                         uint8_t* WEBP_RESTRICT dst);
-void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+void VP8YuvToRgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
-                         uint8_t* dst);
+                        const uint8_t* WEBP_RESTRICT u,
-void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                        const uint8_t* WEBP_RESTRICT v,
-                        uint8_t* dst);
+                        uint8_t* WEBP_RESTRICT dst);
-void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+void VP8YuvToBgra32_SSE2(const uint8_t* WEBP_RESTRICT y,
-                         uint8_t* dst);
+                         const uint8_t* WEBP_RESTRICT u,
-void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
+                         const uint8_t* WEBP_RESTRICT v,
-                             const uint8_t* v, uint8_t* dst);
+                         uint8_t* WEBP_RESTRICT dst);
-void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+void VP8YuvToBgr32_SSE2(const uint8_t* WEBP_RESTRICT y,
-                           uint8_t* dst);
+                        const uint8_t* WEBP_RESTRICT u,
                        const uint8_t* WEBP_RESTRICT v,
                        uint8_t* WEBP_RESTRICT dst);
 void VP8YuvToArgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
                         const uint8_t* WEBP_RESTRICT u,
                         const uint8_t* WEBP_RESTRICT v,
                         uint8_t* WEBP_RESTRICT dst);
 void VP8YuvToRgba444432_SSE2(const uint8_t* WEBP_RESTRICT y,
                             const uint8_t* WEBP_RESTRICT u,
                             const uint8_t* WEBP_RESTRICT v,
                             uint8_t* WEBP_RESTRICT dst);
 void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y,
                           const uint8_t* WEBP_RESTRICT u,
                           const uint8_t* WEBP_RESTRICT v,
                           uint8_t* WEBP_RESTRICT dst);
 #endif    // WEBP_USE_SSE2
@ -172,10 +186,14 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
 #if defined(WEBP_USE_SSE41)
 // Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
-void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+void VP8YuvToRgb32_SSE41(const uint8_t* WEBP_RESTRICT y,
-                         uint8_t* dst);
+                         const uint8_t* WEBP_RESTRICT u,
-void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         const uint8_t* WEBP_RESTRICT v,
-                         uint8_t* dst);
+                         uint8_t* WEBP_RESTRICT dst);
 void VP8YuvToBgr32_SSE41(const uint8_t* WEBP_RESTRICT y,
                         const uint8_t* WEBP_RESTRICT u,
                         const uint8_t* WEBP_RESTRICT v,
                         uint8_t* WEBP_RESTRICT dst);
 #endif    // WEBP_USE_SSE41
--- a/src/dsp/yuv_mips32.c
+++ b/src/dsp/yuv_mips32.c
@ -22,9 +22,10 @@
 // simple point-sampling
 #define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A)                                 \
-static void FUNC_NAME(const uint8_t* y,                                        \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y,                          \
-                      const uint8_t* u, const uint8_t* v,                      \
+                      const uint8_t* WEBP_RESTRICT u,                          \
-                      uint8_t* dst, int len) {                                 \
+                      const uint8_t* WEBP_RESTRICT v,                          \
                      uint8_t* WEBP_RESTRICT dst, int len) {                   \
  int i, r, g, b;                                                              \
  int temp0, temp1, temp2, temp3, temp4;                                       \
  for (i = 0; i < (len >> 1); i++) {                                           \
--- a/src/dsp/yuv_mips_dsp_r2.c
+++ b/src/dsp/yuv_mips_dsp_r2.c
@ -69,9 +69,10 @@
  : "memory", "hi", "lo"                                                       \
 #define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A)                                 \
-static void FUNC_NAME(const uint8_t* y,                                        \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y,                          \
-                      const uint8_t* u, const uint8_t* v,                      \
+                      const uint8_t* WEBP_RESTRICT u,                          \
-                      uint8_t* dst, int len) {                                 \
+                      const uint8_t* WEBP_RESTRICT v,                          \
                      uint8_t* WEBP_RESTRICT dst, int len) {                   \
  int i;                                                                       \
  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;             \
  const int t_con_1 = 26149;                                                   \
--- a/src/dsp/yuv_neon.c
+++ b/src/dsp/yuv_neon.c
@ -46,7 +46,8 @@ static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
  return vqmovn_u16(Y2);
 }
-static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
+static void ConvertRGB24ToY_NEON(const uint8_t* WEBP_RESTRICT rgb,
                                 uint8_t* WEBP_RESTRICT y, int width) {
  int i;
  for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) {
    const uint8x8x3_t RGB = vld3_u8(rgb);
@ -58,7 +59,8 @@ static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
  }
 }
-static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
+static void ConvertBGR24ToY_NEON(const uint8_t* WEBP_RESTRICT bgr,
                                 uint8_t* WEBP_RESTRICT y, int width) {
  int i;
  for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) {
    const uint8x8x3_t BGR = vld3_u8(bgr);
@ -70,7 +72,8 @@ static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
  }
 }
-static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
+static void ConvertARGBToY_NEON(const uint32_t* WEBP_RESTRICT argb,
                                uint8_t* WEBP_RESTRICT y, int width) {
  int i;
  for (i = 0; i + 8 <= width; i += 8) {
    const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
@ -114,8 +117,9 @@ static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
  MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST);       \
 } while (0)
-static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
+static void ConvertRGBA32ToUV_NEON(const uint16_t* WEBP_RESTRICT rgb,
-                                   uint8_t* u, uint8_t* v, int width) {
+                                   uint8_t* WEBP_RESTRICT u,
                                   uint8_t* WEBP_RESTRICT v, int width) {
  int i;
  for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) {
    const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb);
@ -131,7 +135,9 @@ static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
  }
 }
-static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
+static void ConvertARGBToUV_NEON(const uint32_t* WEBP_RESTRICT argb,
                                 uint8_t* WEBP_RESTRICT u,
                                 uint8_t* WEBP_RESTRICT v,
                                 int src_width, int do_store) {
  int i;
  for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {
--- a/src/dsp/yuv_sse2.c
+++ b/src/dsp/yuv_sse2.c
@ -82,9 +82,9 @@ static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
 }
 // Convert 32 samples of YUV444 to R/G/B
-static void YUV444ToRGB_SSE2(const uint8_t* const y,
+static void YUV444ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y,
-                             const uint8_t* const u,
+                             const uint8_t* WEBP_RESTRICT const u,
-                             const uint8_t* const v,
+                             const uint8_t* WEBP_RESTRICT const v,
                             __m128i* const R, __m128i* const G,
                             __m128i* const B) {
  const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
@ -93,9 +93,9 @@ static void YUV444ToRGB_SSE2(const uint8_t* const y,
 }
 // Convert 32 samples of YUV420 to R/G/B
-static void YUV420ToRGB_SSE2(const uint8_t* const y,
+static void YUV420ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y,
-                             const uint8_t* const u,
+                             const uint8_t* WEBP_RESTRICT const u,
-                             const uint8_t* const v,
+                             const uint8_t* WEBP_RESTRICT const v,
                             __m128i* const R, __m128i* const G,
                             __m128i* const B) {
  const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
@ -108,7 +108,7 @@ static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
                                           const __m128i* const G,
                                           const __m128i* const B,
                                           const __m128i* const A,
-                                           uint8_t* const dst) {
+                                           uint8_t* WEBP_RESTRICT const dst) {
  const __m128i rb = _mm_packus_epi16(*R, *B);
  const __m128i ga = _mm_packus_epi16(*G, *A);
  const __m128i rg = _mm_unpacklo_epi8(rb, ga);
@ -120,11 +120,9 @@ static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
 }
 // Pack R/G/B/A results into 16b output.
-static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
+static WEBP_INLINE void PackAndStore4444_SSE2(
-                                              const __m128i* const G,
+     const __m128i* const R, const __m128i* const G, const __m128i* const B,
-                                              const __m128i* const B,
+     const __m128i* const A, uint8_t* WEBP_RESTRICT const dst) {
                                              const __m128i* const A,
                                              uint8_t* const dst) {
 #if (WEBP_SWAP_16BIT_CSP == 0)
  const __m128i rg0 = _mm_packus_epi16(*R, *G);
  const __m128i ba0 = _mm_packus_epi16(*B, *A);
@ -145,7 +143,7 @@ static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
 static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
                                             const __m128i* const G,
                                             const __m128i* const B,
-                                             uint8_t* const dst) {
+                                             uint8_t* WEBP_RESTRICT const dst) {
  const __m128i r0 = _mm_packus_epi16(*R, *R);
  const __m128i g0 = _mm_packus_epi16(*G, *G);
  const __m128i b0 = _mm_packus_epi16(*B, *B);
@ -170,7 +168,7 @@ static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
 static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
                                         __m128i* const in2, __m128i* const in3,
                                         __m128i* const in4, __m128i* const in5,
-                                         uint8_t* const rgb) {
+                                         uint8_t* WEBP_RESTRICT const rgb) {
  // The input is 6 registers of sixteen 8b but for the sake of explanation,
  // let's take 6 registers of four 8b values.
  // To pack, we will keep taking one every two 8b integer and move it
@ -193,8 +191,10 @@ static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
  _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
 }
-void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+void VP8YuvToRgba32_SSE2(const uint8_t* WEBP_RESTRICT y,
-                         uint8_t* dst) {
+                         const uint8_t* WEBP_RESTRICT u,
                         const uint8_t* WEBP_RESTRICT v,
                         uint8_t* WEBP_RESTRICT dst) {
  const __m128i kAlpha = _mm_set1_epi16(255);
  int n;
  for (n = 0; n < 32; n += 8, dst += 32) {
@ -204,8 +204,10 @@ void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  }
 }
-void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+void VP8YuvToBgra32_SSE2(const uint8_t* WEBP_RESTRICT y,
-                         uint8_t* dst) {
+                         const uint8_t* WEBP_RESTRICT u,
                         const uint8_t* WEBP_RESTRICT v,
                         uint8_t* WEBP_RESTRICT dst) {
  const __m128i kAlpha = _mm_set1_epi16(255);
  int n;
  for (n = 0; n < 32; n += 8, dst += 32) {
@ -215,8 +217,10 @@ void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  }
 }
-void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+void VP8YuvToArgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
-                         uint8_t* dst) {
+                         const uint8_t* WEBP_RESTRICT u,
                         const uint8_t* WEBP_RESTRICT v,
                         uint8_t* WEBP_RESTRICT dst) {
  const __m128i kAlpha = _mm_set1_epi16(255);
  int n;
  for (n = 0; n < 32; n += 8, dst += 32) {
@ -226,8 +230,10 @@ void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  }
 }
-void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
+void VP8YuvToRgba444432_SSE2(const uint8_t* WEBP_RESTRICT y,
-                             const uint8_t* v, uint8_t* dst) {
+                             const uint8_t* WEBP_RESTRICT u,
                             const uint8_t* WEBP_RESTRICT v,
                             uint8_t* WEBP_RESTRICT dst) {
  const __m128i kAlpha = _mm_set1_epi16(255);
  int n;
  for (n = 0; n < 32; n += 8, dst += 16) {
@ -237,8 +243,10 @@ void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
  }
 }
-void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y,
-                           uint8_t* dst) {
+                           const uint8_t* WEBP_RESTRICT u,
                           const uint8_t* WEBP_RESTRICT v,
                           uint8_t* WEBP_RESTRICT dst) {
  int n;
  for (n = 0; n < 32; n += 8, dst += 16) {
    __m128i R, G, B;
@ -247,8 +255,10 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  }
 }
-void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+void VP8YuvToRgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
-                        uint8_t* dst) {
+                        const uint8_t* WEBP_RESTRICT u,
                        const uint8_t* WEBP_RESTRICT v,
                        uint8_t* WEBP_RESTRICT dst) {
  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
  __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
@ -269,8 +279,10 @@ void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
 }
-void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+void VP8YuvToBgr32_SSE2(const uint8_t* WEBP_RESTRICT y,
-                        uint8_t* dst) {
+                        const uint8_t* WEBP_RESTRICT u,
                        const uint8_t* WEBP_RESTRICT v,
                        uint8_t* WEBP_RESTRICT dst) {
  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
  __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
@ -294,9 +306,10 @@ void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
 //-----------------------------------------------------------------------------
 // Arbitrary-length row conversion functions
-static void YuvToRgbaRow_SSE2(const uint8_t* y,
+static void YuvToRgbaRow_SSE2(const uint8_t* WEBP_RESTRICT y,
-                              const uint8_t* u, const uint8_t* v,
+                              const uint8_t* WEBP_RESTRICT u,
-                              uint8_t* dst, int len) {
+                              const uint8_t* WEBP_RESTRICT v,
                              uint8_t* WEBP_RESTRICT dst, int len) {
  const __m128i kAlpha = _mm_set1_epi16(255);
  int n;
  for (n = 0; n + 8 <= len; n += 8, dst += 32) {
@ -316,9 +329,10 @@ static void YuvToRgbaRow_SSE2(const uint8_t* y,
  }
 }
-static void YuvToBgraRow_SSE2(const uint8_t* y,
+static void YuvToBgraRow_SSE2(const uint8_t* WEBP_RESTRICT y,
-                              const uint8_t* u, const uint8_t* v,
+                              const uint8_t* WEBP_RESTRICT u,
-                              uint8_t* dst, int len) {
+                              const uint8_t* WEBP_RESTRICT v,
                              uint8_t* WEBP_RESTRICT dst, int len) {
  const __m128i kAlpha = _mm_set1_epi16(255);
  int n;
  for (n = 0; n + 8 <= len; n += 8, dst += 32) {
@ -338,9 +352,10 @@ static void YuvToBgraRow_SSE2(const uint8_t* y,
  }
 }
-static void YuvToArgbRow_SSE2(const uint8_t* y,
+static void YuvToArgbRow_SSE2(const uint8_t* WEBP_RESTRICT y,
-                              const uint8_t* u, const uint8_t* v,
+                              const uint8_t* WEBP_RESTRICT u,
-                              uint8_t* dst, int len) {
+                              const uint8_t* WEBP_RESTRICT v,
                              uint8_t* WEBP_RESTRICT dst, int len) {
  const __m128i kAlpha = _mm_set1_epi16(255);
  int n;
  for (n = 0; n + 8 <= len; n += 8, dst += 32) {
@ -360,9 +375,10 @@ static void YuvToArgbRow_SSE2(const uint8_t* y,
  }
 }
-static void YuvToRgbRow_SSE2(const uint8_t* y,
+static void YuvToRgbRow_SSE2(const uint8_t* WEBP_RESTRICT y,
-                             const uint8_t* u, const uint8_t* v,
+                             const uint8_t* WEBP_RESTRICT u,
-                             uint8_t* dst, int len) {
+                             const uint8_t* WEBP_RESTRICT v,
                             uint8_t* WEBP_RESTRICT dst, int len) {
  int n;
  for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
    __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@ -397,9 +413,10 @@ static void YuvToRgbRow_SSE2(const uint8_t* y,
  }
 }
-static void YuvToBgrRow_SSE2(const uint8_t* y,
+static void YuvToBgrRow_SSE2(const uint8_t* WEBP_RESTRICT y,
-                             const uint8_t* u, const uint8_t* v,
+                             const uint8_t* WEBP_RESTRICT u,
-                             uint8_t* dst, int len) {
+                             const uint8_t* WEBP_RESTRICT v,
                             uint8_t* WEBP_RESTRICT dst, int len) {
  int n;
  for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
    __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@ -471,7 +488,7 @@ static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
 // Similar to PlanarTo24bHelper(), but in reverse order.
 static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
-    const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
+    const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) {
  __m128i tmp[6];
  tmp[0] = _mm_loadu_si128((const __m128i*)(rgb +  0));
  tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
@ -488,8 +505,8 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
 }
 // Convert 8 packed ARGB to r[], g[], b[]
-static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
+static WEBP_INLINE void RGB32PackedToPlanar_SSE2(
-                                                 __m128i* const rgb /*in[6]*/) {
+    const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) {
  const __m128i zero = _mm_setzero_si128();
  __m128i a0 = LOAD_16(argb + 0);
  __m128i a1 = LOAD_16(argb + 4);
@ -562,7 +579,8 @@ static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
 #undef MK_CST_16
 #undef TRANSFORM
-static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
+static void ConvertRGB24ToY_SSE2(const uint8_t* WEBP_RESTRICT rgb,
                                 uint8_t* WEBP_RESTRICT y, int width) {
  const int max_width = width & ~31;
  int i;
  for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
@ -596,7 +614,8 @@ static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
  }
 }
-static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
+static void ConvertBGR24ToY_SSE2(const uint8_t* WEBP_RESTRICT bgr,
                                 uint8_t* WEBP_RESTRICT y, int width) {
  const int max_width = width & ~31;
  int i;
  for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
@ -630,7 +649,8 @@ static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
  }
 }
-static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) {
+static void ConvertARGBToY_SSE2(const uint32_t* WEBP_RESTRICT argb,
                                uint8_t* WEBP_RESTRICT y, int width) {
  const int max_width = width & ~15;
  int i;
  for (i = 0; i < max_width; i += 16) {
@ -658,8 +678,9 @@ static void HorizontalAddPack_SSE2(const __m128i* const A,
  *out = _mm_packs_epi32(C, D);
 }
-static void ConvertARGBToUV_SSE2(const uint32_t* argb,
+static void ConvertARGBToUV_SSE2(const uint32_t* WEBP_RESTRICT argb,
-                                 uint8_t* u, uint8_t* v,
+                                 uint8_t* WEBP_RESTRICT u,
                                 uint8_t* WEBP_RESTRICT v,
                                 int src_width, int do_store) {
  const int max_width = src_width & ~31;
  int i;
@ -695,7 +716,7 @@ static void ConvertARGBToUV_SSE2(const uint32_t* argb,
 // Convert 16 packed ARGB 16b-values to r[], g[], b[]
 static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
-    const uint16_t* const rgbx,
+    const uint16_t* WEBP_RESTRICT const rgbx,
    __m128i* const r, __m128i* const g, __m128i* const b) {
  const __m128i in0 = LOAD_16(rgbx +  0);  // r0 | g0 | b0 |x| r1 | g1 | b1 |x
  const __m128i in1 = LOAD_16(rgbx +  8);  // r2 | g2 | b2 |x| r3 | g3 | b3 |x
@ -715,8 +736,9 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
  *b = _mm_unpacklo_epi64(B1, B3);
 }
-static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb,
+static void ConvertRGBA32ToUV_SSE2(const uint16_t* WEBP_RESTRICT rgb,
-                                   uint8_t* u, uint8_t* v, int width) {
+                                   uint8_t* WEBP_RESTRICT u,
                                   uint8_t* WEBP_RESTRICT v, int width) {
  const int max_width = width & ~15;
  const uint16_t* const last_rgb = rgb + 4 * max_width;
  while (rgb < last_rgb) {
--- a/src/dsp/yuv_sse41.c
+++ b/src/dsp/yuv_sse41.c
@ -82,9 +82,9 @@ static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) {
 }
 // Convert 32 samples of YUV444 to R/G/B
-static void YUV444ToRGB_SSE41(const uint8_t* const y,
+static void YUV444ToRGB_SSE41(const uint8_t* WEBP_RESTRICT const y,
-                              const uint8_t* const u,
+                              const uint8_t* WEBP_RESTRICT const u,
-                              const uint8_t* const v,
+                              const uint8_t* WEBP_RESTRICT const v,
                              __m128i* const R, __m128i* const G,
                              __m128i* const B) {
  const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u),
@ -93,9 +93,9 @@ static void YUV444ToRGB_SSE41(const uint8_t* const y,
 }
 // Convert 32 samples of YUV420 to R/G/B
-static void YUV420ToRGB_SSE41(const uint8_t* const y,
+static void YUV420ToRGB_SSE41(const uint8_t* WEBP_RESTRICT const y,
-                              const uint8_t* const u,
+                              const uint8_t* WEBP_RESTRICT const u,
-                              const uint8_t* const v,
+                              const uint8_t* WEBP_RESTRICT const v,
                              __m128i* const R, __m128i* const G,
                              __m128i* const B) {
  const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u),
@ -109,7 +109,7 @@ static void YUV420ToRGB_SSE41(const uint8_t* const y,
 static WEBP_INLINE void PlanarTo24b_SSE41(
    __m128i* const in0, __m128i* const in1, __m128i* const in2,
    __m128i* const in3, __m128i* const in4, __m128i* const in5,
-    uint8_t* const rgb) {
+    uint8_t* WEBP_RESTRICT const rgb) {
  // The input is 6 registers of sixteen 8b but for the sake of explanation,
  // let's take 6 registers of four 8b values.
  // To pack, we will keep taking one every two 8b integer and move it
@ -132,8 +132,10 @@ static WEBP_INLINE void PlanarTo24b_SSE41(
  _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
 }
-void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+void VP8YuvToRgb32_SSE41(const uint8_t* WEBP_RESTRICT y,
-                         uint8_t* dst) {
+                         const uint8_t* WEBP_RESTRICT u,
                         const uint8_t* WEBP_RESTRICT v,
                         uint8_t* WEBP_RESTRICT dst) {
  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
  __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
@ -154,8 +156,10 @@ void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
 }
-void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+void VP8YuvToBgr32_SSE41(const uint8_t* WEBP_RESTRICT y,
-                         uint8_t* dst) {
+                         const uint8_t* WEBP_RESTRICT u,
                         const uint8_t* WEBP_RESTRICT v,
                         uint8_t* WEBP_RESTRICT dst) {
  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
  __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
@ -179,9 +183,10 @@ void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
 //-----------------------------------------------------------------------------
 // Arbitrary-length row conversion functions
-static void YuvToRgbRow_SSE41(const uint8_t* y,
+static void YuvToRgbRow_SSE41(const uint8_t* WEBP_RESTRICT y,
-                              const uint8_t* u, const uint8_t* v,
+                              const uint8_t* WEBP_RESTRICT u,
-                              uint8_t* dst, int len) {
+                              const uint8_t* WEBP_RESTRICT v,
                              uint8_t* WEBP_RESTRICT dst, int len) {
  int n;
  for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
    __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@ -216,9 +221,10 @@ static void YuvToRgbRow_SSE41(const uint8_t* y,
  }
 }
-static void YuvToBgrRow_SSE41(const uint8_t* y,
+static void YuvToBgrRow_SSE41(const uint8_t* WEBP_RESTRICT y,
-                              const uint8_t* u, const uint8_t* v,
+                              const uint8_t* WEBP_RESTRICT u,
-                              uint8_t* dst, int len) {
+                              const uint8_t* WEBP_RESTRICT v,
                              uint8_t* WEBP_RESTRICT dst, int len) {
  int n;
  for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
    __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@ -290,7 +296,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) {
 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
 // Similar to PlanarTo24bHelper(), but in reverse order.
 static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
-    const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
+    const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) {
  const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb +  0));
  const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16));
  const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + 32));
@ -334,7 +340,7 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
 // Convert 8 packed ARGB to r[], g[], b[]
 static WEBP_INLINE void RGB32PackedToPlanar_SSE41(
-    const uint32_t* const argb, __m128i* const rgb /*in[6]*/) {
+    const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) {
  const __m128i zero = _mm_setzero_si128();
  __m128i a0 = LOAD_16(argb + 0);
  __m128i a1 = LOAD_16(argb + 4);
@ -407,7 +413,8 @@ static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R,
 #undef MK_CST_16
 #undef TRANSFORM
-static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) {
+static void ConvertRGB24ToY_SSE41(const uint8_t* WEBP_RESTRICT rgb,
                                  uint8_t* WEBP_RESTRICT y, int width) {
  const int max_width = width & ~31;
  int i;
  for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
@ -441,7 +448,8 @@ static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) {
  }
 }
-static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) {
+static void ConvertBGR24ToY_SSE41(const uint8_t* WEBP_RESTRICT bgr,
                                  uint8_t* WEBP_RESTRICT y, int width) {
  const int max_width = width & ~31;
  int i;
  for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
@ -475,7 +483,8 @@ static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) {
  }
 }
-static void ConvertARGBToY_SSE41(const uint32_t* argb, uint8_t* y, int width) {
+static void ConvertARGBToY_SSE41(const uint32_t* WEBP_RESTRICT argb,
                                 uint8_t* WEBP_RESTRICT y, int width) {
  const int max_width = width & ~15;
  int i;
  for (i = 0; i < max_width; i += 16) {
@ -503,8 +512,9 @@ static void HorizontalAddPack_SSE41(const __m128i* const A,
  *out = _mm_packs_epi32(C, D);
 }
-static void ConvertARGBToUV_SSE41(const uint32_t* argb,
+static void ConvertARGBToUV_SSE41(const uint32_t* WEBP_RESTRICT argb,
-                                  uint8_t* u, uint8_t* v,
+                                  uint8_t* WEBP_RESTRICT u,
                                  uint8_t* WEBP_RESTRICT v,
                                  int src_width, int do_store) {
  const int max_width = src_width & ~31;
  int i;
@ -540,7 +550,7 @@ static void ConvertARGBToUV_SSE41(const uint32_t* argb,
 // Convert 16 packed ARGB 16b-values to r[], g[], b[]
 static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
-    const uint16_t* const rgbx,
+    const uint16_t* WEBP_RESTRICT const rgbx,
    __m128i* const r, __m128i* const g, __m128i* const b) {
  const __m128i in0 = LOAD_16(rgbx +  0);  // r0 | g0 | b0 |x| r1 | g1 | b1 |x
  const __m128i in1 = LOAD_16(rgbx +  8);  // r2 | g2 | b2 |x| r3 | g3 | b3 |x
@ -570,8 +580,9 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
  *b = _mm_unpackhi_epi64(B1, B3);
 }
-static void ConvertRGBA32ToUV_SSE41(const uint16_t* rgb,
+static void ConvertRGBA32ToUV_SSE41(const uint16_t* WEBP_RESTRICT rgb,
-                                    uint8_t* u, uint8_t* v, int width) {
+                                    uint8_t* WEBP_RESTRICT u,
                                    uint8_t* WEBP_RESTRICT v, int width) {
  const int max_width = width & ~15;
  const uint16_t* const last_rgb = rgb + 4 * max_width;
  while (rgb < last_rgb) {
Author	SHA1	Message	Date
James Zern	fdb229ea3a	Merge changes I07a7e36a,Ib29980f7,I2316122d,I2356e314,I32b53dd3, ... into main * changes: dsp/yuv: use WEBP_RESTRICT qualifier dsp/upsampling: use WEBP_RESTRICT qualifier dsp/rescaler: use WEBP_RESTRICT qualifier dsp/lossless: use WEBP_RESTRICT qualifier dsp/filters: use WEBP_RESTRICT qualifier dsp/enc: use WEBP_RESTRICT qualifier dsp/dec: use WEBP_RESTRICT qualifier dsp/cost: use WEBP_RESTRICT qualifier	2024-10-03 17:01:02 +00:00
James Zern	2dd5eb9862	dsp/yuv*: use WEBP_RESTRICT qualifier Better vectorization in the C code, fewer instructions / comparisons in NEON, and fewer reloads in SSE2/SSE4 w/ndk r27/gcc-13/clang-16. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: I07a7e36a2dce8632c71c0fbbeef94dc51453eaf7	2024-10-02 14:55:15 -07:00
James Zern	23bbafbeb8	dsp/upsampling*: use WEBP_RESTRICT qualifier Better vectorization in the C code, fewer instructions in NEON, and some code reordering / better register usage in SSE2/SSE4 w/ndk r27/gcc-13/clang-16. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: Ib29980f778ad3dbb952178ad8dee39b8673c4ff8	2024-10-02 14:55:15 -07:00
James Zern	35915b389e	dsp/rescaler*: use WEBP_RESTRICT qualifier Some improvement in the C code. No changes in NEON or SSE2 w/ndk r27/gcc-13/clang-16. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: I2316122db893f48f0afda90a147c83cac7f07526	2024-10-02 14:55:14 -07:00
James Zern	a32b436bd5	dsp/lossless*: use WEBP_RESTRICT qualifier lossless_enc: better vectorization, most benefits seen in AddVector/Eq w/ndk r27/gcc-13/clang-16 lossless: minor reordering and some improvement to PredictorAdd5_SSE2 w/gcc-13 This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: I2356e314f391ee2f2c71f00bc6ee10097d3881e7	2024-10-02 14:55:14 -07:00
James Zern	04d4b4f387	dsp/filters*: use WEBP_RESTRICT qualifier Better stack/register usage in SSE2/NEON code and improved vectorization of the C code with ndk r27/gcc-13/clang-16. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: I32b53dd38bfc7e2231d875409e7dfda7c513cfb6	2024-10-02 14:55:14 -07:00
James Zern	b1cb37e659	dsp/enc*: use WEBP_RESTRICT qualifier This allows for better vectorization of the C code, inlining of TrueMotion_SSE2, better load usage in aarch64 and other minor reordering with ndk r27/gcc-13/clang-16. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: I07e9944d5c0aa5a079b22883ac5a2d649695e4a0	2024-10-02 14:55:14 -07:00
James Zern	201894ef24	dsp/dec*: use WEBP_RESTRICT qualifier A minor improvement for arm targets with ndk r27/gcc-13 in H/VFilter8 (a couple fewer moves w/aarch64) and much better vectorization of DitherCombine8x8_C in most targets. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: I03e73e6d6404261bb8408a9ae76a4b6ef142f8f0	2024-10-02 14:55:14 -07:00
James Zern	02eac8a741	dsp/cost: use WEBP_RESTRICT qualifier on SetResidualCoeffs_. This results in some minor code reordering when targeting arvm7 with ndk r27 and other recent versions of clang. No changes in the x86 compilations with clang-16 / gcc-13. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: I7c3554ece848fafbc5ac9c4944f1dc85129f6fd8	2024-10-02 14:55:14 -07:00