From b1cb37e65921649514b7a4fad46eb8d28dcbfb10 Mon Sep 17 00:00:00 2001 From: James Zern Date: Sat, 3 Jul 2021 17:59:44 -0700 Subject: [PATCH] dsp/enc*: use WEBP_RESTRICT qualifier This allows for better vectorization of the C code, inlining of TrueMotion_SSE2, better load usage in aarch64 and other minor reordering with ndk r27/gcc-13/clang-16. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: I07e9944d5c0aa5a079b22883ac5a2d649695e4a0 --- src/dsp/dec.c | 2 +- src/dsp/dsp.h | 55 ++++++----- src/dsp/enc.c | 126 +++++++++++++++--------- src/dsp/enc_mips32.c | 40 +++++--- src/dsp/enc_mips_dsp_r2.c | 101 +++++++++++-------- src/dsp/enc_msa.c | 125 ++++++++++++++--------- src/dsp/enc_neon.c | 82 +++++++++------- src/dsp/enc_sse2.c | 202 +++++++++++++++++++++++--------------- src/dsp/enc_sse41.c | 21 ++-- 9 files changed, 457 insertions(+), 297 deletions(-) diff --git a/src/dsp/dec.c b/src/dsp/dec.c index 51067f45..dc1a7625 100644 --- a/src/dsp/dec.c +++ b/src/dsp/dec.c @@ -168,7 +168,7 @@ static void TransformWHT_C(const int16_t* WEBP_RESTRICT in, } #endif // !WEBP_NEON_OMIT_C_CODE -VP8IWHT VP8TransformWHT; +VP8WHT VP8TransformWHT; //------------------------------------------------------------------------------ // Intra predictions diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index bd126dea..ec0302f2 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -60,57 +60,66 @@ extern "C" { // Transforms // VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms // will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4). -typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst, - int do_two); -typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out); -// TODO(jzern): merge these two typedefs after the encoder functions are -// updated to use WEBP_RESTRICT. -typedef void (*VP8FWHT)(const int16_t* in, int16_t* out); -typedef void (*VP8IWHT)(const int16_t* WEBP_RESTRICT in, +typedef void (*VP8Idct)(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two); +typedef void (*VP8Fdct)(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, int16_t* WEBP_RESTRICT out); +typedef void (*VP8WHT)(const int16_t* WEBP_RESTRICT in, + int16_t* WEBP_RESTRICT out); extern VP8Idct VP8ITransform; extern VP8Fdct VP8FTransform; extern VP8Fdct VP8FTransform2; // performs two transforms at a time -extern VP8FWHT VP8FTransformWHT; +extern VP8WHT VP8FTransformWHT; // Predictions // *dst is the destination block. *top and *left can be NULL. -typedef void (*VP8IntraPreds)(uint8_t* dst, const uint8_t* left, - const uint8_t* top); -typedef void (*VP8Intra4Preds)(uint8_t* dst, const uint8_t* top); +typedef void (*VP8IntraPreds)(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top); +typedef void (*VP8Intra4Preds)(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top); extern VP8Intra4Preds VP8EncPredLuma4; extern VP8IntraPreds VP8EncPredLuma16; extern VP8IntraPreds VP8EncPredChroma8; -typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref); +typedef int (*VP8Metric)(const uint8_t* WEBP_RESTRICT pix, + const uint8_t* WEBP_RESTRICT ref); extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4; -typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref, - const uint16_t* const weights); +typedef int (*VP8WMetric)(const uint8_t* WEBP_RESTRICT pix, + const uint8_t* WEBP_RESTRICT ref, + const uint16_t* WEBP_RESTRICT const weights); // The weights for VP8TDisto4x4 and VP8TDisto16x16 contain a row-major // 4 by 4 symmetric matrix. extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16; // Compute the average (DC) of four 4x4 blocks. // Each sub-4x4 block #i sum is stored in dc[i]. -typedef void (*VP8MeanMetric)(const uint8_t* ref, uint32_t dc[4]); +typedef void (*VP8MeanMetric)(const uint8_t* WEBP_RESTRICT ref, + uint32_t dc[4]); extern VP8MeanMetric VP8Mean16x4; -typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst); +typedef void (*VP8BlockCopy)(const uint8_t* WEBP_RESTRICT src, + uint8_t* WEBP_RESTRICT dst); extern VP8BlockCopy VP8Copy4x4; extern VP8BlockCopy VP8Copy16x8; // Quantization struct VP8Matrix; // forward declaration -typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16], - const struct VP8Matrix* const mtx); +typedef int (*VP8QuantizeBlock)( + int16_t in[16], int16_t out[16], + const struct VP8Matrix* WEBP_RESTRICT const mtx); // Same as VP8QuantizeBlock, but quantizes two consecutive blocks. -typedef int (*VP8Quantize2Blocks)(int16_t in[32], int16_t out[32], - const struct VP8Matrix* const mtx); +typedef int (*VP8Quantize2Blocks)( + int16_t in[32], int16_t out[32], + const struct VP8Matrix* WEBP_RESTRICT const mtx); extern VP8QuantizeBlock VP8EncQuantizeBlock; extern VP8Quantize2Blocks VP8EncQuantize2Blocks; // specific to 2nd transform: -typedef int (*VP8QuantizeBlockWHT)(int16_t in[16], int16_t out[16], - const struct VP8Matrix* const mtx); +typedef int (*VP8QuantizeBlockWHT)( + int16_t in[16], int16_t out[16], + const struct VP8Matrix* WEBP_RESTRICT const mtx); extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT; extern const int VP8DspScan[16 + 4 + 4]; @@ -208,7 +217,7 @@ extern VP8DecIdct VP8TransformAC3; extern VP8DecIdct VP8TransformUV; extern VP8DecIdct VP8TransformDC; extern VP8DecIdct VP8TransformDCUV; -extern VP8IWHT VP8TransformWHT; +extern VP8WHT VP8TransformWHT; #define WEBP_TRANSFORM_AC3_C1 20091 #define WEBP_TRANSFORM_AC3_C2 35468 diff --git a/src/dsp/enc.c b/src/dsp/enc.c index 95c623d9..b177031d 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -59,9 +59,10 @@ void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1], } #if !WEBP_NEON_OMIT_C_CODE -static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred, +static void CollectHistogram_C(const uint8_t* WEBP_RESTRICT ref, + const uint8_t* WEBP_RESTRICT pred, int start_block, int end_block, - VP8Histogram* const histo) { + VP8Histogram* WEBP_RESTRICT const histo) { int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; for (j = start_block; j < end_block; ++j) { @@ -109,8 +110,9 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) { #define STORE(x, y, v) \ dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3)) -static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, - uint8_t* dst) { +static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { int C[4 * 4], *tmp; int i; tmp = C; @@ -146,7 +148,9 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, } } -static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst, +static void ITransform_C(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { ITransformOne(ref, in, dst); if (do_two) { @@ -154,7 +158,9 @@ static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst, } } -static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) { +static void FTransform_C(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, + int16_t* WEBP_RESTRICT out) { int i; int tmp[16]; for (i = 0; i < 4; ++i, src += BPS, ref += BPS) { @@ -184,14 +190,16 @@ static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) { } #endif // !WEBP_NEON_OMIT_C_CODE -static void FTransform2_C(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform2_C(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, + int16_t* WEBP_RESTRICT out) { VP8FTransform(src, ref, out); VP8FTransform(src + 4, ref + 4, out + 16); } #if !WEBP_NEON_OMIT_C_CODE -static void FTransformWHT_C(const int16_t* in, int16_t* out) { +static void FTransformWHT_C(const int16_t* WEBP_RESTRICT in, + int16_t* WEBP_RESTRICT out) { // input is 12b signed int32_t tmp[16]; int i; @@ -234,8 +242,9 @@ static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) { } } -static WEBP_INLINE void VerticalPred(uint8_t* dst, - const uint8_t* top, int size) { +static WEBP_INLINE void VerticalPred(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top, + int size) { int j; if (top != NULL) { for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size); @@ -244,8 +253,9 @@ static WEBP_INLINE void VerticalPred(uint8_t* dst, } } -static WEBP_INLINE void HorizontalPred(uint8_t* dst, - const uint8_t* left, int size) { +static WEBP_INLINE void HorizontalPred(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + int size) { if (left != NULL) { int j; for (j = 0; j < size; ++j) { @@ -256,8 +266,9 @@ static WEBP_INLINE void HorizontalPred(uint8_t* dst, } } -static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left, - const uint8_t* top, int size) { +static WEBP_INLINE void TrueMotion(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top, int size) { int y; if (left != NULL) { if (top != NULL) { @@ -286,8 +297,9 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left, } } -static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left, - const uint8_t* top, +static WEBP_INLINE void DCMode(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top, int size, int round, int shift) { int DC = 0; int j; @@ -312,8 +324,9 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left, //------------------------------------------------------------------------------ // Chroma 8x8 prediction (paragraph 12.2) -static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static void IntraChromaPreds_C(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { // U block DCMode(C8DC8 + dst, left, top, 8, 8, 4); VerticalPred(C8VE8 + dst, top, 8); @@ -333,8 +346,9 @@ static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left, // luma 16x16 prediction (paragraph 12.3) #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 -static void Intra16Preds_C(uint8_t* dst, - const uint8_t* left, const uint8_t* top) { +static void Intra16Preds_C(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { DCMode(I16DC16 + dst, left, top, 16, 16, 5); VerticalPred(I16VE16 + dst, top, 16); HorizontalPred(I16HE16 + dst, left, 16); @@ -351,7 +365,8 @@ static void Intra16Preds_C(uint8_t* dst, #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2)) #define AVG2(a, b) (((a) + (b) + 1) >> 1) -static void VE4(uint8_t* dst, const uint8_t* top) { // vertical +// vertical +static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { const uint8_t vals[4] = { AVG3(top[-1], top[0], top[1]), AVG3(top[ 0], top[1], top[2]), @@ -364,7 +379,8 @@ static void VE4(uint8_t* dst, const uint8_t* top) { // vertical } } -static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal +// horizontal +static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -376,14 +392,14 @@ static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L)); } -static void DC4(uint8_t* dst, const uint8_t* top) { +static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { uint32_t dc = 4; int i; for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; Fill(dst, dc >> 3, 4); } -static void RD4(uint8_t* dst, const uint8_t* top) { +static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -402,7 +418,7 @@ static void RD4(uint8_t* dst, const uint8_t* top) { DST(3, 0) = AVG3(D, C, B); } -static void LD4(uint8_t* dst, const uint8_t* top) { +static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { const int A = top[0]; const int B = top[1]; const int C = top[2]; @@ -420,7 +436,7 @@ static void LD4(uint8_t* dst, const uint8_t* top) { DST(3, 3) = AVG3(G, H, H); } -static void VR4(uint8_t* dst, const uint8_t* top) { +static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -442,7 +458,7 @@ static void VR4(uint8_t* dst, const uint8_t* top) { DST(3, 1) = AVG3(B, C, D); } -static void VL4(uint8_t* dst, const uint8_t* top) { +static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { const int A = top[0]; const int B = top[1]; const int C = top[2]; @@ -464,7 +480,7 @@ static void VL4(uint8_t* dst, const uint8_t* top) { DST(3, 3) = AVG3(F, G, H); } -static void HU4(uint8_t* dst, const uint8_t* top) { +static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { const int I = top[-2]; const int J = top[-3]; const int K = top[-4]; @@ -479,7 +495,7 @@ static void HU4(uint8_t* dst, const uint8_t* top) { DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; } -static void HD4(uint8_t* dst, const uint8_t* top) { +static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -502,7 +518,7 @@ static void HD4(uint8_t* dst, const uint8_t* top) { DST(1, 3) = AVG3(L, K, J); } -static void TM4(uint8_t* dst, const uint8_t* top) { +static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int x, y; const uint8_t* const clip = clip1 + 255 - top[-1]; for (y = 0; y < 4; ++y) { @@ -520,7 +536,8 @@ static void TM4(uint8_t* dst, const uint8_t* top) { // Left samples are top[-5 .. -2], top_left is top[-1], top are // located at top[0..3], and top right is top[4..7] -static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) { +static void Intra4Preds_C(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { DC4(I4DC4 + dst, top); TM4(I4TM4 + dst, top); VE4(I4VE4 + dst, top); @@ -539,7 +556,8 @@ static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) { // Metric #if !WEBP_NEON_OMIT_C_CODE -static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b, +static WEBP_INLINE int GetSSE(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b, int w, int h) { int count = 0; int y, x; @@ -554,21 +572,25 @@ static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b, return count; } -static int SSE16x16_C(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_C(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { return GetSSE(a, b, 16, 16); } -static int SSE16x8_C(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_C(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { return GetSSE(a, b, 16, 8); } -static int SSE8x8_C(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_C(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { return GetSSE(a, b, 8, 8); } -static int SSE4x4_C(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_C(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { return GetSSE(a, b, 4, 4); } #endif // !WEBP_NEON_OMIT_C_CODE -static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) { +static void Mean16x4_C(const uint8_t* WEBP_RESTRICT ref, uint32_t dc[4]) { int k, x, y; for (k = 0; k < 4; ++k) { uint32_t avg = 0; @@ -592,7 +614,8 @@ static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) { // Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. // w[] contains a row-major 4 by 4 symmetric matrix. -static int TTransform(const uint8_t* in, const uint16_t* w) { +static int TTransform(const uint8_t* WEBP_RESTRICT in, + const uint16_t* WEBP_RESTRICT w) { int sum = 0; int tmp[16]; int i; @@ -626,15 +649,17 @@ static int TTransform(const uint8_t* in, const uint16_t* w) { return sum; } -static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_C(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { const int sum1 = TTransform(a, w); const int sum2 = TTransform(b, w); return abs(sum2 - sum1) >> 5; } -static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_C(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { @@ -657,7 +682,7 @@ static const uint8_t kZigzag[16] = { // Simple quantization static int QuantizeBlock_C(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int last = -1; int n; for (n = 0; n < 16; ++n) { @@ -683,7 +708,7 @@ static int QuantizeBlock_C(int16_t in[16], int16_t out[16], } static int Quantize2Blocks_C(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int nz; nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; @@ -694,7 +719,8 @@ static int Quantize2Blocks_C(int16_t in[32], int16_t out[32], //------------------------------------------------------------------------------ // Block copy -static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) { +static WEBP_INLINE void Copy(const uint8_t* WEBP_RESTRICT src, + uint8_t* WEBP_RESTRICT dst, int w, int h) { int y; for (y = 0; y < h; ++y) { memcpy(dst, src, w); @@ -703,11 +729,13 @@ static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) { } } -static void Copy4x4_C(const uint8_t* src, uint8_t* dst) { +static void Copy4x4_C(const uint8_t* WEBP_RESTRICT src, + uint8_t* WEBP_RESTRICT dst) { Copy(src, dst, 4, 4); } -static void Copy16x8_C(const uint8_t* src, uint8_t* dst) { +static void Copy16x8_C(const uint8_t* WEBP_RESTRICT src, + uint8_t* WEBP_RESTRICT dst) { Copy(src, dst, 16, 8); } @@ -720,7 +748,7 @@ VP8CHisto VP8CollectHistogram; VP8Idct VP8ITransform; VP8Fdct VP8FTransform; VP8Fdct VP8FTransform2; -VP8FWHT VP8FTransformWHT; +VP8WHT VP8FTransformWHT; VP8Intra4Preds VP8EncPredLuma4; VP8IntraPreds VP8EncPredLuma16; VP8IntraPreds VP8EncPredChroma8; diff --git a/src/dsp/enc_mips32.c b/src/dsp/enc_mips32.c index 50518a5f..6cd8c93d 100644 --- a/src/dsp/enc_mips32.c +++ b/src/dsp/enc_mips32.c @@ -109,9 +109,9 @@ static const int kC2 = WEBP_TRANSFORM_AC3_C2; "sb %[" #TEMP12 "], 3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" // Does one or two inverse transforms. -static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref, - const int16_t* in, - uint8_t* dst) { +static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { int temp0, temp1, temp2, temp3, temp4, temp5, temp6; int temp7, temp8, temp9, temp10, temp11, temp12, temp13; int temp14, temp15, temp16, temp17, temp18, temp19, temp20; @@ -141,8 +141,9 @@ static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref, ); } -static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in, - uint8_t* dst, int do_two) { +static void ITransform_MIPS32(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { ITransformOne_MIPS32(ref, in, dst); if (do_two) { ITransformOne_MIPS32(ref + 4, in + 16, dst + 4); @@ -236,7 +237,7 @@ static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16], } static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int nz; nz = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0; nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1; @@ -358,8 +359,9 @@ static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32], "msub %[temp6], %[temp0] \n\t" \ "msub %[temp7], %[temp1] \n\t" -static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_MIPS32(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { int tmp[32]; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; @@ -393,8 +395,9 @@ static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b, #undef VERTICAL_PASS #undef HORIZONTAL_PASS -static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_MIPS32(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { @@ -475,8 +478,9 @@ static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b, "sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \ "sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t" -static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform_MIPS32(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, + int16_t* WEBP_RESTRICT out) { int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16; int temp17, temp18, temp19, temp20; @@ -537,7 +541,8 @@ static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref, GET_SSE_INNER(C, C + 1, C + 2, C + 3) \ GET_SSE_INNER(D, D + 1, D + 2, D + 3) -static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_MIPS32(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { int count; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; @@ -571,7 +576,8 @@ static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_MIPS32(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { int count; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; @@ -597,7 +603,8 @@ static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_MIPS32(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { int count; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; @@ -619,7 +626,8 @@ static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_MIPS32(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { int count; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; diff --git a/src/dsp/enc_mips_dsp_r2.c b/src/dsp/enc_mips_dsp_r2.c index e1431f3b..4d808960 100644 --- a/src/dsp/enc_mips_dsp_r2.c +++ b/src/dsp/enc_mips_dsp_r2.c @@ -141,8 +141,9 @@ static const int kC2 = WEBP_TRANSFORM_AC3_C2; "sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \ "sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t" -static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, + int16_t* WEBP_RESTRICT out) { const int c2217 = 2217; const int c5352 = 5352; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; @@ -171,8 +172,9 @@ static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref, #undef VERTICAL_PASS #undef HORIZONTAL_PASS -static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, - uint8_t* dst) { +static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18; @@ -239,16 +241,18 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, ); } -static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in, - uint8_t* dst, int do_two) { +static void ITransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { ITransformOne(ref, in, dst); if (do_two) { ITransformOne(ref + 4, in + 16, dst + 4); } } -static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17; @@ -314,9 +318,9 @@ static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b, return abs(temp3 - temp17) >> 5; } -static int Disto16x16_MIPSdspR2(const uint8_t* const a, - const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { @@ -367,8 +371,8 @@ static int Disto16x16_MIPSdspR2(const uint8_t* const a, } while (0) #define VERTICAL_PRED(DST, TOP, SIZE) \ -static WEBP_INLINE void VerticalPred##SIZE(uint8_t* (DST), \ - const uint8_t* (TOP)) { \ +static WEBP_INLINE void VerticalPred##SIZE( \ + uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (TOP)) { \ int j; \ if ((TOP)) { \ for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE)); \ @@ -383,8 +387,8 @@ VERTICAL_PRED(dst, top, 16) #undef VERTICAL_PRED #define HORIZONTAL_PRED(DST, LEFT, SIZE) \ -static WEBP_INLINE void HorizontalPred##SIZE(uint8_t* (DST), \ - const uint8_t* (LEFT)) { \ +static WEBP_INLINE void HorizontalPred##SIZE( \ + uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (LEFT)) { \ if (LEFT) { \ int j; \ for (j = 0; j < (SIZE); ++j) { \ @@ -451,8 +455,9 @@ HORIZONTAL_PRED(dst, left, 16) } while (0) #define TRUE_MOTION(DST, LEFT, TOP, SIZE) \ -static WEBP_INLINE void TrueMotion##SIZE(uint8_t* (DST), const uint8_t* (LEFT),\ - const uint8_t* (TOP)) { \ +static WEBP_INLINE void TrueMotion##SIZE(uint8_t* WEBP_RESTRICT (DST), \ + const uint8_t* WEBP_RESTRICT (LEFT), \ + const uint8_t* WEBP_RESTRICT (TOP)) { \ if ((LEFT) != NULL) { \ if ((TOP) != NULL) { \ CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE)); \ @@ -480,8 +485,9 @@ TRUE_MOTION(dst, left, top, 16) #undef CLIP_8B_TO_DST #undef CLIPPING -static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DCMode16(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { int DC, DC1; int temp0, temp1, temp2, temp3; @@ -543,8 +549,9 @@ static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left, FILL_8_OR_16(dst, DC, 16); } -static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DCMode8(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { int DC, DC1; int temp0, temp1, temp2, temp3; @@ -588,7 +595,7 @@ static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left, FILL_8_OR_16(dst, DC, 8); } -static void DC4(uint8_t* dst, const uint8_t* top) { +static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int temp0, temp1; __asm__ volatile( "ulw %[temp0], 0(%[top]) \n\t" @@ -609,7 +616,7 @@ static void DC4(uint8_t* dst, const uint8_t* top) { ); } -static void TM4(uint8_t* dst, const uint8_t* top) { +static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5; const int c35 = 0xff00ff; __asm__ volatile ( @@ -664,7 +671,7 @@ static void TM4(uint8_t* dst, const uint8_t* top) { ); } -static void VE4(uint8_t* dst, const uint8_t* top) { +static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int temp0, temp1, temp2, temp3, temp4, temp5, temp6; __asm__ volatile( "ulw %[temp0], -1(%[top]) \n\t" @@ -695,7 +702,7 @@ static void VE4(uint8_t* dst, const uint8_t* top) { ); } -static void HE4(uint8_t* dst, const uint8_t* top) { +static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int temp0, temp1, temp2, temp3, temp4, temp5, temp6; __asm__ volatile( "ulw %[temp0], -4(%[top]) \n\t" @@ -731,7 +738,7 @@ static void HE4(uint8_t* dst, const uint8_t* top) { ); } -static void RD4(uint8_t* dst, const uint8_t* top) { +static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int temp0, temp1, temp2, temp3, temp4, temp5; int temp6, temp7, temp8, temp9, temp10, temp11; __asm__ volatile( @@ -780,7 +787,7 @@ static void RD4(uint8_t* dst, const uint8_t* top) { ); } -static void VR4(uint8_t* dst, const uint8_t* top) { +static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int temp0, temp1, temp2, temp3, temp4; int temp5, temp6, temp7, temp8, temp9; __asm__ volatile ( @@ -830,7 +837,7 @@ static void VR4(uint8_t* dst, const uint8_t* top) { ); } -static void LD4(uint8_t* dst, const uint8_t* top) { +static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int temp0, temp1, temp2, temp3, temp4, temp5; int temp6, temp7, temp8, temp9, temp10, temp11; __asm__ volatile( @@ -877,7 +884,7 @@ static void LD4(uint8_t* dst, const uint8_t* top) { ); } -static void VL4(uint8_t* dst, const uint8_t* top) { +static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int temp0, temp1, temp2, temp3, temp4; int temp5, temp6, temp7, temp8, temp9; __asm__ volatile ( @@ -926,7 +933,7 @@ static void VL4(uint8_t* dst, const uint8_t* top) { ); } -static void HD4(uint8_t* dst, const uint8_t* top) { +static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int temp0, temp1, temp2, temp3, temp4; int temp5, temp6, temp7, temp8, temp9; __asm__ volatile ( @@ -974,7 +981,7 @@ static void HD4(uint8_t* dst, const uint8_t* top) { ); } -static void HU4(uint8_t* dst, const uint8_t* top) { +static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) { int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; __asm__ volatile ( "ulw %[temp0], -5(%[top]) \n\t" @@ -1013,8 +1020,9 @@ static void HU4(uint8_t* dst, const uint8_t* top) { //------------------------------------------------------------------------------ // Chroma 8x8 prediction (paragraph 12.2) -static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static void IntraChromaPreds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { // U block DCMode8(C8DC8 + dst, left, top); VerticalPred8(C8VE8 + dst, top); @@ -1033,8 +1041,9 @@ static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left, //------------------------------------------------------------------------------ // luma 16x16 prediction (paragraph 12.3) -static void Intra16Preds_MIPSdspR2(uint8_t* dst, - const uint8_t* left, const uint8_t* top) { +static void Intra16Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { DCMode16(I16DC16 + dst, left, top); VerticalPred16(I16VE16 + dst, top); HorizontalPred16(I16HE16 + dst, left); @@ -1043,7 +1052,8 @@ static void Intra16Preds_MIPSdspR2(uint8_t* dst, // Left samples are top[-5 .. -2], top_left is top[-1], top are // located at top[0..3], and top right is top[4..7] -static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) { +static void Intra4Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { DC4(I4DC4 + dst, top); TM4(I4TM4 + dst, top); VE4(I4VE4 + dst, top); @@ -1079,7 +1089,8 @@ static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) { GET_SSE_INNER(C) \ GET_SSE_INNER(D) -static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { int count; int temp0, temp1, temp2, temp3; __asm__ volatile ( @@ -1109,7 +1120,8 @@ static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { int count; int temp0, temp1, temp2, temp3; __asm__ volatile ( @@ -1131,7 +1143,8 @@ static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { int count; int temp0, temp1, temp2, temp3; __asm__ volatile ( @@ -1149,7 +1162,8 @@ static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { int count; int temp0, temp1, temp2, temp3; __asm__ volatile ( @@ -1273,7 +1287,7 @@ static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) { "3: \n\t" static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int temp0, temp1, temp2, temp3, temp4, temp5,temp6; int sign, coeff, level; int max_level = MAX_LEVEL; @@ -1314,7 +1328,7 @@ static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16], } static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int nz; nz = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0; nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1; @@ -1360,7 +1374,8 @@ static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32], "usw %[" #TEMP4 "], " #C "(%[out]) \n\t" \ "usw %[" #TEMP6 "], " #D "(%[out]) \n\t" -static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) { +static void FTransformWHT_MIPSdspR2(const int16_t* WEBP_RESTRICT in, + int16_t* WEBP_RESTRICT out) { int temp0, temp1, temp2, temp3, temp4; int temp5, temp6, temp7, temp8, temp9; diff --git a/src/dsp/enc_msa.c b/src/dsp/enc_msa.c index 6f85add4..31ecb942 100644 --- a/src/dsp/enc_msa.c +++ b/src/dsp/enc_msa.c @@ -41,8 +41,9 @@ BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ } while (0) -static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, - uint8_t* dst) { +static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { v8i16 input0, input1; v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; v4i32 res0, res1, res2, res3; @@ -69,16 +70,18 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS); } -static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst, - int do_two) { +static void ITransform_MSA(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { ITransformOne(ref, in, dst); if (do_two) { ITransformOne(ref + 4, in + 16, dst + 4); } } -static void FTransform_MSA(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform_MSA(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, + int16_t* WEBP_RESTRICT out) { uint64_t out0, out1, out2, out3; uint32_t in0, in1, in2, in3; v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; @@ -131,7 +134,8 @@ static void FTransform_MSA(const uint8_t* src, const uint8_t* ref, SD4(out0, out1, out2, out3, out, 8); } -static void FTransformWHT_MSA(const int16_t* in, int16_t* out) { +static void FTransformWHT_MSA(const int16_t* WEBP_RESTRICT in, + int16_t* WEBP_RESTRICT out) { v8i16 in0 = { 0 }; v8i16 in1 = { 0 }; v8i16 tmp0, tmp1, tmp2, tmp3; @@ -168,7 +172,8 @@ static void FTransformWHT_MSA(const int16_t* in, int16_t* out) { ST_SH2(out0, out1, out, 8); } -static int TTransform_MSA(const uint8_t* in, const uint16_t* w) { +static int TTransform_MSA(const uint8_t* WEBP_RESTRICT in, + const uint16_t* WEBP_RESTRICT w) { int sum; uint32_t in0_m, in1_m, in2_m, in3_m; v16i8 src0 = { 0 }; @@ -200,15 +205,17 @@ static int TTransform_MSA(const uint8_t* in, const uint16_t* w) { return sum; } -static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_MSA(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { const int sum1 = TTransform_MSA(a, w); const int sum2 = TTransform_MSA(b, w); return abs(sum2 - sum1) >> 5; } -static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_MSA(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { @@ -259,7 +266,9 @@ static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred, #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) #define AVG2(a, b) (((a) + (b) + 1) >> 1) -static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical +// vertical +static WEBP_INLINE void VE4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const v16u8 A1 = { 0 }; const uint64_t val_m = LD(top - 1); const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m); @@ -272,7 +281,9 @@ static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical SW4(out, out, out, out, dst, BPS); } -static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal +// horizontal +static WEBP_INLINE void HE4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -284,7 +295,8 @@ static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L)); } -static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void DC4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { uint32_t dc = 4; int i; for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; @@ -293,7 +305,8 @@ static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) { SW4(dc, dc, dc, dc, dst, BPS); } -static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void RD4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const v16u8 A2 = { 0 }; const uint64_t val_m = LD(top - 5); const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m); @@ -313,7 +326,8 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { SW4(val3, val2, val1, val0, dst, BPS); } -static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void LD4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const v16u8 A1 = { 0 }; const uint64_t val_m = LD(top); const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m); @@ -333,7 +347,8 @@ static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { SW4(val0, val1, val2, val3, dst, BPS); } -static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void VR4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -354,7 +369,8 @@ static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) { DST(3, 1) = AVG3(B, C, D); } -static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void VL4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const int A = top[0]; const int B = top[1]; const int C = top[2]; @@ -375,7 +391,8 @@ static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) { DST(3, 3) = AVG3(F, G, H); } -static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void HU4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const int I = top[-2]; const int J = top[-3]; const int K = top[-4]; @@ -390,7 +407,8 @@ static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) { DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; } -static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void HD4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -411,7 +429,8 @@ static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) { DST(1, 3) = AVG3(L, K, J); } -static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void TM4(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const v16i8 zero = { 0 }; const v8i16 TL = (v8i16)__msa_fill_h(top[-1]); const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]); @@ -431,7 +450,8 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) { #undef AVG3 #undef AVG2 -static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) { +static void Intra4Preds_MSA(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { DC4(I4DC4 + dst, top); TM4(I4TM4 + dst, top); VE4(I4VE4 + dst, top); @@ -451,7 +471,8 @@ static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) { ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS); \ } while (0) -static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void VerticalPred16x16(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { if (top != NULL) { const v16u8 out = LD_UB(top); STORE16x16(out, dst); @@ -461,8 +482,8 @@ static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) { } } -static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst, - const uint8_t* left) { +static WEBP_INLINE void HorizontalPred16x16(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left) { if (left != NULL) { int j; for (j = 0; j < 16; j += 4) { @@ -480,8 +501,9 @@ static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst, } } -static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void TrueMotion16x16(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { if (left != NULL) { if (top != NULL) { int j; @@ -519,8 +541,9 @@ static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left, } } -static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DCMode16x16(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { int DC; v16u8 out; if (top != NULL && left != NULL) { @@ -548,8 +571,9 @@ static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left, STORE16x16(out, dst); } -static void Intra16Preds_MSA(uint8_t* dst, - const uint8_t* left, const uint8_t* top) { +static void Intra16Preds_MSA(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { DCMode16x16(I16DC16 + dst, left, top); VerticalPred16x16(I16VE16 + dst, top); HorizontalPred16x16(I16HE16 + dst, left); @@ -574,7 +598,8 @@ static void Intra16Preds_MSA(uint8_t* dst, SD4(out, out, out, out, dst + 4 * BPS, BPS); \ } while (0) -static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void VerticalPred8x8(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { if (top != NULL) { const uint64_t out = LD(top); STORE8x8(out, dst); @@ -584,7 +609,8 @@ static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) { } } -static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) { +static WEBP_INLINE void HorizontalPred8x8(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left) { if (left != NULL) { int j; for (j = 0; j < 8; j += 4) { @@ -606,8 +632,9 @@ static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) { } } -static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void TrueMotion8x8(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { if (left != NULL) { if (top != NULL) { int j; @@ -646,8 +673,9 @@ static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left, } } -static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DCMode8x8(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { uint64_t out; v16u8 src = { 0 }; if (top != NULL && left != NULL) { @@ -670,8 +698,9 @@ static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left, STORE8x8(out, dst); } -static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static void IntraChromaPreds_MSA(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { // U block DCMode8x8(C8DC8 + dst, left, top); VerticalPred8x8(C8VE8 + dst, top); @@ -712,7 +741,8 @@ static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left, DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \ } while (0) -static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_MSA(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { uint32_t sum; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; @@ -739,7 +769,8 @@ static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) { return sum; } -static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_MSA(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { uint32_t sum; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; @@ -758,7 +789,8 @@ static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) { return sum; } -static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_MSA(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { uint32_t sum; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; @@ -778,7 +810,8 @@ static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) { return sum; } -static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_MSA(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { uint32_t sum = 0; uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1; @@ -801,7 +834,7 @@ static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) { // Quantization static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int sum; v8i16 in0, in1, sh0, sh1, out0, out1; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1; @@ -854,7 +887,7 @@ static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16], } static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int nz; nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c index 7ba5b2d6..30a66fc5 100644 --- a/src/dsp/enc_neon.c +++ b/src/dsp/enc_neon.c @@ -60,8 +60,8 @@ static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst, static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01, const int16x8_t row23, - const uint8_t* const ref, - uint8_t* const dst) { + const uint8_t* WEBP_RESTRICT const ref, + uint8_t* WEBP_RESTRICT const dst) { uint32x2_t dst01 = vdup_n_u32(0); uint32x2_t dst23 = vdup_n_u32(0); @@ -120,8 +120,9 @@ static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) { Transpose8x2_NEON(E0, E1, rows); } -static void ITransformOne_NEON(const uint8_t* ref, - const int16_t* in, uint8_t* dst) { +static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { int16x8x2_t rows; INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8)); TransformPass_NEON(&rows); @@ -131,8 +132,9 @@ static void ITransformOne_NEON(const uint8_t* ref, #else -static void ITransformOne_NEON(const uint8_t* ref, - const int16_t* in, uint8_t* dst) { +static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { const int kBPS = BPS; const int16_t kC1C2[] = { kC1, kC2, 0, 0 }; @@ -247,8 +249,9 @@ static void ITransformOne_NEON(const uint8_t* ref, #endif // WEBP_USE_INTRINSICS -static void ITransform_NEON(const uint8_t* ref, - const int16_t* in, uint8_t* dst, int do_two) { +static void ITransform_NEON(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { ITransformOne_NEON(ref, in, dst); if (do_two) { ITransformOne_NEON(ref + 4, in + 16, dst + 4); @@ -294,8 +297,9 @@ static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a, return vreinterpretq_s16_u16(vsubl_u8(a, b)); } -static void FTransform_NEON(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, + int16_t* WEBP_RESTRICT out) { int16x8_t d0d1, d3d2; // working 4x4 int16 variables { const uint8x16_t S0 = Load4x4_NEON(src); @@ -364,8 +368,9 @@ static const int32_t kCoeff32[] = { 51000, 51000, 51000, 51000 }; -static void FTransform_NEON(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, + int16_t* WEBP_RESTRICT out) { const int kBPS = BPS; const uint8_t* src_ptr = src; const uint8_t* ref_ptr = ref; @@ -484,7 +489,8 @@ static void FTransform_NEON(const uint8_t* src, const uint8_t* ref, src += stride; \ } while (0) -static void FTransformWHT_NEON(const int16_t* src, int16_t* out) { +static void FTransformWHT_NEON(const int16_t* WEBP_RESTRICT src, + int16_t* WEBP_RESTRICT out) { const int stride = 16; const int16x4_t zero = vdup_n_s16(0); int32x4x4_t tmp0; @@ -659,8 +665,9 @@ static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in, // Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. // w[] contains a row-major 4 by 4 symmetric matrix. -static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_NEON(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { uint32x2_t d_in_ab_0123 = vdup_n_u32(0); uint32x2_t d_in_ab_4567 = vdup_n_u32(0); uint32x2_t d_in_ab_89ab = vdup_n_u32(0); @@ -701,8 +708,9 @@ static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b, } #undef LOAD_LANE_32b -static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_NEON(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { @@ -715,9 +723,10 @@ static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b, //------------------------------------------------------------------------------ -static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred, +static void CollectHistogram_NEON(const uint8_t* WEBP_RESTRICT ref, + const uint8_t* WEBP_RESTRICT pred, int start_block, int end_block, - VP8Histogram* const histo) { + VP8Histogram* WEBP_RESTRICT const histo) { const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH); int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; @@ -747,9 +756,9 @@ static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred, //------------------------------------------------------------------------------ -static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a, - const uint8_t* const b, - uint32x4_t* const sum) { +static WEBP_INLINE void AccumulateSSE16_NEON( + const uint8_t* WEBP_RESTRICT const a, const uint8_t* WEBP_RESTRICT const b, + uint32x4_t* const sum) { const uint8x16_t a0 = vld1q_u8(a); const uint8x16_t b0 = vld1q_u8(b); const uint8x16_t abs_diff = vabdq_u8(a0, b0); @@ -775,7 +784,8 @@ static int SumToInt_NEON(uint32x4_t sum) { #endif } -static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_NEON(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { uint32x4_t sum = vdupq_n_u32(0); int y; for (y = 0; y < 16; ++y) { @@ -784,7 +794,8 @@ static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) { return SumToInt_NEON(sum); } -static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_NEON(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { uint32x4_t sum = vdupq_n_u32(0); int y; for (y = 0; y < 8; ++y) { @@ -793,7 +804,8 @@ static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) { return SumToInt_NEON(sum); } -static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_NEON(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { uint32x4_t sum = vdupq_n_u32(0); int y; for (y = 0; y < 8; ++y) { @@ -806,7 +818,8 @@ static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) { return SumToInt_NEON(sum); } -static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_NEON(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { const uint8x16_t a0 = Load4x4_NEON(a); const uint8x16_t b0 = Load4x4_NEON(b); const uint8x16_t abs_diff = vabdq_u8(a0, b0); @@ -825,8 +838,9 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) { // Compilation with gcc-4.6.x is problematic for now. #if !defined(WORK_AROUND_GCC) -static int16x8_t Quantize_NEON(int16_t* const in, - const VP8Matrix* const mtx, int offset) { +static int16x8_t Quantize_NEON(int16_t* WEBP_RESTRICT const in, + const VP8Matrix* WEBP_RESTRICT const mtx, + int offset) { const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]); const uint16x8_t q = vld1q_u16(&mtx->q_[offset]); const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]); @@ -860,7 +874,7 @@ static const uint8_t kShuffles[4][8] = { }; static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { const int16x8_t out0 = Quantize_NEON(in, mtx, 0); const int16x8_t out1 = Quantize_NEON(in, mtx, 8); uint8x8x4_t shuffles; @@ -902,7 +916,7 @@ static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16], } static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int nz; nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0; nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1; @@ -930,7 +944,8 @@ static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32], vst1q_u8(dst, r); \ } while (0) -static void Intra4Preds_NEON(uint8_t* dst, const uint8_t* top) { +static void Intra4Preds_NEON(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 // L K J I X A B C D E F G H // -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 @@ -1162,8 +1177,9 @@ static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, const uint8_t* left, } } -static void Intra16Preds_NEON(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static void Intra16Preds_NEON(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { DCMode_NEON(I16DC16 + dst, left, top); VerticalPred16_NEON(I16VE16 + dst, top); HorizontalPred16_NEON(I16HE16 + dst, left); diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c index 010624a2..588a6292 100644 --- a/src/dsp/enc_sse2.c +++ b/src/dsp/enc_sse2.c @@ -26,8 +26,9 @@ // Transforms (Paragraph 14.4) // Does one inverse transform. -static void ITransform_One_SSE2(const uint8_t* ref, const int16_t* in, - uint8_t* dst) { +static void ITransform_One_SSE2(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 @@ -177,8 +178,9 @@ static void ITransform_One_SSE2(const uint8_t* ref, const int16_t* in, } // Does two inverse transforms. -static void ITransform_Two_SSE2(const uint8_t* ref, const int16_t* in, - uint8_t* dst) { +static void ITransform_Two_SSE2(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 @@ -316,7 +318,9 @@ static void ITransform_Two_SSE2(const uint8_t* ref, const int16_t* in, } // Does one or two inverse transforms. -static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, +static void ITransform_SSE2(const uint8_t* WEBP_RESTRICT ref, + const int16_t* WEBP_RESTRICT in, + uint8_t* WEBP_RESTRICT dst, int do_two) { if (do_two) { ITransform_Two_SSE2(ref, in, dst); @@ -373,7 +377,7 @@ static void FTransformPass1_SSE2(const __m128i* const in01, static void FTransformPass2_SSE2(const __m128i* const v01, const __m128i* const v32, - int16_t* out) { + int16_t* WEBP_RESTRICT out) { const __m128i zero = _mm_setzero_si128(); const __m128i seven = _mm_set1_epi16(7); const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, @@ -424,8 +428,9 @@ static void FTransformPass2_SSE2(const __m128i* const v01, _mm_storeu_si128((__m128i*)&out[8], d2_f3); } -static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform_SSE2(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, + int16_t* WEBP_RESTRICT out) { const __m128i zero = _mm_setzero_si128(); // Load src. const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); @@ -468,8 +473,9 @@ static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref, FTransformPass2_SSE2(&v01, &v32, out); } -static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform2_SSE2(const uint8_t* WEBP_RESTRICT src, + const uint8_t* WEBP_RESTRICT ref, + int16_t* WEBP_RESTRICT out) { const __m128i zero = _mm_setzero_si128(); // Load src and convert to 16b. @@ -517,7 +523,8 @@ static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref, FTransformPass2_SSE2(&v01h, &v32h, out + 16); } -static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) { +static void FTransformWHTRow_SSE2(const int16_t* WEBP_RESTRICT const in, + __m128i* const out) { const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1); const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]); const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]); @@ -533,7 +540,8 @@ static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) { *out = _mm_madd_epi16(D, kMult); } -static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) { +static void FTransformWHT_SSE2(const int16_t* WEBP_RESTRICT in, + int16_t* WEBP_RESTRICT out) { // Input is 12b signed. __m128i row0, row1, row2, row3; // Rows are 14b signed. @@ -566,9 +574,10 @@ static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) { // Compute susceptibility based on DCT-coeff histograms: // the higher, the "easier" the macroblock is to compress. -static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred, +static void CollectHistogram_SSE2(const uint8_t* WEBP_RESTRICT ref, + const uint8_t* WEBP_RESTRICT pred, int start_block, int end_block, - VP8Histogram* const histo) { + VP8Histogram* WEBP_RESTRICT const histo) { const __m128i zero = _mm_setzero_si128(); const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); int j; @@ -640,7 +649,8 @@ static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) { } } -static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void VE8uv_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { int j; const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); for (j = 0; j < 8; ++j) { @@ -648,7 +658,8 @@ static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) { } } -static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void VE16_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const __m128i top_values = _mm_load_si128((const __m128i*)top); int j; for (j = 0; j < 16; ++j) { @@ -656,8 +667,9 @@ static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) { } } -static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst, - const uint8_t* top, int size) { +static WEBP_INLINE void VerticalPred_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top, + int size) { if (top != NULL) { if (size == 8) { VE8uv_SSE2(dst, top); @@ -669,7 +681,8 @@ static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst, } } -static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) { +static WEBP_INLINE void HE8uv_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left) { int j; for (j = 0; j < 8; ++j) { const __m128i values = _mm_set1_epi8((char)left[j]); @@ -678,7 +691,8 @@ static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) { } } -static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) { +static WEBP_INLINE void HE16_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left) { int j; for (j = 0; j < 16; ++j) { const __m128i values = _mm_set1_epi8((char)left[j]); @@ -687,8 +701,9 @@ static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) { } } -static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst, - const uint8_t* left, int size) { +static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + int size) { if (left != NULL) { if (size == 8) { HE8uv_SSE2(dst, left); @@ -700,8 +715,9 @@ static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst, } } -static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left, - const uint8_t* top, int size) { +static WEBP_INLINE void TM_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top, int size) { const __m128i zero = _mm_setzero_si128(); int y; if (size == 8) { @@ -728,8 +744,10 @@ static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left, } } -static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left, - const uint8_t* top, int size) { +static WEBP_INLINE void TrueMotion_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top, + int size) { if (left != NULL) { if (top != NULL) { TM_SSE2(dst, left, top, size); @@ -749,8 +767,9 @@ static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left, } } -static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DC8uv_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); const __m128i left_values = _mm_loadl_epi64((const __m128i*)left); const __m128i combined = _mm_unpacklo_epi64(top_values, left_values); @@ -758,7 +777,8 @@ static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left, Put8x8uv_SSE2(DC >> 4, dst); } -static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const __m128i zero = _mm_setzero_si128(); const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); const __m128i sum = _mm_sad_epu8(top_values, zero); @@ -766,7 +786,8 @@ static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) { Put8x8uv_SSE2(DC >> 3, dst); } -static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* dst, const uint8_t* left) { +static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left) { // 'left' is contiguous so we can reuse the top summation. DC8uvNoLeft_SSE2(dst, left); } @@ -775,8 +796,9 @@ static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) { Put8x8uv_SSE2(0x80, dst); } -static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { if (top != NULL) { if (left != NULL) { // top and left present DC8uv_SSE2(dst, left, top); @@ -790,8 +812,9 @@ static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left, } } -static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DC16_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { const __m128i top_row = _mm_load_si128((const __m128i*)top); const __m128i left_row = _mm_load_si128((const __m128i*)left); const int DC = @@ -799,13 +822,15 @@ static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left, Put16_SSE2(DC >> 5, dst); } -static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const __m128i top_row = _mm_load_si128((const __m128i*)top); const int DC = VP8HorizontalAdd8b(&top_row) + 8; Put16_SSE2(DC >> 4, dst); } -static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* dst, const uint8_t* left) { +static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left) { // 'left' is contiguous so we can reuse the top summation. DC16NoLeft_SSE2(dst, left); } @@ -814,8 +839,9 @@ static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) { Put16_SSE2(0x80, dst); } -static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DC16Mode_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { if (top != NULL) { if (left != NULL) { // top and left present DC16_SSE2(dst, left, top); @@ -844,8 +870,9 @@ static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left, // where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1 // and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1 -static WEBP_INLINE void VE4_SSE2(uint8_t* dst, - const uint8_t* top) { // vertical +// vertical +static WEBP_INLINE void VE4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const __m128i one = _mm_set1_epi8(1); const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1)); const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); @@ -861,8 +888,9 @@ static WEBP_INLINE void VE4_SSE2(uint8_t* dst, } } -static WEBP_INLINE void HE4_SSE2(uint8_t* dst, - const uint8_t* top) { // horizontal +// horizontal +static WEBP_INLINE void HE4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -874,15 +902,17 @@ static WEBP_INLINE void HE4_SSE2(uint8_t* dst, WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L)); } -static WEBP_INLINE void DC4_SSE2(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void DC4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { uint32_t dc = 4; int i; for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; Fill_SSE2(dst, dc >> 3, 4); } -static WEBP_INLINE void LD4_SSE2(uint8_t* dst, - const uint8_t* top) { // Down-Left +// Down-Left +static WEBP_INLINE void LD4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const __m128i one = _mm_set1_epi8(1); const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top); const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); @@ -898,8 +928,9 @@ static WEBP_INLINE void LD4_SSE2(uint8_t* dst, WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } -static WEBP_INLINE void VR4_SSE2(uint8_t* dst, - const uint8_t* top) { // Vertical-Right +// Vertical-Right +static WEBP_INLINE void VR4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const __m128i one = _mm_set1_epi8(1); const int I = top[-2]; const int J = top[-3]; @@ -924,8 +955,9 @@ static WEBP_INLINE void VR4_SSE2(uint8_t* dst, DST(0, 3) = AVG3(K, J, I); } -static WEBP_INLINE void VL4_SSE2(uint8_t* dst, - const uint8_t* top) { // Vertical-Left +// Vertical-Left +static WEBP_INLINE void VL4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const __m128i one = _mm_set1_epi8(1); const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top); const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1); @@ -951,8 +983,9 @@ static WEBP_INLINE void VL4_SSE2(uint8_t* dst, DST(3, 3) = (extra_out >> 8) & 0xff; } -static WEBP_INLINE void RD4_SSE2(uint8_t* dst, - const uint8_t* top) { // Down-right +// Down-right +static WEBP_INLINE void RD4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const __m128i one = _mm_set1_epi8(1); const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5)); const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4); @@ -968,7 +1001,8 @@ static WEBP_INLINE void RD4_SSE2(uint8_t* dst, WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } -static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void HU4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const int I = top[-2]; const int J = top[-3]; const int K = top[-4]; @@ -983,7 +1017,8 @@ static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) { DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; } -static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void HD4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -1006,7 +1041,8 @@ static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) { DST(1, 3) = AVG3(L, K, J); } -static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void TM4_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { const __m128i zero = _mm_setzero_si128(); const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top)); const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); @@ -1028,7 +1064,8 @@ static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) { // Left samples are top[-5 .. -2], top_left is top[-1], top are // located at top[0..3], and top right is top[4..7] -static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) { +static void Intra4Preds_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT top) { DC4_SSE2(I4DC4 + dst, top); TM4_SSE2(I4TM4 + dst, top); VE4_SSE2(I4VE4 + dst, top); @@ -1044,8 +1081,9 @@ static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) { //------------------------------------------------------------------------------ // Chroma 8x8 prediction (paragraph 12.2) -static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static void IntraChromaPreds_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { // U block DC8uvMode_SSE2(C8DC8 + dst, left, top); VerticalPred_SSE2(C8VE8 + dst, top, 8); @@ -1064,8 +1102,9 @@ static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left, //------------------------------------------------------------------------------ // luma 16x16 prediction (paragraph 12.3) -static void Intra16Preds_SSE2(uint8_t* dst, - const uint8_t* left, const uint8_t* top) { +static void Intra16Preds_SSE2(uint8_t* WEBP_RESTRICT dst, + const uint8_t* WEBP_RESTRICT left, + const uint8_t* WEBP_RESTRICT top) { DC16Mode_SSE2(I16DC16 + dst, left, top); VerticalPred_SSE2(I16VE16 + dst, top, 16); HorizontalPred_SSE2(I16HE16 + dst, left, 16); @@ -1092,7 +1131,8 @@ static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a, *sum = _mm_add_epi32(sum1, sum2); } -static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b, +static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b, int num_pairs) { __m128i sum = _mm_setzero_si128(); int32_t tmp[4]; @@ -1114,18 +1154,21 @@ static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b, return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); } -static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_SSE2(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { return SSE_16xN_SSE2(a, b, 8); } -static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_SSE2(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { return SSE_16xN_SSE2(a, b, 4); } #define LOAD_8x16b(ptr) \ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero) -static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_SSE2(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { const __m128i zero = _mm_setzero_si128(); int num_pairs = 4; __m128i sum = zero; @@ -1152,7 +1195,8 @@ static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) { } #undef LOAD_8x16b -static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_SSE2(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT b) { const __m128i zero = _mm_setzero_si128(); // Load values. Note that we read 8 pixels instead of 4, @@ -1189,7 +1233,7 @@ static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) { //------------------------------------------------------------------------------ -static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) { +static void Mean16x4_SSE2(const uint8_t* WEBP_RESTRICT ref, uint32_t dc[4]) { const __m128i mask = _mm_set1_epi16(0x00ff); const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]); const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]); @@ -1227,8 +1271,9 @@ static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) { // Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. // w[] contains a row-major 4 by 4 symmetric matrix. -static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB, - const uint16_t* const w) { +static int TTransform_SSE2(const uint8_t* WEBP_RESTRICT inA, + const uint8_t* WEBP_RESTRICT inB, + const uint16_t* WEBP_RESTRICT const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; const __m128i zero = _mm_setzero_si128(); @@ -1328,14 +1373,16 @@ static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB, return sum[0] + sum[1] + sum[2] + sum[3]; } -static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_SSE2(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { const int diff_sum = TTransform_SSE2(a, b, w); return abs(diff_sum) >> 5; } -static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_SSE2(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { @@ -1350,9 +1397,10 @@ static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b, // Quantization // -static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16], - const uint16_t* const sharpen, - const VP8Matrix* const mtx) { +static WEBP_INLINE int DoQuantizeBlock_SSE2( + int16_t in[16], int16_t out[16], + const uint16_t* WEBP_RESTRICT const sharpen, + const VP8Matrix* WEBP_RESTRICT const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL); const __m128i zero = _mm_setzero_si128(); __m128i coeff0, coeff8; @@ -1463,17 +1511,17 @@ static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16], } static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx); } static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { return DoQuantizeBlock_SSE2(in, out, NULL, mtx); } static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int nz; const uint16_t* const sharpen = &mtx->sharpen_[0]; nz = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0; diff --git a/src/dsp/enc_sse41.c b/src/dsp/enc_sse41.c index 924035a6..613c44cf 100644 --- a/src/dsp/enc_sse41.c +++ b/src/dsp/enc_sse41.c @@ -23,9 +23,10 @@ //------------------------------------------------------------------------------ // Compute susceptibility based on DCT-coeff histograms. -static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred, +static void CollectHistogram_SSE41(const uint8_t* WEBP_RESTRICT ref, + const uint8_t* WEBP_RESTRICT pred, int start_block, int end_block, - VP8Histogram* const histo) { + VP8Histogram* WEBP_RESTRICT const histo) { const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; @@ -168,14 +169,16 @@ static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB, return sum[0] + sum[1] + sum[2] + sum[3]; } -static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_SSE41(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { const int diff_sum = TTransform_SSE41(a, b, w); return abs(diff_sum) >> 5; } -static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_SSE41(const uint8_t* WEBP_RESTRICT const a, + const uint8_t* WEBP_RESTRICT const b, + const uint16_t* WEBP_RESTRICT const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { @@ -301,17 +304,17 @@ static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16], #undef PSHUFB_CST static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx); } static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { return DoQuantizeBlock_SSE41(in, out, NULL, mtx); } static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { + const VP8Matrix* WEBP_RESTRICT const mtx) { int nz; const uint16_t* const sharpen = &mtx->sharpen_[0]; nz = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;