From 23bbafbeb8ea2bbd701738d712855a5bb7330838 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 16 Aug 2024 19:02:19 -0700 Subject: [PATCH] dsp/upsampling*: use WEBP_RESTRICT qualifier Better vectorization in the C code, fewer instructions in NEON, and some code reordering / better register usage in SSE2/SSE4 w/ndk r27/gcc-13/clang-16. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: Ib29980f778ad3dbb952178ad8dee39b8673c4ff8 --- src/dsp/dsp.h | 29 +++++++++-------- src/dsp/upsampling.c | 36 ++++++++++++++------- src/dsp/upsampling_mips_dsp_r2.c | 18 +++++++---- src/dsp/upsampling_msa.c | 55 +++++++++++++++++++++----------- src/dsp/upsampling_neon.c | 17 ++++++---- src/dsp/upsampling_sse2.c | 29 +++++++++++------ src/dsp/upsampling_sse41.c | 29 +++++++++++------ 7 files changed, 137 insertions(+), 76 deletions(-) diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index b7c2acf0..8e0b4143 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -285,10 +285,10 @@ void VP8DspInit(void); // Convert a pair of y/u/v lines together to the output rgb/a colorspace. // bottom_y can be NULL if only one line of output is needed (at top/bottom). typedef void (*WebPUpsampleLinePairFunc)( - const uint8_t* top_y, const uint8_t* bottom_y, - const uint8_t* top_u, const uint8_t* top_v, - const uint8_t* cur_u, const uint8_t* cur_v, - uint8_t* top_dst, uint8_t* bottom_dst, int len); + const uint8_t* WEBP_RESTRICT top_y, const uint8_t* WEBP_RESTRICT bottom_y, + const uint8_t* WEBP_RESTRICT top_u, const uint8_t* WEBP_RESTRICT top_v, + const uint8_t* WEBP_RESTRICT cur_u, const uint8_t* WEBP_RESTRICT cur_v, + uint8_t* WEBP_RESTRICT top_dst, uint8_t* WEBP_RESTRICT bottom_dst, int len); #ifdef FANCY_UPSAMPLING @@ -298,13 +298,15 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */]; #endif // FANCY_UPSAMPLING // Per-row point-sampling methods. -typedef void (*WebPSamplerRowFunc)(const uint8_t* y, - const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len); +typedef void (*WebPSamplerRowFunc)(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int len); // Generic function to apply 'WebPSamplerRowFunc' to the whole plane: -void WebPSamplerProcessPlane(const uint8_t* y, int y_stride, - const uint8_t* u, const uint8_t* v, int uv_stride, - uint8_t* dst, int dst_stride, +void WebPSamplerProcessPlane(const uint8_t* WEBP_RESTRICT y, int y_stride, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, int uv_stride, + uint8_t* WEBP_RESTRICT dst, int dst_stride, int width, int height, WebPSamplerRowFunc func); // Sampling functions to convert rows of YUV to RGB(A) @@ -316,9 +318,10 @@ extern WebPSamplerRowFunc WebPSamplers[/* MODE_LAST */]; WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last); // YUV444->RGB converters -typedef void (*WebPYUV444Converter)(const uint8_t* y, - const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len); +typedef void (*WebPYUV444Converter)(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int len); extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */]; diff --git a/src/dsp/upsampling.c b/src/dsp/upsampling.c index 983b9c42..5953fe48 100644 --- a/src/dsp/upsampling.c +++ b/src/dsp/upsampling.c @@ -35,10 +35,14 @@ WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST]; #define LOAD_UV(u, v) ((u) | ((v) << 16)) #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ -static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ - const uint8_t* top_u, const uint8_t* top_v, \ - const uint8_t* cur_u, const uint8_t* cur_v, \ - uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \ + const uint8_t* WEBP_RESTRICT bottom_y, \ + const uint8_t* WEBP_RESTRICT top_u, \ + const uint8_t* WEBP_RESTRICT top_v, \ + const uint8_t* WEBP_RESTRICT cur_u, \ + const uint8_t* WEBP_RESTRICT cur_v, \ + uint8_t* WEBP_RESTRICT top_dst, \ + uint8_t* WEBP_RESTRICT bottom_dst, int len) { \ int x; \ const int last_pixel_pair = (len - 1) >> 1; \ uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \ @@ -136,10 +140,14 @@ static void EmptyUpsampleFunc(const uint8_t* top_y, const uint8_t* bottom_y, #if !defined(FANCY_UPSAMPLING) #define DUAL_SAMPLE_FUNC(FUNC_NAME, FUNC) \ -static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \ - const uint8_t* top_u, const uint8_t* top_v, \ - const uint8_t* bot_u, const uint8_t* bot_v, \ - uint8_t* top_dst, uint8_t* bot_dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \ + const uint8_t* WEBP_RESTRICT bot_y, \ + const uint8_t* WEBP_RESTRICT top_u, \ + const uint8_t* WEBP_RESTRICT top_v, \ + const uint8_t* WEBP_RESTRICT bot_u, \ + const uint8_t* WEBP_RESTRICT bot_v, \ + uint8_t* WEBP_RESTRICT top_dst, \ + uint8_t* WEBP_RESTRICT bot_dst, int len) { \ const int half_len = len >> 1; \ int x; \ assert(top_dst != NULL); \ @@ -178,10 +186,14 @@ WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) { // YUV444 converter #define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \ -extern void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len); \ -void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len) { \ +extern void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len); \ +void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len) { \ int i; \ for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]); \ } diff --git a/src/dsp/upsampling_mips_dsp_r2.c b/src/dsp/upsampling_mips_dsp_r2.c index 10d499d7..cbe8e71d 100644 --- a/src/dsp/upsampling_mips_dsp_r2.c +++ b/src/dsp/upsampling_mips_dsp_r2.c @@ -143,10 +143,14 @@ static WEBP_INLINE void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, #define LOAD_UV(u, v) ((u) | ((v) << 16)) #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ -static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ - const uint8_t* top_u, const uint8_t* top_v, \ - const uint8_t* cur_u, const uint8_t* cur_v, \ - uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \ + const uint8_t* WEBP_RESTRICT bottom_y, \ + const uint8_t* WEBP_RESTRICT top_u, \ + const uint8_t* WEBP_RESTRICT top_v, \ + const uint8_t* WEBP_RESTRICT cur_u, \ + const uint8_t* WEBP_RESTRICT cur_v, \ + uint8_t* WEBP_RESTRICT top_dst, \ + uint8_t* WEBP_RESTRICT bottom_dst, int len) { \ int x; \ const int last_pixel_pair = (len - 1) >> 1; \ uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \ @@ -241,8 +245,10 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) { // YUV444 converter #define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \ -static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len) { \ int i; \ for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \ } diff --git a/src/dsp/upsampling_msa.c b/src/dsp/upsampling_msa.c index f2e03e85..72a526bc 100644 --- a/src/dsp/upsampling_msa.c +++ b/src/dsp/upsampling_msa.c @@ -320,8 +320,10 @@ static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) { } #if !defined(WEBP_REDUCE_CSP) -static void YuvToRgbLine(const uint8_t* y, const uint8_t* u, - const uint8_t* v, uint8_t* dst, int length) { +static void YuvToRgbLine(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int length) { v16u8 R, G, B; while (length >= 16) { CALC_RGB16(y, u, v, R, G, B); @@ -347,8 +349,10 @@ static void YuvToRgbLine(const uint8_t* y, const uint8_t* u, } } -static void YuvToBgrLine(const uint8_t* y, const uint8_t* u, - const uint8_t* v, uint8_t* dst, int length) { +static void YuvToBgrLine(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int length) { v16u8 R, G, B; while (length >= 16) { CALC_RGB16(y, u, v, R, G, B); @@ -375,8 +379,10 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u, } #endif // WEBP_REDUCE_CSP -static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u, - const uint8_t* v, uint8_t* dst, int length) { +static void YuvToRgbaLine(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int length) { v16u8 R, G, B; const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL); while (length >= 16) { @@ -403,8 +409,10 @@ static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u, } } -static void YuvToBgraLine(const uint8_t* y, const uint8_t* u, - const uint8_t* v, uint8_t* dst, int length) { +static void YuvToBgraLine(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int length) { v16u8 R, G, B; const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL); while (length >= 16) { @@ -432,8 +440,10 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u, } #if !defined(WEBP_REDUCE_CSP) -static void YuvToArgbLine(const uint8_t* y, const uint8_t* u, - const uint8_t* v, uint8_t* dst, int length) { +static void YuvToArgbLine(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int length) { v16u8 R, G, B; const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL); while (length >= 16) { @@ -460,8 +470,10 @@ static void YuvToArgbLine(const uint8_t* y, const uint8_t* u, } } -static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u, - const uint8_t* v, uint8_t* dst, int length) { +static void YuvToRgba4444Line(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int length) { v16u8 R, G, B, RG, BA, tmp0, tmp1; while (length >= 16) { #if (WEBP_SWAP_16BIT_CSP == 1) @@ -496,8 +508,10 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u, } } -static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u, - const uint8_t* v, uint8_t* dst, int length) { +static void YuvToRgb565Line(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int length) { v16u8 R, G, B, RG, GB, tmp0, tmp1; while (length >= 16) { #if (WEBP_SWAP_16BIT_CSP == 1) @@ -564,11 +578,14 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u, } while (0) #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ -static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \ - const uint8_t* top_u, const uint8_t* top_v, \ - const uint8_t* cur_u, const uint8_t* cur_v, \ - uint8_t* top_dst, uint8_t* bot_dst, int len) \ -{ \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \ + const uint8_t* WEBP_RESTRICT bot_y, \ + const uint8_t* WEBP_RESTRICT top_u, \ + const uint8_t* WEBP_RESTRICT top_v, \ + const uint8_t* WEBP_RESTRICT cur_u, \ + const uint8_t* WEBP_RESTRICT cur_v, \ + uint8_t* WEBP_RESTRICT top_dst, \ + uint8_t* WEBP_RESTRICT bot_dst, int len) { \ int size = (len - 1) >> 1; \ uint8_t temp_u[64]; \ uint8_t temp_v[64]; \ diff --git a/src/dsp/upsampling_neon.c b/src/dsp/upsampling_neon.c index f39d75e0..2bd3e931 100644 --- a/src/dsp/upsampling_neon.c +++ b/src/dsp/upsampling_neon.c @@ -58,8 +58,9 @@ } while (0) // Turn the macro into a function for reducing code-size when non-critical -static void Upsample16Pixels_NEON(const uint8_t* r1, const uint8_t* r2, - uint8_t* out) { +static void Upsample16Pixels_NEON(const uint8_t* WEBP_RESTRICT const r1, + const uint8_t* WEBP_RESTRICT const r2, + uint8_t* WEBP_RESTRICT const out) { UPSAMPLE_16PIXELS(r1, r2, out); } @@ -190,10 +191,14 @@ static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 }; } #define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP) \ -static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ - const uint8_t* top_u, const uint8_t* top_v, \ - const uint8_t* cur_u, const uint8_t* cur_v, \ - uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \ + const uint8_t* WEBP_RESTRICT bottom_y, \ + const uint8_t* WEBP_RESTRICT top_u, \ + const uint8_t* WEBP_RESTRICT top_v, \ + const uint8_t* WEBP_RESTRICT cur_u, \ + const uint8_t* WEBP_RESTRICT cur_v, \ + uint8_t* WEBP_RESTRICT top_dst, \ + uint8_t* WEBP_RESTRICT bottom_dst, int len) { \ int block; \ /* 16 byte aligned array to cache reconstructed u and v */ \ uint8_t uv_buf[2 * 32 + 15]; \ diff --git a/src/dsp/upsampling_sse2.c b/src/dsp/upsampling_sse2.c index 77b4f722..36226fb1 100644 --- a/src/dsp/upsampling_sse2.c +++ b/src/dsp/upsampling_sse2.c @@ -88,8 +88,9 @@ } while (0) // Turn the macro into a function for reducing code-size when non-critical -static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[], - uint8_t* const out) { +static void Upsample32Pixels_SSE2(const uint8_t* WEBP_RESTRICT const r1, + const uint8_t* WEBP_RESTRICT const r2, + uint8_t* WEBP_RESTRICT const out) { UPSAMPLE_32PIXELS(r1, r2, out); } @@ -114,10 +115,14 @@ static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[], } while (0) #define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ -static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ - const uint8_t* top_u, const uint8_t* top_v, \ - const uint8_t* cur_u, const uint8_t* cur_v, \ - uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \ + const uint8_t* WEBP_RESTRICT bottom_y, \ + const uint8_t* WEBP_RESTRICT top_u, \ + const uint8_t* WEBP_RESTRICT top_v, \ + const uint8_t* WEBP_RESTRICT cur_u, \ + const uint8_t* WEBP_RESTRICT cur_v, \ + uint8_t* WEBP_RESTRICT top_dst, \ + uint8_t* WEBP_RESTRICT bottom_dst, int len) { \ int uv_pos, pos; \ /* 16byte-aligned array to cache reconstructed u and v */ \ uint8_t uv_buf[14 * 32 + 15] = { 0 }; \ @@ -215,10 +220,14 @@ extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */]; extern void WebPInitYUV444ConvertersSSE2(void); #define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \ -extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len); \ -static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len) { \ +extern void CALL_C(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len); \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len) { \ int i; \ const int max_len = len & ~31; \ for (i = 0; i < max_len; i += 32) { \ diff --git a/src/dsp/upsampling_sse41.c b/src/dsp/upsampling_sse41.c index a880974a..823633c4 100644 --- a/src/dsp/upsampling_sse41.c +++ b/src/dsp/upsampling_sse41.c @@ -90,8 +90,9 @@ } while (0) // Turn the macro into a function for reducing code-size when non-critical -static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[], - uint8_t* const out) { +static void Upsample32Pixels_SSE41(const uint8_t* WEBP_RESTRICT const r1, + const uint8_t* WEBP_RESTRICT const r2, + uint8_t* WEBP_RESTRICT const out) { UPSAMPLE_32PIXELS(r1, r2, out); } @@ -116,10 +117,14 @@ static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[], } while (0) #define SSE4_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ -static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ - const uint8_t* top_u, const uint8_t* top_v, \ - const uint8_t* cur_u, const uint8_t* cur_v, \ - uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \ + const uint8_t* WEBP_RESTRICT bottom_y, \ + const uint8_t* WEBP_RESTRICT top_u, \ + const uint8_t* WEBP_RESTRICT top_v, \ + const uint8_t* WEBP_RESTRICT cur_u, \ + const uint8_t* WEBP_RESTRICT cur_v, \ + uint8_t* WEBP_RESTRICT top_dst, \ + uint8_t* WEBP_RESTRICT bottom_dst, int len) { \ int uv_pos, pos; \ /* 16byte-aligned array to cache reconstructed u and v */ \ uint8_t uv_buf[14 * 32 + 15] = { 0 }; \ @@ -202,10 +207,14 @@ extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */]; extern void WebPInitYUV444ConvertersSSE41(void); #define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \ -extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len); \ -static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len) { \ +extern void CALL_C(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len); \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len) { \ int i; \ const int max_len = len & ~31; \ for (i = 0; i < max_len; i += 32) { \