From 2dd5eb9862d4822d2746cfb55f4b59b53bda99eb Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 16 Aug 2024 19:26:36 -0700 Subject: [PATCH] dsp/yuv*: use WEBP_RESTRICT qualifier Better vectorization in the C code, fewer instructions / comparisons in NEON, and fewer reloads in SSE2/SSE4 w/ndk r27/gcc-13/clang-16. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: I07a7e36a2dce8632c71c0fbbeef94dc51453eaf7 --- src/dsp/dsp.h | 27 +++++--- src/dsp/yuv.c | 48 ++++++++------ src/dsp/yuv.h | 54 ++++++++++------ src/dsp/yuv_mips32.c | 7 +- src/dsp/yuv_mips_dsp_r2.c | 7 +- src/dsp/yuv_neon.c | 18 ++++-- src/dsp/yuv_sse2.c | 130 ++++++++++++++++++++++---------------- src/dsp/yuv_sse41.c | 65 +++++++++++-------- 8 files changed, 218 insertions(+), 138 deletions(-) diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index 8e0b4143..1b37ef4b 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -337,26 +337,35 @@ void WebPInitYUV444Converters(void); // ARGB -> YUV converters // Convert ARGB samples to luma Y. -extern void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width); +extern void (*WebPConvertARGBToY)(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT y, int width); // Convert ARGB samples to U/V with downsampling. do_store should be '1' for // even lines and '0' for odd ones. 'src_width' is the original width, not // the U/V one. -extern void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v, +extern void (*WebPConvertARGBToUV)(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int src_width, int do_store); // Convert a row of accumulated (four-values) of rgba32 toward U/V -extern void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb, - uint8_t* u, uint8_t* v, int width); +extern void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int width); // Convert RGB or BGR to Y -extern void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width); -extern void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width); +extern void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT y, int width); +extern void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr, + uint8_t* WEBP_RESTRICT y, int width); // used for plain-C fallback. -extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v, +extern void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int src_width, int do_store); -extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb, - uint8_t* u, uint8_t* v, int width); +extern void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int width); // Must be called before using the above. void WebPInitConvertARGBToYUV(void); diff --git a/src/dsp/yuv.c b/src/dsp/yuv.c index 8a04b85d..c1320f28 100644 --- a/src/dsp/yuv.c +++ b/src/dsp/yuv.c @@ -20,9 +20,10 @@ // Plain-C version #define ROW_FUNC(FUNC_NAME, FUNC, XSTEP) \ -static void FUNC_NAME(const uint8_t* y, \ - const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len) { \ const uint8_t* const end = dst + (len & ~1) * (XSTEP); \ while (dst != end) { \ FUNC(y[0], u[0], v[0], dst); \ @@ -49,9 +50,10 @@ ROW_FUNC(YuvToRgb565Row, VP8YuvToRgb565, 2) #undef ROW_FUNC // Main call for processing a plane with a WebPSamplerRowFunc function: -void WebPSamplerProcessPlane(const uint8_t* y, int y_stride, - const uint8_t* u, const uint8_t* v, int uv_stride, - uint8_t* dst, int dst_stride, +void WebPSamplerProcessPlane(const uint8_t* WEBP_RESTRICT y, int y_stride, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, int uv_stride, + uint8_t* WEBP_RESTRICT dst, int dst_stride, int width, int height, WebPSamplerRowFunc func) { int j; for (j = 0; j < height; ++j) { @@ -117,7 +119,8 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) { //----------------------------------------------------------------------------- // ARGB -> YUV converters -static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) { +static void ConvertARGBToY_C(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT y, int width) { int i; for (i = 0; i < width; ++i) { const uint32_t p = argb[i]; @@ -126,7 +129,8 @@ static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) { } } -void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v, +void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, int src_width, int do_store) { // No rounding. Last pixel is dealt with separately. const int uv_width = src_width >> 1; @@ -169,22 +173,25 @@ void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v, //----------------------------------------------------------------------------- -static void ConvertRGB24ToY_C(const uint8_t* rgb, uint8_t* y, int width) { +static void ConvertRGB24ToY_C(const uint8_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT y, int width) { int i; for (i = 0; i < width; ++i, rgb += 3) { y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); } } -static void ConvertBGR24ToY_C(const uint8_t* bgr, uint8_t* y, int width) { +static void ConvertBGR24ToY_C(const uint8_t* WEBP_RESTRICT bgr, + uint8_t* WEBP_RESTRICT y, int width) { int i; for (i = 0; i < width; ++i, bgr += 3) { y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); } } -void WebPConvertRGBA32ToUV_C(const uint16_t* rgb, - uint8_t* u, uint8_t* v, int width) { +void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, + int width) { int i; for (i = 0; i < width; i += 1, rgb += 4) { const int r = rgb[0], g = rgb[1], b = rgb[2]; @@ -195,13 +202,18 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb, //----------------------------------------------------------------------------- -void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width); -void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width); -void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb, - uint8_t* u, uint8_t* v, int width); +void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT y, int width); +void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr, + uint8_t* WEBP_RESTRICT y, int width); +void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int width); -void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width); -void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v, +void (*WebPConvertARGBToY)(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT y, int width); +void (*WebPConvertARGBToUV)(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, int src_width, int do_store); extern void WebPInitConvertARGBToYUVSSE2(void); diff --git a/src/dsp/yuv.h b/src/dsp/yuv.h index 66a397d1..91fdba12 100644 --- a/src/dsp/yuv.h +++ b/src/dsp/yuv.h @@ -149,20 +149,34 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v, #if defined(WEBP_USE_SSE2) // Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst. -void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u, - const uint8_t* v, uint8_t* dst); -void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); +void VP8YuvToRgba32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst); +void VP8YuvToRgb32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst); +void VP8YuvToBgra32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst); +void VP8YuvToBgr32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst); +void VP8YuvToArgb32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst); +void VP8YuvToRgba444432_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst); +void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst); #endif // WEBP_USE_SSE2 @@ -172,10 +186,14 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, #if defined(WEBP_USE_SSE41) // Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst. -void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); +void VP8YuvToRgb32_SSE41(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst); +void VP8YuvToBgr32_SSE41(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst); #endif // WEBP_USE_SSE41 diff --git a/src/dsp/yuv_mips32.c b/src/dsp/yuv_mips32.c index 9d0a8878..1f634858 100644 --- a/src/dsp/yuv_mips32.c +++ b/src/dsp/yuv_mips32.c @@ -22,9 +22,10 @@ // simple point-sampling #define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \ -static void FUNC_NAME(const uint8_t* y, \ - const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len) { \ int i, r, g, b; \ int temp0, temp1, temp2, temp3, temp4; \ for (i = 0; i < (len >> 1); i++) { \ diff --git a/src/dsp/yuv_mips_dsp_r2.c b/src/dsp/yuv_mips_dsp_r2.c index cc8afcc7..816340fe 100644 --- a/src/dsp/yuv_mips_dsp_r2.c +++ b/src/dsp/yuv_mips_dsp_r2.c @@ -69,9 +69,10 @@ : "memory", "hi", "lo" \ #define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \ -static void FUNC_NAME(const uint8_t* y, \ - const uint8_t* u, const uint8_t* v, \ - uint8_t* dst, int len) { \ +static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \ + const uint8_t* WEBP_RESTRICT u, \ + const uint8_t* WEBP_RESTRICT v, \ + uint8_t* WEBP_RESTRICT dst, int len) { \ int i; \ uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; \ const int t_con_1 = 26149; \ diff --git a/src/dsp/yuv_neon.c b/src/dsp/yuv_neon.c index ff77b009..b1b7c604 100644 --- a/src/dsp/yuv_neon.c +++ b/src/dsp/yuv_neon.c @@ -46,7 +46,8 @@ static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R, return vqmovn_u16(Y2); } -static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) { +static void ConvertRGB24ToY_NEON(const uint8_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT y, int width) { int i; for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) { const uint8x8x3_t RGB = vld3_u8(rgb); @@ -58,7 +59,8 @@ static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) { } } -static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) { +static void ConvertBGR24ToY_NEON(const uint8_t* WEBP_RESTRICT bgr, + uint8_t* WEBP_RESTRICT y, int width) { int i; for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) { const uint8x8x3_t BGR = vld3_u8(bgr); @@ -70,7 +72,8 @@ static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) { } } -static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) { +static void ConvertARGBToY_NEON(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT y, int width) { int i; for (i = 0; i + 8 <= width; i += 8) { const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]); @@ -114,8 +117,9 @@ static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) { MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST); \ } while (0) -static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb, - uint8_t* u, uint8_t* v, int width) { +static void ConvertRGBA32ToUV_NEON(const uint16_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int width) { int i; for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) { const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb); @@ -131,7 +135,9 @@ static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb, } } -static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v, +static void ConvertARGBToUV_NEON(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int src_width, int do_store) { int i; for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) { diff --git a/src/dsp/yuv_sse2.c b/src/dsp/yuv_sse2.c index 01a48f9a..a96b4522 100644 --- a/src/dsp/yuv_sse2.c +++ b/src/dsp/yuv_sse2.c @@ -82,9 +82,9 @@ static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) { } // Convert 32 samples of YUV444 to R/G/B -static void YUV444ToRGB_SSE2(const uint8_t* const y, - const uint8_t* const u, - const uint8_t* const v, +static void YUV444ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y, + const uint8_t* WEBP_RESTRICT const u, + const uint8_t* WEBP_RESTRICT const v, __m128i* const R, __m128i* const G, __m128i* const B) { const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u), @@ -93,9 +93,9 @@ static void YUV444ToRGB_SSE2(const uint8_t* const y, } // Convert 32 samples of YUV420 to R/G/B -static void YUV420ToRGB_SSE2(const uint8_t* const y, - const uint8_t* const u, - const uint8_t* const v, +static void YUV420ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y, + const uint8_t* WEBP_RESTRICT const u, + const uint8_t* WEBP_RESTRICT const v, __m128i* const R, __m128i* const G, __m128i* const B) { const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u), @@ -108,7 +108,7 @@ static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R, const __m128i* const G, const __m128i* const B, const __m128i* const A, - uint8_t* const dst) { + uint8_t* WEBP_RESTRICT const dst) { const __m128i rb = _mm_packus_epi16(*R, *B); const __m128i ga = _mm_packus_epi16(*G, *A); const __m128i rg = _mm_unpacklo_epi8(rb, ga); @@ -120,11 +120,9 @@ static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R, } // Pack R/G/B/A results into 16b output. -static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R, - const __m128i* const G, - const __m128i* const B, - const __m128i* const A, - uint8_t* const dst) { +static WEBP_INLINE void PackAndStore4444_SSE2( + const __m128i* const R, const __m128i* const G, const __m128i* const B, + const __m128i* const A, uint8_t* WEBP_RESTRICT const dst) { #if (WEBP_SWAP_16BIT_CSP == 0) const __m128i rg0 = _mm_packus_epi16(*R, *G); const __m128i ba0 = _mm_packus_epi16(*B, *A); @@ -145,7 +143,7 @@ static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R, static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R, const __m128i* const G, const __m128i* const B, - uint8_t* const dst) { + uint8_t* WEBP_RESTRICT const dst) { const __m128i r0 = _mm_packus_epi16(*R, *R); const __m128i g0 = _mm_packus_epi16(*G, *G); const __m128i b0 = _mm_packus_epi16(*B, *B); @@ -170,7 +168,7 @@ static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R, static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1, __m128i* const in2, __m128i* const in3, __m128i* const in4, __m128i* const in5, - uint8_t* const rgb) { + uint8_t* WEBP_RESTRICT const rgb) { // The input is 6 registers of sixteen 8b but for the sake of explanation, // let's take 6 registers of four 8b values. // To pack, we will keep taking one every two 8b integer and move it @@ -193,8 +191,10 @@ static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1, _mm_storeu_si128((__m128i*)(rgb + 80), *in5); } -void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToRgba32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n < 32; n += 8, dst += 32) { @@ -204,8 +204,10 @@ void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, } } -void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToBgra32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n < 32; n += 8, dst += 32) { @@ -215,8 +217,10 @@ void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, } } -void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToArgb32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n < 32; n += 8, dst += 32) { @@ -226,8 +230,10 @@ void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, } } -void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u, - const uint8_t* v, uint8_t* dst) { +void VP8YuvToRgba444432_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n < 32; n += 8, dst += 16) { @@ -237,8 +243,10 @@ void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u, } } -void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst) { int n; for (n = 0; n < 32; n += 8, dst += 16) { __m128i R, G, B; @@ -247,8 +255,10 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, } } -void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToRgb32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; @@ -269,8 +279,10 @@ void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); } -void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToBgr32_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; @@ -294,9 +306,10 @@ void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, //----------------------------------------------------------------------------- // Arbitrary-length row conversion functions -static void YuvToRgbaRow_SSE2(const uint8_t* y, - const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToRgbaRow_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int len) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n + 8 <= len; n += 8, dst += 32) { @@ -316,9 +329,10 @@ static void YuvToRgbaRow_SSE2(const uint8_t* y, } } -static void YuvToBgraRow_SSE2(const uint8_t* y, - const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToBgraRow_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int len) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n + 8 <= len; n += 8, dst += 32) { @@ -338,9 +352,10 @@ static void YuvToBgraRow_SSE2(const uint8_t* y, } } -static void YuvToArgbRow_SSE2(const uint8_t* y, - const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToArgbRow_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int len) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n + 8 <= len; n += 8, dst += 32) { @@ -360,9 +375,10 @@ static void YuvToArgbRow_SSE2(const uint8_t* y, } } -static void YuvToRgbRow_SSE2(const uint8_t* y, - const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToRgbRow_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int len) { int n; for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; @@ -397,9 +413,10 @@ static void YuvToRgbRow_SSE2(const uint8_t* y, } } -static void YuvToBgrRow_SSE2(const uint8_t* y, - const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToBgrRow_SSE2(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int len) { int n; for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; @@ -471,7 +488,7 @@ static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2( // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // Similar to PlanarTo24bHelper(), but in reverse order. static WEBP_INLINE void RGB24PackedToPlanar_SSE2( - const uint8_t* const rgb, __m128i* const out /*out[6]*/) { + const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) { __m128i tmp[6]; tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0)); tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16)); @@ -488,8 +505,8 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE2( } // Convert 8 packed ARGB to r[], g[], b[] -static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb, - __m128i* const rgb /*in[6]*/) { +static WEBP_INLINE void RGB32PackedToPlanar_SSE2( + const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) { const __m128i zero = _mm_setzero_si128(); __m128i a0 = LOAD_16(argb + 0); __m128i a1 = LOAD_16(argb + 4); @@ -562,7 +579,8 @@ static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R, #undef MK_CST_16 #undef TRANSFORM -static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) { +static void ConvertRGB24ToY_SSE2(const uint8_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT y, int width) { const int max_width = width & ~31; int i; for (i = 0; i < max_width; rgb += 3 * 16 * 2) { @@ -596,7 +614,8 @@ static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) { } } -static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) { +static void ConvertBGR24ToY_SSE2(const uint8_t* WEBP_RESTRICT bgr, + uint8_t* WEBP_RESTRICT y, int width) { const int max_width = width & ~31; int i; for (i = 0; i < max_width; bgr += 3 * 16 * 2) { @@ -630,7 +649,8 @@ static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) { } } -static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) { +static void ConvertARGBToY_SSE2(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT y, int width) { const int max_width = width & ~15; int i; for (i = 0; i < max_width; i += 16) { @@ -658,8 +678,9 @@ static void HorizontalAddPack_SSE2(const __m128i* const A, *out = _mm_packs_epi32(C, D); } -static void ConvertARGBToUV_SSE2(const uint32_t* argb, - uint8_t* u, uint8_t* v, +static void ConvertARGBToUV_SSE2(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int src_width, int do_store) { const int max_width = src_width & ~31; int i; @@ -695,7 +716,7 @@ static void ConvertARGBToUV_SSE2(const uint32_t* argb, // Convert 16 packed ARGB 16b-values to r[], g[], b[] static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2( - const uint16_t* const rgbx, + const uint16_t* WEBP_RESTRICT const rgbx, __m128i* const r, __m128i* const g, __m128i* const b) { const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x @@ -715,8 +736,9 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2( *b = _mm_unpacklo_epi64(B1, B3); } -static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb, - uint8_t* u, uint8_t* v, int width) { +static void ConvertRGBA32ToUV_SSE2(const uint16_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int width) { const int max_width = width & ~15; const uint16_t* const last_rgb = rgb + 4 * max_width; while (rgb < last_rgb) { diff --git a/src/dsp/yuv_sse41.c b/src/dsp/yuv_sse41.c index f79b802e..071e4908 100644 --- a/src/dsp/yuv_sse41.c +++ b/src/dsp/yuv_sse41.c @@ -82,9 +82,9 @@ static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) { } // Convert 32 samples of YUV444 to R/G/B -static void YUV444ToRGB_SSE41(const uint8_t* const y, - const uint8_t* const u, - const uint8_t* const v, +static void YUV444ToRGB_SSE41(const uint8_t* WEBP_RESTRICT const y, + const uint8_t* WEBP_RESTRICT const u, + const uint8_t* WEBP_RESTRICT const v, __m128i* const R, __m128i* const G, __m128i* const B) { const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u), @@ -93,9 +93,9 @@ static void YUV444ToRGB_SSE41(const uint8_t* const y, } // Convert 32 samples of YUV420 to R/G/B -static void YUV420ToRGB_SSE41(const uint8_t* const y, - const uint8_t* const u, - const uint8_t* const v, +static void YUV420ToRGB_SSE41(const uint8_t* WEBP_RESTRICT const y, + const uint8_t* WEBP_RESTRICT const u, + const uint8_t* WEBP_RESTRICT const v, __m128i* const R, __m128i* const G, __m128i* const B) { const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u), @@ -109,7 +109,7 @@ static void YUV420ToRGB_SSE41(const uint8_t* const y, static WEBP_INLINE void PlanarTo24b_SSE41( __m128i* const in0, __m128i* const in1, __m128i* const in2, __m128i* const in3, __m128i* const in4, __m128i* const in5, - uint8_t* const rgb) { + uint8_t* WEBP_RESTRICT const rgb) { // The input is 6 registers of sixteen 8b but for the sake of explanation, // let's take 6 registers of four 8b values. // To pack, we will keep taking one every two 8b integer and move it @@ -132,8 +132,10 @@ static WEBP_INLINE void PlanarTo24b_SSE41( _mm_storeu_si128((__m128i*)(rgb + 80), *in5); } -void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToRgb32_SSE41(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; @@ -154,8 +156,10 @@ void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); } -void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToBgr32_SSE41(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; @@ -179,9 +183,10 @@ void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, //----------------------------------------------------------------------------- // Arbitrary-length row conversion functions -static void YuvToRgbRow_SSE41(const uint8_t* y, - const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToRgbRow_SSE41(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int len) { int n; for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; @@ -216,9 +221,10 @@ static void YuvToRgbRow_SSE41(const uint8_t* y, } } -static void YuvToBgrRow_SSE41(const uint8_t* y, - const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToBgrRow_SSE41(const uint8_t* WEBP_RESTRICT y, + const uint8_t* WEBP_RESTRICT u, + const uint8_t* WEBP_RESTRICT v, + uint8_t* WEBP_RESTRICT dst, int len) { int n; for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; @@ -290,7 +296,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) { // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // Similar to PlanarTo24bHelper(), but in reverse order. static WEBP_INLINE void RGB24PackedToPlanar_SSE41( - const uint8_t* const rgb, __m128i* const out /*out[6]*/) { + const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) { const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb + 0)); const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16)); const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + 32)); @@ -334,7 +340,7 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE41( // Convert 8 packed ARGB to r[], g[], b[] static WEBP_INLINE void RGB32PackedToPlanar_SSE41( - const uint32_t* const argb, __m128i* const rgb /*in[6]*/) { + const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) { const __m128i zero = _mm_setzero_si128(); __m128i a0 = LOAD_16(argb + 0); __m128i a1 = LOAD_16(argb + 4); @@ -407,7 +413,8 @@ static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R, #undef MK_CST_16 #undef TRANSFORM -static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) { +static void ConvertRGB24ToY_SSE41(const uint8_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT y, int width) { const int max_width = width & ~31; int i; for (i = 0; i < max_width; rgb += 3 * 16 * 2) { @@ -441,7 +448,8 @@ static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) { } } -static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) { +static void ConvertBGR24ToY_SSE41(const uint8_t* WEBP_RESTRICT bgr, + uint8_t* WEBP_RESTRICT y, int width) { const int max_width = width & ~31; int i; for (i = 0; i < max_width; bgr += 3 * 16 * 2) { @@ -475,7 +483,8 @@ static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) { } } -static void ConvertARGBToY_SSE41(const uint32_t* argb, uint8_t* y, int width) { +static void ConvertARGBToY_SSE41(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT y, int width) { const int max_width = width & ~15; int i; for (i = 0; i < max_width; i += 16) { @@ -503,8 +512,9 @@ static void HorizontalAddPack_SSE41(const __m128i* const A, *out = _mm_packs_epi32(C, D); } -static void ConvertARGBToUV_SSE41(const uint32_t* argb, - uint8_t* u, uint8_t* v, +static void ConvertARGBToUV_SSE41(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int src_width, int do_store) { const int max_width = src_width & ~31; int i; @@ -540,7 +550,7 @@ static void ConvertARGBToUV_SSE41(const uint32_t* argb, // Convert 16 packed ARGB 16b-values to r[], g[], b[] static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41( - const uint16_t* const rgbx, + const uint16_t* WEBP_RESTRICT const rgbx, __m128i* const r, __m128i* const g, __m128i* const b) { const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x @@ -570,8 +580,9 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41( *b = _mm_unpackhi_epi64(B1, B3); } -static void ConvertRGBA32ToUV_SSE41(const uint16_t* rgb, - uint8_t* u, uint8_t* v, int width) { +static void ConvertRGBA32ToUV_SSE41(const uint16_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT u, + uint8_t* WEBP_RESTRICT v, int width) { const int max_width = width & ~15; const uint16_t* const last_rgb = rgb + 4 * max_width; while (rgb < last_rgb) {