dsp/upsampling*: use WEBP_RESTRICT qualifier

Better vectorization in the C code, fewer instructions in NEON, and some
code reordering / better register usage in SSE2/SSE4 w/ndk
r27/gcc-13/clang-16.

This only affects non-vector pointers; any vector pointers are left as a
follow up.

Change-Id: Ib29980f778ad3dbb952178ad8dee39b8673c4ff8
This commit is contained in:
James Zern 2024-08-16 19:02:19 -07:00
parent 35915b389e
commit 23bbafbeb8
7 changed files with 137 additions and 76 deletions

View File

@ -285,10 +285,10 @@ void VP8DspInit(void);
// Convert a pair of y/u/v lines together to the output rgb/a colorspace. // Convert a pair of y/u/v lines together to the output rgb/a colorspace.
// bottom_y can be NULL if only one line of output is needed (at top/bottom). // bottom_y can be NULL if only one line of output is needed (at top/bottom).
typedef void (*WebPUpsampleLinePairFunc)( typedef void (*WebPUpsampleLinePairFunc)(
const uint8_t* top_y, const uint8_t* bottom_y, const uint8_t* WEBP_RESTRICT top_y, const uint8_t* WEBP_RESTRICT bottom_y,
const uint8_t* top_u, const uint8_t* top_v, const uint8_t* WEBP_RESTRICT top_u, const uint8_t* WEBP_RESTRICT top_v,
const uint8_t* cur_u, const uint8_t* cur_v, const uint8_t* WEBP_RESTRICT cur_u, const uint8_t* WEBP_RESTRICT cur_v,
uint8_t* top_dst, uint8_t* bottom_dst, int len); uint8_t* WEBP_RESTRICT top_dst, uint8_t* WEBP_RESTRICT bottom_dst, int len);
#ifdef FANCY_UPSAMPLING #ifdef FANCY_UPSAMPLING
@ -298,13 +298,15 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
#endif // FANCY_UPSAMPLING #endif // FANCY_UPSAMPLING
// Per-row point-sampling methods. // Per-row point-sampling methods.
typedef void (*WebPSamplerRowFunc)(const uint8_t* y, typedef void (*WebPSamplerRowFunc)(const uint8_t* WEBP_RESTRICT y,
const uint8_t* u, const uint8_t* v, const uint8_t* WEBP_RESTRICT u,
uint8_t* dst, int len); const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len);
// Generic function to apply 'WebPSamplerRowFunc' to the whole plane: // Generic function to apply 'WebPSamplerRowFunc' to the whole plane:
void WebPSamplerProcessPlane(const uint8_t* y, int y_stride, void WebPSamplerProcessPlane(const uint8_t* WEBP_RESTRICT y, int y_stride,
const uint8_t* u, const uint8_t* v, int uv_stride, const uint8_t* WEBP_RESTRICT u,
uint8_t* dst, int dst_stride, const uint8_t* WEBP_RESTRICT v, int uv_stride,
uint8_t* WEBP_RESTRICT dst, int dst_stride,
int width, int height, WebPSamplerRowFunc func); int width, int height, WebPSamplerRowFunc func);
// Sampling functions to convert rows of YUV to RGB(A) // Sampling functions to convert rows of YUV to RGB(A)
@ -316,9 +318,10 @@ extern WebPSamplerRowFunc WebPSamplers[/* MODE_LAST */];
WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last); WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last);
// YUV444->RGB converters // YUV444->RGB converters
typedef void (*WebPYUV444Converter)(const uint8_t* y, typedef void (*WebPYUV444Converter)(const uint8_t* WEBP_RESTRICT y,
const uint8_t* u, const uint8_t* v, const uint8_t* WEBP_RESTRICT u,
uint8_t* dst, int len); const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len);
extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */]; extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];

View File

@ -35,10 +35,14 @@ WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST];
#define LOAD_UV(u, v) ((u) | ((v) << 16)) #define LOAD_UV(u, v) ((u) | ((v) << 16))
#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* top_u, const uint8_t* top_v, \ const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* cur_u, const uint8_t* cur_v, \ const uint8_t* WEBP_RESTRICT top_u, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
int x; \ int x; \
const int last_pixel_pair = (len - 1) >> 1; \ const int last_pixel_pair = (len - 1) >> 1; \
uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \ uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \
@ -136,10 +140,14 @@ static void EmptyUpsampleFunc(const uint8_t* top_y, const uint8_t* bottom_y,
#if !defined(FANCY_UPSAMPLING) #if !defined(FANCY_UPSAMPLING)
#define DUAL_SAMPLE_FUNC(FUNC_NAME, FUNC) \ #define DUAL_SAMPLE_FUNC(FUNC_NAME, FUNC) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \ static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* top_u, const uint8_t* top_v, \ const uint8_t* WEBP_RESTRICT bot_y, \
const uint8_t* bot_u, const uint8_t* bot_v, \ const uint8_t* WEBP_RESTRICT top_u, \
uint8_t* top_dst, uint8_t* bot_dst, int len) { \ const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT bot_u, \
const uint8_t* WEBP_RESTRICT bot_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bot_dst, int len) { \
const int half_len = len >> 1; \ const int half_len = len >> 1; \
int x; \ int x; \
assert(top_dst != NULL); \ assert(top_dst != NULL); \
@ -178,10 +186,14 @@ WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
// YUV444 converter // YUV444 converter
#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \ #define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \
extern void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ extern void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
uint8_t* dst, int len); \ const uint8_t* WEBP_RESTRICT u, \
void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ const uint8_t* WEBP_RESTRICT v, \
uint8_t* dst, int len) { \ uint8_t* WEBP_RESTRICT dst, int len); \
void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i; \ int i; \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]); \ for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]); \
} }

View File

@ -143,10 +143,14 @@ static WEBP_INLINE void YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
#define LOAD_UV(u, v) ((u) | ((v) << 16)) #define LOAD_UV(u, v) ((u) | ((v) << 16))
#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* top_u, const uint8_t* top_v, \ const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* cur_u, const uint8_t* cur_v, \ const uint8_t* WEBP_RESTRICT top_u, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
int x; \ int x; \
const int last_pixel_pair = (len - 1) >> 1; \ const int last_pixel_pair = (len - 1) >> 1; \
uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \ uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \
@ -241,8 +245,10 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) {
// YUV444 converter // YUV444 converter
#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \ #define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
uint8_t* dst, int len) { \ const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i; \ int i; \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \ for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \
} }

View File

@ -320,8 +320,10 @@ static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) {
} }
#if !defined(WEBP_REDUCE_CSP) #if !defined(WEBP_REDUCE_CSP)
static void YuvToRgbLine(const uint8_t* y, const uint8_t* u, static void YuvToRgbLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* v, uint8_t* dst, int length) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B; v16u8 R, G, B;
while (length >= 16) { while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B); CALC_RGB16(y, u, v, R, G, B);
@ -347,8 +349,10 @@ static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
} }
} }
static void YuvToBgrLine(const uint8_t* y, const uint8_t* u, static void YuvToBgrLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* v, uint8_t* dst, int length) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B; v16u8 R, G, B;
while (length >= 16) { while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B); CALC_RGB16(y, u, v, R, G, B);
@ -375,8 +379,10 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
} }
#endif // WEBP_REDUCE_CSP #endif // WEBP_REDUCE_CSP
static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u, static void YuvToRgbaLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* v, uint8_t* dst, int length) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B; v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL); const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) { while (length >= 16) {
@ -403,8 +409,10 @@ static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
} }
} }
static void YuvToBgraLine(const uint8_t* y, const uint8_t* u, static void YuvToBgraLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* v, uint8_t* dst, int length) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B; v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL); const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) { while (length >= 16) {
@ -432,8 +440,10 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
} }
#if !defined(WEBP_REDUCE_CSP) #if !defined(WEBP_REDUCE_CSP)
static void YuvToArgbLine(const uint8_t* y, const uint8_t* u, static void YuvToArgbLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* v, uint8_t* dst, int length) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B; v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL); const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) { while (length >= 16) {
@ -460,8 +470,10 @@ static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
} }
} }
static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u, static void YuvToRgba4444Line(const uint8_t* WEBP_RESTRICT y,
const uint8_t* v, uint8_t* dst, int length) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B, RG, BA, tmp0, tmp1; v16u8 R, G, B, RG, BA, tmp0, tmp1;
while (length >= 16) { while (length >= 16) {
#if (WEBP_SWAP_16BIT_CSP == 1) #if (WEBP_SWAP_16BIT_CSP == 1)
@ -496,8 +508,10 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
} }
} }
static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u, static void YuvToRgb565Line(const uint8_t* WEBP_RESTRICT y,
const uint8_t* v, uint8_t* dst, int length) { const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B, RG, GB, tmp0, tmp1; v16u8 R, G, B, RG, GB, tmp0, tmp1;
while (length >= 16) { while (length >= 16) {
#if (WEBP_SWAP_16BIT_CSP == 1) #if (WEBP_SWAP_16BIT_CSP == 1)
@ -564,11 +578,14 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
} while (0) } while (0)
#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \ static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* top_u, const uint8_t* top_v, \ const uint8_t* WEBP_RESTRICT bot_y, \
const uint8_t* cur_u, const uint8_t* cur_v, \ const uint8_t* WEBP_RESTRICT top_u, \
uint8_t* top_dst, uint8_t* bot_dst, int len) \ const uint8_t* WEBP_RESTRICT top_v, \
{ \ const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bot_dst, int len) { \
int size = (len - 1) >> 1; \ int size = (len - 1) >> 1; \
uint8_t temp_u[64]; \ uint8_t temp_u[64]; \
uint8_t temp_v[64]; \ uint8_t temp_v[64]; \

View File

@ -58,8 +58,9 @@
} while (0) } while (0)
// Turn the macro into a function for reducing code-size when non-critical // Turn the macro into a function for reducing code-size when non-critical
static void Upsample16Pixels_NEON(const uint8_t* r1, const uint8_t* r2, static void Upsample16Pixels_NEON(const uint8_t* WEBP_RESTRICT const r1,
uint8_t* out) { const uint8_t* WEBP_RESTRICT const r2,
uint8_t* WEBP_RESTRICT const out) {
UPSAMPLE_16PIXELS(r1, r2, out); UPSAMPLE_16PIXELS(r1, r2, out);
} }
@ -190,10 +191,14 @@ static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 };
} }
#define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP) \ #define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* top_u, const uint8_t* top_v, \ const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* cur_u, const uint8_t* cur_v, \ const uint8_t* WEBP_RESTRICT top_u, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
int block; \ int block; \
/* 16 byte aligned array to cache reconstructed u and v */ \ /* 16 byte aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[2 * 32 + 15]; \ uint8_t uv_buf[2 * 32 + 15]; \

View File

@ -88,8 +88,9 @@
} while (0) } while (0)
// Turn the macro into a function for reducing code-size when non-critical // Turn the macro into a function for reducing code-size when non-critical
static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[], static void Upsample32Pixels_SSE2(const uint8_t* WEBP_RESTRICT const r1,
uint8_t* const out) { const uint8_t* WEBP_RESTRICT const r2,
uint8_t* WEBP_RESTRICT const out) {
UPSAMPLE_32PIXELS(r1, r2, out); UPSAMPLE_32PIXELS(r1, r2, out);
} }
@ -114,10 +115,14 @@ static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
} while (0) } while (0)
#define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ #define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* top_u, const uint8_t* top_v, \ const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* cur_u, const uint8_t* cur_v, \ const uint8_t* WEBP_RESTRICT top_u, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
int uv_pos, pos; \ int uv_pos, pos; \
/* 16byte-aligned array to cache reconstructed u and v */ \ /* 16byte-aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[14 * 32 + 15] = { 0 }; \ uint8_t uv_buf[14 * 32 + 15] = { 0 }; \
@ -215,10 +220,14 @@ extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
extern void WebPInitYUV444ConvertersSSE2(void); extern void WebPInitYUV444ConvertersSSE2(void);
#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \ #define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \
extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ extern void CALL_C(const uint8_t* WEBP_RESTRICT y, \
uint8_t* dst, int len); \ const uint8_t* WEBP_RESTRICT u, \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ const uint8_t* WEBP_RESTRICT v, \
uint8_t* dst, int len) { \ uint8_t* WEBP_RESTRICT dst, int len); \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i; \ int i; \
const int max_len = len & ~31; \ const int max_len = len & ~31; \
for (i = 0; i < max_len; i += 32) { \ for (i = 0; i < max_len; i += 32) { \

View File

@ -90,8 +90,9 @@
} while (0) } while (0)
// Turn the macro into a function for reducing code-size when non-critical // Turn the macro into a function for reducing code-size when non-critical
static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[], static void Upsample32Pixels_SSE41(const uint8_t* WEBP_RESTRICT const r1,
uint8_t* const out) { const uint8_t* WEBP_RESTRICT const r2,
uint8_t* WEBP_RESTRICT const out) {
UPSAMPLE_32PIXELS(r1, r2, out); UPSAMPLE_32PIXELS(r1, r2, out);
} }
@ -116,10 +117,14 @@ static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[],
} while (0) } while (0)
#define SSE4_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ #define SSE4_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* top_u, const uint8_t* top_v, \ const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* cur_u, const uint8_t* cur_v, \ const uint8_t* WEBP_RESTRICT top_u, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
int uv_pos, pos; \ int uv_pos, pos; \
/* 16byte-aligned array to cache reconstructed u and v */ \ /* 16byte-aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[14 * 32 + 15] = { 0 }; \ uint8_t uv_buf[14 * 32 + 15] = { 0 }; \
@ -202,10 +207,14 @@ extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
extern void WebPInitYUV444ConvertersSSE41(void); extern void WebPInitYUV444ConvertersSSE41(void);
#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \ #define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \
extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ extern void CALL_C(const uint8_t* WEBP_RESTRICT y, \
uint8_t* dst, int len); \ const uint8_t* WEBP_RESTRICT u, \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ const uint8_t* WEBP_RESTRICT v, \
uint8_t* dst, int len) { \ uint8_t* WEBP_RESTRICT dst, int len); \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i; \ int i; \
const int max_len = len & ~31; \ const int max_len = len & ~31; \
for (i = 0; i < max_len; i += 32) { \ for (i = 0; i < max_len; i += 32) { \