dsp/upsampling*: use WEBP_RESTRICT qualifier

Better vectorization in the C code, fewer instructions in NEON, and some
code reordering / better register usage in SSE2/SSE4 w/ndk
r27/gcc-13/clang-16.

This only affects non-vector pointers; any vector pointers are left as a
follow up.

Change-Id: Ib29980f778ad3dbb952178ad8dee39b8673c4ff8
This commit is contained in:
James Zern 2024-08-16 19:02:19 -07:00
parent 35915b389e
commit 23bbafbeb8
7 changed files with 137 additions and 76 deletions

View File

@ -285,10 +285,10 @@ void VP8DspInit(void);
// Convert a pair of y/u/v lines together to the output rgb/a colorspace.
// bottom_y can be NULL if only one line of output is needed (at top/bottom).
typedef void (*WebPUpsampleLinePairFunc)(
const uint8_t* top_y, const uint8_t* bottom_y,
const uint8_t* top_u, const uint8_t* top_v,
const uint8_t* cur_u, const uint8_t* cur_v,
uint8_t* top_dst, uint8_t* bottom_dst, int len);
const uint8_t* WEBP_RESTRICT top_y, const uint8_t* WEBP_RESTRICT bottom_y,
const uint8_t* WEBP_RESTRICT top_u, const uint8_t* WEBP_RESTRICT top_v,
const uint8_t* WEBP_RESTRICT cur_u, const uint8_t* WEBP_RESTRICT cur_v,
uint8_t* WEBP_RESTRICT top_dst, uint8_t* WEBP_RESTRICT bottom_dst, int len);
#ifdef FANCY_UPSAMPLING
@ -298,13 +298,15 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
#endif // FANCY_UPSAMPLING
// Per-row point-sampling methods.
typedef void (*WebPSamplerRowFunc)(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len);
typedef void (*WebPSamplerRowFunc)(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len);
// Generic function to apply 'WebPSamplerRowFunc' to the whole plane:
void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
const uint8_t* u, const uint8_t* v, int uv_stride,
uint8_t* dst, int dst_stride,
void WebPSamplerProcessPlane(const uint8_t* WEBP_RESTRICT y, int y_stride,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v, int uv_stride,
uint8_t* WEBP_RESTRICT dst, int dst_stride,
int width, int height, WebPSamplerRowFunc func);
// Sampling functions to convert rows of YUV to RGB(A)
@ -316,9 +318,10 @@ extern WebPSamplerRowFunc WebPSamplers[/* MODE_LAST */];
WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last);
// YUV444->RGB converters
typedef void (*WebPYUV444Converter)(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len);
typedef void (*WebPYUV444Converter)(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len);
extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];

View File

@ -35,10 +35,14 @@ WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST];
#define LOAD_UV(u, v) ((u) | ((v) << 16))
#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* WEBP_RESTRICT top_u, \
const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
int x; \
const int last_pixel_pair = (len - 1) >> 1; \
uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \
@ -136,10 +140,14 @@ static void EmptyUpsampleFunc(const uint8_t* top_y, const uint8_t* bottom_y,
#if !defined(FANCY_UPSAMPLING)
#define DUAL_SAMPLE_FUNC(FUNC_NAME, FUNC) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* bot_u, const uint8_t* bot_v, \
uint8_t* top_dst, uint8_t* bot_dst, int len) { \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* WEBP_RESTRICT bot_y, \
const uint8_t* WEBP_RESTRICT top_u, \
const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT bot_u, \
const uint8_t* WEBP_RESTRICT bot_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bot_dst, int len) { \
const int half_len = len >> 1; \
int x; \
assert(top_dst != NULL); \
@ -178,10 +186,14 @@ WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
// YUV444 converter
#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \
extern void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len); \
void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
extern void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len); \
void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i; \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]); \
}

View File

@ -143,10 +143,14 @@ static WEBP_INLINE void YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
#define LOAD_UV(u, v) ((u) | ((v) << 16))
#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* WEBP_RESTRICT top_u, \
const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
int x; \
const int last_pixel_pair = (len - 1) >> 1; \
uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \
@ -241,8 +245,10 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) {
// YUV444 converter
#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i; \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \
}

View File

@ -320,8 +320,10 @@ static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) {
}
#if !defined(WEBP_REDUCE_CSP)
static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
static void YuvToRgbLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B;
while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B);
@ -347,8 +349,10 @@ static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
}
}
static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
static void YuvToBgrLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B;
while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B);
@ -375,8 +379,10 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
}
#endif // WEBP_REDUCE_CSP
static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
static void YuvToRgbaLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) {
@ -403,8 +409,10 @@ static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
}
}
static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
static void YuvToBgraLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) {
@ -432,8 +440,10 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
}
#if !defined(WEBP_REDUCE_CSP)
static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
static void YuvToArgbLine(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) {
@ -460,8 +470,10 @@ static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
}
}
static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
static void YuvToRgba4444Line(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B, RG, BA, tmp0, tmp1;
while (length >= 16) {
#if (WEBP_SWAP_16BIT_CSP == 1)
@ -496,8 +508,10 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
}
}
static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
static void YuvToRgb565Line(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int length) {
v16u8 R, G, B, RG, GB, tmp0, tmp1;
while (length >= 16) {
#if (WEBP_SWAP_16BIT_CSP == 1)
@ -564,11 +578,14 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
} while (0)
#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bot_dst, int len) \
{ \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* WEBP_RESTRICT bot_y, \
const uint8_t* WEBP_RESTRICT top_u, \
const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bot_dst, int len) { \
int size = (len - 1) >> 1; \
uint8_t temp_u[64]; \
uint8_t temp_v[64]; \

View File

@ -58,8 +58,9 @@
} while (0)
// Turn the macro into a function for reducing code-size when non-critical
static void Upsample16Pixels_NEON(const uint8_t* r1, const uint8_t* r2,
uint8_t* out) {
static void Upsample16Pixels_NEON(const uint8_t* WEBP_RESTRICT const r1,
const uint8_t* WEBP_RESTRICT const r2,
uint8_t* WEBP_RESTRICT const out) {
UPSAMPLE_16PIXELS(r1, r2, out);
}
@ -190,10 +191,14 @@ static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 };
}
#define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* WEBP_RESTRICT top_u, \
const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
int block; \
/* 16 byte aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[2 * 32 + 15]; \

View File

@ -88,8 +88,9 @@
} while (0)
// Turn the macro into a function for reducing code-size when non-critical
static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
uint8_t* const out) {
static void Upsample32Pixels_SSE2(const uint8_t* WEBP_RESTRICT const r1,
const uint8_t* WEBP_RESTRICT const r2,
uint8_t* WEBP_RESTRICT const out) {
UPSAMPLE_32PIXELS(r1, r2, out);
}
@ -114,10 +115,14 @@ static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
} while (0)
#define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* WEBP_RESTRICT top_u, \
const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
int uv_pos, pos; \
/* 16byte-aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[14 * 32 + 15] = { 0 }; \
@ -215,10 +220,14 @@ extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
extern void WebPInitYUV444ConvertersSSE2(void);
#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \
extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len); \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
extern void CALL_C(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len); \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i; \
const int max_len = len & ~31; \
for (i = 0; i < max_len; i += 32) { \

View File

@ -90,8 +90,9 @@
} while (0)
// Turn the macro into a function for reducing code-size when non-critical
static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[],
uint8_t* const out) {
static void Upsample32Pixels_SSE41(const uint8_t* WEBP_RESTRICT const r1,
const uint8_t* WEBP_RESTRICT const r2,
uint8_t* WEBP_RESTRICT const out) {
UPSAMPLE_32PIXELS(r1, r2, out);
}
@ -116,10 +117,14 @@ static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[],
} while (0)
#define SSE4_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y, \
const uint8_t* WEBP_RESTRICT bottom_y, \
const uint8_t* WEBP_RESTRICT top_u, \
const uint8_t* WEBP_RESTRICT top_v, \
const uint8_t* WEBP_RESTRICT cur_u, \
const uint8_t* WEBP_RESTRICT cur_v, \
uint8_t* WEBP_RESTRICT top_dst, \
uint8_t* WEBP_RESTRICT bottom_dst, int len) { \
int uv_pos, pos; \
/* 16byte-aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[14 * 32 + 15] = { 0 }; \
@ -202,10 +207,14 @@ extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
extern void WebPInitYUV444ConvertersSSE41(void);
#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \
extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len); \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
extern void CALL_C(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len); \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i; \
const int max_len = len & ~31; \
for (i = 0; i < max_len; i += 32) { \