dsp/yuv*: use WEBP_RESTRICT qualifier

Better vectorization in the C code, fewer instructions / comparisons in
NEON, and fewer reloads in SSE2/SSE4 w/ndk r27/gcc-13/clang-16.

This only affects non-vector pointers; any vector pointers are left as a
follow up.

Change-Id: I07a7e36a2dce8632c71c0fbbeef94dc51453eaf7
This commit is contained in:
James Zern 2024-08-16 19:26:36 -07:00
parent 23bbafbeb8
commit 2dd5eb9862
8 changed files with 218 additions and 138 deletions

View File

@ -337,26 +337,35 @@ void WebPInitYUV444Converters(void);
// ARGB -> YUV converters
// Convert ARGB samples to luma Y.
extern void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
extern void (*WebPConvertARGBToY)(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT y, int width);
// Convert ARGB samples to U/V with downsampling. do_store should be '1' for
// even lines and '0' for odd ones. 'src_width' is the original width, not
// the U/V one.
extern void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
extern void (*WebPConvertARGBToUV)(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v,
int src_width, int do_store);
// Convert a row of accumulated (four-values) of rgba32 toward U/V
extern void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width);
extern void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width);
// Convert RGB or BGR to Y
extern void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
extern void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
extern void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width);
extern void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width);
// used for plain-C fallback.
extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
extern void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v,
int src_width, int do_store);
extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width);
extern void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width);
// Must be called before using the above.
void WebPInitConvertARGBToYUV(void);

View File

@ -20,9 +20,10 @@
// Plain-C version
#define ROW_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* y, \
const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
const uint8_t* const end = dst + (len & ~1) * (XSTEP); \
while (dst != end) { \
FUNC(y[0], u[0], v[0], dst); \
@ -49,9 +50,10 @@ ROW_FUNC(YuvToRgb565Row, VP8YuvToRgb565, 2)
#undef ROW_FUNC
// Main call for processing a plane with a WebPSamplerRowFunc function:
void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
const uint8_t* u, const uint8_t* v, int uv_stride,
uint8_t* dst, int dst_stride,
void WebPSamplerProcessPlane(const uint8_t* WEBP_RESTRICT y, int y_stride,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v, int uv_stride,
uint8_t* WEBP_RESTRICT dst, int dst_stride,
int width, int height, WebPSamplerRowFunc func) {
int j;
for (j = 0; j < height; ++j) {
@ -117,7 +119,8 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
//-----------------------------------------------------------------------------
// ARGB -> YUV converters
static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) {
static void ConvertARGBToY_C(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT y, int width) {
int i;
for (i = 0; i < width; ++i) {
const uint32_t p = argb[i];
@ -126,7 +129,8 @@ static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) {
}
}
void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int src_width, int do_store) {
// No rounding. Last pixel is dealt with separately.
const int uv_width = src_width >> 1;
@ -169,22 +173,25 @@ void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
//-----------------------------------------------------------------------------
static void ConvertRGB24ToY_C(const uint8_t* rgb, uint8_t* y, int width) {
static void ConvertRGB24ToY_C(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width) {
int i;
for (i = 0; i < width; ++i, rgb += 3) {
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
}
}
static void ConvertBGR24ToY_C(const uint8_t* bgr, uint8_t* y, int width) {
static void ConvertBGR24ToY_C(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width) {
int i;
for (i = 0; i < width; ++i, bgr += 3) {
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
}
}
void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width) {
void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int width) {
int i;
for (i = 0; i < width; i += 1, rgb += 4) {
const int r = rgb[0], g = rgb[1], b = rgb[2];
@ -195,13 +202,18 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
//-----------------------------------------------------------------------------
void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width);
void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width);
void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width);
void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width);
void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
void (*WebPConvertARGBToY)(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT y, int width);
void (*WebPConvertARGBToUV)(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
int src_width, int do_store);
extern void WebPInitConvertARGBToYUVSSE2(void);

View File

@ -149,20 +149,34 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
#if defined(WEBP_USE_SSE2)
// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst);
void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgba32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
void VP8YuvToRgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
void VP8YuvToBgra32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
void VP8YuvToBgr32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
void VP8YuvToArgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
void VP8YuvToRgba444432_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
#endif // WEBP_USE_SSE2
@ -172,10 +186,14 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
#if defined(WEBP_USE_SSE41)
// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgb32_SSE41(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
void VP8YuvToBgr32_SSE41(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst);
#endif // WEBP_USE_SSE41

View File

@ -22,9 +22,10 @@
// simple point-sampling
#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \
static void FUNC_NAME(const uint8_t* y, \
const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i, r, g, b; \
int temp0, temp1, temp2, temp3, temp4; \
for (i = 0; i < (len >> 1); i++) { \

View File

@ -69,9 +69,10 @@
: "memory", "hi", "lo" \
#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \
static void FUNC_NAME(const uint8_t* y, \
const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
const uint8_t* WEBP_RESTRICT u, \
const uint8_t* WEBP_RESTRICT v, \
uint8_t* WEBP_RESTRICT dst, int len) { \
int i; \
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; \
const int t_con_1 = 26149; \

View File

@ -46,7 +46,8 @@ static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
return vqmovn_u16(Y2);
}
static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
static void ConvertRGB24ToY_NEON(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) {
const uint8x8x3_t RGB = vld3_u8(rgb);
@ -58,7 +59,8 @@ static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
}
}
static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
static void ConvertBGR24ToY_NEON(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) {
const uint8x8x3_t BGR = vld3_u8(bgr);
@ -70,7 +72,8 @@ static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
}
}
static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
static void ConvertARGBToY_NEON(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT y, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8) {
const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
@ -114,8 +117,9 @@ static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST); \
} while (0)
static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width) {
static void ConvertRGBA32ToUV_NEON(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) {
const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb);
@ -131,7 +135,9 @@ static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
}
}
static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
static void ConvertARGBToUV_NEON(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v,
int src_width, int do_store) {
int i;
for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {

View File

@ -82,9 +82,9 @@ static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
}
// Convert 32 samples of YUV444 to R/G/B
static void YUV444ToRGB_SSE2(const uint8_t* const y,
const uint8_t* const u,
const uint8_t* const v,
static void YUV444ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y,
const uint8_t* WEBP_RESTRICT const u,
const uint8_t* WEBP_RESTRICT const v,
__m128i* const R, __m128i* const G,
__m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
@ -93,9 +93,9 @@ static void YUV444ToRGB_SSE2(const uint8_t* const y,
}
// Convert 32 samples of YUV420 to R/G/B
static void YUV420ToRGB_SSE2(const uint8_t* const y,
const uint8_t* const u,
const uint8_t* const v,
static void YUV420ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y,
const uint8_t* WEBP_RESTRICT const u,
const uint8_t* WEBP_RESTRICT const v,
__m128i* const R, __m128i* const G,
__m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
@ -108,7 +108,7 @@ static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
const __m128i* const A,
uint8_t* const dst) {
uint8_t* WEBP_RESTRICT const dst) {
const __m128i rb = _mm_packus_epi16(*R, *B);
const __m128i ga = _mm_packus_epi16(*G, *A);
const __m128i rg = _mm_unpacklo_epi8(rb, ga);
@ -120,11 +120,9 @@ static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
}
// Pack R/G/B/A results into 16b output.
static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
const __m128i* const A,
uint8_t* const dst) {
static WEBP_INLINE void PackAndStore4444_SSE2(
const __m128i* const R, const __m128i* const G, const __m128i* const B,
const __m128i* const A, uint8_t* WEBP_RESTRICT const dst) {
#if (WEBP_SWAP_16BIT_CSP == 0)
const __m128i rg0 = _mm_packus_epi16(*R, *G);
const __m128i ba0 = _mm_packus_epi16(*B, *A);
@ -145,7 +143,7 @@ static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
uint8_t* const dst) {
uint8_t* WEBP_RESTRICT const dst) {
const __m128i r0 = _mm_packus_epi16(*R, *R);
const __m128i g0 = _mm_packus_epi16(*G, *G);
const __m128i b0 = _mm_packus_epi16(*B, *B);
@ -170,7 +168,7 @@ static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
__m128i* const in2, __m128i* const in3,
__m128i* const in4, __m128i* const in5,
uint8_t* const rgb) {
uint8_t* WEBP_RESTRICT const rgb) {
// The input is 6 registers of sixteen 8b but for the sake of explanation,
// let's take 6 registers of four 8b values.
// To pack, we will keep taking one every two 8b integer and move it
@ -193,8 +191,10 @@ static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
_mm_storeu_si128((__m128i*)(rgb + 80), *in5);
}
void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToRgba32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 32) {
@ -204,8 +204,10 @@ void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
}
}
void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToBgra32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 32) {
@ -215,8 +217,10 @@ void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
}
}
void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToArgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 32) {
@ -226,8 +230,10 @@ void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
}
}
void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst) {
void VP8YuvToRgba444432_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 16) {
@ -237,8 +243,10 @@ void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
}
}
void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
int n;
for (n = 0; n < 32; n += 8, dst += 16) {
__m128i R, G, B;
@ -247,8 +255,10 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
}
}
void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToRgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
@ -269,8 +279,10 @@ void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
}
void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToBgr32_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
@ -294,9 +306,10 @@ void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
//-----------------------------------------------------------------------------
// Arbitrary-length row conversion functions
static void YuvToRgbaRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
static void YuvToRgbaRow_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
@ -316,9 +329,10 @@ static void YuvToRgbaRow_SSE2(const uint8_t* y,
}
}
static void YuvToBgraRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
static void YuvToBgraRow_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
@ -338,9 +352,10 @@ static void YuvToBgraRow_SSE2(const uint8_t* y,
}
}
static void YuvToArgbRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
static void YuvToArgbRow_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
@ -360,9 +375,10 @@ static void YuvToArgbRow_SSE2(const uint8_t* y,
}
}
static void YuvToRgbRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
static void YuvToRgbRow_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@ -397,9 +413,10 @@ static void YuvToRgbRow_SSE2(const uint8_t* y,
}
}
static void YuvToBgrRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
static void YuvToBgrRow_SSE2(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@ -471,7 +488,7 @@ static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// Similar to PlanarTo24bHelper(), but in reverse order.
static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) {
__m128i tmp[6];
tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0));
tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
@ -488,8 +505,8 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
}
// Convert 8 packed ARGB to r[], g[], b[]
static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
__m128i* const rgb /*in[6]*/) {
static WEBP_INLINE void RGB32PackedToPlanar_SSE2(
const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) {
const __m128i zero = _mm_setzero_si128();
__m128i a0 = LOAD_16(argb + 0);
__m128i a1 = LOAD_16(argb + 4);
@ -562,7 +579,8 @@ static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
#undef MK_CST_16
#undef TRANSFORM
static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
static void ConvertRGB24ToY_SSE2(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width) {
const int max_width = width & ~31;
int i;
for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
@ -596,7 +614,8 @@ static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
}
}
static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
static void ConvertBGR24ToY_SSE2(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width) {
const int max_width = width & ~31;
int i;
for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
@ -630,7 +649,8 @@ static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
}
}
static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) {
static void ConvertARGBToY_SSE2(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT y, int width) {
const int max_width = width & ~15;
int i;
for (i = 0; i < max_width; i += 16) {
@ -658,8 +678,9 @@ static void HorizontalAddPack_SSE2(const __m128i* const A,
*out = _mm_packs_epi32(C, D);
}
static void ConvertARGBToUV_SSE2(const uint32_t* argb,
uint8_t* u, uint8_t* v,
static void ConvertARGBToUV_SSE2(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v,
int src_width, int do_store) {
const int max_width = src_width & ~31;
int i;
@ -695,7 +716,7 @@ static void ConvertARGBToUV_SSE2(const uint32_t* argb,
// Convert 16 packed ARGB 16b-values to r[], g[], b[]
static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
const uint16_t* const rgbx,
const uint16_t* WEBP_RESTRICT const rgbx,
__m128i* const r, __m128i* const g, __m128i* const b) {
const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x
const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x
@ -715,8 +736,9 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
*b = _mm_unpacklo_epi64(B1, B3);
}
static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width) {
static void ConvertRGBA32ToUV_SSE2(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width) {
const int max_width = width & ~15;
const uint16_t* const last_rgb = rgb + 4 * max_width;
while (rgb < last_rgb) {

View File

@ -82,9 +82,9 @@ static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) {
}
// Convert 32 samples of YUV444 to R/G/B
static void YUV444ToRGB_SSE41(const uint8_t* const y,
const uint8_t* const u,
const uint8_t* const v,
static void YUV444ToRGB_SSE41(const uint8_t* WEBP_RESTRICT const y,
const uint8_t* WEBP_RESTRICT const u,
const uint8_t* WEBP_RESTRICT const v,
__m128i* const R, __m128i* const G,
__m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u),
@ -93,9 +93,9 @@ static void YUV444ToRGB_SSE41(const uint8_t* const y,
}
// Convert 32 samples of YUV420 to R/G/B
static void YUV420ToRGB_SSE41(const uint8_t* const y,
const uint8_t* const u,
const uint8_t* const v,
static void YUV420ToRGB_SSE41(const uint8_t* WEBP_RESTRICT const y,
const uint8_t* WEBP_RESTRICT const u,
const uint8_t* WEBP_RESTRICT const v,
__m128i* const R, __m128i* const G,
__m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u),
@ -109,7 +109,7 @@ static void YUV420ToRGB_SSE41(const uint8_t* const y,
static WEBP_INLINE void PlanarTo24b_SSE41(
__m128i* const in0, __m128i* const in1, __m128i* const in2,
__m128i* const in3, __m128i* const in4, __m128i* const in5,
uint8_t* const rgb) {
uint8_t* WEBP_RESTRICT const rgb) {
// The input is 6 registers of sixteen 8b but for the sake of explanation,
// let's take 6 registers of four 8b values.
// To pack, we will keep taking one every two 8b integer and move it
@ -132,8 +132,10 @@ static WEBP_INLINE void PlanarTo24b_SSE41(
_mm_storeu_si128((__m128i*)(rgb + 80), *in5);
}
void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToRgb32_SSE41(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
@ -154,8 +156,10 @@ void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
}
void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToBgr32_SSE41(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
@ -179,9 +183,10 @@ void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
//-----------------------------------------------------------------------------
// Arbitrary-length row conversion functions
static void YuvToRgbRow_SSE41(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
static void YuvToRgbRow_SSE41(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@ -216,9 +221,10 @@ static void YuvToRgbRow_SSE41(const uint8_t* y,
}
}
static void YuvToBgrRow_SSE41(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
static void YuvToBgrRow_SSE41(const uint8_t* WEBP_RESTRICT y,
const uint8_t* WEBP_RESTRICT u,
const uint8_t* WEBP_RESTRICT v,
uint8_t* WEBP_RESTRICT dst, int len) {
int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@ -290,7 +296,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) {
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// Similar to PlanarTo24bHelper(), but in reverse order.
static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) {
const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb + 0));
const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16));
const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + 32));
@ -334,7 +340,7 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
// Convert 8 packed ARGB to r[], g[], b[]
static WEBP_INLINE void RGB32PackedToPlanar_SSE41(
const uint32_t* const argb, __m128i* const rgb /*in[6]*/) {
const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) {
const __m128i zero = _mm_setzero_si128();
__m128i a0 = LOAD_16(argb + 0);
__m128i a1 = LOAD_16(argb + 4);
@ -407,7 +413,8 @@ static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R,
#undef MK_CST_16
#undef TRANSFORM
static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) {
static void ConvertRGB24ToY_SSE41(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width) {
const int max_width = width & ~31;
int i;
for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
@ -441,7 +448,8 @@ static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) {
}
}
static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) {
static void ConvertBGR24ToY_SSE41(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width) {
const int max_width = width & ~31;
int i;
for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
@ -475,7 +483,8 @@ static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) {
}
}
static void ConvertARGBToY_SSE41(const uint32_t* argb, uint8_t* y, int width) {
static void ConvertARGBToY_SSE41(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT y, int width) {
const int max_width = width & ~15;
int i;
for (i = 0; i < max_width; i += 16) {
@ -503,8 +512,9 @@ static void HorizontalAddPack_SSE41(const __m128i* const A,
*out = _mm_packs_epi32(C, D);
}
static void ConvertARGBToUV_SSE41(const uint32_t* argb,
uint8_t* u, uint8_t* v,
static void ConvertARGBToUV_SSE41(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v,
int src_width, int do_store) {
const int max_width = src_width & ~31;
int i;
@ -540,7 +550,7 @@ static void ConvertARGBToUV_SSE41(const uint32_t* argb,
// Convert 16 packed ARGB 16b-values to r[], g[], b[]
static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
const uint16_t* const rgbx,
const uint16_t* WEBP_RESTRICT const rgbx,
__m128i* const r, __m128i* const g, __m128i* const b) {
const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x
const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x
@ -570,8 +580,9 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
*b = _mm_unpackhi_epi64(B1, B3);
}
static void ConvertRGBA32ToUV_SSE41(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width) {
static void ConvertRGBA32ToUV_SSE41(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width) {
const int max_width = width & ~15;
const uint16_t* const last_rgb = rgb + 4 * max_width;
while (rgb < last_rgb) {