mirror of
https://github.com/webmproject/libwebp.git
synced 2024-12-25 05:08:21 +01:00
dsp/yuv*: use WEBP_RESTRICT qualifier
Better vectorization in the C code, fewer instructions / comparisons in NEON, and fewer reloads in SSE2/SSE4 w/ndk r27/gcc-13/clang-16. This only affects non-vector pointers; any vector pointers are left as a follow up. Change-Id: I07a7e36a2dce8632c71c0fbbeef94dc51453eaf7
This commit is contained in:
parent
23bbafbeb8
commit
2dd5eb9862
@ -337,26 +337,35 @@ void WebPInitYUV444Converters(void);
|
||||
// ARGB -> YUV converters
|
||||
|
||||
// Convert ARGB samples to luma Y.
|
||||
extern void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
|
||||
extern void (*WebPConvertARGBToY)(const uint32_t* WEBP_RESTRICT argb,
|
||||
uint8_t* WEBP_RESTRICT y, int width);
|
||||
// Convert ARGB samples to U/V with downsampling. do_store should be '1' for
|
||||
// even lines and '0' for odd ones. 'src_width' is the original width, not
|
||||
// the U/V one.
|
||||
extern void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
|
||||
extern void (*WebPConvertARGBToUV)(const uint32_t* WEBP_RESTRICT argb,
|
||||
uint8_t* WEBP_RESTRICT u,
|
||||
uint8_t* WEBP_RESTRICT v,
|
||||
int src_width, int do_store);
|
||||
|
||||
// Convert a row of accumulated (four-values) of rgba32 toward U/V
|
||||
extern void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
|
||||
uint8_t* u, uint8_t* v, int width);
|
||||
extern void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb,
|
||||
uint8_t* WEBP_RESTRICT u,
|
||||
uint8_t* WEBP_RESTRICT v, int width);
|
||||
|
||||
// Convert RGB or BGR to Y
|
||||
extern void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
|
||||
extern void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
|
||||
extern void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb,
|
||||
uint8_t* WEBP_RESTRICT y, int width);
|
||||
extern void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr,
|
||||
uint8_t* WEBP_RESTRICT y, int width);
|
||||
|
||||
// used for plain-C fallback.
|
||||
extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
|
||||
extern void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb,
|
||||
uint8_t* WEBP_RESTRICT u,
|
||||
uint8_t* WEBP_RESTRICT v,
|
||||
int src_width, int do_store);
|
||||
extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
|
||||
uint8_t* u, uint8_t* v, int width);
|
||||
extern void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb,
|
||||
uint8_t* WEBP_RESTRICT u,
|
||||
uint8_t* WEBP_RESTRICT v, int width);
|
||||
|
||||
// Must be called before using the above.
|
||||
void WebPInitConvertARGBToYUV(void);
|
||||
|
@ -20,9 +20,10 @@
|
||||
// Plain-C version
|
||||
|
||||
#define ROW_FUNC(FUNC_NAME, FUNC, XSTEP) \
|
||||
static void FUNC_NAME(const uint8_t* y, \
|
||||
const uint8_t* u, const uint8_t* v, \
|
||||
uint8_t* dst, int len) { \
|
||||
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
|
||||
const uint8_t* WEBP_RESTRICT u, \
|
||||
const uint8_t* WEBP_RESTRICT v, \
|
||||
uint8_t* WEBP_RESTRICT dst, int len) { \
|
||||
const uint8_t* const end = dst + (len & ~1) * (XSTEP); \
|
||||
while (dst != end) { \
|
||||
FUNC(y[0], u[0], v[0], dst); \
|
||||
@ -49,9 +50,10 @@ ROW_FUNC(YuvToRgb565Row, VP8YuvToRgb565, 2)
|
||||
#undef ROW_FUNC
|
||||
|
||||
// Main call for processing a plane with a WebPSamplerRowFunc function:
|
||||
void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
|
||||
const uint8_t* u, const uint8_t* v, int uv_stride,
|
||||
uint8_t* dst, int dst_stride,
|
||||
void WebPSamplerProcessPlane(const uint8_t* WEBP_RESTRICT y, int y_stride,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v, int uv_stride,
|
||||
uint8_t* WEBP_RESTRICT dst, int dst_stride,
|
||||
int width, int height, WebPSamplerRowFunc func) {
|
||||
int j;
|
||||
for (j = 0; j < height; ++j) {
|
||||
@ -117,7 +119,8 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
|
||||
//-----------------------------------------------------------------------------
|
||||
// ARGB -> YUV converters
|
||||
|
||||
static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) {
|
||||
static void ConvertARGBToY_C(const uint32_t* WEBP_RESTRICT argb,
|
||||
uint8_t* WEBP_RESTRICT y, int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
const uint32_t p = argb[i];
|
||||
@ -126,7 +129,8 @@ static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) {
|
||||
}
|
||||
}
|
||||
|
||||
void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
|
||||
void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb,
|
||||
uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int src_width, int do_store) {
|
||||
// No rounding. Last pixel is dealt with separately.
|
||||
const int uv_width = src_width >> 1;
|
||||
@ -169,22 +173,25 @@ void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
static void ConvertRGB24ToY_C(const uint8_t* rgb, uint8_t* y, int width) {
|
||||
static void ConvertRGB24ToY_C(const uint8_t* WEBP_RESTRICT rgb,
|
||||
uint8_t* WEBP_RESTRICT y, int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i, rgb += 3) {
|
||||
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
|
||||
}
|
||||
}
|
||||
|
||||
static void ConvertBGR24ToY_C(const uint8_t* bgr, uint8_t* y, int width) {
|
||||
static void ConvertBGR24ToY_C(const uint8_t* WEBP_RESTRICT bgr,
|
||||
uint8_t* WEBP_RESTRICT y, int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i, bgr += 3) {
|
||||
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
|
||||
}
|
||||
}
|
||||
|
||||
void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
|
||||
uint8_t* u, uint8_t* v, int width) {
|
||||
void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb,
|
||||
uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; i += 1, rgb += 4) {
|
||||
const int r = rgb[0], g = rgb[1], b = rgb[2];
|
||||
@ -195,13 +202,18 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
|
||||
void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
|
||||
void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
|
||||
uint8_t* u, uint8_t* v, int width);
|
||||
void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb,
|
||||
uint8_t* WEBP_RESTRICT y, int width);
|
||||
void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr,
|
||||
uint8_t* WEBP_RESTRICT y, int width);
|
||||
void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb,
|
||||
uint8_t* WEBP_RESTRICT u,
|
||||
uint8_t* WEBP_RESTRICT v, int width);
|
||||
|
||||
void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
|
||||
void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
|
||||
void (*WebPConvertARGBToY)(const uint32_t* WEBP_RESTRICT argb,
|
||||
uint8_t* WEBP_RESTRICT y, int width);
|
||||
void (*WebPConvertARGBToUV)(const uint32_t* WEBP_RESTRICT argb,
|
||||
uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
|
||||
int src_width, int do_store);
|
||||
|
||||
extern void WebPInitConvertARGBToYUVSSE2(void);
|
||||
|
@ -149,20 +149,34 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
|
||||
// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
|
||||
void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
|
||||
const uint8_t* v, uint8_t* dst);
|
||||
void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
void VP8YuvToRgba32_SSE2(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst);
|
||||
void VP8YuvToRgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst);
|
||||
void VP8YuvToBgra32_SSE2(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst);
|
||||
void VP8YuvToBgr32_SSE2(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst);
|
||||
void VP8YuvToArgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst);
|
||||
void VP8YuvToRgba444432_SSE2(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst);
|
||||
void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst);
|
||||
|
||||
#endif // WEBP_USE_SSE2
|
||||
|
||||
@ -172,10 +186,14 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
#if defined(WEBP_USE_SSE41)
|
||||
|
||||
// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
|
||||
void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
void VP8YuvToRgb32_SSE41(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst);
|
||||
void VP8YuvToBgr32_SSE41(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst);
|
||||
|
||||
#endif // WEBP_USE_SSE41
|
||||
|
||||
|
@ -22,9 +22,10 @@
|
||||
// simple point-sampling
|
||||
|
||||
#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \
|
||||
static void FUNC_NAME(const uint8_t* y, \
|
||||
const uint8_t* u, const uint8_t* v, \
|
||||
uint8_t* dst, int len) { \
|
||||
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
|
||||
const uint8_t* WEBP_RESTRICT u, \
|
||||
const uint8_t* WEBP_RESTRICT v, \
|
||||
uint8_t* WEBP_RESTRICT dst, int len) { \
|
||||
int i, r, g, b; \
|
||||
int temp0, temp1, temp2, temp3, temp4; \
|
||||
for (i = 0; i < (len >> 1); i++) { \
|
||||
|
@ -69,9 +69,10 @@
|
||||
: "memory", "hi", "lo" \
|
||||
|
||||
#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \
|
||||
static void FUNC_NAME(const uint8_t* y, \
|
||||
const uint8_t* u, const uint8_t* v, \
|
||||
uint8_t* dst, int len) { \
|
||||
static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y, \
|
||||
const uint8_t* WEBP_RESTRICT u, \
|
||||
const uint8_t* WEBP_RESTRICT v, \
|
||||
uint8_t* WEBP_RESTRICT dst, int len) { \
|
||||
int i; \
|
||||
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; \
|
||||
const int t_con_1 = 26149; \
|
||||
|
@ -46,7 +46,8 @@ static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
|
||||
return vqmovn_u16(Y2);
|
||||
}
|
||||
|
||||
static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
|
||||
static void ConvertRGB24ToY_NEON(const uint8_t* WEBP_RESTRICT rgb,
|
||||
uint8_t* WEBP_RESTRICT y, int width) {
|
||||
int i;
|
||||
for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) {
|
||||
const uint8x8x3_t RGB = vld3_u8(rgb);
|
||||
@ -58,7 +59,8 @@ static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
|
||||
}
|
||||
}
|
||||
|
||||
static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
|
||||
static void ConvertBGR24ToY_NEON(const uint8_t* WEBP_RESTRICT bgr,
|
||||
uint8_t* WEBP_RESTRICT y, int width) {
|
||||
int i;
|
||||
for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) {
|
||||
const uint8x8x3_t BGR = vld3_u8(bgr);
|
||||
@ -70,7 +72,8 @@ static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
|
||||
}
|
||||
}
|
||||
|
||||
static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
|
||||
static void ConvertARGBToY_NEON(const uint32_t* WEBP_RESTRICT argb,
|
||||
uint8_t* WEBP_RESTRICT y, int width) {
|
||||
int i;
|
||||
for (i = 0; i + 8 <= width; i += 8) {
|
||||
const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
|
||||
@ -114,8 +117,9 @@ static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
|
||||
MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST); \
|
||||
} while (0)
|
||||
|
||||
static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
|
||||
uint8_t* u, uint8_t* v, int width) {
|
||||
static void ConvertRGBA32ToUV_NEON(const uint16_t* WEBP_RESTRICT rgb,
|
||||
uint8_t* WEBP_RESTRICT u,
|
||||
uint8_t* WEBP_RESTRICT v, int width) {
|
||||
int i;
|
||||
for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) {
|
||||
const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb);
|
||||
@ -131,7 +135,9 @@ static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
|
||||
}
|
||||
}
|
||||
|
||||
static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
|
||||
static void ConvertARGBToUV_NEON(const uint32_t* WEBP_RESTRICT argb,
|
||||
uint8_t* WEBP_RESTRICT u,
|
||||
uint8_t* WEBP_RESTRICT v,
|
||||
int src_width, int do_store) {
|
||||
int i;
|
||||
for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {
|
||||
|
@ -82,9 +82,9 @@ static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
|
||||
}
|
||||
|
||||
// Convert 32 samples of YUV444 to R/G/B
|
||||
static void YUV444ToRGB_SSE2(const uint8_t* const y,
|
||||
const uint8_t* const u,
|
||||
const uint8_t* const v,
|
||||
static void YUV444ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y,
|
||||
const uint8_t* WEBP_RESTRICT const u,
|
||||
const uint8_t* WEBP_RESTRICT const v,
|
||||
__m128i* const R, __m128i* const G,
|
||||
__m128i* const B) {
|
||||
const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
|
||||
@ -93,9 +93,9 @@ static void YUV444ToRGB_SSE2(const uint8_t* const y,
|
||||
}
|
||||
|
||||
// Convert 32 samples of YUV420 to R/G/B
|
||||
static void YUV420ToRGB_SSE2(const uint8_t* const y,
|
||||
const uint8_t* const u,
|
||||
const uint8_t* const v,
|
||||
static void YUV420ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y,
|
||||
const uint8_t* WEBP_RESTRICT const u,
|
||||
const uint8_t* WEBP_RESTRICT const v,
|
||||
__m128i* const R, __m128i* const G,
|
||||
__m128i* const B) {
|
||||
const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
|
||||
@ -108,7 +108,7 @@ static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
|
||||
const __m128i* const G,
|
||||
const __m128i* const B,
|
||||
const __m128i* const A,
|
||||
uint8_t* const dst) {
|
||||
uint8_t* WEBP_RESTRICT const dst) {
|
||||
const __m128i rb = _mm_packus_epi16(*R, *B);
|
||||
const __m128i ga = _mm_packus_epi16(*G, *A);
|
||||
const __m128i rg = _mm_unpacklo_epi8(rb, ga);
|
||||
@ -120,11 +120,9 @@ static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
|
||||
}
|
||||
|
||||
// Pack R/G/B/A results into 16b output.
|
||||
static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
|
||||
const __m128i* const G,
|
||||
const __m128i* const B,
|
||||
const __m128i* const A,
|
||||
uint8_t* const dst) {
|
||||
static WEBP_INLINE void PackAndStore4444_SSE2(
|
||||
const __m128i* const R, const __m128i* const G, const __m128i* const B,
|
||||
const __m128i* const A, uint8_t* WEBP_RESTRICT const dst) {
|
||||
#if (WEBP_SWAP_16BIT_CSP == 0)
|
||||
const __m128i rg0 = _mm_packus_epi16(*R, *G);
|
||||
const __m128i ba0 = _mm_packus_epi16(*B, *A);
|
||||
@ -145,7 +143,7 @@ static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
|
||||
static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
|
||||
const __m128i* const G,
|
||||
const __m128i* const B,
|
||||
uint8_t* const dst) {
|
||||
uint8_t* WEBP_RESTRICT const dst) {
|
||||
const __m128i r0 = _mm_packus_epi16(*R, *R);
|
||||
const __m128i g0 = _mm_packus_epi16(*G, *G);
|
||||
const __m128i b0 = _mm_packus_epi16(*B, *B);
|
||||
@ -170,7 +168,7 @@ static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
|
||||
static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
|
||||
__m128i* const in2, __m128i* const in3,
|
||||
__m128i* const in4, __m128i* const in5,
|
||||
uint8_t* const rgb) {
|
||||
uint8_t* WEBP_RESTRICT const rgb) {
|
||||
// The input is 6 registers of sixteen 8b but for the sake of explanation,
|
||||
// let's take 6 registers of four 8b values.
|
||||
// To pack, we will keep taking one every two 8b integer and move it
|
||||
@ -193,8 +191,10 @@ static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
|
||||
_mm_storeu_si128((__m128i*)(rgb + 80), *in5);
|
||||
}
|
||||
|
||||
void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
void VP8YuvToRgba32_SSE2(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
const __m128i kAlpha = _mm_set1_epi16(255);
|
||||
int n;
|
||||
for (n = 0; n < 32; n += 8, dst += 32) {
|
||||
@ -204,8 +204,10 @@ void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
}
|
||||
}
|
||||
|
||||
void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
void VP8YuvToBgra32_SSE2(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
const __m128i kAlpha = _mm_set1_epi16(255);
|
||||
int n;
|
||||
for (n = 0; n < 32; n += 8, dst += 32) {
|
||||
@ -215,8 +217,10 @@ void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
}
|
||||
}
|
||||
|
||||
void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
void VP8YuvToArgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
const __m128i kAlpha = _mm_set1_epi16(255);
|
||||
int n;
|
||||
for (n = 0; n < 32; n += 8, dst += 32) {
|
||||
@ -226,8 +230,10 @@ void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
}
|
||||
}
|
||||
|
||||
void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
|
||||
const uint8_t* v, uint8_t* dst) {
|
||||
void VP8YuvToRgba444432_SSE2(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
const __m128i kAlpha = _mm_set1_epi16(255);
|
||||
int n;
|
||||
for (n = 0; n < 32; n += 8, dst += 16) {
|
||||
@ -237,8 +243,10 @@ void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
|
||||
}
|
||||
}
|
||||
|
||||
void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
int n;
|
||||
for (n = 0; n < 32; n += 8, dst += 16) {
|
||||
__m128i R, G, B;
|
||||
@ -247,8 +255,10 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
}
|
||||
}
|
||||
|
||||
void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
void VP8YuvToRgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
|
||||
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
|
||||
|
||||
@ -269,8 +279,10 @@ void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
|
||||
}
|
||||
|
||||
void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
void VP8YuvToBgr32_SSE2(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
|
||||
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
|
||||
|
||||
@ -294,9 +306,10 @@ void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
//-----------------------------------------------------------------------------
|
||||
// Arbitrary-length row conversion functions
|
||||
|
||||
static void YuvToRgbaRow_SSE2(const uint8_t* y,
|
||||
const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst, int len) {
|
||||
static void YuvToRgbaRow_SSE2(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst, int len) {
|
||||
const __m128i kAlpha = _mm_set1_epi16(255);
|
||||
int n;
|
||||
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
|
||||
@ -316,9 +329,10 @@ static void YuvToRgbaRow_SSE2(const uint8_t* y,
|
||||
}
|
||||
}
|
||||
|
||||
static void YuvToBgraRow_SSE2(const uint8_t* y,
|
||||
const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst, int len) {
|
||||
static void YuvToBgraRow_SSE2(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst, int len) {
|
||||
const __m128i kAlpha = _mm_set1_epi16(255);
|
||||
int n;
|
||||
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
|
||||
@ -338,9 +352,10 @@ static void YuvToBgraRow_SSE2(const uint8_t* y,
|
||||
}
|
||||
}
|
||||
|
||||
static void YuvToArgbRow_SSE2(const uint8_t* y,
|
||||
const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst, int len) {
|
||||
static void YuvToArgbRow_SSE2(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst, int len) {
|
||||
const __m128i kAlpha = _mm_set1_epi16(255);
|
||||
int n;
|
||||
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
|
||||
@ -360,9 +375,10 @@ static void YuvToArgbRow_SSE2(const uint8_t* y,
|
||||
}
|
||||
}
|
||||
|
||||
static void YuvToRgbRow_SSE2(const uint8_t* y,
|
||||
const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst, int len) {
|
||||
static void YuvToRgbRow_SSE2(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst, int len) {
|
||||
int n;
|
||||
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
|
||||
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
|
||||
@ -397,9 +413,10 @@ static void YuvToRgbRow_SSE2(const uint8_t* y,
|
||||
}
|
||||
}
|
||||
|
||||
static void YuvToBgrRow_SSE2(const uint8_t* y,
|
||||
const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst, int len) {
|
||||
static void YuvToBgrRow_SSE2(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst, int len) {
|
||||
int n;
|
||||
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
|
||||
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
|
||||
@ -471,7 +488,7 @@ static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
|
||||
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
|
||||
// Similar to PlanarTo24bHelper(), but in reverse order.
|
||||
static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
|
||||
const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
|
||||
const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) {
|
||||
__m128i tmp[6];
|
||||
tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0));
|
||||
tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
|
||||
@ -488,8 +505,8 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
|
||||
}
|
||||
|
||||
// Convert 8 packed ARGB to r[], g[], b[]
|
||||
static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
|
||||
__m128i* const rgb /*in[6]*/) {
|
||||
static WEBP_INLINE void RGB32PackedToPlanar_SSE2(
|
||||
const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i a0 = LOAD_16(argb + 0);
|
||||
__m128i a1 = LOAD_16(argb + 4);
|
||||
@ -562,7 +579,8 @@ static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
|
||||
#undef MK_CST_16
|
||||
#undef TRANSFORM
|
||||
|
||||
static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
|
||||
static void ConvertRGB24ToY_SSE2(const uint8_t* WEBP_RESTRICT rgb,
|
||||
uint8_t* WEBP_RESTRICT y, int width) {
|
||||
const int max_width = width & ~31;
|
||||
int i;
|
||||
for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
|
||||
@ -596,7 +614,8 @@ static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
|
||||
}
|
||||
}
|
||||
|
||||
static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
|
||||
static void ConvertBGR24ToY_SSE2(const uint8_t* WEBP_RESTRICT bgr,
|
||||
uint8_t* WEBP_RESTRICT y, int width) {
|
||||
const int max_width = width & ~31;
|
||||
int i;
|
||||
for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
|
||||
@ -630,7 +649,8 @@ static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
|
||||
}
|
||||
}
|
||||
|
||||
static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) {
|
||||
static void ConvertARGBToY_SSE2(const uint32_t* WEBP_RESTRICT argb,
|
||||
uint8_t* WEBP_RESTRICT y, int width) {
|
||||
const int max_width = width & ~15;
|
||||
int i;
|
||||
for (i = 0; i < max_width; i += 16) {
|
||||
@ -658,8 +678,9 @@ static void HorizontalAddPack_SSE2(const __m128i* const A,
|
||||
*out = _mm_packs_epi32(C, D);
|
||||
}
|
||||
|
||||
static void ConvertARGBToUV_SSE2(const uint32_t* argb,
|
||||
uint8_t* u, uint8_t* v,
|
||||
static void ConvertARGBToUV_SSE2(const uint32_t* WEBP_RESTRICT argb,
|
||||
uint8_t* WEBP_RESTRICT u,
|
||||
uint8_t* WEBP_RESTRICT v,
|
||||
int src_width, int do_store) {
|
||||
const int max_width = src_width & ~31;
|
||||
int i;
|
||||
@ -695,7 +716,7 @@ static void ConvertARGBToUV_SSE2(const uint32_t* argb,
|
||||
|
||||
// Convert 16 packed ARGB 16b-values to r[], g[], b[]
|
||||
static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
|
||||
const uint16_t* const rgbx,
|
||||
const uint16_t* WEBP_RESTRICT const rgbx,
|
||||
__m128i* const r, __m128i* const g, __m128i* const b) {
|
||||
const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x
|
||||
const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x
|
||||
@ -715,8 +736,9 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
|
||||
*b = _mm_unpacklo_epi64(B1, B3);
|
||||
}
|
||||
|
||||
static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb,
|
||||
uint8_t* u, uint8_t* v, int width) {
|
||||
static void ConvertRGBA32ToUV_SSE2(const uint16_t* WEBP_RESTRICT rgb,
|
||||
uint8_t* WEBP_RESTRICT u,
|
||||
uint8_t* WEBP_RESTRICT v, int width) {
|
||||
const int max_width = width & ~15;
|
||||
const uint16_t* const last_rgb = rgb + 4 * max_width;
|
||||
while (rgb < last_rgb) {
|
||||
|
@ -82,9 +82,9 @@ static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) {
|
||||
}
|
||||
|
||||
// Convert 32 samples of YUV444 to R/G/B
|
||||
static void YUV444ToRGB_SSE41(const uint8_t* const y,
|
||||
const uint8_t* const u,
|
||||
const uint8_t* const v,
|
||||
static void YUV444ToRGB_SSE41(const uint8_t* WEBP_RESTRICT const y,
|
||||
const uint8_t* WEBP_RESTRICT const u,
|
||||
const uint8_t* WEBP_RESTRICT const v,
|
||||
__m128i* const R, __m128i* const G,
|
||||
__m128i* const B) {
|
||||
const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u),
|
||||
@ -93,9 +93,9 @@ static void YUV444ToRGB_SSE41(const uint8_t* const y,
|
||||
}
|
||||
|
||||
// Convert 32 samples of YUV420 to R/G/B
|
||||
static void YUV420ToRGB_SSE41(const uint8_t* const y,
|
||||
const uint8_t* const u,
|
||||
const uint8_t* const v,
|
||||
static void YUV420ToRGB_SSE41(const uint8_t* WEBP_RESTRICT const y,
|
||||
const uint8_t* WEBP_RESTRICT const u,
|
||||
const uint8_t* WEBP_RESTRICT const v,
|
||||
__m128i* const R, __m128i* const G,
|
||||
__m128i* const B) {
|
||||
const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u),
|
||||
@ -109,7 +109,7 @@ static void YUV420ToRGB_SSE41(const uint8_t* const y,
|
||||
static WEBP_INLINE void PlanarTo24b_SSE41(
|
||||
__m128i* const in0, __m128i* const in1, __m128i* const in2,
|
||||
__m128i* const in3, __m128i* const in4, __m128i* const in5,
|
||||
uint8_t* const rgb) {
|
||||
uint8_t* WEBP_RESTRICT const rgb) {
|
||||
// The input is 6 registers of sixteen 8b but for the sake of explanation,
|
||||
// let's take 6 registers of four 8b values.
|
||||
// To pack, we will keep taking one every two 8b integer and move it
|
||||
@ -132,8 +132,10 @@ static WEBP_INLINE void PlanarTo24b_SSE41(
|
||||
_mm_storeu_si128((__m128i*)(rgb + 80), *in5);
|
||||
}
|
||||
|
||||
void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
void VP8YuvToRgb32_SSE41(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
|
||||
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
|
||||
|
||||
@ -154,8 +156,10 @@ void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
|
||||
}
|
||||
|
||||
void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
void VP8YuvToBgr32_SSE41(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst) {
|
||||
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
|
||||
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
|
||||
|
||||
@ -179,9 +183,10 @@ void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
//-----------------------------------------------------------------------------
|
||||
// Arbitrary-length row conversion functions
|
||||
|
||||
static void YuvToRgbRow_SSE41(const uint8_t* y,
|
||||
const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst, int len) {
|
||||
static void YuvToRgbRow_SSE41(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst, int len) {
|
||||
int n;
|
||||
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
|
||||
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
|
||||
@ -216,9 +221,10 @@ static void YuvToRgbRow_SSE41(const uint8_t* y,
|
||||
}
|
||||
}
|
||||
|
||||
static void YuvToBgrRow_SSE41(const uint8_t* y,
|
||||
const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst, int len) {
|
||||
static void YuvToBgrRow_SSE41(const uint8_t* WEBP_RESTRICT y,
|
||||
const uint8_t* WEBP_RESTRICT u,
|
||||
const uint8_t* WEBP_RESTRICT v,
|
||||
uint8_t* WEBP_RESTRICT dst, int len) {
|
||||
int n;
|
||||
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
|
||||
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
|
||||
@ -290,7 +296,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) {
|
||||
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
|
||||
// Similar to PlanarTo24bHelper(), but in reverse order.
|
||||
static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
|
||||
const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
|
||||
const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) {
|
||||
const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb + 0));
|
||||
const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16));
|
||||
const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + 32));
|
||||
@ -334,7 +340,7 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
|
||||
|
||||
// Convert 8 packed ARGB to r[], g[], b[]
|
||||
static WEBP_INLINE void RGB32PackedToPlanar_SSE41(
|
||||
const uint32_t* const argb, __m128i* const rgb /*in[6]*/) {
|
||||
const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i a0 = LOAD_16(argb + 0);
|
||||
__m128i a1 = LOAD_16(argb + 4);
|
||||
@ -407,7 +413,8 @@ static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R,
|
||||
#undef MK_CST_16
|
||||
#undef TRANSFORM
|
||||
|
||||
static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) {
|
||||
static void ConvertRGB24ToY_SSE41(const uint8_t* WEBP_RESTRICT rgb,
|
||||
uint8_t* WEBP_RESTRICT y, int width) {
|
||||
const int max_width = width & ~31;
|
||||
int i;
|
||||
for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
|
||||
@ -441,7 +448,8 @@ static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) {
|
||||
}
|
||||
}
|
||||
|
||||
static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) {
|
||||
static void ConvertBGR24ToY_SSE41(const uint8_t* WEBP_RESTRICT bgr,
|
||||
uint8_t* WEBP_RESTRICT y, int width) {
|
||||
const int max_width = width & ~31;
|
||||
int i;
|
||||
for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
|
||||
@ -475,7 +483,8 @@ static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) {
|
||||
}
|
||||
}
|
||||
|
||||
static void ConvertARGBToY_SSE41(const uint32_t* argb, uint8_t* y, int width) {
|
||||
static void ConvertARGBToY_SSE41(const uint32_t* WEBP_RESTRICT argb,
|
||||
uint8_t* WEBP_RESTRICT y, int width) {
|
||||
const int max_width = width & ~15;
|
||||
int i;
|
||||
for (i = 0; i < max_width; i += 16) {
|
||||
@ -503,8 +512,9 @@ static void HorizontalAddPack_SSE41(const __m128i* const A,
|
||||
*out = _mm_packs_epi32(C, D);
|
||||
}
|
||||
|
||||
static void ConvertARGBToUV_SSE41(const uint32_t* argb,
|
||||
uint8_t* u, uint8_t* v,
|
||||
static void ConvertARGBToUV_SSE41(const uint32_t* WEBP_RESTRICT argb,
|
||||
uint8_t* WEBP_RESTRICT u,
|
||||
uint8_t* WEBP_RESTRICT v,
|
||||
int src_width, int do_store) {
|
||||
const int max_width = src_width & ~31;
|
||||
int i;
|
||||
@ -540,7 +550,7 @@ static void ConvertARGBToUV_SSE41(const uint32_t* argb,
|
||||
|
||||
// Convert 16 packed ARGB 16b-values to r[], g[], b[]
|
||||
static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
|
||||
const uint16_t* const rgbx,
|
||||
const uint16_t* WEBP_RESTRICT const rgbx,
|
||||
__m128i* const r, __m128i* const g, __m128i* const b) {
|
||||
const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x
|
||||
const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x
|
||||
@ -570,8 +580,9 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
|
||||
*b = _mm_unpackhi_epi64(B1, B3);
|
||||
}
|
||||
|
||||
static void ConvertRGBA32ToUV_SSE41(const uint16_t* rgb,
|
||||
uint8_t* u, uint8_t* v, int width) {
|
||||
static void ConvertRGBA32ToUV_SSE41(const uint16_t* WEBP_RESTRICT rgb,
|
||||
uint8_t* WEBP_RESTRICT u,
|
||||
uint8_t* WEBP_RESTRICT v, int width) {
|
||||
const int max_width = width & ~15;
|
||||
const uint16_t* const last_rgb = rgb + 4 * max_width;
|
||||
while (rgb < last_rgb) {
|
||||
|
Loading…
Reference in New Issue
Block a user