From 8c815d82d71f4296150c03d98ee90430d3cf1213 Mon Sep 17 00:00:00 2001 From: Vincent Rabaud Date: Thu, 24 Jul 2025 14:14:20 +0200 Subject: [PATCH] Add ARGB/ABGR support to WebPConvertRGB24ToY/WebPConvertBGR24ToY Rename them to WebPConvertRGBToY/WebPConvertBGRToY and accept the 'step' parameter (3 for RGB, 4 for ARGB). Change-Id: I930a23894e4135a34fff2174e6a5bbee1eac2ba0 --- src/dsp/dsp.h | 10 +-- src/dsp/yuv.c | 28 +++---- src/dsp/yuv_neon.c | 61 +++++++++----- src/dsp/yuv_sse2.c | 157 ++++++++++++++++++++---------------- src/dsp/yuv_sse41.c | 163 +++++++++++++++++++++----------------- src/enc/picture_csp_enc.c | 24 +++--- 6 files changed, 251 insertions(+), 192 deletions(-) diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index 1b37ef4b..9acfd444 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -352,11 +352,11 @@ extern void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb, uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, int width); -// Convert RGB or BGR to Y -extern void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb, - uint8_t* WEBP_RESTRICT y, int width); -extern void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr, - uint8_t* WEBP_RESTRICT y, int width); +// Convert RGB or BGR to Y. Step is 3 or 4. If step is 4, data is RGBA or BGRA. +extern void (*WebPConvertRGBToY)(const uint8_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT y, int width, int step); +extern void (*WebPConvertBGRToY)(const uint8_t* WEBP_RESTRICT bgr, + uint8_t* WEBP_RESTRICT y, int width, int step); // used for plain-C fallback. extern void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb, diff --git a/src/dsp/yuv.c b/src/dsp/yuv.c index 62f1ecc1..7b0b6a91 100644 --- a/src/dsp/yuv.c +++ b/src/dsp/yuv.c @@ -177,18 +177,18 @@ void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb, //----------------------------------------------------------------------------- -static void ConvertRGB24ToY_C(const uint8_t* WEBP_RESTRICT rgb, - uint8_t* WEBP_RESTRICT y, int width) { +static void ConvertRGBToY_C(const uint8_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT y, int width, int step) { int i; - for (i = 0; i < width; ++i, rgb += 3) { + for (i = 0; i < width; ++i, rgb += step) { y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); } } -static void ConvertBGR24ToY_C(const uint8_t* WEBP_RESTRICT bgr, - uint8_t* WEBP_RESTRICT y, int width) { +static void ConvertBGRToY_C(const uint8_t* WEBP_RESTRICT bgr, + uint8_t* WEBP_RESTRICT y, int width, int step) { int i; - for (i = 0; i < width; ++i, bgr += 3) { + for (i = 0; i < width; ++i, bgr += step) { y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); } } @@ -206,10 +206,10 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb, //----------------------------------------------------------------------------- -void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb, - uint8_t* WEBP_RESTRICT y, int width); -void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr, - uint8_t* WEBP_RESTRICT y, int width); +void (*WebPConvertRGBToY)(const uint8_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT y, int width, int step); +void (*WebPConvertBGRToY)(const uint8_t* WEBP_RESTRICT bgr, + uint8_t* WEBP_RESTRICT y, int width, int step); void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb, uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, int width); @@ -228,8 +228,8 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) { WebPConvertARGBToY = ConvertARGBToY_C; WebPConvertARGBToUV = WebPConvertARGBToUV_C; - WebPConvertRGB24ToY = ConvertRGB24ToY_C; - WebPConvertBGR24ToY = ConvertBGR24ToY_C; + WebPConvertRGBToY = ConvertRGBToY_C; + WebPConvertBGRToY = ConvertBGRToY_C; WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C; @@ -255,7 +255,7 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) { assert(WebPConvertARGBToY != NULL); assert(WebPConvertARGBToUV != NULL); - assert(WebPConvertRGB24ToY != NULL); - assert(WebPConvertBGR24ToY != NULL); + assert(WebPConvertRGBToY != NULL); + assert(WebPConvertBGRToY != NULL); assert(WebPConvertRGBA32ToUV != NULL); } diff --git a/src/dsp/yuv_neon.c b/src/dsp/yuv_neon.c index 44745cf7..3e0afa0e 100644 --- a/src/dsp/yuv_neon.c +++ b/src/dsp/yuv_neon.c @@ -23,9 +23,9 @@ //----------------------------------------------------------------------------- -static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R, - const uint8x8_t G, - const uint8x8_t B) { +static uint8x8_t ConvertRGBToYImpl_NEON(const uint8x8_t R, + const uint8x8_t G, + const uint8x8_t B) { const uint16x8_t r = vmovl_u8(R); const uint16x8_t g = vmovl_u8(G); const uint16x8_t b = vmovl_u8(B); @@ -47,28 +47,48 @@ static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R, return vqmovn_u16(Y2); } -static void ConvertRGB24ToY_NEON(const uint8_t* WEBP_RESTRICT rgb, - uint8_t* WEBP_RESTRICT y, int width) { +static void ConvertRGBToY_NEON(const uint8_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT y, int width, int step) { int i; - for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) { - const uint8x8x3_t RGB = vld3_u8(rgb); - const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[0], RGB.val[1], RGB.val[2]); - vst1_u8(y + i, Y); + if (step == 3) { + for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) { + const uint8x8x3_t RGB = vld3_u8(rgb); + const uint8x8_t Y = + ConvertRGBToYImpl_NEON(RGB.val[0], RGB.val[1], RGB.val[2]); + vst1_u8(y + i, Y); + } + } else { + for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) { + const uint8x8x4_t RGB = vld4_u8(rgb); + const uint8x8_t Y = + ConvertRGBToYImpl_NEON(RGB.val[0], RGB.val[1], RGB.val[2]); + vst1_u8(y + i, Y); + } } - for (; i < width; ++i, rgb += 3) { // left-over + for (; i < width; ++i, rgb += step) { // left-over y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); } } -static void ConvertBGR24ToY_NEON(const uint8_t* WEBP_RESTRICT bgr, - uint8_t* WEBP_RESTRICT y, int width) { +static void ConvertBGRToY_NEON(const uint8_t* WEBP_RESTRICT bgr, + uint8_t* WEBP_RESTRICT y, int width, int step) { int i; - for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) { - const uint8x8x3_t BGR = vld3_u8(bgr); - const uint8x8_t Y = ConvertRGBToY_NEON(BGR.val[2], BGR.val[1], BGR.val[0]); - vst1_u8(y + i, Y); + if (step == 3) { + for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) { + const uint8x8x3_t BGR = vld3_u8(bgr); + const uint8x8_t Y = + ConvertRGBToYImpl_NEON(BGR.val[2], BGR.val[1], BGR.val[0]); + vst1_u8(y + i, Y); + } + } else { + for (i = 0; i + 8 <= width; i += 8, bgr += 4 * 8) { + const uint8x8x4_t BGR = vld4_u8(bgr); + const uint8x8_t Y = + ConvertRGBToYImpl_NEON(BGR.val[2], BGR.val[1], BGR.val[0]); + vst1_u8(y + i, Y); + } } - for (; i < width; ++i, bgr += 3) { // left-over + for (; i < width; ++i, bgr += step) { // left-over y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); } } @@ -78,7 +98,8 @@ static void ConvertARGBToY_NEON(const uint32_t* WEBP_RESTRICT argb, int i; for (i = 0; i + 8 <= width; i += 8) { const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]); - const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[2], RGB.val[1], RGB.val[0]); + const uint8x8_t Y = + ConvertRGBToYImpl_NEON(RGB.val[2], RGB.val[1], RGB.val[0]); vst1_u8(y + i, Y); } for (; i < width; ++i) { // left-over @@ -173,8 +194,8 @@ static void ConvertARGBToUV_NEON(const uint32_t* WEBP_RESTRICT argb, extern void WebPInitConvertARGBToYUVNEON(void); WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) { - WebPConvertRGB24ToY = ConvertRGB24ToY_NEON; - WebPConvertBGR24ToY = ConvertBGR24ToY_NEON; + WebPConvertRGBToY = ConvertRGBToY_NEON; + WebPConvertBGRToY = ConvertBGRToY_NEON; WebPConvertARGBToY = ConvertARGBToY_NEON; WebPConvertARGBToUV = ConvertARGBToUV_NEON; WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON; diff --git a/src/dsp/yuv_sse2.c b/src/dsp/yuv_sse2.c index f1abf217..c024ea38 100644 --- a/src/dsp/yuv_sse2.c +++ b/src/dsp/yuv_sse2.c @@ -491,7 +491,7 @@ static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2( // Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers: // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // Similar to PlanarTo24bHelper(), but in reverse order. -static WEBP_INLINE void RGB24PackedToPlanar_SSE2( +static WEBP_INLINE void RGBPackedToPlanar_SSE2( const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) { __m128i tmp[6]; tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0)); @@ -508,8 +508,31 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE2( RGB24PackedToPlanarHelper_SSE2(tmp, out); } -// Convert 8 packed ARGB to r[], g[], b[] -static WEBP_INLINE void RGB32PackedToPlanar_SSE2( +// Unpack the 8b input rgbargbargba... as contiguous registers: +// rrrr... rrrr... gggg... gggg... bbbb... bbbb.... +static WEBP_INLINE void RGBAPackedToRGBPlanar_SSE2( + const uint8_t* WEBP_RESTRICT const rgba, __m128i* const rgb /*in[6]*/) { + __m128i a0 = _mm_loadu_si128((const __m128i*)(rgba + 0)); + __m128i a1 = _mm_loadu_si128((const __m128i*)(rgba + 16)); + __m128i a2 = _mm_loadu_si128((const __m128i*)(rgba + 32)); + __m128i a3 = _mm_loadu_si128((const __m128i*)(rgba + 48)); + __m128i a4 = _mm_loadu_si128((const __m128i*)(rgba + 64)); + __m128i a5 = _mm_loadu_si128((const __m128i*)(rgba + 80)); + __m128i a6 = _mm_loadu_si128((const __m128i*)(rgba + 96)); + __m128i a7 = _mm_loadu_si128((const __m128i*)(rgba + 112)); + VP8L32bToPlanar_SSE2(&a0, &a1, &a2, &a3); + rgb[0] = a3; + rgb[2] = a2; + rgb[4] = a1; + VP8L32bToPlanar_SSE2(&a4, &a5, &a6, &a7); + rgb[1] = a7; + rgb[3] = a6; + rgb[5] = a5; +} + +// Unpack the 8b input argbargbargb... as contiguous registers: +// 0r0r0r... 0r0r0r... 0g0g0g... 0g0g0g0... 0b0b0b... 0b0b0b.... +static WEBP_INLINE void RGB32PackedToPlanar16_SSE2( const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) { const __m128i zero = _mm_setzero_si128(); __m128i a0 = LOAD_16(argb + 0); @@ -544,10 +567,10 @@ static WEBP_INLINE void RGB32PackedToPlanar_SSE2( } while (0) #define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A)) -static WEBP_INLINE void ConvertRGBToY_SSE2(const __m128i* const R, - const __m128i* const G, - const __m128i* const B, - __m128i* const Y) { +static WEBP_INLINE void ConvertRGBToYImpl_SSE2(const __m128i* const R, + const __m128i* const G, + const __m128i* const B, + __m128i* const Y) { const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384); const __m128i kGB_y = MK_CST_16(16384, 6420); const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF); @@ -583,72 +606,70 @@ static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R, #undef MK_CST_16 #undef TRANSFORM -static void ConvertRGB24ToY_SSE2(const uint8_t* WEBP_RESTRICT rgb, - uint8_t* WEBP_RESTRICT y, int width) { +static WEBP_INLINE void ConvertRGBToYHelper_SSE2( + const __m128i* const rgb_plane /*in[6]*/, int swap_rb, int* i, + uint8_t* WEBP_RESTRICT y) { + int j; + + for (j = 0; j < 2; ++j, *i += 16) { + const __m128i zero = _mm_setzero_si128(); + __m128i r, g, b, Y0, Y1; + + // Convert to 16-bit Y. + r = _mm_unpacklo_epi8(rgb_plane[(swap_rb ? 4 : 0) + j], zero); + g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero); + b = _mm_unpacklo_epi8(rgb_plane[(swap_rb ? 0 : 4) + j], zero); + ConvertRGBToYImpl_SSE2(&r, &g, &b, &Y0); + + // Convert to 16-bit Y. + r = _mm_unpackhi_epi8(rgb_plane[(swap_rb ? 4 : 0) + j], zero); + g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero); + b = _mm_unpackhi_epi8(rgb_plane[(swap_rb ? 0 : 4) + j], zero); + ConvertRGBToYImpl_SSE2(&r, &g, &b, &Y1); + + // Cast to 8-bit and store. + STORE_16(_mm_packus_epi16(Y0, Y1), y + *i); + } +} + +static void ConvertRGBToY_SSE2(const uint8_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT y, int width, int step) { const int max_width = width & ~31; int i; - for (i = 0; i < max_width; rgb += 3 * 16 * 2) { - __m128i rgb_plane[6]; - int j; - - RGB24PackedToPlanar_SSE2(rgb, rgb_plane); - - for (j = 0; j < 2; ++j, i += 16) { - const __m128i zero = _mm_setzero_si128(); - __m128i r, g, b, Y0, Y1; - - // Convert to 16-bit Y. - r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero); - g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero); - b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero); - ConvertRGBToY_SSE2(&r, &g, &b, &Y0); - - // Convert to 16-bit Y. - r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero); - g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero); - b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero); - ConvertRGBToY_SSE2(&r, &g, &b, &Y1); - - // Cast to 8-bit and store. - STORE_16(_mm_packus_epi16(Y0, Y1), y + i); + __m128i rgb_plane[6]; + if (step == 3) { + for (i = 0; i < max_width; rgb += 3 * 16 * 2) { + RGBPackedToPlanar_SSE2(rgb, rgb_plane); + ConvertRGBToYHelper_SSE2(rgb_plane, /*swap_rb=*/0, &i, y); + } + } else { + for (i = 0; i < max_width; rgb += 4 * 16 * 2) { + RGBAPackedToRGBPlanar_SSE2(rgb, rgb_plane); + ConvertRGBToYHelper_SSE2(rgb_plane, /*swap_rb=*/0, &i, y); } } - for (; i < width; ++i, rgb += 3) { // left-over + for (; i < width; ++i, rgb += step) { // left-over y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); } } -static void ConvertBGR24ToY_SSE2(const uint8_t* WEBP_RESTRICT bgr, - uint8_t* WEBP_RESTRICT y, int width) { +static void ConvertBGRToY_SSE2(const uint8_t* WEBP_RESTRICT bgr, + uint8_t* WEBP_RESTRICT y, int width, int step) { const int max_width = width & ~31; int i; - for (i = 0; i < max_width; bgr += 3 * 16 * 2) { - __m128i bgr_plane[6]; - int j; - - RGB24PackedToPlanar_SSE2(bgr, bgr_plane); - - for (j = 0; j < 2; ++j, i += 16) { - const __m128i zero = _mm_setzero_si128(); - __m128i r, g, b, Y0, Y1; - - // Convert to 16-bit Y. - b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero); - g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero); - r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero); - ConvertRGBToY_SSE2(&r, &g, &b, &Y0); - - // Convert to 16-bit Y. - b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero); - g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero); - r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero); - ConvertRGBToY_SSE2(&r, &g, &b, &Y1); - - // Cast to 8-bit and store. - STORE_16(_mm_packus_epi16(Y0, Y1), y + i); + __m128i bgr_plane[6]; + if (step == 3) { + for (i = 0; i < max_width; bgr += 3 * 16 * 2) { + RGBPackedToPlanar_SSE2(bgr, bgr_plane); + ConvertRGBToYHelper_SSE2(bgr_plane, /*swap_rb=*/1, &i, y); + } + } else { + for (i = 0; i < max_width; bgr += 4 * 16 * 2) { + RGBAPackedToRGBPlanar_SSE2(bgr, bgr_plane); + ConvertRGBToYHelper_SSE2(bgr_plane, /*swap_rb=*/1, &i, y); } } - for (; i < width; ++i, bgr += 3) { // left-over + for (; i < width; ++i, bgr += step) { // left-over y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); } } @@ -659,9 +680,9 @@ static void ConvertARGBToY_SSE2(const uint32_t* WEBP_RESTRICT argb, int i; for (i = 0; i < max_width; i += 16) { __m128i Y0, Y1, rgb[6]; - RGB32PackedToPlanar_SSE2(&argb[i], rgb); - ConvertRGBToY_SSE2(&rgb[0], &rgb[2], &rgb[4], &Y0); - ConvertRGBToY_SSE2(&rgb[1], &rgb[3], &rgb[5], &Y1); + RGB32PackedToPlanar16_SSE2(&argb[i], rgb); + ConvertRGBToYImpl_SSE2(&rgb[0], &rgb[2], &rgb[4], &Y0); + ConvertRGBToYImpl_SSE2(&rgb[1], &rgb[3], &rgb[5], &Y1); STORE_16(_mm_packus_epi16(Y0, Y1), y + i); } for (; i < width; ++i) { // left-over @@ -690,13 +711,13 @@ static void ConvertARGBToUV_SSE2(const uint32_t* WEBP_RESTRICT argb, int i; for (i = 0; i < max_width; i += 32, u += 16, v += 16) { __m128i rgb[6], U0, V0, U1, V1; - RGB32PackedToPlanar_SSE2(&argb[i], rgb); + RGB32PackedToPlanar16_SSE2(&argb[i], rgb); HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]); HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]); HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]); ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U0, &V0); - RGB32PackedToPlanar_SSE2(&argb[i + 16], rgb); + RGB32PackedToPlanar16_SSE2(&argb[i + 16], rgb); HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]); HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]); HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]); @@ -770,8 +791,8 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) { WebPConvertARGBToY = ConvertARGBToY_SSE2; WebPConvertARGBToUV = ConvertARGBToUV_SSE2; - WebPConvertRGB24ToY = ConvertRGB24ToY_SSE2; - WebPConvertBGR24ToY = ConvertBGR24ToY_SSE2; + WebPConvertRGBToY = ConvertRGBToY_SSE2; + WebPConvertBGRToY = ConvertBGRToY_SSE2; WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2; } diff --git a/src/dsp/yuv_sse41.c b/src/dsp/yuv_sse41.c index e1b80846..dfcb2cc0 100644 --- a/src/dsp/yuv_sse41.c +++ b/src/dsp/yuv_sse41.c @@ -300,7 +300,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) { // Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers: // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // Similar to PlanarTo24bHelper(), but in reverse order. -static WEBP_INLINE void RGB24PackedToPlanar_SSE41( +static WEBP_INLINE void RGBPackedToPlanar_SSE41( const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) { const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb + 0)); const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16)); @@ -343,14 +343,35 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE41( #undef WEBP_SSE41_SHUFF -// Convert 8 packed ARGB to r[], g[], b[] -static WEBP_INLINE void RGB32PackedToPlanar_SSE41( +static WEBP_INLINE void RGBAPackedToRGBPlanar_SSE41( + const uint8_t* WEBP_RESTRICT const rgba, __m128i* const rgb /*in[6]*/) { + __m128i a0 = _mm_loadu_si128((const __m128i*)(rgba + 0)); + __m128i a1 = _mm_loadu_si128((const __m128i*)(rgba + 16)); + __m128i a2 = _mm_loadu_si128((const __m128i*)(rgba + 32)); + __m128i a3 = _mm_loadu_si128((const __m128i*)(rgba + 48)); + __m128i a4 = _mm_loadu_si128((const __m128i*)(rgba + 64)); + __m128i a5 = _mm_loadu_si128((const __m128i*)(rgba + 80)); + __m128i a6 = _mm_loadu_si128((const __m128i*)(rgba + 96)); + __m128i a7 = _mm_loadu_si128((const __m128i*)(rgba + 112)); + VP8L32bToPlanar_SSE41(&a0, &a1, &a2, &a3); + rgb[0] = a3; + rgb[2] = a2; + rgb[4] = a1; + VP8L32bToPlanar_SSE41(&a4, &a5, &a6, &a7); + rgb[1] = a7; + rgb[3] = a6; + rgb[5] = a5; +} + +// Unpack the 8b input argbargbargb... as contiguous registers: +// 0r0r0r... 0r0r0r... 0g0g0g... 0g0g0g0... 0b0b0b... 0b0b0b.... +static WEBP_INLINE void ARGBPackedToRGBPlanar16_SSE41( const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) { const __m128i zero = _mm_setzero_si128(); - __m128i a0 = LOAD_16(argb + 0); - __m128i a1 = LOAD_16(argb + 4); - __m128i a2 = LOAD_16(argb + 8); - __m128i a3 = LOAD_16(argb + 12); + __m128i a0 = _mm_loadu_si128((const __m128i*)(argb + 0)); + __m128i a1 = _mm_loadu_si128((const __m128i*)(argb + 4)); + __m128i a2 = _mm_loadu_si128((const __m128i*)(argb + 8)); + __m128i a3 = _mm_loadu_si128((const __m128i*)(argb + 12)); VP8L32bToPlanar_SSE41(&a0, &a1, &a2, &a3); rgb[0] = _mm_unpacklo_epi8(a1, zero); rgb[1] = _mm_unpackhi_epi8(a1, zero); @@ -379,10 +400,10 @@ static WEBP_INLINE void RGB32PackedToPlanar_SSE41( } while (0) #define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A)) -static WEBP_INLINE void ConvertRGBToY_SSE41(const __m128i* const R, - const __m128i* const G, - const __m128i* const B, - __m128i* const Y) { +static WEBP_INLINE void ConvertRGBToYImpl_SSE41(const __m128i* const R, + const __m128i* const G, + const __m128i* const B, + __m128i* const Y) { const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384); const __m128i kGB_y = MK_CST_16(16384, 6420); const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF); @@ -418,72 +439,70 @@ static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R, #undef MK_CST_16 #undef TRANSFORM -static void ConvertRGB24ToY_SSE41(const uint8_t* WEBP_RESTRICT rgb, - uint8_t* WEBP_RESTRICT y, int width) { +static WEBP_INLINE void ConvertRGBToYHelper_SSE41( + const __m128i* const rgb_plane /*in[6]*/, int swap_rb, int* i, + uint8_t* WEBP_RESTRICT y) { + int j; + + for (j = 0; j < 2; ++j, *i += 16) { + const __m128i zero = _mm_setzero_si128(); + __m128i r, g, b, Y0, Y1; + + // Convert to 16-bit Y. + r = _mm_unpacklo_epi8(rgb_plane[(swap_rb ? 4 : 0) + j], zero); + g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero); + b = _mm_unpacklo_epi8(rgb_plane[(swap_rb ? 0 : 4) + j], zero); + ConvertRGBToYImpl_SSE41(&r, &g, &b, &Y0); + + // Convert to 16-bit Y. + r = _mm_unpackhi_epi8(rgb_plane[(swap_rb ? 4 : 0) + j], zero); + g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero); + b = _mm_unpackhi_epi8(rgb_plane[(swap_rb ? 0 : 4) + j], zero); + ConvertRGBToYImpl_SSE41(&r, &g, &b, &Y1); + + // Cast to 8-bit and store. + STORE_16(_mm_packus_epi16(Y0, Y1), y + *i); + } +} + +static void ConvertRGBToY_SSE41(const uint8_t* WEBP_RESTRICT rgb, + uint8_t* WEBP_RESTRICT y, int width, int step) { const int max_width = width & ~31; int i; - for (i = 0; i < max_width; rgb += 3 * 16 * 2) { - __m128i rgb_plane[6]; - int j; - - RGB24PackedToPlanar_SSE41(rgb, rgb_plane); - - for (j = 0; j < 2; ++j, i += 16) { - const __m128i zero = _mm_setzero_si128(); - __m128i r, g, b, Y0, Y1; - - // Convert to 16-bit Y. - r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero); - g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero); - b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero); - ConvertRGBToY_SSE41(&r, &g, &b, &Y0); - - // Convert to 16-bit Y. - r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero); - g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero); - b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero); - ConvertRGBToY_SSE41(&r, &g, &b, &Y1); - - // Cast to 8-bit and store. - STORE_16(_mm_packus_epi16(Y0, Y1), y + i); + __m128i rgb_plane[6]; + if (step == 3) { + for (i = 0; i < max_width; rgb += 3 * 16 * 2) { + RGBPackedToPlanar_SSE41(rgb, rgb_plane); + ConvertRGBToYHelper_SSE41(rgb_plane, /*swap_rb=*/0, &i, y); + } + } else { + for (i = 0; i < max_width; rgb += 4 * 16 * 2) { + RGBAPackedToRGBPlanar_SSE41(rgb, rgb_plane); + ConvertRGBToYHelper_SSE41(rgb_plane, /*swap_rb=*/0, &i, y); } } - for (; i < width; ++i, rgb += 3) { // left-over + for (; i < width; ++i, rgb += step) { // left-over y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); } } -static void ConvertBGR24ToY_SSE41(const uint8_t* WEBP_RESTRICT bgr, - uint8_t* WEBP_RESTRICT y, int width) { +static void ConvertBGRToY_SSE41(const uint8_t* WEBP_RESTRICT bgr, + uint8_t* WEBP_RESTRICT y, int width, int step) { const int max_width = width & ~31; int i; - for (i = 0; i < max_width; bgr += 3 * 16 * 2) { - __m128i bgr_plane[6]; - int j; - - RGB24PackedToPlanar_SSE41(bgr, bgr_plane); - - for (j = 0; j < 2; ++j, i += 16) { - const __m128i zero = _mm_setzero_si128(); - __m128i r, g, b, Y0, Y1; - - // Convert to 16-bit Y. - b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero); - g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero); - r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero); - ConvertRGBToY_SSE41(&r, &g, &b, &Y0); - - // Convert to 16-bit Y. - b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero); - g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero); - r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero); - ConvertRGBToY_SSE41(&r, &g, &b, &Y1); - - // Cast to 8-bit and store. - STORE_16(_mm_packus_epi16(Y0, Y1), y + i); + __m128i bgr_plane[6]; + if (step == 3) { + for (i = 0; i < max_width; bgr += 3 * 16 * 2) { + RGBPackedToPlanar_SSE41(bgr, bgr_plane); + ConvertRGBToYHelper_SSE41(bgr_plane, /*swap_rb=*/1, &i, y); + } + } else { + for (i = 0; i < max_width; bgr += 4 * 16 * 2) { + RGBAPackedToRGBPlanar_SSE41(bgr, bgr_plane); + ConvertRGBToYHelper_SSE41(bgr_plane, /*swap_rb=*/1, &i, y); } } - for (; i < width; ++i, bgr += 3) { // left-over + for (; i < width; ++i, bgr += step) { // left-over y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); } } @@ -494,9 +513,9 @@ static void ConvertARGBToY_SSE41(const uint32_t* WEBP_RESTRICT argb, int i; for (i = 0; i < max_width; i += 16) { __m128i Y0, Y1, rgb[6]; - RGB32PackedToPlanar_SSE41(&argb[i], rgb); - ConvertRGBToY_SSE41(&rgb[0], &rgb[2], &rgb[4], &Y0); - ConvertRGBToY_SSE41(&rgb[1], &rgb[3], &rgb[5], &Y1); + ARGBPackedToRGBPlanar16_SSE41(&argb[i], rgb); + ConvertRGBToYImpl_SSE41(&rgb[0], &rgb[2], &rgb[4], &Y0); + ConvertRGBToYImpl_SSE41(&rgb[1], &rgb[3], &rgb[5], &Y1); STORE_16(_mm_packus_epi16(Y0, Y1), y + i); } for (; i < width; ++i) { // left-over @@ -525,13 +544,13 @@ static void ConvertARGBToUV_SSE41(const uint32_t* WEBP_RESTRICT argb, int i; for (i = 0; i < max_width; i += 32, u += 16, v += 16) { __m128i rgb[6], U0, V0, U1, V1; - RGB32PackedToPlanar_SSE41(&argb[i], rgb); + ARGBPackedToRGBPlanar16_SSE41(&argb[i], rgb); HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]); HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]); HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]); ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U0, &V0); - RGB32PackedToPlanar_SSE41(&argb[i + 16], rgb); + ARGBPackedToRGBPlanar16_SSE41(&argb[i + 16], rgb); HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]); HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]); HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]); @@ -615,8 +634,8 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE41(void) { WebPConvertARGBToY = ConvertARGBToY_SSE41; WebPConvertARGBToUV = ConvertARGBToUV_SSE41; - WebPConvertRGB24ToY = ConvertRGB24ToY_SSE41; - WebPConvertBGR24ToY = ConvertBGR24ToY_SSE41; + WebPConvertRGBToY = ConvertRGBToY_SSE41; + WebPConvertBGRToY = ConvertBGRToY_SSE41; WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE41; } diff --git a/src/enc/picture_csp_enc.c b/src/enc/picture_csp_enc.c index bf3c2304..3ebf93d3 100644 --- a/src/enc/picture_csp_enc.c +++ b/src/enc/picture_csp_enc.c @@ -519,7 +519,6 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr, } } else { const int uv_width = (width + 1) >> 1; - int use_dsp = (step == 3); // use special function in this case // temporary storage for accumulated R/G/B values during conversion to U/V uint16_t* const tmp_rgb = (uint16_t*)WebPSafeMalloc(4 * uv_width, sizeof(*tmp_rgb)); @@ -533,7 +532,6 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr, if (dithering > 0.) { VP8InitRandom(&base_rg, dithering); rg = &base_rg; - use_dsp = 0; // can't use dsp in this case } WebPInitConvertARGBToYUV(); InitGammaTables(); @@ -545,15 +543,15 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr, // Downsample Y/U/V planes, two rows at a time for (y = 0; y < (height >> 1); ++y) { int rows_have_alpha = has_alpha; - if (use_dsp) { + if (rg == NULL) { if (is_rgb) { - WebPConvertRGB24ToY(r_ptr, dst_y, width); - WebPConvertRGB24ToY(r_ptr + rgb_stride, - dst_y + picture->y_stride, width); + WebPConvertRGBToY(r_ptr, dst_y, width, step); + WebPConvertRGBToY(r_ptr + rgb_stride, dst_y + picture->y_stride, + width, step); } else { - WebPConvertBGR24ToY(b_ptr, dst_y, width); - WebPConvertBGR24ToY(b_ptr + rgb_stride, - dst_y + picture->y_stride, width); + WebPConvertBGRToY(b_ptr, dst_y, width, step); + WebPConvertBGRToY(b_ptr + rgb_stride, dst_y + picture->y_stride, + width, step); } } else { ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg); @@ -589,11 +587,11 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr, } if (height & 1) { // extra last row int row_has_alpha = has_alpha; - if (use_dsp) { - if (r_ptr < b_ptr) { - WebPConvertRGB24ToY(r_ptr, dst_y, width); + if (rg == NULL) { + if (is_rgb) { + WebPConvertRGBToY(r_ptr, dst_y, width, step); } else { - WebPConvertBGR24ToY(b_ptr, dst_y, width); + WebPConvertBGRToY(b_ptr, dst_y, width, step); } } else { ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg);