Add ARGB/ABGR support to WebPConvertRGB24ToY/WebPConvertBGR24ToY

Rename them to WebPConvertRGBToY/WebPConvertBGRToY and accept the
'step' parameter (3 for RGB, 4 for ARGB).

Change-Id: I930a23894e4135a34fff2174e6a5bbee1eac2ba0
This commit is contained in:
Vincent Rabaud
2025-07-24 14:14:20 +02:00
parent a81af56db9
commit 8c815d82d7
6 changed files with 251 additions and 192 deletions

View File

@@ -352,11 +352,11 @@ extern void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width); uint8_t* WEBP_RESTRICT v, int width);
// Convert RGB or BGR to Y // Convert RGB or BGR to Y. Step is 3 or 4. If step is 4, data is RGBA or BGRA.
extern void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb, extern void (*WebPConvertRGBToY)(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width); uint8_t* WEBP_RESTRICT y, int width, int step);
extern void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr, extern void (*WebPConvertBGRToY)(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width); uint8_t* WEBP_RESTRICT y, int width, int step);
// used for plain-C fallback. // used for plain-C fallback.
extern void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb, extern void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb,

View File

@@ -177,18 +177,18 @@ void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb,
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
static void ConvertRGB24ToY_C(const uint8_t* WEBP_RESTRICT rgb, static void ConvertRGBToY_C(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width) { uint8_t* WEBP_RESTRICT y, int width, int step) {
int i; int i;
for (i = 0; i < width; ++i, rgb += 3) { for (i = 0; i < width; ++i, rgb += step) {
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
} }
} }
static void ConvertBGR24ToY_C(const uint8_t* WEBP_RESTRICT bgr, static void ConvertBGRToY_C(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width) { uint8_t* WEBP_RESTRICT y, int width, int step) {
int i; int i;
for (i = 0; i < width; ++i, bgr += 3) { for (i = 0; i < width; ++i, bgr += step) {
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
} }
} }
@@ -206,10 +206,10 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb,
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb, void (*WebPConvertRGBToY)(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width); uint8_t* WEBP_RESTRICT y, int width, int step);
void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr, void (*WebPConvertBGRToY)(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width); uint8_t* WEBP_RESTRICT y, int width, int step);
void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb, void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT u,
uint8_t* WEBP_RESTRICT v, int width); uint8_t* WEBP_RESTRICT v, int width);
@@ -228,8 +228,8 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
WebPConvertARGBToY = ConvertARGBToY_C; WebPConvertARGBToY = ConvertARGBToY_C;
WebPConvertARGBToUV = WebPConvertARGBToUV_C; WebPConvertARGBToUV = WebPConvertARGBToUV_C;
WebPConvertRGB24ToY = ConvertRGB24ToY_C; WebPConvertRGBToY = ConvertRGBToY_C;
WebPConvertBGR24ToY = ConvertBGR24ToY_C; WebPConvertBGRToY = ConvertBGRToY_C;
WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C; WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;
@@ -255,7 +255,7 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
assert(WebPConvertARGBToY != NULL); assert(WebPConvertARGBToY != NULL);
assert(WebPConvertARGBToUV != NULL); assert(WebPConvertARGBToUV != NULL);
assert(WebPConvertRGB24ToY != NULL); assert(WebPConvertRGBToY != NULL);
assert(WebPConvertBGR24ToY != NULL); assert(WebPConvertBGRToY != NULL);
assert(WebPConvertRGBA32ToUV != NULL); assert(WebPConvertRGBA32ToUV != NULL);
} }

View File

@@ -23,9 +23,9 @@
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R, static uint8x8_t ConvertRGBToYImpl_NEON(const uint8x8_t R,
const uint8x8_t G, const uint8x8_t G,
const uint8x8_t B) { const uint8x8_t B) {
const uint16x8_t r = vmovl_u8(R); const uint16x8_t r = vmovl_u8(R);
const uint16x8_t g = vmovl_u8(G); const uint16x8_t g = vmovl_u8(G);
const uint16x8_t b = vmovl_u8(B); const uint16x8_t b = vmovl_u8(B);
@@ -47,28 +47,48 @@ static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
return vqmovn_u16(Y2); return vqmovn_u16(Y2);
} }
static void ConvertRGB24ToY_NEON(const uint8_t* WEBP_RESTRICT rgb, static void ConvertRGBToY_NEON(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width) { uint8_t* WEBP_RESTRICT y, int width, int step) {
int i; int i;
for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) { if (step == 3) {
const uint8x8x3_t RGB = vld3_u8(rgb); for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) {
const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[0], RGB.val[1], RGB.val[2]); const uint8x8x3_t RGB = vld3_u8(rgb);
vst1_u8(y + i, Y); const uint8x8_t Y =
ConvertRGBToYImpl_NEON(RGB.val[0], RGB.val[1], RGB.val[2]);
vst1_u8(y + i, Y);
}
} else {
for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) {
const uint8x8x4_t RGB = vld4_u8(rgb);
const uint8x8_t Y =
ConvertRGBToYImpl_NEON(RGB.val[0], RGB.val[1], RGB.val[2]);
vst1_u8(y + i, Y);
}
} }
for (; i < width; ++i, rgb += 3) { // left-over for (; i < width; ++i, rgb += step) { // left-over
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
} }
} }
static void ConvertBGR24ToY_NEON(const uint8_t* WEBP_RESTRICT bgr, static void ConvertBGRToY_NEON(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width) { uint8_t* WEBP_RESTRICT y, int width, int step) {
int i; int i;
for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) { if (step == 3) {
const uint8x8x3_t BGR = vld3_u8(bgr); for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) {
const uint8x8_t Y = ConvertRGBToY_NEON(BGR.val[2], BGR.val[1], BGR.val[0]); const uint8x8x3_t BGR = vld3_u8(bgr);
vst1_u8(y + i, Y); const uint8x8_t Y =
ConvertRGBToYImpl_NEON(BGR.val[2], BGR.val[1], BGR.val[0]);
vst1_u8(y + i, Y);
}
} else {
for (i = 0; i + 8 <= width; i += 8, bgr += 4 * 8) {
const uint8x8x4_t BGR = vld4_u8(bgr);
const uint8x8_t Y =
ConvertRGBToYImpl_NEON(BGR.val[2], BGR.val[1], BGR.val[0]);
vst1_u8(y + i, Y);
}
} }
for (; i < width; ++i, bgr += 3) { // left-over for (; i < width; ++i, bgr += step) { // left-over
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
} }
} }
@@ -78,7 +98,8 @@ static void ConvertARGBToY_NEON(const uint32_t* WEBP_RESTRICT argb,
int i; int i;
for (i = 0; i + 8 <= width; i += 8) { for (i = 0; i + 8 <= width; i += 8) {
const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]); const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[2], RGB.val[1], RGB.val[0]); const uint8x8_t Y =
ConvertRGBToYImpl_NEON(RGB.val[2], RGB.val[1], RGB.val[0]);
vst1_u8(y + i, Y); vst1_u8(y + i, Y);
} }
for (; i < width; ++i) { // left-over for (; i < width; ++i) { // left-over
@@ -173,8 +194,8 @@ static void ConvertARGBToUV_NEON(const uint32_t* WEBP_RESTRICT argb,
extern void WebPInitConvertARGBToYUVNEON(void); extern void WebPInitConvertARGBToYUVNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) { WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) {
WebPConvertRGB24ToY = ConvertRGB24ToY_NEON; WebPConvertRGBToY = ConvertRGBToY_NEON;
WebPConvertBGR24ToY = ConvertBGR24ToY_NEON; WebPConvertBGRToY = ConvertBGRToY_NEON;
WebPConvertARGBToY = ConvertARGBToY_NEON; WebPConvertARGBToY = ConvertARGBToY_NEON;
WebPConvertARGBToUV = ConvertARGBToUV_NEON; WebPConvertARGBToUV = ConvertARGBToUV_NEON;
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON; WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON;

View File

@@ -491,7 +491,7 @@ static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers: // Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
// rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// Similar to PlanarTo24bHelper(), but in reverse order. // Similar to PlanarTo24bHelper(), but in reverse order.
static WEBP_INLINE void RGB24PackedToPlanar_SSE2( static WEBP_INLINE void RGBPackedToPlanar_SSE2(
const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) { const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) {
__m128i tmp[6]; __m128i tmp[6];
tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0)); tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0));
@@ -508,8 +508,31 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
RGB24PackedToPlanarHelper_SSE2(tmp, out); RGB24PackedToPlanarHelper_SSE2(tmp, out);
} }
// Convert 8 packed ARGB to r[], g[], b[] // Unpack the 8b input rgbargbargba... as contiguous registers:
static WEBP_INLINE void RGB32PackedToPlanar_SSE2( // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
static WEBP_INLINE void RGBAPackedToRGBPlanar_SSE2(
const uint8_t* WEBP_RESTRICT const rgba, __m128i* const rgb /*in[6]*/) {
__m128i a0 = _mm_loadu_si128((const __m128i*)(rgba + 0));
__m128i a1 = _mm_loadu_si128((const __m128i*)(rgba + 16));
__m128i a2 = _mm_loadu_si128((const __m128i*)(rgba + 32));
__m128i a3 = _mm_loadu_si128((const __m128i*)(rgba + 48));
__m128i a4 = _mm_loadu_si128((const __m128i*)(rgba + 64));
__m128i a5 = _mm_loadu_si128((const __m128i*)(rgba + 80));
__m128i a6 = _mm_loadu_si128((const __m128i*)(rgba + 96));
__m128i a7 = _mm_loadu_si128((const __m128i*)(rgba + 112));
VP8L32bToPlanar_SSE2(&a0, &a1, &a2, &a3);
rgb[0] = a3;
rgb[2] = a2;
rgb[4] = a1;
VP8L32bToPlanar_SSE2(&a4, &a5, &a6, &a7);
rgb[1] = a7;
rgb[3] = a6;
rgb[5] = a5;
}
// Unpack the 8b input argbargbargb... as contiguous registers:
// 0r0r0r... 0r0r0r... 0g0g0g... 0g0g0g0... 0b0b0b... 0b0b0b....
static WEBP_INLINE void RGB32PackedToPlanar16_SSE2(
const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) { const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
__m128i a0 = LOAD_16(argb + 0); __m128i a0 = LOAD_16(argb + 0);
@@ -544,10 +567,10 @@ static WEBP_INLINE void RGB32PackedToPlanar_SSE2(
} while (0) } while (0)
#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A)) #define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
static WEBP_INLINE void ConvertRGBToY_SSE2(const __m128i* const R, static WEBP_INLINE void ConvertRGBToYImpl_SSE2(const __m128i* const R,
const __m128i* const G, const __m128i* const G,
const __m128i* const B, const __m128i* const B,
__m128i* const Y) { __m128i* const Y) {
const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384); const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
const __m128i kGB_y = MK_CST_16(16384, 6420); const __m128i kGB_y = MK_CST_16(16384, 6420);
const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF); const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
@@ -583,72 +606,70 @@ static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
#undef MK_CST_16 #undef MK_CST_16
#undef TRANSFORM #undef TRANSFORM
static void ConvertRGB24ToY_SSE2(const uint8_t* WEBP_RESTRICT rgb, static WEBP_INLINE void ConvertRGBToYHelper_SSE2(
uint8_t* WEBP_RESTRICT y, int width) { const __m128i* const rgb_plane /*in[6]*/, int swap_rb, int* i,
uint8_t* WEBP_RESTRICT y) {
int j;
for (j = 0; j < 2; ++j, *i += 16) {
const __m128i zero = _mm_setzero_si128();
__m128i r, g, b, Y0, Y1;
// Convert to 16-bit Y.
r = _mm_unpacklo_epi8(rgb_plane[(swap_rb ? 4 : 0) + j], zero);
g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
b = _mm_unpacklo_epi8(rgb_plane[(swap_rb ? 0 : 4) + j], zero);
ConvertRGBToYImpl_SSE2(&r, &g, &b, &Y0);
// Convert to 16-bit Y.
r = _mm_unpackhi_epi8(rgb_plane[(swap_rb ? 4 : 0) + j], zero);
g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
b = _mm_unpackhi_epi8(rgb_plane[(swap_rb ? 0 : 4) + j], zero);
ConvertRGBToYImpl_SSE2(&r, &g, &b, &Y1);
// Cast to 8-bit and store.
STORE_16(_mm_packus_epi16(Y0, Y1), y + *i);
}
}
static void ConvertRGBToY_SSE2(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width, int step) {
const int max_width = width & ~31; const int max_width = width & ~31;
int i; int i;
for (i = 0; i < max_width; rgb += 3 * 16 * 2) { __m128i rgb_plane[6];
__m128i rgb_plane[6]; if (step == 3) {
int j; for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
RGBPackedToPlanar_SSE2(rgb, rgb_plane);
RGB24PackedToPlanar_SSE2(rgb, rgb_plane); ConvertRGBToYHelper_SSE2(rgb_plane, /*swap_rb=*/0, &i, y);
}
for (j = 0; j < 2; ++j, i += 16) { } else {
const __m128i zero = _mm_setzero_si128(); for (i = 0; i < max_width; rgb += 4 * 16 * 2) {
__m128i r, g, b, Y0, Y1; RGBAPackedToRGBPlanar_SSE2(rgb, rgb_plane);
ConvertRGBToYHelper_SSE2(rgb_plane, /*swap_rb=*/0, &i, y);
// Convert to 16-bit Y.
r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
// Convert to 16-bit Y.
r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
// Cast to 8-bit and store.
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
} }
} }
for (; i < width; ++i, rgb += 3) { // left-over for (; i < width; ++i, rgb += step) { // left-over
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
} }
} }
static void ConvertBGR24ToY_SSE2(const uint8_t* WEBP_RESTRICT bgr, static void ConvertBGRToY_SSE2(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width) { uint8_t* WEBP_RESTRICT y, int width, int step) {
const int max_width = width & ~31; const int max_width = width & ~31;
int i; int i;
for (i = 0; i < max_width; bgr += 3 * 16 * 2) { __m128i bgr_plane[6];
__m128i bgr_plane[6]; if (step == 3) {
int j; for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
RGBPackedToPlanar_SSE2(bgr, bgr_plane);
RGB24PackedToPlanar_SSE2(bgr, bgr_plane); ConvertRGBToYHelper_SSE2(bgr_plane, /*swap_rb=*/1, &i, y);
}
for (j = 0; j < 2; ++j, i += 16) { } else {
const __m128i zero = _mm_setzero_si128(); for (i = 0; i < max_width; bgr += 4 * 16 * 2) {
__m128i r, g, b, Y0, Y1; RGBAPackedToRGBPlanar_SSE2(bgr, bgr_plane);
ConvertRGBToYHelper_SSE2(bgr_plane, /*swap_rb=*/1, &i, y);
// Convert to 16-bit Y.
b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
// Convert to 16-bit Y.
b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
// Cast to 8-bit and store.
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
} }
} }
for (; i < width; ++i, bgr += 3) { // left-over for (; i < width; ++i, bgr += step) { // left-over
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
} }
} }
@@ -659,9 +680,9 @@ static void ConvertARGBToY_SSE2(const uint32_t* WEBP_RESTRICT argb,
int i; int i;
for (i = 0; i < max_width; i += 16) { for (i = 0; i < max_width; i += 16) {
__m128i Y0, Y1, rgb[6]; __m128i Y0, Y1, rgb[6];
RGB32PackedToPlanar_SSE2(&argb[i], rgb); RGB32PackedToPlanar16_SSE2(&argb[i], rgb);
ConvertRGBToY_SSE2(&rgb[0], &rgb[2], &rgb[4], &Y0); ConvertRGBToYImpl_SSE2(&rgb[0], &rgb[2], &rgb[4], &Y0);
ConvertRGBToY_SSE2(&rgb[1], &rgb[3], &rgb[5], &Y1); ConvertRGBToYImpl_SSE2(&rgb[1], &rgb[3], &rgb[5], &Y1);
STORE_16(_mm_packus_epi16(Y0, Y1), y + i); STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
} }
for (; i < width; ++i) { // left-over for (; i < width; ++i) { // left-over
@@ -690,13 +711,13 @@ static void ConvertARGBToUV_SSE2(const uint32_t* WEBP_RESTRICT argb,
int i; int i;
for (i = 0; i < max_width; i += 32, u += 16, v += 16) { for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
__m128i rgb[6], U0, V0, U1, V1; __m128i rgb[6], U0, V0, U1, V1;
RGB32PackedToPlanar_SSE2(&argb[i], rgb); RGB32PackedToPlanar16_SSE2(&argb[i], rgb);
HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]); HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]); HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]); HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U0, &V0); ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
RGB32PackedToPlanar_SSE2(&argb[i + 16], rgb); RGB32PackedToPlanar16_SSE2(&argb[i + 16], rgb);
HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]); HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]); HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]); HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
@@ -770,8 +791,8 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
WebPConvertARGBToY = ConvertARGBToY_SSE2; WebPConvertARGBToY = ConvertARGBToY_SSE2;
WebPConvertARGBToUV = ConvertARGBToUV_SSE2; WebPConvertARGBToUV = ConvertARGBToUV_SSE2;
WebPConvertRGB24ToY = ConvertRGB24ToY_SSE2; WebPConvertRGBToY = ConvertRGBToY_SSE2;
WebPConvertBGR24ToY = ConvertBGR24ToY_SSE2; WebPConvertBGRToY = ConvertBGRToY_SSE2;
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2; WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2;
} }

View File

@@ -300,7 +300,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) {
// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers: // Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
// rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// Similar to PlanarTo24bHelper(), but in reverse order. // Similar to PlanarTo24bHelper(), but in reverse order.
static WEBP_INLINE void RGB24PackedToPlanar_SSE41( static WEBP_INLINE void RGBPackedToPlanar_SSE41(
const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) { const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) {
const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb + 0)); const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb + 0));
const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16)); const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16));
@@ -343,14 +343,35 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
#undef WEBP_SSE41_SHUFF #undef WEBP_SSE41_SHUFF
// Convert 8 packed ARGB to r[], g[], b[] static WEBP_INLINE void RGBAPackedToRGBPlanar_SSE41(
static WEBP_INLINE void RGB32PackedToPlanar_SSE41( const uint8_t* WEBP_RESTRICT const rgba, __m128i* const rgb /*in[6]*/) {
__m128i a0 = _mm_loadu_si128((const __m128i*)(rgba + 0));
__m128i a1 = _mm_loadu_si128((const __m128i*)(rgba + 16));
__m128i a2 = _mm_loadu_si128((const __m128i*)(rgba + 32));
__m128i a3 = _mm_loadu_si128((const __m128i*)(rgba + 48));
__m128i a4 = _mm_loadu_si128((const __m128i*)(rgba + 64));
__m128i a5 = _mm_loadu_si128((const __m128i*)(rgba + 80));
__m128i a6 = _mm_loadu_si128((const __m128i*)(rgba + 96));
__m128i a7 = _mm_loadu_si128((const __m128i*)(rgba + 112));
VP8L32bToPlanar_SSE41(&a0, &a1, &a2, &a3);
rgb[0] = a3;
rgb[2] = a2;
rgb[4] = a1;
VP8L32bToPlanar_SSE41(&a4, &a5, &a6, &a7);
rgb[1] = a7;
rgb[3] = a6;
rgb[5] = a5;
}
// Unpack the 8b input argbargbargb... as contiguous registers:
// 0r0r0r... 0r0r0r... 0g0g0g... 0g0g0g0... 0b0b0b... 0b0b0b....
static WEBP_INLINE void ARGBPackedToRGBPlanar16_SSE41(
const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) { const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
__m128i a0 = LOAD_16(argb + 0); __m128i a0 = _mm_loadu_si128((const __m128i*)(argb + 0));
__m128i a1 = LOAD_16(argb + 4); __m128i a1 = _mm_loadu_si128((const __m128i*)(argb + 4));
__m128i a2 = LOAD_16(argb + 8); __m128i a2 = _mm_loadu_si128((const __m128i*)(argb + 8));
__m128i a3 = LOAD_16(argb + 12); __m128i a3 = _mm_loadu_si128((const __m128i*)(argb + 12));
VP8L32bToPlanar_SSE41(&a0, &a1, &a2, &a3); VP8L32bToPlanar_SSE41(&a0, &a1, &a2, &a3);
rgb[0] = _mm_unpacklo_epi8(a1, zero); rgb[0] = _mm_unpacklo_epi8(a1, zero);
rgb[1] = _mm_unpackhi_epi8(a1, zero); rgb[1] = _mm_unpackhi_epi8(a1, zero);
@@ -379,10 +400,10 @@ static WEBP_INLINE void RGB32PackedToPlanar_SSE41(
} while (0) } while (0)
#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A)) #define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
static WEBP_INLINE void ConvertRGBToY_SSE41(const __m128i* const R, static WEBP_INLINE void ConvertRGBToYImpl_SSE41(const __m128i* const R,
const __m128i* const G, const __m128i* const G,
const __m128i* const B, const __m128i* const B,
__m128i* const Y) { __m128i* const Y) {
const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384); const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
const __m128i kGB_y = MK_CST_16(16384, 6420); const __m128i kGB_y = MK_CST_16(16384, 6420);
const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF); const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
@@ -418,72 +439,70 @@ static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R,
#undef MK_CST_16 #undef MK_CST_16
#undef TRANSFORM #undef TRANSFORM
static void ConvertRGB24ToY_SSE41(const uint8_t* WEBP_RESTRICT rgb, static WEBP_INLINE void ConvertRGBToYHelper_SSE41(
uint8_t* WEBP_RESTRICT y, int width) { const __m128i* const rgb_plane /*in[6]*/, int swap_rb, int* i,
uint8_t* WEBP_RESTRICT y) {
int j;
for (j = 0; j < 2; ++j, *i += 16) {
const __m128i zero = _mm_setzero_si128();
__m128i r, g, b, Y0, Y1;
// Convert to 16-bit Y.
r = _mm_unpacklo_epi8(rgb_plane[(swap_rb ? 4 : 0) + j], zero);
g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
b = _mm_unpacklo_epi8(rgb_plane[(swap_rb ? 0 : 4) + j], zero);
ConvertRGBToYImpl_SSE41(&r, &g, &b, &Y0);
// Convert to 16-bit Y.
r = _mm_unpackhi_epi8(rgb_plane[(swap_rb ? 4 : 0) + j], zero);
g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
b = _mm_unpackhi_epi8(rgb_plane[(swap_rb ? 0 : 4) + j], zero);
ConvertRGBToYImpl_SSE41(&r, &g, &b, &Y1);
// Cast to 8-bit and store.
STORE_16(_mm_packus_epi16(Y0, Y1), y + *i);
}
}
static void ConvertRGBToY_SSE41(const uint8_t* WEBP_RESTRICT rgb,
uint8_t* WEBP_RESTRICT y, int width, int step) {
const int max_width = width & ~31; const int max_width = width & ~31;
int i; int i;
for (i = 0; i < max_width; rgb += 3 * 16 * 2) { __m128i rgb_plane[6];
__m128i rgb_plane[6]; if (step == 3) {
int j; for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
RGBPackedToPlanar_SSE41(rgb, rgb_plane);
RGB24PackedToPlanar_SSE41(rgb, rgb_plane); ConvertRGBToYHelper_SSE41(rgb_plane, /*swap_rb=*/0, &i, y);
}
for (j = 0; j < 2; ++j, i += 16) { } else {
const __m128i zero = _mm_setzero_si128(); for (i = 0; i < max_width; rgb += 4 * 16 * 2) {
__m128i r, g, b, Y0, Y1; RGBAPackedToRGBPlanar_SSE41(rgb, rgb_plane);
ConvertRGBToYHelper_SSE41(rgb_plane, /*swap_rb=*/0, &i, y);
// Convert to 16-bit Y.
r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
ConvertRGBToY_SSE41(&r, &g, &b, &Y0);
// Convert to 16-bit Y.
r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
ConvertRGBToY_SSE41(&r, &g, &b, &Y1);
// Cast to 8-bit and store.
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
} }
} }
for (; i < width; ++i, rgb += 3) { // left-over for (; i < width; ++i, rgb += step) { // left-over
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
} }
} }
static void ConvertBGR24ToY_SSE41(const uint8_t* WEBP_RESTRICT bgr, static void ConvertBGRToY_SSE41(const uint8_t* WEBP_RESTRICT bgr,
uint8_t* WEBP_RESTRICT y, int width) { uint8_t* WEBP_RESTRICT y, int width, int step) {
const int max_width = width & ~31; const int max_width = width & ~31;
int i; int i;
for (i = 0; i < max_width; bgr += 3 * 16 * 2) { __m128i bgr_plane[6];
__m128i bgr_plane[6]; if (step == 3) {
int j; for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
RGBPackedToPlanar_SSE41(bgr, bgr_plane);
RGB24PackedToPlanar_SSE41(bgr, bgr_plane); ConvertRGBToYHelper_SSE41(bgr_plane, /*swap_rb=*/1, &i, y);
}
for (j = 0; j < 2; ++j, i += 16) { } else {
const __m128i zero = _mm_setzero_si128(); for (i = 0; i < max_width; bgr += 4 * 16 * 2) {
__m128i r, g, b, Y0, Y1; RGBAPackedToRGBPlanar_SSE41(bgr, bgr_plane);
ConvertRGBToYHelper_SSE41(bgr_plane, /*swap_rb=*/1, &i, y);
// Convert to 16-bit Y.
b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
ConvertRGBToY_SSE41(&r, &g, &b, &Y0);
// Convert to 16-bit Y.
b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
ConvertRGBToY_SSE41(&r, &g, &b, &Y1);
// Cast to 8-bit and store.
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
} }
} }
for (; i < width; ++i, bgr += 3) { // left-over for (; i < width; ++i, bgr += step) { // left-over
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
} }
} }
@@ -494,9 +513,9 @@ static void ConvertARGBToY_SSE41(const uint32_t* WEBP_RESTRICT argb,
int i; int i;
for (i = 0; i < max_width; i += 16) { for (i = 0; i < max_width; i += 16) {
__m128i Y0, Y1, rgb[6]; __m128i Y0, Y1, rgb[6];
RGB32PackedToPlanar_SSE41(&argb[i], rgb); ARGBPackedToRGBPlanar16_SSE41(&argb[i], rgb);
ConvertRGBToY_SSE41(&rgb[0], &rgb[2], &rgb[4], &Y0); ConvertRGBToYImpl_SSE41(&rgb[0], &rgb[2], &rgb[4], &Y0);
ConvertRGBToY_SSE41(&rgb[1], &rgb[3], &rgb[5], &Y1); ConvertRGBToYImpl_SSE41(&rgb[1], &rgb[3], &rgb[5], &Y1);
STORE_16(_mm_packus_epi16(Y0, Y1), y + i); STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
} }
for (; i < width; ++i) { // left-over for (; i < width; ++i) { // left-over
@@ -525,13 +544,13 @@ static void ConvertARGBToUV_SSE41(const uint32_t* WEBP_RESTRICT argb,
int i; int i;
for (i = 0; i < max_width; i += 32, u += 16, v += 16) { for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
__m128i rgb[6], U0, V0, U1, V1; __m128i rgb[6], U0, V0, U1, V1;
RGB32PackedToPlanar_SSE41(&argb[i], rgb); ARGBPackedToRGBPlanar16_SSE41(&argb[i], rgb);
HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]); HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]);
HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]); HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]);
HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]); HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]);
ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U0, &V0); ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
RGB32PackedToPlanar_SSE41(&argb[i + 16], rgb); ARGBPackedToRGBPlanar16_SSE41(&argb[i + 16], rgb);
HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]); HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]);
HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]); HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]);
HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]); HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]);
@@ -615,8 +634,8 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE41(void) {
WebPConvertARGBToY = ConvertARGBToY_SSE41; WebPConvertARGBToY = ConvertARGBToY_SSE41;
WebPConvertARGBToUV = ConvertARGBToUV_SSE41; WebPConvertARGBToUV = ConvertARGBToUV_SSE41;
WebPConvertRGB24ToY = ConvertRGB24ToY_SSE41; WebPConvertRGBToY = ConvertRGBToY_SSE41;
WebPConvertBGR24ToY = ConvertBGR24ToY_SSE41; WebPConvertBGRToY = ConvertBGRToY_SSE41;
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE41; WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE41;
} }

View File

@@ -519,7 +519,6 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
} }
} else { } else {
const int uv_width = (width + 1) >> 1; const int uv_width = (width + 1) >> 1;
int use_dsp = (step == 3); // use special function in this case
// temporary storage for accumulated R/G/B values during conversion to U/V // temporary storage for accumulated R/G/B values during conversion to U/V
uint16_t* const tmp_rgb = uint16_t* const tmp_rgb =
(uint16_t*)WebPSafeMalloc(4 * uv_width, sizeof(*tmp_rgb)); (uint16_t*)WebPSafeMalloc(4 * uv_width, sizeof(*tmp_rgb));
@@ -533,7 +532,6 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
if (dithering > 0.) { if (dithering > 0.) {
VP8InitRandom(&base_rg, dithering); VP8InitRandom(&base_rg, dithering);
rg = &base_rg; rg = &base_rg;
use_dsp = 0; // can't use dsp in this case
} }
WebPInitConvertARGBToYUV(); WebPInitConvertARGBToYUV();
InitGammaTables(); InitGammaTables();
@@ -545,15 +543,15 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
// Downsample Y/U/V planes, two rows at a time // Downsample Y/U/V planes, two rows at a time
for (y = 0; y < (height >> 1); ++y) { for (y = 0; y < (height >> 1); ++y) {
int rows_have_alpha = has_alpha; int rows_have_alpha = has_alpha;
if (use_dsp) { if (rg == NULL) {
if (is_rgb) { if (is_rgb) {
WebPConvertRGB24ToY(r_ptr, dst_y, width); WebPConvertRGBToY(r_ptr, dst_y, width, step);
WebPConvertRGB24ToY(r_ptr + rgb_stride, WebPConvertRGBToY(r_ptr + rgb_stride, dst_y + picture->y_stride,
dst_y + picture->y_stride, width); width, step);
} else { } else {
WebPConvertBGR24ToY(b_ptr, dst_y, width); WebPConvertBGRToY(b_ptr, dst_y, width, step);
WebPConvertBGR24ToY(b_ptr + rgb_stride, WebPConvertBGRToY(b_ptr + rgb_stride, dst_y + picture->y_stride,
dst_y + picture->y_stride, width); width, step);
} }
} else { } else {
ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg); ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg);
@@ -589,11 +587,11 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
} }
if (height & 1) { // extra last row if (height & 1) { // extra last row
int row_has_alpha = has_alpha; int row_has_alpha = has_alpha;
if (use_dsp) { if (rg == NULL) {
if (r_ptr < b_ptr) { if (is_rgb) {
WebPConvertRGB24ToY(r_ptr, dst_y, width); WebPConvertRGBToY(r_ptr, dst_y, width, step);
} else { } else {
WebPConvertBGR24ToY(b_ptr, dst_y, width); WebPConvertBGRToY(b_ptr, dst_y, width, step);
} }
} else { } else {
ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg); ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg);