yuv_sse2: harmonize function suffixes

BUG=webp:355

Change-Id: I02a66f7446c75a10c3ce4766235e5767617d0dce
This commit is contained in:
James Zern 2017-10-08 13:59:08 -07:00
parent 6921aa6f0c
commit d361a6a733

View File

@ -26,12 +26,12 @@
// R = (19077 * y + 26149 * v - 14234) >> 6 // R = (19077 * y + 26149 * v - 14234) >> 6
// G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6
// B = (19077 * y + 33050 * u - 17685) >> 6 // B = (19077 * y + 33050 * u - 17685) >> 6
static void ConvertYUV444ToRGB(const __m128i* const Y0, static void ConvertYUV444ToRGB_SSE2(const __m128i* const Y0,
const __m128i* const U0, const __m128i* const U0,
const __m128i* const V0, const __m128i* const V0,
__m128i* const R, __m128i* const R,
__m128i* const G, __m128i* const G,
__m128i* const B) { __m128i* const B) {
const __m128i k19077 = _mm_set1_epi16(19077); const __m128i k19077 = _mm_set1_epi16(19077);
const __m128i k26149 = _mm_set1_epi16(26149); const __m128i k26149 = _mm_set1_epi16(26149);
const __m128i k14234 = _mm_set1_epi16(14234); const __m128i k14234 = _mm_set1_epi16(14234);
@ -66,13 +66,13 @@ static void ConvertYUV444ToRGB(const __m128i* const Y0,
} }
// Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
static WEBP_INLINE __m128i Load_HI_16(const uint8_t* src) { static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src)); return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
} }
// Load and replicate the U/V samples // Load and replicate the U/V samples
static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) { static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src); const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0); const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
@ -80,29 +80,33 @@ static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) {
} }
// Convert 32 samples of YUV444 to R/G/B // Convert 32 samples of YUV444 to R/G/B
static void YUV444ToRGB(const uint8_t* const y, static void YUV444ToRGB_SSE2(const uint8_t* const y,
const uint8_t* const u, const uint8_t* const u,
const uint8_t* const v, const uint8_t* const v,
__m128i* const R, __m128i* const G, __m128i* const B) { __m128i* const R, __m128i* const G,
const __m128i Y0 = Load_HI_16(y), U0 = Load_HI_16(u), V0 = Load_HI_16(v); __m128i* const B) {
ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B); const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
V0 = Load_HI_16_SSE2(v);
ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
} }
// Convert 32 samples of YUV420 to R/G/B // Convert 32 samples of YUV420 to R/G/B
static void YUV420ToRGB(const uint8_t* const y, static void YUV420ToRGB_SSE2(const uint8_t* const y,
const uint8_t* const u, const uint8_t* const u,
const uint8_t* const v, const uint8_t* const v,
__m128i* const R, __m128i* const G, __m128i* const B) { __m128i* const R, __m128i* const G,
const __m128i Y0 = Load_HI_16(y), U0 = Load_UV_HI_8(u), V0 = Load_UV_HI_8(v); __m128i* const B) {
ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B); const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
V0 = Load_UV_HI_8_SSE2(v);
ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
} }
// Pack R/G/B/A results into 32b output. // Pack R/G/B/A results into 32b output.
static WEBP_INLINE void PackAndStore4(const __m128i* const R, static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
const __m128i* const G, const __m128i* const G,
const __m128i* const B, const __m128i* const B,
const __m128i* const A, const __m128i* const A,
uint8_t* const dst) { uint8_t* const dst) {
const __m128i rb = _mm_packus_epi16(*R, *B); const __m128i rb = _mm_packus_epi16(*R, *B);
const __m128i ga = _mm_packus_epi16(*G, *A); const __m128i ga = _mm_packus_epi16(*G, *A);
const __m128i rg = _mm_unpacklo_epi8(rb, ga); const __m128i rg = _mm_unpacklo_epi8(rb, ga);
@ -114,11 +118,11 @@ static WEBP_INLINE void PackAndStore4(const __m128i* const R,
} }
// Pack R/G/B/A results into 16b output. // Pack R/G/B/A results into 16b output.
static WEBP_INLINE void PackAndStore4444(const __m128i* const R, static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
const __m128i* const G, const __m128i* const G,
const __m128i* const B, const __m128i* const B,
const __m128i* const A, const __m128i* const A,
uint8_t* const dst) { uint8_t* const dst) {
#if (WEBP_SWAP_16BIT_CSP == 0) #if (WEBP_SWAP_16BIT_CSP == 0)
const __m128i rg0 = _mm_packus_epi16(*R, *G); const __m128i rg0 = _mm_packus_epi16(*R, *G);
const __m128i ba0 = _mm_packus_epi16(*B, *A); const __m128i ba0 = _mm_packus_epi16(*B, *A);
@ -136,10 +140,10 @@ static WEBP_INLINE void PackAndStore4444(const __m128i* const R,
} }
// Pack R/G/B results into 16b output. // Pack R/G/B results into 16b output.
static WEBP_INLINE void PackAndStore565(const __m128i* const R, static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
const __m128i* const G, const __m128i* const G,
const __m128i* const B, const __m128i* const B,
uint8_t* const dst) { uint8_t* const dst) {
const __m128i r0 = _mm_packus_epi16(*R, *R); const __m128i r0 = _mm_packus_epi16(*R, *R);
const __m128i g0 = _mm_packus_epi16(*G, *G); const __m128i g0 = _mm_packus_epi16(*G, *G);
const __m128i b0 = _mm_packus_epi16(*B, *B); const __m128i b0 = _mm_packus_epi16(*B, *B);
@ -160,10 +164,10 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R,
// Pack the planar buffers // Pack the planar buffers
// rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1, static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
__m128i* const in2, __m128i* const in3, __m128i* const in2, __m128i* const in3,
__m128i* const in4, __m128i* const in5, __m128i* const in4, __m128i* const in5,
uint8_t* const rgb) { uint8_t* const rgb) {
// The input is 6 registers of sixteen 8b but for the sake of explanation, // The input is 6 registers of sixteen 8b but for the sake of explanation,
// let's take 6 registers of four 8b values. // let's take 6 registers of four 8b values.
// To pack, we will keep taking one every two 8b integer and move it // To pack, we will keep taking one every two 8b integer and move it
@ -192,8 +196,8 @@ void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
int n; int n;
for (n = 0; n < 32; n += 8, dst += 32) { for (n = 0; n < 32; n += 8, dst += 32) {
__m128i R, G, B; __m128i R, G, B;
YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
PackAndStore4(&R, &G, &B, &kAlpha, dst); PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
} }
} }
@ -203,8 +207,8 @@ void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
int n; int n;
for (n = 0; n < 32; n += 8, dst += 32) { for (n = 0; n < 32; n += 8, dst += 32) {
__m128i R, G, B; __m128i R, G, B;
YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
PackAndStore4(&B, &G, &R, &kAlpha, dst); PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
} }
} }
@ -214,8 +218,8 @@ void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
int n; int n;
for (n = 0; n < 32; n += 8, dst += 32) { for (n = 0; n < 32; n += 8, dst += 32) {
__m128i R, G, B; __m128i R, G, B;
YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
PackAndStore4(&kAlpha, &R, &G, &B, dst); PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
} }
} }
@ -225,8 +229,8 @@ void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
int n; int n;
for (n = 0; n < 32; n += 8, dst += 16) { for (n = 0; n < 32; n += 8, dst += 16) {
__m128i R, G, B; __m128i R, G, B;
YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
PackAndStore4444(&R, &G, &B, &kAlpha, dst); PackAndStore4444_SSE2(&R, &G, &B, &kAlpha, dst);
} }
} }
@ -235,8 +239,8 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
int n; int n;
for (n = 0; n < 32; n += 8, dst += 16) { for (n = 0; n < 32; n += 8, dst += 16) {
__m128i R, G, B; __m128i R, G, B;
YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
PackAndStore565(&R, &G, &B, dst); PackAndStore565_SSE2(&R, &G, &B, dst);
} }
} }
@ -245,10 +249,10 @@ void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1); YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1);
YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2); YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3); YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
// Cast to 8b and store as RRRRGGGGBBBB. // Cast to 8b and store as RRRRGGGGBBBB.
rgb0 = _mm_packus_epi16(R0, R1); rgb0 = _mm_packus_epi16(R0, R1);
@ -259,7 +263,7 @@ void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
rgb5 = _mm_packus_epi16(B2, B3); rgb5 = _mm_packus_epi16(B2, B3);
// Pack as RGBRGBRGBRGB. // Pack as RGBRGBRGBRGB.
PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
} }
void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
@ -267,10 +271,10 @@ void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1); YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1);
YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2); YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3); YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
// Cast to 8b and store as BBBBGGGGRRRR. // Cast to 8b and store as BBBBGGGGRRRR.
bgr0 = _mm_packus_epi16(B0, B1); bgr0 = _mm_packus_epi16(B0, B1);
@ -281,7 +285,7 @@ void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
bgr5= _mm_packus_epi16(R2, R3); bgr5= _mm_packus_epi16(R2, R3);
// Pack as BGRBGRBGRBGR. // Pack as BGRBGRBGRBGR.
PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
} }
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
@ -294,8 +298,8 @@ static void YuvToRgbaRow_SSE2(const uint8_t* y,
int n; int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) { for (n = 0; n + 8 <= len; n += 8, dst += 32) {
__m128i R, G, B; __m128i R, G, B;
YUV420ToRGB(y, u, v, &R, &G, &B); YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
PackAndStore4(&R, &G, &B, &kAlpha, dst); PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
y += 8; y += 8;
u += 4; u += 4;
v += 4; v += 4;
@ -316,8 +320,8 @@ static void YuvToBgraRow_SSE2(const uint8_t* y,
int n; int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) { for (n = 0; n + 8 <= len; n += 8, dst += 32) {
__m128i R, G, B; __m128i R, G, B;
YUV420ToRGB(y, u, v, &R, &G, &B); YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
PackAndStore4(&B, &G, &R, &kAlpha, dst); PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
y += 8; y += 8;
u += 4; u += 4;
v += 4; v += 4;
@ -338,8 +342,8 @@ static void YuvToArgbRow_SSE2(const uint8_t* y,
int n; int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) { for (n = 0; n + 8 <= len; n += 8, dst += 32) {
__m128i R, G, B; __m128i R, G, B;
YUV420ToRGB(y, u, v, &R, &G, &B); YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
PackAndStore4(&kAlpha, &R, &G, &B, dst); PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
y += 8; y += 8;
u += 4; u += 4;
v += 4; v += 4;
@ -361,10 +365,10 @@ static void YuvToRgbRow_SSE2(const uint8_t* y,
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1); YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1);
YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2); YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2);
YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3); YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
// Cast to 8b and store as RRRRGGGGBBBB. // Cast to 8b and store as RRRRGGGGBBBB.
rgb0 = _mm_packus_epi16(R0, R1); rgb0 = _mm_packus_epi16(R0, R1);
@ -375,7 +379,7 @@ static void YuvToRgbRow_SSE2(const uint8_t* y,
rgb5 = _mm_packus_epi16(B2, B3); rgb5 = _mm_packus_epi16(B2, B3);
// Pack as RGBRGBRGBRGB. // Pack as RGBRGBRGBRGB.
PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
y += 32; y += 32;
u += 16; u += 16;
@ -398,10 +402,10 @@ static void YuvToBgrRow_SSE2(const uint8_t* y,
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1); YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1);
YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2); YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2);
YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3); YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
// Cast to 8b and store as BBBBGGGGRRRR. // Cast to 8b and store as BBBBGGGGRRRR.
bgr0 = _mm_packus_epi16(B0, B1); bgr0 = _mm_packus_epi16(B0, B1);
@ -412,7 +416,7 @@ static void YuvToBgrRow_SSE2(const uint8_t* y,
bgr5 = _mm_packus_epi16(R2, R3); bgr5 = _mm_packus_epi16(R2, R3);
// Pack as BGRBGRBGRBGR. // Pack as BGRBGRBGRBGR.
PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
y += 32; y += 32;
u += 16; u += 16;
@ -450,7 +454,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
// Function that inserts a value of the second half of the in buffer in between // Function that inserts a value of the second half of the in buffer in between
// every two char of the first half. // every two char of the first half.
static WEBP_INLINE void RGB24PackedToPlanarHelper( static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) { const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) {
out[0] = _mm_unpacklo_epi8(in[0], in[3]); out[0] = _mm_unpacklo_epi8(in[0], in[3]);
out[1] = _mm_unpackhi_epi8(in[0], in[3]); out[1] = _mm_unpackhi_epi8(in[0], in[3]);
@ -463,8 +467,8 @@ static WEBP_INLINE void RGB24PackedToPlanarHelper(
// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers: // Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
// rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// Similar to PlanarTo24bHelper(), but in reverse order. // Similar to PlanarTo24bHelper(), but in reverse order.
static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb, static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
__m128i* const out /*out[6]*/) { const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
__m128i tmp[6]; __m128i tmp[6];
tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0)); tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0));
tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16)); tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
@ -473,16 +477,16 @@ static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb,
tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64)); tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64));
tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80)); tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80));
RGB24PackedToPlanarHelper(tmp, out); RGB24PackedToPlanarHelper_SSE2(tmp, out);
RGB24PackedToPlanarHelper(out, tmp); RGB24PackedToPlanarHelper_SSE2(out, tmp);
RGB24PackedToPlanarHelper(tmp, out); RGB24PackedToPlanarHelper_SSE2(tmp, out);
RGB24PackedToPlanarHelper(out, tmp); RGB24PackedToPlanarHelper_SSE2(out, tmp);
RGB24PackedToPlanarHelper(tmp, out); RGB24PackedToPlanarHelper_SSE2(tmp, out);
} }
// Convert 8 packed ARGB to r[], g[], b[] // Convert 8 packed ARGB to r[], g[], b[]
static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb, static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
__m128i* const rgb /*in[6]*/) { __m128i* const rgb /*in[6]*/) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
__m128i a0 = LOAD_16(argb + 0); __m128i a0 = LOAD_16(argb + 0);
__m128i a1 = LOAD_16(argb + 4); __m128i a1 = LOAD_16(argb + 4);
@ -516,10 +520,10 @@ static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,
} while (0) } while (0)
#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A)) #define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
static WEBP_INLINE void ConvertRGBToY(const __m128i* const R, static WEBP_INLINE void ConvertRGBToY_SSE2(const __m128i* const R,
const __m128i* const G, const __m128i* const G,
const __m128i* const B, const __m128i* const B,
__m128i* const Y) { __m128i* const Y) {
const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384); const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
const __m128i kGB_y = MK_CST_16(16384, 6420); const __m128i kGB_y = MK_CST_16(16384, 6420);
const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF); const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
@ -531,10 +535,11 @@ static WEBP_INLINE void ConvertRGBToY(const __m128i* const R,
TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y); TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
} }
static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R, static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
const __m128i* const G, const __m128i* const G,
const __m128i* const B, const __m128i* const B,
__m128i* const U, __m128i* const V) { __m128i* const U,
__m128i* const V) {
const __m128i kRG_u = MK_CST_16(-9719, -19081); const __m128i kRG_u = MK_CST_16(-9719, -19081);
const __m128i kGB_u = MK_CST_16(0, 28800); const __m128i kGB_u = MK_CST_16(0, 28800);
const __m128i kRG_v = MK_CST_16(28800, 0); const __m128i kRG_v = MK_CST_16(28800, 0);
@ -561,7 +566,7 @@ static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
__m128i rgb_plane[6]; __m128i rgb_plane[6];
int j; int j;
RGB24PackedToPlanar(rgb, rgb_plane); RGB24PackedToPlanar_SSE2(rgb, rgb_plane);
for (j = 0; j < 2; ++j, i += 16) { for (j = 0; j < 2; ++j, i += 16) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
@ -571,13 +576,13 @@ static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero); r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero); g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero); b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
ConvertRGBToY(&r, &g, &b, &Y0); ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
// Convert to 16-bit Y. // Convert to 16-bit Y.
r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero); r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero); g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero); b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
ConvertRGBToY(&r, &g, &b, &Y1); ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
// Cast to 8-bit and store. // Cast to 8-bit and store.
STORE_16(_mm_packus_epi16(Y0, Y1), y + i); STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
@ -595,7 +600,7 @@ static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
__m128i bgr_plane[6]; __m128i bgr_plane[6];
int j; int j;
RGB24PackedToPlanar(bgr, bgr_plane); RGB24PackedToPlanar_SSE2(bgr, bgr_plane);
for (j = 0; j < 2; ++j, i += 16) { for (j = 0; j < 2; ++j, i += 16) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
@ -605,13 +610,13 @@ static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero); b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero); g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero); r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
ConvertRGBToY(&r, &g, &b, &Y0); ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
// Convert to 16-bit Y. // Convert to 16-bit Y.
b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero); b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero); g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero); r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
ConvertRGBToY(&r, &g, &b, &Y1); ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
// Cast to 8-bit and store. // Cast to 8-bit and store.
STORE_16(_mm_packus_epi16(Y0, Y1), y + i); STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
@ -627,9 +632,9 @@ static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) {
int i; int i;
for (i = 0; i < max_width; i += 16) { for (i = 0; i < max_width; i += 16) {
__m128i Y0, Y1, rgb[6]; __m128i Y0, Y1, rgb[6];
RGB32PackedToPlanar(&argb[i], rgb); RGB32PackedToPlanar_SSE2(&argb[i], rgb);
ConvertRGBToY(&rgb[0], &rgb[2], &rgb[4], &Y0); ConvertRGBToY_SSE2(&rgb[0], &rgb[2], &rgb[4], &Y0);
ConvertRGBToY(&rgb[1], &rgb[3], &rgb[5], &Y1); ConvertRGBToY_SSE2(&rgb[1], &rgb[3], &rgb[5], &Y1);
STORE_16(_mm_packus_epi16(Y0, Y1), y + i); STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
} }
for (; i < width; ++i) { // left-over for (; i < width; ++i) { // left-over
@ -641,8 +646,9 @@ static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) {
// Horizontal add (doubled) of two 16b values, result is 16b. // Horizontal add (doubled) of two 16b values, result is 16b.
// in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ... // in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
static void HorizontalAddPack(const __m128i* const A, const __m128i* const B, static void HorizontalAddPack_SSE2(const __m128i* const A,
__m128i* const out) { const __m128i* const B,
__m128i* const out) {
const __m128i k2 = _mm_set1_epi16(2); const __m128i k2 = _mm_set1_epi16(2);
const __m128i C = _mm_madd_epi16(*A, k2); const __m128i C = _mm_madd_epi16(*A, k2);
const __m128i D = _mm_madd_epi16(*B, k2); const __m128i D = _mm_madd_epi16(*B, k2);
@ -656,17 +662,17 @@ static void ConvertARGBToUV_SSE2(const uint32_t* argb,
int i; int i;
for (i = 0; i < max_width; i += 32, u += 16, v += 16) { for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
__m128i rgb[6], U0, V0, U1, V1; __m128i rgb[6], U0, V0, U1, V1;
RGB32PackedToPlanar(&argb[i], rgb); RGB32PackedToPlanar_SSE2(&argb[i], rgb);
HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]); HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]); HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]); HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U0, &V0); ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
RGB32PackedToPlanar(&argb[i + 16], rgb); RGB32PackedToPlanar_SSE2(&argb[i + 16], rgb);
HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]); HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]); HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]); HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U1, &V1); ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
U0 = _mm_packus_epi16(U0, U1); U0 = _mm_packus_epi16(U0, U1);
V0 = _mm_packus_epi16(V0, V1); V0 = _mm_packus_epi16(V0, V1);
@ -685,10 +691,9 @@ static void ConvertARGBToUV_SSE2(const uint32_t* argb,
} }
// Convert 16 packed ARGB 16b-values to r[], g[], b[] // Convert 16 packed ARGB 16b-values to r[], g[], b[]
static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx, static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
__m128i* const r, const uint16_t* const rgbx,
__m128i* const g, __m128i* const r, __m128i* const g, __m128i* const b) {
__m128i* const b) {
const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x
const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x
const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ... const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ...
@ -713,10 +718,10 @@ static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb,
const uint16_t* const last_rgb = rgb + 4 * max_width; const uint16_t* const last_rgb = rgb + 4 * max_width;
while (rgb < last_rgb) { while (rgb < last_rgb) {
__m128i r, g, b, U0, V0, U1, V1; __m128i r, g, b, U0, V0, U1, V1;
RGBA32PackedToPlanar_16b(rgb + 0, &r, &g, &b); RGBA32PackedToPlanar_16b_SSE2(rgb + 0, &r, &g, &b);
ConvertRGBToUV(&r, &g, &b, &U0, &V0); ConvertRGBToUV_SSE2(&r, &g, &b, &U0, &V0);
RGBA32PackedToPlanar_16b(rgb + 32, &r, &g, &b); RGBA32PackedToPlanar_16b_SSE2(rgb + 32, &r, &g, &b);
ConvertRGBToUV(&r, &g, &b, &U1, &V1); ConvertRGBToUV_SSE2(&r, &g, &b, &U1, &V1);
STORE_16(_mm_packus_epi16(U0, U1), u); STORE_16(_mm_packus_epi16(U0, U1), u);
STORE_16(_mm_packus_epi16(V0, V1), v); STORE_16(_mm_packus_epi16(V0, V1), v);
u += 16; u += 16;