From c90a902effec4ab53e3cbdcbb47c06a6161ebade Mon Sep 17 00:00:00 2001 From: Urvang Joshi Date: Wed, 2 Apr 2014 12:21:20 -0700 Subject: [PATCH] Add SSE2 version of forward cross-color transform Encoding speed is roughly the same. Change-Id: I6b976d0eb24e1847714e719762cb8403768da66c --- src/dsp/lossless.c | 9 +++++---- src/dsp/lossless.h | 7 +++++-- src/dsp/lossless_sse2.c | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 6 deletions(-) diff --git a/src/dsp/lossless.c b/src/dsp/lossless.c index 9625d4dd..ed9a815a 100644 --- a/src/dsp/lossless.c +++ b/src/dsp/lossless.c @@ -833,9 +833,8 @@ static WEBP_INLINE uint32_t MultipliersToColorCode( m->green_to_red_; } -static WEBP_INLINE void TransformColor(const VP8LMultipliers* const m, - uint32_t* data, - int num_pixels) { +void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data, + int num_pixels) { int i; for (i = 0; i < num_pixels; ++i) { const uint32_t argb = data[i]; @@ -1078,7 +1077,7 @@ static void CopyTileWithColorTransform(int xsize, int ysize, int yscan = GetMin(max_tile_size, ysize - tile_y); argb += tile_y * xsize + tile_x; while (yscan-- > 0) { - TransformColor(&color_transform, argb, xscan); + VP8LTransformColor(&color_transform, argb, xscan); argb += xsize; } } @@ -1465,6 +1464,7 @@ VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed; VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed; VP8LPredictorFunc VP8LPredictors[16]; +VP8LTransformColorFunc VP8LTransformColor; VP8LTransformColorFunc VP8LTransformColorInverse; VP8LConvertFunc VP8LConvertBGRAToRGB; @@ -1482,6 +1482,7 @@ void VP8LDspInit(void) { VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C; VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C; + VP8LTransformColor = VP8LTransformColor_C; VP8LTransformColorInverse = VP8LTransformColorInverse_C; VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C; diff --git a/src/dsp/lossless.h b/src/dsp/lossless.h index 0abb6558..553793c8 100644 --- a/src/dsp/lossless.h +++ b/src/dsp/lossless.h @@ -41,6 +41,7 @@ typedef struct { } VP8LMultipliers; typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m, uint32_t* argb_data, int num_pixels); +extern VP8LTransformColorFunc VP8LTransformColor; extern VP8LTransformColorFunc VP8LTransformColorInverse; typedef void (*VP8LConvertFunc)(const uint32_t* src, int num_pixels, @@ -52,8 +53,10 @@ extern VP8LConvertFunc VP8LConvertBGRAToRGB565; extern VP8LConvertFunc VP8LConvertBGRAToBGR; // Expose some C-only fallback functions -extern void VP8LTransformColorInverse_C( - const VP8LMultipliers* const m, uint32_t* data, int num_pixels); +extern void VP8LTransformColor_C(const VP8LMultipliers* const m, + uint32_t* data, int num_pixels); +extern void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, + uint32_t* data, int num_pixels); extern void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst); diff --git a/src/dsp/lossless_sse2.c b/src/dsp/lossless_sse2.c index 3e6ad9c3..0e37cf1f 100644 --- a/src/dsp/lossless_sse2.c +++ b/src/dsp/lossless_sse2.c @@ -171,6 +171,45 @@ static WEBP_INLINE __m128i ColorTransformDelta(__m128i color_pred, return _mm_srli_epi32(signed_mult, 5); } +static WEBP_INLINE void TransformColor(const VP8LMultipliers* const m, + uint32_t* argb_data, + int num_pixels) { + const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_); // multipliers + const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_); + const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_); + + int i; + + for (i = 0; i + 4 <= num_pixels; i += 4) { + const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); + const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00); // masks + const __m128i red_mask = _mm_set1_epi32(0x00ff0000); + const __m128i green_mask = _mm_set1_epi32(0x0000ff00); + const __m128i lower_8bit_mask = _mm_set1_epi32(0x000000ff); + const __m128i ag = _mm_and_si128(in, alpha_green_mask); // alpha, green + const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16); + const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8); + const __m128i b = in; + + const __m128i r_delta = ColorTransformDelta(g_to_r, g); // red + const __m128i r_new = + _mm_and_si128(_mm_sub_epi32(r, r_delta), lower_8bit_mask); + const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16); + + const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g); // blue + const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r); + const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2); + const __m128i b_new = + _mm_and_si128(_mm_sub_epi32(b, b_delta), lower_8bit_mask); + + const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new); + _mm_storeu_si128((__m128i*)&argb_data[i], out); + } + + // Fall-back to C-version for left-overs. + VP8LTransformColor_C(m, argb_data + i, num_pixels - i); +} + static WEBP_INLINE void TransformColorInverse(const VP8LMultipliers* const m, uint32_t* argb_data, int num_pixels) { @@ -359,6 +398,7 @@ void VP8LDspInitSSE2(void) { VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed; VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; + VP8LTransformColor = TransformColor; VP8LTransformColorInverse = TransformColorInverse; VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;