diff --git a/src/dsp/lossless.c b/src/dsp/lossless.c index cde4a8de..9625d4dd 100644 --- a/src/dsp/lossless.c +++ b/src/dsp/lossless.c @@ -807,15 +807,7 @@ void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels) { } } -typedef struct { - // Note: the members are uint8_t, so that any negative values are - // automatically converted to "mod 256" values. - uint8_t green_to_red_; - uint8_t green_to_blue_; - uint8_t red_to_blue_; -} Multipliers; - -static WEBP_INLINE void MultipliersClear(Multipliers* m) { +static WEBP_INLINE void MultipliersClear(VP8LMultipliers* const m) { m->green_to_red_ = 0; m->green_to_blue_ = 0; m->red_to_blue_ = 0; @@ -827,45 +819,55 @@ static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred, } static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code, - Multipliers* const m) { + VP8LMultipliers* const m) { m->green_to_red_ = (color_code >> 0) & 0xff; m->green_to_blue_ = (color_code >> 8) & 0xff; m->red_to_blue_ = (color_code >> 16) & 0xff; } -static WEBP_INLINE uint32_t MultipliersToColorCode(const Multipliers* const m) { +static WEBP_INLINE uint32_t MultipliersToColorCode( + const VP8LMultipliers* const m) { return 0xff000000u | ((uint32_t)(m->red_to_blue_) << 16) | ((uint32_t)(m->green_to_blue_) << 8) | m->green_to_red_; } -static WEBP_INLINE uint32_t TransformColor(const Multipliers* const m, - uint32_t argb) { - const uint32_t green = argb >> 8; - const uint32_t red = argb >> 16; - uint32_t new_red = red; - uint32_t new_blue = argb; - new_red -= ColorTransformDelta(m->green_to_red_, green); - new_red &= 0xff; - new_blue -= ColorTransformDelta(m->green_to_blue_, green); - new_blue -= ColorTransformDelta(m->red_to_blue_, red); - new_blue &= 0xff; - return (argb & 0xff00ff00u) | (new_red << 16) | (new_blue); +static WEBP_INLINE void TransformColor(const VP8LMultipliers* const m, + uint32_t* data, + int num_pixels) { + int i; + for (i = 0; i < num_pixels; ++i) { + const uint32_t argb = data[i]; + const uint32_t green = argb >> 8; + const uint32_t red = argb >> 16; + uint32_t new_red = red; + uint32_t new_blue = argb; + new_red -= ColorTransformDelta(m->green_to_red_, green); + new_red &= 0xff; + new_blue -= ColorTransformDelta(m->green_to_blue_, green); + new_blue -= ColorTransformDelta(m->red_to_blue_, red); + new_blue &= 0xff; + data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue); + } } -static WEBP_INLINE uint32_t TransformColorInverse(const Multipliers* const m, - uint32_t argb) { - const uint32_t green = argb >> 8; - const uint32_t red = argb >> 16; - uint32_t new_red = red; - uint32_t new_blue = argb; - new_red += ColorTransformDelta(m->green_to_red_, green); - new_red &= 0xff; - new_blue += ColorTransformDelta(m->green_to_blue_, green); - new_blue += ColorTransformDelta(m->red_to_blue_, new_red); - new_blue &= 0xff; - return (argb & 0xff00ff00u) | (new_red << 16) | (new_blue); +void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, uint32_t* data, + int num_pixels) { + int i; + for (i = 0; i < num_pixels; ++i) { + const uint32_t argb = data[i]; + const uint32_t green = argb >> 8; + const uint32_t red = argb >> 16; + uint32_t new_red = red; + uint32_t new_blue = argb; + new_red += ColorTransformDelta(m->green_to_red_, green); + new_red &= 0xff; + new_blue += ColorTransformDelta(m->green_to_blue_, green); + new_blue += ColorTransformDelta(m->red_to_blue_, new_red); + new_blue &= 0xff; + data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue); + } } static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red, @@ -898,7 +900,7 @@ static float PredictionCostCrossColor(const int accumulated[256], static float GetPredictionCostCrossColorRed( int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max, - int xsize, Multipliers prev_x, Multipliers prev_y, int green_to_red, + int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red, const int accumulated_red_histo[256], const uint32_t* const argb) { int all_y; int histo[256] = { 0 }; @@ -925,9 +927,9 @@ static float GetPredictionCostCrossColorRed( static void GetBestGreenToRed( int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max, - int xsize, Multipliers prev_x, Multipliers prev_y, + int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y, const int accumulated_red_histo[256], const uint32_t* const argb, - Multipliers* best_tx) { + VP8LMultipliers* const best_tx) { int min_green_to_red = -64; int max_green_to_red = 64; int green_to_red = 0; @@ -964,8 +966,8 @@ static void GetBestGreenToRed( static float GetPredictionCostCrossColorBlue( int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max, - int xsize, Multipliers prev_x, Multipliers prev_y, int green_to_blue, - int red_to_blue, const int accumulated_blue_histo[256], + int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y, + int green_to_blue, int red_to_blue, const int accumulated_blue_histo[256], const uint32_t* const argb) { int all_y; int histo[256] = { 0 }; @@ -1001,9 +1003,9 @@ static float GetPredictionCostCrossColorBlue( static void GetBestGreenRedToBlue( int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max, - int xsize, Multipliers prev_x, Multipliers prev_y, int quality, + int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality, const int accumulated_blue_histo[256], const uint32_t* const argb, - Multipliers* best_tx) { + VP8LMultipliers* const best_tx) { float best_diff = MAX_DIFF_COST; float cur_diff; const int step = (quality < 25) ? 32 : (quality > 50) ? 8 : 16; @@ -1043,10 +1045,10 @@ static void GetBestGreenRedToBlue( } } -static Multipliers GetBestColorTransformForTile( +static VP8LMultipliers GetBestColorTransformForTile( int tile_x, int tile_y, int bits, - Multipliers prev_x, - Multipliers prev_y, + VP8LMultipliers prev_x, + VP8LMultipliers prev_y, int quality, int xsize, int ysize, const int accumulated_red_histo[256], const int accumulated_blue_histo[256], @@ -1056,7 +1058,7 @@ static Multipliers GetBestColorTransformForTile( const int tile_x_offset = tile_x * max_tile_size; const int all_x_max = GetMin(tile_x_offset + max_tile_size, xsize); const int all_y_max = GetMin(tile_y_offset + max_tile_size, ysize); - Multipliers best_tx; + VP8LMultipliers best_tx; MultipliersClear(&best_tx); GetBestGreenToRed(tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize, @@ -1070,16 +1072,13 @@ static Multipliers GetBestColorTransformForTile( static void CopyTileWithColorTransform(int xsize, int ysize, int tile_x, int tile_y, int max_tile_size, - Multipliers color_transform, + VP8LMultipliers color_transform, uint32_t* argb) { const int xscan = GetMin(max_tile_size, xsize - tile_x); int yscan = GetMin(max_tile_size, ysize - tile_y); argb += tile_y * xsize + tile_x; while (yscan-- > 0) { - int x; - for (x = 0; x < xscan; ++x) { - argb[x] = TransformColor(&color_transform, argb[x]); - } + TransformColor(&color_transform, argb, xscan); argb += xsize; } } @@ -1092,7 +1091,7 @@ void VP8LColorSpaceTransform(int width, int height, int bits, int quality, int accumulated_red_histo[256] = { 0 }; int accumulated_blue_histo[256] = { 0 }; int tile_x, tile_y; - Multipliers prev_x, prev_y; + VP8LMultipliers prev_x, prev_y; MultipliersClear(&prev_y); MultipliersClear(&prev_x); for (tile_y = 0; tile_y < tile_ysize; ++tile_y) { @@ -1148,6 +1147,7 @@ static void ColorSpaceInverseTransform(const VP8LTransform* const transform, const int tile_width = 1 << transform->bits_; const int mask = tile_width - 1; const int safe_width = width & ~mask; + const int remaining_width = width - safe_width; const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_); int y = y_start; const uint32_t* pred_row = @@ -1155,22 +1155,19 @@ static void ColorSpaceInverseTransform(const VP8LTransform* const transform, while (y < y_end) { const uint32_t* pred = pred_row; - Multipliers m = { 0, 0, 0 }; - int x = 0; - while (x < safe_width) { - int t; + VP8LMultipliers m = { 0, 0, 0 }; + const uint32_t* const data_safe_end = data + safe_width; + const uint32_t* const data_end = data + width; + while (data < data_safe_end) { ColorCodeToMultipliers(*pred++, &m); - for (t = 0; t < tile_width; ++t, ++x) { - data[x] = TransformColorInverse(&m, data[x]); - } + VP8LTransformColorInverse(&m, data, tile_width); + data += tile_width; } - if (x < width) { + if (data < data_end) { // Left-overs using C-version. ColorCodeToMultipliers(*pred++, &m); - for (; x < width; ++x) { - data[x] = TransformColorInverse(&m, data[x]); - } + VP8LTransformColorInverse(&m, data, remaining_width); + data += remaining_width; } - data += width; ++y; if ((y & mask) == 0) pred_row += tiles_per_row;; } @@ -1468,6 +1465,8 @@ VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed; VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed; VP8LPredictorFunc VP8LPredictors[16]; +VP8LTransformColorFunc VP8LTransformColorInverse; + VP8LConvertFunc VP8LConvertBGRAToRGB; VP8LConvertFunc VP8LConvertBGRAToRGBA; VP8LConvertFunc VP8LConvertBGRAToRGBA4444; @@ -1483,6 +1482,8 @@ void VP8LDspInit(void) { VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C; VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C; + VP8LTransformColorInverse = VP8LTransformColorInverse_C; + VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C; VP8LConvertBGRAToRGBA = VP8LConvertBGRAToRGBA_C; VP8LConvertBGRAToRGBA4444 = VP8LConvertBGRAToRGBA4444_C; diff --git a/src/dsp/lossless.h b/src/dsp/lossless.h index be644dc0..0abb6558 100644 --- a/src/dsp/lossless.h +++ b/src/dsp/lossless.h @@ -32,6 +32,17 @@ typedef void (*VP8LProcessBlueAndRedFunc)(uint32_t* argb_data, int num_pixels); extern VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed; extern VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed; +typedef struct { + // Note: the members are uint8_t, so that any negative values are + // automatically converted to "mod 256" values. + uint8_t green_to_red_; + uint8_t green_to_blue_; + uint8_t red_to_blue_; +} VP8LMultipliers; +typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m, + uint32_t* argb_data, int num_pixels); +extern VP8LTransformColorFunc VP8LTransformColorInverse; + typedef void (*VP8LConvertFunc)(const uint32_t* src, int num_pixels, uint8_t* dst); extern VP8LConvertFunc VP8LConvertBGRAToRGB; @@ -41,6 +52,9 @@ extern VP8LConvertFunc VP8LConvertBGRAToRGB565; extern VP8LConvertFunc VP8LConvertBGRAToBGR; // Expose some C-only fallback functions +extern void VP8LTransformColorInverse_C( + const VP8LMultipliers* const m, uint32_t* data, int num_pixels); + extern void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst); extern void VP8LConvertBGRAToRGBA_C(const uint32_t* src, diff --git a/src/dsp/lossless_sse2.c b/src/dsp/lossless_sse2.c index 5481c907..3e6ad9c3 100644 --- a/src/dsp/lossless_sse2.c +++ b/src/dsp/lossless_sse2.c @@ -13,12 +13,14 @@ #include "./dsp.h" +#include + #if defined(WEBP_USE_SSE2) #include #include "./lossless.h" //------------------------------------------------------------------------------ -// Predictors +// Predictor Transform static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1, uint32_t c2) { @@ -118,7 +120,7 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) { } //------------------------------------------------------------------------------ -// Colorspace conversion functions +// Subtract-Green Transform static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { const __m128i mask = _mm_set1_epi32(0x0000ff00); @@ -152,6 +154,65 @@ static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { VP8LAddGreenToBlueAndRed_C(argb_data + i, num_pixels - i); } +//------------------------------------------------------------------------------ +// Color Transform + +static WEBP_INLINE __m128i ColorTransformDelta(__m128i color_pred, + __m128i color) { + // We simulate signed 8-bit multiplication as: + // * Left shift the two (8-bit) numbers by 8 bits, + // * Perform a 16-bit signed multiplication and retain the higher 16-bits. + const __m128i color_pred_shifted = _mm_slli_epi32(color_pred, 8); + const __m128i color_shifted = _mm_slli_epi32(color, 8); + // Note: This performs multiplication on 8 packed 16-bit numbers, 4 of which + // happen to be zeroes. + const __m128i signed_mult = + _mm_mulhi_epi16(color_pred_shifted, color_shifted); + return _mm_srli_epi32(signed_mult, 5); +} + +static WEBP_INLINE void TransformColorInverse(const VP8LMultipliers* const m, + uint32_t* argb_data, + int num_pixels) { + const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_); // multipliers + const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_); + const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_); + + int i; + + for (i = 0; i + 4 <= num_pixels; i += 4) { + const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); + const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00); // masks + const __m128i red_mask = _mm_set1_epi32(0x00ff0000); + const __m128i green_mask = _mm_set1_epi32(0x0000ff00); + const __m128i lower_8bit_mask = _mm_set1_epi32(0x000000ff); + const __m128i ag = _mm_and_si128(in, alpha_green_mask); // alpha, green + const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16); + const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8); + const __m128i b = in; + + const __m128i r_delta = ColorTransformDelta(g_to_r, g); // red + const __m128i r_new = + _mm_and_si128(_mm_add_epi32(r, r_delta), lower_8bit_mask); + const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16); + + const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g); // blue + const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r_new); + const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2); + const __m128i b_new = + _mm_and_si128(_mm_add_epi32(b, b_delta), lower_8bit_mask); + + const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new); + _mm_storeu_si128((__m128i*)&argb_data[i], out); + } + + // Fall-back to C-version for left-overs. + VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i); +} + +//------------------------------------------------------------------------------ +// Color-space conversion functions + static void ConvertBGRAToRGBA(const uint32_t* src, int num_pixels, uint8_t* dst) { const __m128i* in = (const __m128i*)src; @@ -298,6 +359,8 @@ void VP8LDspInitSSE2(void) { VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed; VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; + VP8LTransformColorInverse = TransformColorInverse; + VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444; VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;