Add SSE2 function for Inverse Cross-color Transform

Lossless decoding is now ~3% faster.

Change-Id: Idafb5c73e5cfb272cc3661d841f79971f9da0743
This commit is contained in:
Urvang Joshi 2014-04-01 15:52:25 -07:00
parent 26029568b7
commit d4813f0cb2
3 changed files with 144 additions and 66 deletions

View File

@ -807,15 +807,7 @@ void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels) {
}
}
typedef struct {
// Note: the members are uint8_t, so that any negative values are
// automatically converted to "mod 256" values.
uint8_t green_to_red_;
uint8_t green_to_blue_;
uint8_t red_to_blue_;
} Multipliers;
static WEBP_INLINE void MultipliersClear(Multipliers* m) {
static WEBP_INLINE void MultipliersClear(VP8LMultipliers* const m) {
m->green_to_red_ = 0;
m->green_to_blue_ = 0;
m->red_to_blue_ = 0;
@ -827,45 +819,55 @@ static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
}
static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
Multipliers* const m) {
VP8LMultipliers* const m) {
m->green_to_red_ = (color_code >> 0) & 0xff;
m->green_to_blue_ = (color_code >> 8) & 0xff;
m->red_to_blue_ = (color_code >> 16) & 0xff;
}
static WEBP_INLINE uint32_t MultipliersToColorCode(const Multipliers* const m) {
static WEBP_INLINE uint32_t MultipliersToColorCode(
const VP8LMultipliers* const m) {
return 0xff000000u |
((uint32_t)(m->red_to_blue_) << 16) |
((uint32_t)(m->green_to_blue_) << 8) |
m->green_to_red_;
}
static WEBP_INLINE uint32_t TransformColor(const Multipliers* const m,
uint32_t argb) {
const uint32_t green = argb >> 8;
const uint32_t red = argb >> 16;
uint32_t new_red = red;
uint32_t new_blue = argb;
new_red -= ColorTransformDelta(m->green_to_red_, green);
new_red &= 0xff;
new_blue -= ColorTransformDelta(m->green_to_blue_, green);
new_blue -= ColorTransformDelta(m->red_to_blue_, red);
new_blue &= 0xff;
return (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
static WEBP_INLINE void TransformColor(const VP8LMultipliers* const m,
uint32_t* data,
int num_pixels) {
int i;
for (i = 0; i < num_pixels; ++i) {
const uint32_t argb = data[i];
const uint32_t green = argb >> 8;
const uint32_t red = argb >> 16;
uint32_t new_red = red;
uint32_t new_blue = argb;
new_red -= ColorTransformDelta(m->green_to_red_, green);
new_red &= 0xff;
new_blue -= ColorTransformDelta(m->green_to_blue_, green);
new_blue -= ColorTransformDelta(m->red_to_blue_, red);
new_blue &= 0xff;
data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
}
}
static WEBP_INLINE uint32_t TransformColorInverse(const Multipliers* const m,
uint32_t argb) {
const uint32_t green = argb >> 8;
const uint32_t red = argb >> 16;
uint32_t new_red = red;
uint32_t new_blue = argb;
new_red += ColorTransformDelta(m->green_to_red_, green);
new_red &= 0xff;
new_blue += ColorTransformDelta(m->green_to_blue_, green);
new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
new_blue &= 0xff;
return (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, uint32_t* data,
int num_pixels) {
int i;
for (i = 0; i < num_pixels; ++i) {
const uint32_t argb = data[i];
const uint32_t green = argb >> 8;
const uint32_t red = argb >> 16;
uint32_t new_red = red;
uint32_t new_blue = argb;
new_red += ColorTransformDelta(m->green_to_red_, green);
new_red &= 0xff;
new_blue += ColorTransformDelta(m->green_to_blue_, green);
new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
new_blue &= 0xff;
data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
}
}
static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
@ -898,7 +900,7 @@ static float PredictionCostCrossColor(const int accumulated[256],
static float GetPredictionCostCrossColorRed(
int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
int xsize, Multipliers prev_x, Multipliers prev_y, int green_to_red,
int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red,
const int accumulated_red_histo[256], const uint32_t* const argb) {
int all_y;
int histo[256] = { 0 };
@ -925,9 +927,9 @@ static float GetPredictionCostCrossColorRed(
static void GetBestGreenToRed(
int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
int xsize, Multipliers prev_x, Multipliers prev_y,
int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y,
const int accumulated_red_histo[256], const uint32_t* const argb,
Multipliers* best_tx) {
VP8LMultipliers* const best_tx) {
int min_green_to_red = -64;
int max_green_to_red = 64;
int green_to_red = 0;
@ -964,8 +966,8 @@ static void GetBestGreenToRed(
static float GetPredictionCostCrossColorBlue(
int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
int xsize, Multipliers prev_x, Multipliers prev_y, int green_to_blue,
int red_to_blue, const int accumulated_blue_histo[256],
int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y,
int green_to_blue, int red_to_blue, const int accumulated_blue_histo[256],
const uint32_t* const argb) {
int all_y;
int histo[256] = { 0 };
@ -1001,9 +1003,9 @@ static float GetPredictionCostCrossColorBlue(
static void GetBestGreenRedToBlue(
int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
int xsize, Multipliers prev_x, Multipliers prev_y, int quality,
int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
const int accumulated_blue_histo[256], const uint32_t* const argb,
Multipliers* best_tx) {
VP8LMultipliers* const best_tx) {
float best_diff = MAX_DIFF_COST;
float cur_diff;
const int step = (quality < 25) ? 32 : (quality > 50) ? 8 : 16;
@ -1043,10 +1045,10 @@ static void GetBestGreenRedToBlue(
}
}
static Multipliers GetBestColorTransformForTile(
static VP8LMultipliers GetBestColorTransformForTile(
int tile_x, int tile_y, int bits,
Multipliers prev_x,
Multipliers prev_y,
VP8LMultipliers prev_x,
VP8LMultipliers prev_y,
int quality, int xsize, int ysize,
const int accumulated_red_histo[256],
const int accumulated_blue_histo[256],
@ -1056,7 +1058,7 @@ static Multipliers GetBestColorTransformForTile(
const int tile_x_offset = tile_x * max_tile_size;
const int all_x_max = GetMin(tile_x_offset + max_tile_size, xsize);
const int all_y_max = GetMin(tile_y_offset + max_tile_size, ysize);
Multipliers best_tx;
VP8LMultipliers best_tx;
MultipliersClear(&best_tx);
GetBestGreenToRed(tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize,
@ -1070,16 +1072,13 @@ static Multipliers GetBestColorTransformForTile(
static void CopyTileWithColorTransform(int xsize, int ysize,
int tile_x, int tile_y,
int max_tile_size,
Multipliers color_transform,
VP8LMultipliers color_transform,
uint32_t* argb) {
const int xscan = GetMin(max_tile_size, xsize - tile_x);
int yscan = GetMin(max_tile_size, ysize - tile_y);
argb += tile_y * xsize + tile_x;
while (yscan-- > 0) {
int x;
for (x = 0; x < xscan; ++x) {
argb[x] = TransformColor(&color_transform, argb[x]);
}
TransformColor(&color_transform, argb, xscan);
argb += xsize;
}
}
@ -1092,7 +1091,7 @@ void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
int accumulated_red_histo[256] = { 0 };
int accumulated_blue_histo[256] = { 0 };
int tile_x, tile_y;
Multipliers prev_x, prev_y;
VP8LMultipliers prev_x, prev_y;
MultipliersClear(&prev_y);
MultipliersClear(&prev_x);
for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
@ -1148,6 +1147,7 @@ static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
const int tile_width = 1 << transform->bits_;
const int mask = tile_width - 1;
const int safe_width = width & ~mask;
const int remaining_width = width - safe_width;
const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
int y = y_start;
const uint32_t* pred_row =
@ -1155,22 +1155,19 @@ static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
while (y < y_end) {
const uint32_t* pred = pred_row;
Multipliers m = { 0, 0, 0 };
int x = 0;
while (x < safe_width) {
int t;
VP8LMultipliers m = { 0, 0, 0 };
const uint32_t* const data_safe_end = data + safe_width;
const uint32_t* const data_end = data + width;
while (data < data_safe_end) {
ColorCodeToMultipliers(*pred++, &m);
for (t = 0; t < tile_width; ++t, ++x) {
data[x] = TransformColorInverse(&m, data[x]);
}
VP8LTransformColorInverse(&m, data, tile_width);
data += tile_width;
}
if (x < width) {
if (data < data_end) { // Left-overs using C-version.
ColorCodeToMultipliers(*pred++, &m);
for (; x < width; ++x) {
data[x] = TransformColorInverse(&m, data[x]);
}
VP8LTransformColorInverse(&m, data, remaining_width);
data += remaining_width;
}
data += width;
++y;
if ((y & mask) == 0) pred_row += tiles_per_row;;
}
@ -1468,6 +1465,8 @@ VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
VP8LPredictorFunc VP8LPredictors[16];
VP8LTransformColorFunc VP8LTransformColorInverse;
VP8LConvertFunc VP8LConvertBGRAToRGB;
VP8LConvertFunc VP8LConvertBGRAToRGBA;
VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
@ -1483,6 +1482,8 @@ void VP8LDspInit(void) {
VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;
VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;
VP8LTransformColorInverse = VP8LTransformColorInverse_C;
VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
VP8LConvertBGRAToRGBA = VP8LConvertBGRAToRGBA_C;
VP8LConvertBGRAToRGBA4444 = VP8LConvertBGRAToRGBA4444_C;

View File

@ -32,6 +32,17 @@ typedef void (*VP8LProcessBlueAndRedFunc)(uint32_t* argb_data, int num_pixels);
extern VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
extern VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
typedef struct {
// Note: the members are uint8_t, so that any negative values are
// automatically converted to "mod 256" values.
uint8_t green_to_red_;
uint8_t green_to_blue_;
uint8_t red_to_blue_;
} VP8LMultipliers;
typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels);
extern VP8LTransformColorFunc VP8LTransformColorInverse;
typedef void (*VP8LConvertFunc)(const uint32_t* src, int num_pixels,
uint8_t* dst);
extern VP8LConvertFunc VP8LConvertBGRAToRGB;
@ -41,6 +52,9 @@ extern VP8LConvertFunc VP8LConvertBGRAToRGB565;
extern VP8LConvertFunc VP8LConvertBGRAToBGR;
// Expose some C-only fallback functions
extern void VP8LTransformColorInverse_C(
const VP8LMultipliers* const m, uint32_t* data, int num_pixels);
extern void VP8LConvertBGRAToRGB_C(const uint32_t* src,
int num_pixels, uint8_t* dst);
extern void VP8LConvertBGRAToRGBA_C(const uint32_t* src,

View File

@ -13,12 +13,14 @@
#include "./dsp.h"
#include <assert.h>
#if defined(WEBP_USE_SSE2)
#include <emmintrin.h>
#include "./lossless.h"
//------------------------------------------------------------------------------
// Predictors
// Predictor Transform
static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
uint32_t c2) {
@ -118,7 +120,7 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
}
//------------------------------------------------------------------------------
// Colorspace conversion functions
// Subtract-Green Transform
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
const __m128i mask = _mm_set1_epi32(0x0000ff00);
@ -152,6 +154,65 @@ static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
VP8LAddGreenToBlueAndRed_C(argb_data + i, num_pixels - i);
}
//------------------------------------------------------------------------------
// Color Transform
static WEBP_INLINE __m128i ColorTransformDelta(__m128i color_pred,
__m128i color) {
// We simulate signed 8-bit multiplication as:
// * Left shift the two (8-bit) numbers by 8 bits,
// * Perform a 16-bit signed multiplication and retain the higher 16-bits.
const __m128i color_pred_shifted = _mm_slli_epi32(color_pred, 8);
const __m128i color_shifted = _mm_slli_epi32(color, 8);
// Note: This performs multiplication on 8 packed 16-bit numbers, 4 of which
// happen to be zeroes.
const __m128i signed_mult =
_mm_mulhi_epi16(color_pred_shifted, color_shifted);
return _mm_srli_epi32(signed_mult, 5);
}
static WEBP_INLINE void TransformColorInverse(const VP8LMultipliers* const m,
uint32_t* argb_data,
int num_pixels) {
const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_); // multipliers
const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_);
const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_);
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00); // masks
const __m128i red_mask = _mm_set1_epi32(0x00ff0000);
const __m128i green_mask = _mm_set1_epi32(0x0000ff00);
const __m128i lower_8bit_mask = _mm_set1_epi32(0x000000ff);
const __m128i ag = _mm_and_si128(in, alpha_green_mask); // alpha, green
const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16);
const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8);
const __m128i b = in;
const __m128i r_delta = ColorTransformDelta(g_to_r, g); // red
const __m128i r_new =
_mm_and_si128(_mm_add_epi32(r, r_delta), lower_8bit_mask);
const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16);
const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g); // blue
const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r_new);
const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2);
const __m128i b_new =
_mm_and_si128(_mm_add_epi32(b, b_delta), lower_8bit_mask);
const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new);
_mm_storeu_si128((__m128i*)&argb_data[i], out);
}
// Fall-back to C-version for left-overs.
VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i);
}
//------------------------------------------------------------------------------
// Color-space conversion functions
static void ConvertBGRAToRGBA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const __m128i* in = (const __m128i*)src;
@ -298,6 +359,8 @@ void VP8LDspInitSSE2(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
VP8LTransformColorInverse = TransformColorInverse;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;