Remove memcpy in lossless decoding.

Change-Id: Iba694b306486d67764e2fc5576c98a974c9b886c
This commit is contained in:
Vincent Rabaud 2016-11-24 17:45:22 +01:00
parent 7474d46e45
commit 71e2f5cadf
8 changed files with 151 additions and 112 deletions

View File

@ -712,13 +712,15 @@ static void ApplyInverseTransforms(VP8LDecoder* const dec, int num_rows,
uint32_t* const rows_out = dec->argb_cache_; uint32_t* const rows_out = dec->argb_cache_;
// Inverse transforms. // Inverse transforms.
// TODO: most transforms only need to operate on the cropped region only.
memcpy(rows_out, rows_in, cache_pixs * sizeof(*rows_out));
while (n-- > 0) { while (n-- > 0) {
VP8LTransform* const transform = &dec->transforms_[n]; VP8LTransform* const transform = &dec->transforms_[n];
VP8LInverseTransform(transform, start_row, end_row, rows_in, rows_out); VP8LInverseTransform(transform, start_row, end_row, rows_in, rows_out);
rows_in = rows_out; rows_in = rows_out;
} }
if (rows_in != rows_out) {
// No transform called, hence just copy.
memcpy(rows_out, rows_in, cache_pixs * sizeof(*rows_out));
}
} }
// Processes (transforms, scales & color-converts) the rows decoded after the // Processes (transforms, scales & color-converts) the rows decoded after the

View File

@ -234,15 +234,16 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
// Add green to blue and red channels (i.e. perform the inverse transform of // Add green to blue and red channels (i.e. perform the inverse transform of
// 'subtract green'). // 'subtract green').
void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels) { void VP8LAddGreenToBlueAndRed_C(const uint32_t* const src, int num_pixels,
uint32_t* dst) {
int i; int i;
for (i = 0; i < num_pixels; ++i) { for (i = 0; i < num_pixels; ++i) {
const uint32_t argb = data[i]; const uint32_t argb = src[i];
const uint32_t green = ((argb >> 8) & 0xff); const uint32_t green = ((argb >> 8) & 0xff);
uint32_t red_blue = (argb & 0x00ff00ffu); uint32_t red_blue = (argb & 0x00ff00ffu);
red_blue += (green << 16) | green; red_blue += (green << 16) | green;
red_blue &= 0x00ff00ffu; red_blue &= 0x00ff00ffu;
data[i] = (argb & 0xff00ff00u) | red_blue; dst[i] = (argb & 0xff00ff00u) | red_blue;
} }
} }
@ -258,11 +259,12 @@ static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
m->red_to_blue_ = (color_code >> 16) & 0xff; m->red_to_blue_ = (color_code >> 16) & 0xff;
} }
void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, uint32_t* data, void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
int num_pixels) { const uint32_t* const src, int num_pixels,
uint32_t* const dst) {
int i; int i;
for (i = 0; i < num_pixels; ++i) { for (i = 0; i < num_pixels; ++i) {
const uint32_t argb = data[i]; const uint32_t argb = src[i];
const uint32_t green = argb >> 8; const uint32_t green = argb >> 8;
const uint32_t red = argb >> 16; const uint32_t red = argb >> 16;
int new_red = red; int new_red = red;
@ -272,13 +274,14 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, uint32_t* data,
new_blue += ColorTransformDelta(m->green_to_blue_, green); new_blue += ColorTransformDelta(m->green_to_blue_, green);
new_blue += ColorTransformDelta(m->red_to_blue_, new_red); new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
new_blue &= 0xff; new_blue &= 0xff;
data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue); dst[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
} }
} }
// Color space inverse transform. // Color space inverse transform.
static void ColorSpaceInverseTransform(const VP8LTransform* const transform, static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
int y_start, int y_end, uint32_t* data) { int y_start, int y_end,
const uint32_t* src, uint32_t* dst) {
const int width = transform->xsize_; const int width = transform->xsize_;
const int tile_width = 1 << transform->bits_; const int tile_width = 1 << transform->bits_;
const int mask = tile_width - 1; const int mask = tile_width - 1;
@ -292,17 +295,19 @@ static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
while (y < y_end) { while (y < y_end) {
const uint32_t* pred = pred_row; const uint32_t* pred = pred_row;
VP8LMultipliers m = { 0, 0, 0 }; VP8LMultipliers m = { 0, 0, 0 };
const uint32_t* const data_safe_end = data + safe_width; const uint32_t* const src_safe_end = src + safe_width;
const uint32_t* const data_end = data + width; const uint32_t* const src_end = src + width;
while (data < data_safe_end) { while (src < src_safe_end) {
ColorCodeToMultipliers(*pred++, &m); ColorCodeToMultipliers(*pred++, &m);
VP8LTransformColorInverse(&m, data, tile_width); VP8LTransformColorInverse(&m, src, tile_width, dst);
data += tile_width; src += tile_width;
dst += tile_width;
} }
if (data < data_end) { // Left-overs using C-version. if (src < src_end) { // Left-overs using C-version.
ColorCodeToMultipliers(*pred++, &m); ColorCodeToMultipliers(*pred++, &m);
VP8LTransformColorInverse(&m, data, remaining_width); VP8LTransformColorInverse(&m, src, remaining_width, dst);
data += remaining_width; src += remaining_width;
dst += remaining_width;
} }
++y; ++y;
if ((y & mask) == 0) pred_row += tiles_per_row; if ((y & mask) == 0) pred_row += tiles_per_row;
@ -367,9 +372,13 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
assert(row_end <= transform->ysize_); assert(row_end <= transform->ysize_);
switch (transform->type_) { switch (transform->type_) {
case SUBTRACT_GREEN: case SUBTRACT_GREEN:
VP8LAddGreenToBlueAndRed(out, (row_end - row_start) * width); VP8LAddGreenToBlueAndRed(in, (row_end - row_start) * width, out);
break; break;
case PREDICTOR_TRANSFORM: case PREDICTOR_TRANSFORM:
// TODO(vrabaud): parallelize transform predictors.
if (in != out) {
memcpy(out, in, (row_end - row_start) * width * sizeof(*out));
}
PredictorInverseTransform(transform, row_start, row_end, out); PredictorInverseTransform(transform, row_start, row_end, out);
if (row_end != transform->ysize_) { if (row_end != transform->ysize_) {
// The last predicted row in this iteration will be the top-pred row // The last predicted row in this iteration will be the top-pred row
@ -379,7 +388,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
} }
break; break;
case CROSS_COLOR_TRANSFORM: case CROSS_COLOR_TRANSFORM:
ColorSpaceInverseTransform(transform, row_start, row_end, out); ColorSpaceInverseTransform(transform, row_start, row_end, in, out);
break; break;
case COLOR_INDEXING_TRANSFORM: case COLOR_INDEXING_TRANSFORM:
if (in == out && transform->bits_ > 0) { if (in == out && transform->bits_ > 0) {
@ -556,10 +565,10 @@ void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed; VP8LProcessDecBlueAndRedFunc VP8LAddGreenToBlueAndRed;
VP8LPredictorFunc VP8LPredictors[16]; VP8LPredictorFunc VP8LPredictors[16];
VP8LTransformColorFunc VP8LTransformColorInverse; VP8LTransformColorInverseFunc VP8LTransformColorInverse;
VP8LConvertFunc VP8LConvertBGRAToRGB; VP8LConvertFunc VP8LConvertBGRAToRGB;
VP8LConvertFunc VP8LConvertBGRAToRGBA; VP8LConvertFunc VP8LConvertBGRAToRGBA;

View File

@ -35,8 +35,9 @@ extern "C" {
typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top); typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
extern VP8LPredictorFunc VP8LPredictors[16]; extern VP8LPredictorFunc VP8LPredictors[16];
typedef void (*VP8LProcessBlueAndRedFunc)(uint32_t* argb_data, int num_pixels); typedef void (*VP8LProcessDecBlueAndRedFunc)(const uint32_t* src,
extern VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed; int num_pixels, uint32_t* dst);
extern VP8LProcessDecBlueAndRedFunc VP8LAddGreenToBlueAndRed;
typedef struct { typedef struct {
// Note: the members are uint8_t, so that any negative values are // Note: the members are uint8_t, so that any negative values are
@ -45,9 +46,10 @@ typedef struct {
uint8_t green_to_blue_; uint8_t green_to_blue_;
uint8_t red_to_blue_; uint8_t red_to_blue_;
} VP8LMultipliers; } VP8LMultipliers;
typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m, typedef void (*VP8LTransformColorInverseFunc)(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels); const uint32_t* src,
extern VP8LTransformColorFunc VP8LTransformColorInverse; int num_pixels, uint32_t* dst);
extern VP8LTransformColorInverseFunc VP8LTransformColorInverse;
struct VP8LTransform; // Defined in dec/vp8li.h. struct VP8LTransform; // Defined in dec/vp8li.h.
@ -93,7 +95,8 @@ void VP8LColorIndexInverseTransformAlpha(
// Expose some C-only fallback functions // Expose some C-only fallback functions
void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
uint32_t* data, int num_pixels); const uint32_t* src, int num_pixels,
uint32_t* dst);
void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst); void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst);
void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst); void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst);
@ -102,7 +105,8 @@ void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
void VP8LConvertBGRAToRGB565_C(const uint32_t* src, void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
int num_pixels, uint8_t* dst); int num_pixels, uint8_t* dst);
void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst); void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst);
void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels); void VP8LAddGreenToBlueAndRed_C(const uint32_t* src, int num_pixels,
uint32_t* dst);
// Must be called before calling any of the above methods. // Must be called before calling any of the above methods.
void VP8LDspInit(void); void VP8LDspInit(void);
@ -110,7 +114,10 @@ void VP8LDspInit(void);
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Encoding // Encoding
extern VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed; typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels);
extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
uint32_t* const dst, int num_pixels);
extern VP8LTransformColorFunc VP8LTransformColor; extern VP8LTransformColorFunc VP8LTransformColor;
typedef void (*VP8LCollectColorBlueTransformsFunc)( typedef void (*VP8LCollectColorBlueTransformsFunc)(
const uint32_t* argb, int stride, const uint32_t* argb, int stride,

View File

@ -665,7 +665,7 @@ static void HistogramAdd(const VP8LHistogram* const a,
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed; VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
VP8LTransformColorFunc VP8LTransformColor; VP8LTransformColorFunc VP8LTransformColor;

View File

@ -228,25 +228,27 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
// Add green to blue and red channels (i.e. perform the inverse transform of // Add green to blue and red channels (i.e. perform the inverse transform of
// 'subtract green'). // 'subtract green').
static void AddGreenToBlueAndRed(uint32_t* data, int num_pixels) { static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
uint32_t* dst) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
uint32_t* const p_loop1_end = data + (num_pixels & ~3); const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
uint32_t* const p_loop2_end = data + num_pixels; const uint32_t* const p_loop2_end = src + num_pixels;
__asm__ volatile ( __asm__ volatile (
".set push \n\t" ".set push \n\t"
".set noreorder \n\t" ".set noreorder \n\t"
"beq %[data], %[p_loop1_end], 3f \n\t" "beq %[src], %[p_loop1_end], 3f \n\t"
" nop \n\t" " nop \n\t"
"0: \n\t" "0: \n\t"
"lw %[temp0], 0(%[data]) \n\t" "lw %[temp0], 0(%[src]) \n\t"
"lw %[temp1], 4(%[data]) \n\t" "lw %[temp1], 4(%[src]) \n\t"
"lw %[temp2], 8(%[data]) \n\t" "lw %[temp2], 8(%[src]) \n\t"
"lw %[temp3], 12(%[data]) \n\t" "lw %[temp3], 12(%[src]) \n\t"
"ext %[temp4], %[temp0], 8, 8 \n\t" "ext %[temp4], %[temp0], 8, 8 \n\t"
"ext %[temp5], %[temp1], 8, 8 \n\t" "ext %[temp5], %[temp1], 8, 8 \n\t"
"ext %[temp6], %[temp2], 8, 8 \n\t" "ext %[temp6], %[temp2], 8, 8 \n\t"
"ext %[temp7], %[temp3], 8, 8 \n\t" "ext %[temp7], %[temp3], 8, 8 \n\t"
"addiu %[data], %[data], 16 \n\t" "addiu %[src], %[src], 16 \n\t"
"addiu %[dst], %[dst], 16 \n\t"
"replv.ph %[temp4], %[temp4] \n\t" "replv.ph %[temp4], %[temp4] \n\t"
"replv.ph %[temp5], %[temp5] \n\t" "replv.ph %[temp5], %[temp5] \n\t"
"replv.ph %[temp6], %[temp6] \n\t" "replv.ph %[temp6], %[temp6] \n\t"
@ -255,44 +257,47 @@ static void AddGreenToBlueAndRed(uint32_t* data, int num_pixels) {
"addu.qb %[temp1], %[temp1], %[temp5] \n\t" "addu.qb %[temp1], %[temp1], %[temp5] \n\t"
"addu.qb %[temp2], %[temp2], %[temp6] \n\t" "addu.qb %[temp2], %[temp2], %[temp6] \n\t"
"addu.qb %[temp3], %[temp3], %[temp7] \n\t" "addu.qb %[temp3], %[temp3], %[temp7] \n\t"
"sw %[temp0], -16(%[data]) \n\t" "sw %[temp0], -16(%[dst]) \n\t"
"sw %[temp1], -12(%[data]) \n\t" "sw %[temp1], -12(%[dst]) \n\t"
"sw %[temp2], -8(%[data]) \n\t" "sw %[temp2], -8(%[dst]) \n\t"
"bne %[data], %[p_loop1_end], 0b \n\t" "bne %[src], %[p_loop1_end], 0b \n\t"
" sw %[temp3], -4(%[data]) \n\t" " sw %[temp3], -4(%[dst]) \n\t"
"3: \n\t" "3: \n\t"
"beq %[data], %[p_loop2_end], 2f \n\t" "beq %[src], %[p_loop2_end], 2f \n\t"
" nop \n\t" " nop \n\t"
"1: \n\t" "1: \n\t"
"lw %[temp0], 0(%[data]) \n\t" "lw %[temp0], 0(%[src]) \n\t"
"addiu %[data], %[data], 4 \n\t" "addiu %[src], %[src], 4 \n\t"
"addiu %[dst], %[dst], 4 \n\t"
"ext %[temp4], %[temp0], 8, 8 \n\t" "ext %[temp4], %[temp0], 8, 8 \n\t"
"replv.ph %[temp4], %[temp4] \n\t" "replv.ph %[temp4], %[temp4] \n\t"
"addu.qb %[temp0], %[temp0], %[temp4] \n\t" "addu.qb %[temp0], %[temp0], %[temp4] \n\t"
"bne %[data], %[p_loop2_end], 1b \n\t" "bne %[src], %[p_loop2_end], 1b \n\t"
" sw %[temp0], -4(%[data]) \n\t" " sw %[temp0], -4(%[dst]) \n\t"
"2: \n\t" "2: \n\t"
".set pop \n\t" ".set pop \n\t"
: [data]"+&r"(data), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), : [dst]"+&r"(dst), [src]"+&r"(src), [temp0]"=&r"(temp0),
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
[temp5]"=&r"(temp5), [temp6]"=&r"(temp6), [temp7]"=&r"(temp7) [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
[temp7]"=&r"(temp7)
: [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end) : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
: "memory" : "memory"
); );
} }
static void TransformColorInverse(const VP8LMultipliers* const m, static void TransformColorInverse(const VP8LMultipliers* const m,
uint32_t* data, int num_pixels) { const uint32_t* src, int num_pixels,
uint32_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5; int temp0, temp1, temp2, temp3, temp4, temp5;
uint32_t argb, argb1, new_red; uint32_t argb, argb1, new_red;
const uint32_t G_to_R = m->green_to_red_; const uint32_t G_to_R = m->green_to_red_;
const uint32_t G_to_B = m->green_to_blue_; const uint32_t G_to_B = m->green_to_blue_;
const uint32_t R_to_B = m->red_to_blue_; const uint32_t R_to_B = m->red_to_blue_;
uint32_t* const p_loop_end = data + (num_pixels & ~1); const uint32_t* const p_loop_end = src + (num_pixels & ~1);
__asm__ volatile ( __asm__ volatile (
".set push \n\t" ".set push \n\t"
".set noreorder \n\t" ".set noreorder \n\t"
"beq %[data], %[p_loop_end], 1f \n\t" "beq %[src], %[p_loop_end], 1f \n\t"
" nop \n\t" " nop \n\t"
"replv.ph %[temp0], %[G_to_R] \n\t" "replv.ph %[temp0], %[G_to_R] \n\t"
"replv.ph %[temp1], %[G_to_B] \n\t" "replv.ph %[temp1], %[G_to_B] \n\t"
@ -304,9 +309,12 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
"shra.ph %[temp1], %[temp1], 8 \n\t" "shra.ph %[temp1], %[temp1], 8 \n\t"
"shra.ph %[temp2], %[temp2], 8 \n\t" "shra.ph %[temp2], %[temp2], 8 \n\t"
"0: \n\t" "0: \n\t"
"lw %[argb], 0(%[data]) \n\t" "lw %[argb], 0(%[src]) \n\t"
"lw %[argb1], 4(%[data]) \n\t" "lw %[argb1], 4(%[src]) \n\t"
"addiu %[data], %[data], 8 \n\t" "sw %[argb], 0(%[dst]) \n\t"
"sw %[argb1], 4(%[dst]) \n\t"
"addiu %[src], %[src], 8 \n\t"
"addiu %[dst], %[dst], 8 \n\t"
"precrq.qb.ph %[temp3], %[argb], %[argb1] \n\t" "precrq.qb.ph %[temp3], %[argb], %[argb1] \n\t"
"preceu.ph.qbra %[temp3], %[temp3] \n\t" "preceu.ph.qbra %[temp3], %[temp3] \n\t"
"shll.ph %[temp3], %[temp3], 8 \n\t" "shll.ph %[temp3], %[temp3], 8 \n\t"
@ -323,29 +331,29 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
"shll.ph %[temp4], %[temp5], 8 \n\t" "shll.ph %[temp4], %[temp5], 8 \n\t"
"shra.ph %[temp4], %[temp4], 8 \n\t" "shra.ph %[temp4], %[temp4], 8 \n\t"
"mul.ph %[temp4], %[temp4], %[temp2] \n\t" "mul.ph %[temp4], %[temp4], %[temp2] \n\t"
"sb %[temp5], -2(%[data]) \n\t" "sb %[temp5], -2(%[dst]) \n\t"
"sra %[temp5], %[temp5], 16 \n\t" "sra %[temp5], %[temp5], 16 \n\t"
"shra.ph %[temp4], %[temp4], 5 \n\t" "shra.ph %[temp4], %[temp4], 5 \n\t"
"addu.ph %[argb1], %[argb1], %[temp4] \n\t" "addu.ph %[argb1], %[argb1], %[temp4] \n\t"
"preceu.ph.qbra %[temp3], %[argb1] \n\t" "preceu.ph.qbra %[temp3], %[argb1] \n\t"
"sb %[temp5], -6(%[data]) \n\t" "sb %[temp5], -6(%[dst]) \n\t"
"sb %[temp3], -4(%[data]) \n\t" "sb %[temp3], -4(%[dst]) \n\t"
"sra %[temp3], %[temp3], 16 \n\t" "sra %[temp3], %[temp3], 16 \n\t"
"bne %[data], %[p_loop_end], 0b \n\t" "bne %[src], %[p_loop_end], 0b \n\t"
" sb %[temp3], -8(%[data]) \n\t" " sb %[temp3], -8(%[dst]) \n\t"
"1: \n\t" "1: \n\t"
".set pop \n\t" ".set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[new_red]"=&r"(new_red), [argb]"=&r"(argb), [new_red]"=&r"(new_red), [argb]"=&r"(argb),
[argb1]"=&r"(argb1), [data]"+&r"(data) [argb1]"=&r"(argb1), [dst]"+&r"(dst), [src]"+&r"(src)
: [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B), : [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
[G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end) [G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
: "memory", "hi", "lo" : "memory", "hi", "lo"
); );
// Fall-back to C-version for left-overs. // Fall-back to C-version for left-overs.
if (num_pixels & 1) VP8LTransformColorInverse_C(m, data, 1); if (num_pixels & 1) VP8LTransformColorInverse_C(m, src, 1, dst);
} }
static void ConvertBGRAToRGB(const uint32_t* src, static void ConvertBGRAToRGB(const uint32_t* src,

View File

@ -244,44 +244,51 @@ static void ConvertBGRAToRGB(const uint32_t* src,
} }
} }
static void AddGreenToBlueAndRed(uint32_t* data, int num_pixels) { static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
uint32_t* dst) {
int i; int i;
uint8_t* ptemp_data = (uint8_t*)data; const uint8_t* in = (const uint8_t*)src;
uint8_t* out = (uint8_t*)dst;
v16u8 src0, dst0, tmp0; v16u8 src0, dst0, tmp0;
const v16u8 mask = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, const v16u8 mask = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
13, 255, 13, 255 }; 13, 255, 13, 255 };
while (num_pixels >= 8) { while (num_pixels >= 8) {
v16u8 src1, dst1, tmp1; v16u8 src1, dst1, tmp1;
LD_UB2(ptemp_data, 16, src0, src1); LD_UB2(in, 16, src0, src1);
VSHF_B2_UB(src0, src1, src1, src0, mask, mask, tmp0, tmp1); VSHF_B2_UB(src0, src1, src1, src0, mask, mask, tmp0, tmp1);
ADD2(src0, tmp0, src1, tmp1, dst0, dst1); ADD2(src0, tmp0, src1, tmp1, dst0, dst1);
ST_UB2(dst0, dst1, ptemp_data, 16); ST_UB2(dst0, dst1, out, 16);
ptemp_data += 32; in += 32;
out += 32;
num_pixels -= 8; num_pixels -= 8;
} }
if (num_pixels > 0) { if (num_pixels > 0) {
if (num_pixels >= 4) { if (num_pixels >= 4) {
src0 = LD_UB(ptemp_data); src0 = LD_UB(in);
tmp0 = VSHF_UB(src0, src0, mask); tmp0 = VSHF_UB(src0, src0, mask);
dst0 = src0 + tmp0; dst0 = src0 + tmp0;
ST_UB(dst0, ptemp_data); ST_UB(dst0, out);
ptemp_data += 16; in += 16;
out += 16;
num_pixels -= 4; num_pixels -= 4;
} }
for (i = 0; i < num_pixels; i++) { for (i = 0; i < num_pixels; i++) {
const uint8_t b = ptemp_data[0]; const uint8_t b = in[0];
const uint8_t g = ptemp_data[1]; const uint8_t g = in[1];
const uint8_t r = ptemp_data[2]; const uint8_t r = in[2];
ptemp_data[0] = (b + g) & 0xff; out[0] = (b + g) & 0xff;
ptemp_data[2] = (r + g) & 0xff; out[1] = g;
ptemp_data += 4; out[2] = (r + g) & 0xff;
out[4] = in[4];
out += 4;
} }
} }
} }
static void TransformColorInverse(const VP8LMultipliers* const m, static void TransformColorInverse(const VP8LMultipliers* const m,
uint32_t* data, int num_pixels) { const uint32_t* src, int num_pixels,
uint32_t* dst) {
v16u8 src0, dst0; v16u8 src0, dst0;
const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ | const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
(m->green_to_red_ << 16)); (m->green_to_red_ << 16));
@ -293,34 +300,36 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
while (num_pixels >= 8) { while (num_pixels >= 8) {
v16u8 src1, dst1; v16u8 src1, dst1;
LD_UB2(data, 4, src0, src1); LD_UB2(src, 4, src0, src1);
TRANSFORM_COLOR_INVERSE_8(src0, src1, dst0, dst1, g2br, r2b, mask0, mask1); TRANSFORM_COLOR_INVERSE_8(src0, src1, dst0, dst1, g2br, r2b, mask0, mask1);
ST_UB2(dst0, dst1, data, 4); ST_UB2(dst0, dst1, dst, 4);
data += 8; src += 8;
dst += 8;
num_pixels -= 8; num_pixels -= 8;
} }
if (num_pixels > 0) { if (num_pixels > 0) {
if (num_pixels >= 4) { if (num_pixels >= 4) {
src0 = LD_UB(data); src0 = LD_UB(src);
TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1); TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1);
ST_UB(dst0, data); ST_UB(dst0, dst);
data += 4; src += 4;
dst += 4;
num_pixels -= 4; num_pixels -= 4;
} }
if (num_pixels > 0) { if (num_pixels > 0) {
src0 = LD_UB(data); src0 = LD_UB(src);
TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1); TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1);
if (num_pixels == 3) { if (num_pixels == 3) {
const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0); const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2); const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);
SD(pix_d, data + 0); SD(pix_d, dst + 0);
SW(pix_w, data + 2); SW(pix_w, dst + 2);
} else if (num_pixels == 2) { } else if (num_pixels == 2) {
const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0); const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
SD(pix_d, data); SD(pix_d, dst);
} else { } else {
const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 0); const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 0);
SW(pix_w, data); SW(pix_w, dst);
} }
} }
} }

View File

@ -171,28 +171,30 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
} }
#endif // USE_VTBLQ #endif // USE_VTBLQ
static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
const uint32_t* const end = argb_data + (num_pixels & ~3); uint32_t* dst) {
const uint32_t* const end = src + (num_pixels & ~3);
#ifdef USE_VTBLQ #ifdef USE_VTBLQ
const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
#else #else
const uint8x8_t shuffle = vld1_u8(kGreenShuffle); const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
#endif #endif
for (; argb_data < end; argb_data += 4) { for (; src < end; src += 4, dst += 4) {
const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); const uint8x16_t argb = vld1q_u8((const uint8_t*)src);
const uint8x16_t greens = DoGreenShuffle(argb, shuffle); const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens)); vst1q_u8((uint8_t*)dst, vaddq_u8(argb, greens));
} }
// fallthrough and finish off with plain-C // fallthrough and finish off with plain-C
VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3); VP8LAddGreenToBlueAndRed_C(src, num_pixels & 3, dst);
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Color Transform // Color Transform
static void TransformColorInverse(const VP8LMultipliers* const m, static void TransformColorInverse(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) { const uint32_t* const src, int num_pixels,
// sign-extended multiplying constants, pre-shifted by 6. uint32_t* dst) {
// sign-extended multiplying constants, pre-shifted by 6.
#define CST(X) (((int16_t)(m->X << 8)) >> 6) #define CST(X) (((int16_t)(m->X << 8)) >> 6)
const int16_t rb[8] = { const int16_t rb[8] = {
CST(green_to_blue_), CST(green_to_red_), CST(green_to_blue_), CST(green_to_red_),
@ -219,7 +221,7 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
const uint32x4_t mask_ag = vdupq_n_u32(0xff00ff00u); const uint32x4_t mask_ag = vdupq_n_u32(0xff00ff00u);
int i; int i;
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i)); const uint8x16_t in = vld1q_u8((const uint8_t*)(src + i));
const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag); const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag);
// 0 g 0 g // 0 g 0 g
const uint8x16_t greens = DoGreenShuffle(in, shuffle); const uint8x16_t greens = DoGreenShuffle(in, shuffle);
@ -240,10 +242,10 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
// 0 r' 0 b'' // 0 r' 0 b''
const uint16x8_t G = vshrq_n_u16(vreinterpretq_u16_s8(F), 8); const uint16x8_t G = vshrq_n_u16(vreinterpretq_u16_s8(F), 8);
const uint32x4_t out = vorrq_u32(vreinterpretq_u32_u16(G), a0g0); const uint32x4_t out = vorrq_u32(vreinterpretq_u32_u16(G), a0g0);
vst1q_u32(argb_data + i, out); vst1q_u32(dst + i, out);
} }
// Fall-back to C-version for left-overs. // Fall-back to C-version for left-overs.
VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i); VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
} }
#undef USE_VTBLQ #undef USE_VTBLQ

View File

@ -157,26 +157,28 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Subtract-Green Transform // Subtract-Green Transform
static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
uint32_t* dst) {
int i; int i;
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g
const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g
const __m128i out = _mm_add_epi8(in, C); const __m128i out = _mm_add_epi8(in, C);
_mm_storeu_si128((__m128i*)&argb_data[i], out); _mm_storeu_si128((__m128i*)&dst[i], out);
} }
// fallthrough and finish off with plain-C // fallthrough and finish off with plain-C
VP8LAddGreenToBlueAndRed_C(argb_data + i, num_pixels - i); VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Color Transform // Color Transform
static void TransformColorInverse(const VP8LMultipliers* const m, static void TransformColorInverse(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) { const uint32_t* const src, int num_pixels,
// sign-extended multiplying constants, pre-shifted by 5. uint32_t* dst) {
// sign-extended multiplying constants, pre-shifted by 5.
#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
const __m128i mults_rb = _mm_set_epi16( const __m128i mults_rb = _mm_set_epi16(
CST(green_to_red_), CST(green_to_blue_), CST(green_to_red_), CST(green_to_blue_),
@ -190,7 +192,7 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks
int i; int i;
for (i = 0; i + 4 <= num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0
const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0
@ -202,10 +204,10 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0 const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0
const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b'' const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b''
const __m128i out = _mm_or_si128(J, A); const __m128i out = _mm_or_si128(J, A);
_mm_storeu_si128((__m128i*)&argb_data[i], out); _mm_storeu_si128((__m128i*)&dst[i], out);
} }
// Fall-back to C-version for left-overs. // Fall-back to C-version for left-overs.
VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i); VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------