add WebPDispatchAlphaToGreen() to dsp

SSE2 version is 2.1x faster

This is used to transfer the alpha plane to green channel before lossless compression.

Change-Id: I01d9df0051c183b1ff5d6eb69961d4f43e33141a
This commit is contained in:
Pascal Massimino 2014-10-06 23:15:44 +02:00
parent 1bd4c2ad23
commit 2d9b0a4472
4 changed files with 51 additions and 13 deletions

View File

@ -303,6 +303,19 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
return (alpha_mask != 0xff); return (alpha_mask != 0xff);
} }
static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint32_t* dst, int dst_stride) {
int i, j;
for (j = 0; j < height; ++j) {
for (i = 0; i < width; ++i) {
dst[i] = alpha[i] << 8; // leave A/R/B channels zero'd.
}
alpha += alpha_stride;
dst += dst_stride;
}
}
static int ExtractAlpha(const uint8_t* argb, int argb_stride, static int ExtractAlpha(const uint8_t* argb, int argb_stride,
int width, int height, int width, int height,
uint8_t* alpha, int alpha_stride) { uint8_t* alpha, int alpha_stride) {
@ -324,6 +337,7 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride,
void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int); void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int); void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int); int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int); int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
@ -339,6 +353,7 @@ void WebPInitAlphaProcessing(void) {
WebPApplyAlphaMultiply = ApplyAlphaMultiply; WebPApplyAlphaMultiply = ApplyAlphaMultiply;
WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b; WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
WebPDispatchAlpha = DispatchAlpha; WebPDispatchAlpha = DispatchAlpha;
WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
WebPExtractAlpha = ExtractAlpha; WebPExtractAlpha = ExtractAlpha;
// If defined, use CPUInfo() to overwrite some pointers with faster versions. // If defined, use CPUInfo() to overwrite some pointers with faster versions.

View File

@ -72,6 +72,32 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
return (alpha_and != 0xff); return (alpha_and != 0xff);
} }
static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint32_t* dst, int dst_stride) {
int i, j;
const __m128i zero = _mm_setzero_si128();
const int limit = width & ~15;
for (j = 0; j < height; ++j) {
for (i = 0; i < limit; i += 16) { // process 16 alpha bytes
const __m128i a0 = _mm_loadu_si128((__m128i*)&alpha[i]);
const __m128i a1 = _mm_unpacklo_epi8(zero, a0); // note the 'zero' first!
const __m128i b1 = _mm_unpackhi_epi8(zero, a0);
const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
const __m128i b2_lo = _mm_unpacklo_epi16(b1, zero);
const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
const __m128i b2_hi = _mm_unpackhi_epi16(b1, zero);
_mm_storeu_si128((__m128i*)&dst[i + 0], a2_lo);
_mm_storeu_si128((__m128i*)&dst[i + 4], a2_hi);
_mm_storeu_si128((__m128i*)&dst[i + 8], b2_lo);
_mm_storeu_si128((__m128i*)&dst[i + 12], b2_hi);
}
for (; i < width; ++i) dst[i] = alpha[i] << 8;
alpha += alpha_stride;
dst += dst_stride;
}
}
static int ExtractAlpha(const uint8_t* argb, int argb_stride, static int ExtractAlpha(const uint8_t* argb, int argb_stride,
int width, int height, int width, int height,
uint8_t* alpha, int alpha_stride) { uint8_t* alpha, int alpha_stride) {
@ -264,6 +290,7 @@ void WebPInitAlphaProcessingSSE2(void) {
WebPMultRow = MultRow; WebPMultRow = MultRow;
WebPApplyAlphaMultiply = ApplyAlphaMultiply; WebPApplyAlphaMultiply = ApplyAlphaMultiply;
WebPDispatchAlpha = DispatchAlpha; WebPDispatchAlpha = DispatchAlpha;
WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
WebPExtractAlpha = ExtractAlpha; WebPExtractAlpha = ExtractAlpha;
#endif #endif
} }

View File

@ -274,6 +274,12 @@ extern int (*WebPDispatchAlpha)(const uint8_t* alpha, int alpha_stride,
int width, int height, int width, int height,
uint8_t* dst, int dst_stride); uint8_t* dst, int dst_stride);
// Transfer packed 8b alpha[] values to green channel in dst[], zero'ing the
// A/R/B values. 'dst_stride' is the stride for dst[] in uint32_t units.
extern void (*WebPDispatchAlphaToGreen)(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint32_t* dst, int dst_stride);
// Extract the alpha values from 32b values in argb[] and pack them into alpha[] // Extract the alpha values from 32b values in argb[] and pack them into alpha[]
// (this is the opposite of WebPDispatchAlpha). // (this is the opposite of WebPDispatchAlpha).
// Returns true if there's only trivial 0xff alpha values. // Returns true if there's only trivial 0xff alpha values.

View File

@ -61,18 +61,8 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
if (!WebPPictureAlloc(&picture)) return 0; if (!WebPPictureAlloc(&picture)) return 0;
// Transfer the alpha values to the green channel. // Transfer the alpha values to the green channel.
{ WebPDispatchAlphaToGreen(data, width, picture.width, picture.height,
int i, j; picture.argb, picture.argb_stride);
uint32_t* dst = picture.argb;
const uint8_t* src = data;
for (j = 0; j < picture.height; ++j) {
for (i = 0; i < picture.width; ++i) {
dst[i] = src[i] << 8; // we leave A/R/B channels zero'd.
}
src += width;
dst += picture.argb_stride;
}
}
WebPConfigInit(&config); WebPConfigInit(&config);
config.lossless = 1; config.lossless = 1;
@ -376,6 +366,7 @@ static int CompressAlphaJob(VP8Encoder* const enc, void* dummy) {
} }
void VP8EncInitAlpha(VP8Encoder* const enc) { void VP8EncInitAlpha(VP8Encoder* const enc) {
WebPInitAlphaProcessing();
enc->has_alpha_ = WebPPictureHasTransparency(enc->pic_); enc->has_alpha_ = WebPPictureHasTransparency(enc->pic_);
enc->alpha_data_ = NULL; enc->alpha_data_ = NULL;
enc->alpha_data_size_ = 0; enc->alpha_data_size_ = 0;
@ -430,4 +421,3 @@ int VP8EncDeleteAlpha(VP8Encoder* const enc) {
enc->has_alpha_ = 0; enc->has_alpha_ = 0;
return ok; return ok;
} }