From 2d9b0a4472b8d08d2ce0fa93ae408d65548298e5 Mon Sep 17 00:00:00 2001 From: Pascal Massimino Date: Mon, 6 Oct 2014 23:15:44 +0200 Subject: [PATCH] add WebPDispatchAlphaToGreen() to dsp SSE2 version is 2.1x faster This is used to transfer the alpha plane to green channel before lossless compression. Change-Id: I01d9df0051c183b1ff5d6eb69961d4f43e33141a --- src/dsp/alpha_processing.c | 15 +++++++++++++++ src/dsp/alpha_processing_sse2.c | 27 +++++++++++++++++++++++++++ src/dsp/dsp.h | 6 ++++++ src/enc/alpha.c | 16 +++------------- 4 files changed, 51 insertions(+), 13 deletions(-) diff --git a/src/dsp/alpha_processing.c b/src/dsp/alpha_processing.c index 85a7e620..138fdb56 100644 --- a/src/dsp/alpha_processing.c +++ b/src/dsp/alpha_processing.c @@ -303,6 +303,19 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride, return (alpha_mask != 0xff); } +static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride, + int width, int height, + uint32_t* dst, int dst_stride) { + int i, j; + for (j = 0; j < height; ++j) { + for (i = 0; i < width; ++i) { + dst[i] = alpha[i] << 8; // leave A/R/B channels zero'd. + } + alpha += alpha_stride; + dst += dst_stride; + } +} + static int ExtractAlpha(const uint8_t* argb, int argb_stride, int width, int height, uint8_t* alpha, int alpha_stride) { @@ -324,6 +337,7 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride, void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int); void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int); int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int); +void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int); int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int); //------------------------------------------------------------------------------ @@ -339,6 +353,7 @@ void WebPInitAlphaProcessing(void) { WebPApplyAlphaMultiply = ApplyAlphaMultiply; WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b; WebPDispatchAlpha = DispatchAlpha; + WebPDispatchAlphaToGreen = DispatchAlphaToGreen; WebPExtractAlpha = ExtractAlpha; // If defined, use CPUInfo() to overwrite some pointers with faster versions. diff --git a/src/dsp/alpha_processing_sse2.c b/src/dsp/alpha_processing_sse2.c index d34852c5..9e0d2d69 100644 --- a/src/dsp/alpha_processing_sse2.c +++ b/src/dsp/alpha_processing_sse2.c @@ -72,6 +72,32 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride, return (alpha_and != 0xff); } +static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride, + int width, int height, + uint32_t* dst, int dst_stride) { + int i, j; + const __m128i zero = _mm_setzero_si128(); + const int limit = width & ~15; + for (j = 0; j < height; ++j) { + for (i = 0; i < limit; i += 16) { // process 16 alpha bytes + const __m128i a0 = _mm_loadu_si128((__m128i*)&alpha[i]); + const __m128i a1 = _mm_unpacklo_epi8(zero, a0); // note the 'zero' first! + const __m128i b1 = _mm_unpackhi_epi8(zero, a0); + const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); + const __m128i b2_lo = _mm_unpacklo_epi16(b1, zero); + const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); + const __m128i b2_hi = _mm_unpackhi_epi16(b1, zero); + _mm_storeu_si128((__m128i*)&dst[i + 0], a2_lo); + _mm_storeu_si128((__m128i*)&dst[i + 4], a2_hi); + _mm_storeu_si128((__m128i*)&dst[i + 8], b2_lo); + _mm_storeu_si128((__m128i*)&dst[i + 12], b2_hi); + } + for (; i < width; ++i) dst[i] = alpha[i] << 8; + alpha += alpha_stride; + dst += dst_stride; + } +} + static int ExtractAlpha(const uint8_t* argb, int argb_stride, int width, int height, uint8_t* alpha, int alpha_stride) { @@ -264,6 +290,7 @@ void WebPInitAlphaProcessingSSE2(void) { WebPMultRow = MultRow; WebPApplyAlphaMultiply = ApplyAlphaMultiply; WebPDispatchAlpha = DispatchAlpha; + WebPDispatchAlphaToGreen = DispatchAlphaToGreen; WebPExtractAlpha = ExtractAlpha; #endif } diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index c75cf04a..d94a7b7d 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -274,6 +274,12 @@ extern int (*WebPDispatchAlpha)(const uint8_t* alpha, int alpha_stride, int width, int height, uint8_t* dst, int dst_stride); +// Transfer packed 8b alpha[] values to green channel in dst[], zero'ing the +// A/R/B values. 'dst_stride' is the stride for dst[] in uint32_t units. +extern void (*WebPDispatchAlphaToGreen)(const uint8_t* alpha, int alpha_stride, + int width, int height, + uint32_t* dst, int dst_stride); + // Extract the alpha values from 32b values in argb[] and pack them into alpha[] // (this is the opposite of WebPDispatchAlpha). // Returns true if there's only trivial 0xff alpha values. diff --git a/src/enc/alpha.c b/src/enc/alpha.c index 79cb94db..ba6e9da7 100644 --- a/src/enc/alpha.c +++ b/src/enc/alpha.c @@ -61,18 +61,8 @@ static int EncodeLossless(const uint8_t* const data, int width, int height, if (!WebPPictureAlloc(&picture)) return 0; // Transfer the alpha values to the green channel. - { - int i, j; - uint32_t* dst = picture.argb; - const uint8_t* src = data; - for (j = 0; j < picture.height; ++j) { - for (i = 0; i < picture.width; ++i) { - dst[i] = src[i] << 8; // we leave A/R/B channels zero'd. - } - src += width; - dst += picture.argb_stride; - } - } + WebPDispatchAlphaToGreen(data, width, picture.width, picture.height, + picture.argb, picture.argb_stride); WebPConfigInit(&config); config.lossless = 1; @@ -376,6 +366,7 @@ static int CompressAlphaJob(VP8Encoder* const enc, void* dummy) { } void VP8EncInitAlpha(VP8Encoder* const enc) { + WebPInitAlphaProcessing(); enc->has_alpha_ = WebPPictureHasTransparency(enc->pic_); enc->alpha_data_ = NULL; enc->alpha_data_size_ = 0; @@ -430,4 +421,3 @@ int VP8EncDeleteAlpha(VP8Encoder* const enc) { enc->has_alpha_ = 0; return ok; } -