diff --git a/src/dsp/alpha_processing.c b/src/dsp/alpha_processing.c index 176135ab..85a7e620 100644 --- a/src/dsp/alpha_processing.c +++ b/src/dsp/alpha_processing.c @@ -303,9 +303,28 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride, return (alpha_mask != 0xff); } +static int ExtractAlpha(const uint8_t* argb, int argb_stride, + int width, int height, + uint8_t* alpha, int alpha_stride) { + uint8_t alpha_mask = 0xff; + int i, j; + + for (j = 0; j < height; ++j) { + for (i = 0; i < width; ++i) { + const uint8_t alpha_value = argb[4 * i]; + alpha[i] = alpha_value; + alpha_mask &= alpha_value; + } + argb += argb_stride; + alpha += alpha_stride; + } + return (alpha_mask == 0xff); +} + void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int); void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int); int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int); +int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int); //------------------------------------------------------------------------------ // Init function @@ -320,6 +339,7 @@ void WebPInitAlphaProcessing(void) { WebPApplyAlphaMultiply = ApplyAlphaMultiply; WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b; WebPDispatchAlpha = DispatchAlpha; + WebPExtractAlpha = ExtractAlpha; // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { diff --git a/src/dsp/alpha_processing_sse2.c b/src/dsp/alpha_processing_sse2.c index 5d6dcaf4..d34852c5 100644 --- a/src/dsp/alpha_processing_sse2.c +++ b/src/dsp/alpha_processing_sse2.c @@ -72,6 +72,51 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride, return (alpha_and != 0xff); } +static int ExtractAlpha(const uint8_t* argb, int argb_stride, + int width, int height, + uint8_t* alpha, int alpha_stride) { + // alpha_and stores an 'and' operation of all the alpha[] values. The final + // value is not 0xff if any of the alpha[] is not equal to 0xff. + uint32_t alpha_and = 0xff; + int i, j; + const __m128i a_mask = _mm_set1_epi32(0xffu); // to preserve alpha + const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); + __m128i all_alphas = all_0xff; + + // We must be able to access 3 extra bytes after the last written byte + // 'src[4 * width - 4]', because we don't know if alpha is the first or the + // last byte of the quadruplet. + const int limit = (width - 1) & ~7; + + for (j = 0; j < height; ++j) { + const __m128i* src = (const __m128i*)argb; + for (i = 0; i < limit; i += 8) { + // load 32 argb bytes + const __m128i a0 = _mm_loadu_si128(src + 0); + const __m128i a1 = _mm_loadu_si128(src + 1); + const __m128i b0 = _mm_and_si128(a0, a_mask); + const __m128i b1 = _mm_and_si128(a1, a_mask); + const __m128i c0 = _mm_packs_epi32(b0, b1); + const __m128i d0 = _mm_packus_epi16(c0, c0); + // store + _mm_storel_epi64((__m128i*)&alpha[i], d0); + // accumulate eight alpha 'and' in parallel + all_alphas = _mm_and_si128(all_alphas, d0); + src += 2; + } + for (; i < width; ++i) { + const uint32_t alpha_value = argb[4 * i]; + alpha[i] = alpha_value; + alpha_and &= alpha_value; + } + argb += argb_stride; + alpha += alpha_stride; + } + // Combine the eight alpha 'and' into a 8-bit mask. + alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); + return (alpha_and == 0xff); +} + //------------------------------------------------------------------------------ // Non-dither premultiplied modes @@ -219,5 +264,6 @@ void WebPInitAlphaProcessingSSE2(void) { WebPMultRow = MultRow; WebPApplyAlphaMultiply = ApplyAlphaMultiply; WebPDispatchAlpha = DispatchAlpha; + WebPExtractAlpha = ExtractAlpha; #endif } diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index a50f0539..c75cf04a 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -268,13 +268,19 @@ extern void (*WebPApplyAlphaMultiply)( extern void (*WebPApplyAlphaMultiply4444)( uint8_t* rgba4444, int w, int h, int stride); - // Dispatch the values from alpha[] plane to the ARGB destination 'dst'. // Returns true if alpha[] plane has non-trivial values different from 0xff. extern int (*WebPDispatchAlpha)(const uint8_t* alpha, int alpha_stride, int width, int height, uint8_t* dst, int dst_stride); +// Extract the alpha values from 32b values in argb[] and pack them into alpha[] +// (this is the opposite of WebPDispatchAlpha). +// Returns true if there's only trivial 0xff alpha values. +extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride, + int width, int height, + uint8_t* alpha, int alpha_stride); + // Pre-Multiply operation transforms x into x * A / 255 (where x=Y,R,G or B). // Un-Multiply operation transforms x into x * 255 / A.