fix loop bug in DispatchAlpha()

* We were re-doing most of the work in plain-C as 'left-over'.
* we were always returning has_alpha = true because of a bad mask all_0xff

These bugs were conservative and silent, in the sense that we were 'just' doing
more work than necessary.

Now, the SSE2 version is really 2x faster than the C version.

Change-Id: I6c8132a267fe3c7a3d1fa70e7a5fcd10719543fa
This commit is contained in:
Pascal Massimino 2014-09-11 22:35:08 +02:00
parent 3101f53720
commit 690b491af1
2 changed files with 5 additions and 7 deletions

View File

@ -210,7 +210,7 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w, const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w,
num_rows, dst, buf->stride); num_rows, dst, buf->stride);
// alpha_mask is < 0xff if there's non-trivial alpha to premultiply with. // has_alpha is true if there's non-trivial alpha to premultiply with.
if (has_alpha && WebPIsPremultipliedMode(colorspace)) { if (has_alpha && WebPIsPremultipliedMode(colorspace)) {
WebPApplyAlphaMultiply(base_rgba, alpha_first, WebPApplyAlphaMultiply(base_rgba, alpha_first,
mb_w, num_rows, buf->stride); mb_w, num_rows, buf->stride);

View File

@ -27,20 +27,19 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
int i, j; int i, j;
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u); // to preserve RGB const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u); // to preserve RGB
const __m128i all_0xff = _mm_set_epi32(~0u, ~0u, 0, 0); const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
__m128i all_alphas = all_0xff; __m128i all_alphas = all_0xff;
// We must be able to access 3 extra bytes after the last written byte // We must be able to access 3 extra bytes after the last written byte
// 'dst[4 * width - 4]', because we don't know if alpha is the first or the // 'dst[4 * width - 4]', because we don't know if alpha is the first or the
// last byte of the quadruplet. // last byte of the quadruplet.
const int limit = (width - 1) >> 3; const int limit = (width - 1) & ~7;
for (j = 0; j < height; ++j) { for (j = 0; j < height; ++j) {
const uint8_t* in = alpha;
__m128i* out = (__m128i*)dst; __m128i* out = (__m128i*)dst;
for (i = 0; i < limit; ++i) { for (i = 0; i < limit; i += 8) {
// load 8 alpha bytes // load 8 alpha bytes
const __m128i a0 = _mm_loadl_epi64((__m128i*)in); // zeroes upper bytes const __m128i a0 = _mm_loadl_epi64((__m128i*)&alpha[i]);
const __m128i a1 = _mm_unpacklo_epi8(a0, zero); const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
@ -59,7 +58,6 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
// accumulate eight alpha 'and' in parallel // accumulate eight alpha 'and' in parallel
all_alphas = _mm_and_si128(all_alphas, a0); all_alphas = _mm_and_si128(all_alphas, a0);
out += 2; out += 2;
in += 8;
} }
for (; i < width; ++i) { for (; i < width; ++i) {
const uint32_t alpha_value = alpha[i]; const uint32_t alpha_value = alpha[i];