From a59562283fcbeedbb040bc9e3961d38965066540 Mon Sep 17 00:00:00 2001 From: Djordje Pesut Date: Thu, 28 Aug 2014 16:53:33 +0200 Subject: [PATCH] added C-level optimization for DecodeAlphaData function Copies with short distances of 1,2 and 4 are specialized. up to 10-14% faster alpha decoding. Change-Id: I9708e98193910bfaf8ef43091f3fdea73b63896d --- src/dec/vp8l.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/src/dec/vp8l.c b/src/dec/vp8l.c index 88d5b07d..e97f2383 100644 --- a/src/dec/vp8l.c +++ b/src/dec/vp8l.c @@ -745,6 +745,39 @@ static void ExtractPalettedAlphaRows(VP8LDecoder* const dec, int row) { dec->last_row_ = dec->last_out_row_ = row; } +// cyclic rotation of pattern word +#if defined(WORDS_BIGENDIAN) +#define ROTATE8b(V) do { \ + (V) = (((V) & 0xff000000u) >> 24) | ((V) << 8); \ +} while (0) +#else +#define ROTATE8b(V) do { \ + (V) = (((V) & 0xffu) << 24) | ((V) >> 8); \ +} while (0) +#endif + +// copy 1, 2 or 4-bytes pattern +#define COPY_SMALL_PATTERN() do { \ + int ilength = length; \ + uint32_t* pdata; \ + int j = 0; \ + while ((uintptr_t)pdata1 & 3) { \ + *pdata1++ = pdata2[j]; \ + ROTATE8b(temp1); \ + ++j; \ + } \ + ilength -= j; \ + pdata = (uint32_t*)pdata1; \ + for (i = 0; i < (ilength >> 2); ++i) { \ + pdata[i] = temp1; \ + } \ + pdata1 = (uint8_t*)pdata; \ + pdata2 += j; \ + for (i <<= 2; i < ilength; ++i) { \ + pdata1[i] = pdata2[i]; \ + } \ +} while (0) + static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data, int width, int height, int last_row) { int ok = 1; @@ -791,8 +824,41 @@ static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data, dist_code = GetCopyDistance(dist_symbol, br); dist = PlaneCodeToDistance(width, dist_code); if (pos >= dist && end - pos >= length) { + uint8_t* pdata1 = data + pos; + const uint8_t* pdata2 = pdata1 - dist; int i; - for (i = 0; i < length; ++i) data[pos + i] = data[pos + i - dist]; + if (length >= 8) { + uint32_t temp1; + switch (dist) { + case 1: + temp1 = pdata1[-1]; +#if defined(__arm__) || defined(_M_ARM) // arm doesn't like multiply that much + temp1 |= temp1 << 8; + temp1 |= temp1 << 16; +#else + temp1 = 0x01010101u * temp1; +#endif + break; + case 2: + temp1 = ((uint16_t*)pdata1)[-1]; +#if defined(__arm__) || defined(_M_ARM) + temp1 |= temp1 << 16; +#else + temp1 = 0x00010001u * temp1; +#endif + break; + case 4: + temp1 = ((uint32_t*)pdata1)[-1]; + break; + default: + goto Copy; + break; + } + COPY_SMALL_PATTERN(); + } else { + Copy: + for (i = 0; i < length; ++i) pdata1[i] = pdata2[i]; + } } else { ok = 0; goto End; @@ -831,6 +897,9 @@ static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data, return ok; } +#undef COPY_PATTERN +#undef ROTATE8b + static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data, int width, int height, int last_row, ProcessRowsFunc process_func) {