diff --git a/src/dsp/dec.c b/src/dsp/dec.c index 9ae7b6fa..758c6a57 100644 --- a/src/dsp/dec.c +++ b/src/dsp/dec.c @@ -426,11 +426,16 @@ static void HE8uv(uint8_t *dst) { // horizontal } // helper for chroma-DC predictions -static WEBP_INLINE void Put8x8uv(uint64_t v, uint8_t* dst) { +static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) { int j; +#ifndef WEBP_REFERENCE_IMPLEMENTATION + const uint64_t v = (uint64_t)value * 0x0101010101010101ULL; for (j = 0; j < 8; ++j) { *(uint64_t*)(dst + j * BPS) = v; } +#else + for (j = 0; j < 8; ++j) memset(dst + j * BPS, value, 8); +#endif } static void DC8uv(uint8_t *dst) { // DC @@ -439,7 +444,7 @@ static void DC8uv(uint8_t *dst) { // DC for (i = 0; i < 8; ++i) { dc0 += dst[i - BPS] + dst[-1 + i * BPS]; } - Put8x8uv((uint64_t)((dc0 >> 4) * 0x0101010101010101ULL), dst); + Put8x8uv(dc0 >> 4, dst); } static void DC8uvNoLeft(uint8_t *dst) { // DC with no left samples @@ -448,7 +453,7 @@ static void DC8uvNoLeft(uint8_t *dst) { // DC with no left samples for (i = 0; i < 8; ++i) { dc0 += dst[i - BPS]; } - Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst); + Put8x8uv(dc0 >> 3, dst); } static void DC8uvNoTop(uint8_t *dst) { // DC with no top samples @@ -457,11 +462,11 @@ static void DC8uvNoTop(uint8_t *dst) { // DC with no top samples for (i = 0; i < 8; ++i) { dc0 += dst[-1 + i * BPS]; } - Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst); + Put8x8uv(dc0 >> 3, dst); } static void DC8uvNoTopLeft(uint8_t *dst) { // DC with nothing - Put8x8uv(0x8080808080808080ULL, dst); + Put8x8uv(0x80, dst); } //------------------------------------------------------------------------------ diff --git a/src/dsp/lossless.c b/src/dsp/lossless.c index f951b897..db8f7584 100644 --- a/src/dsp/lossless.c +++ b/src/dsp/lossless.c @@ -1079,20 +1079,27 @@ static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst, const uint32_t* const src_end = src + num_pixels; while (src < src_end) { uint32_t argb = *src++; + +#if !defined(WEBP_REFERENCE_IMPLEMENTATION) #if !defined(__BIG_ENDIAN__) && (defined(__i386__) || defined(__x86_64__)) __asm__ volatile("bswap %0" : "=r"(argb) : "0"(argb)); *(uint32_t*)dst = argb; - dst += sizeof(argb); #elif !defined(__BIG_ENDIAN__) && defined(_MSC_VER) argb = _byteswap_ulong(argb); *(uint32_t*)dst = argb; - dst += sizeof(argb); #else - *dst++ = (argb >> 24) & 0xff; - *dst++ = (argb >> 16) & 0xff; - *dst++ = (argb >> 8) & 0xff; - *dst++ = (argb >> 0) & 0xff; + dst[0] = (argb >> 24) & 0xff; + dst[1] = (argb >> 16) & 0xff; + dst[2] = (argb >> 8) & 0xff; + dst[3] = (argb >> 0) & 0xff; #endif +#else // WEBP_REFERENCE_IMPLEMENTATION + dst[0] = (argb >> 24) & 0xff; + dst[1] = (argb >> 16) & 0xff; + dst[2] = (argb >> 8) & 0xff; + dst[3] = (argb >> 0) & 0xff; +#endif + dst += sizeof(argb); } } else { memcpy(dst, src, num_pixels * sizeof(*src)); diff --git a/src/utils/bit_reader.h b/src/utils/bit_reader.h index daf271ef..f1ded6f3 100644 --- a/src/utils/bit_reader.h +++ b/src/utils/bit_reader.h @@ -56,20 +56,29 @@ extern "C" { // -> we're back to height active 'value_' bits (marked 'v') and BITS cached // bits (marked 'B') // -// The right-justify strategy tends to use less shifts, so let's use it: - -#define USE_RIGHT_JUSTIFY +// The right-justify strategy tends to use less shifts and is often faster. //------------------------------------------------------------------------------ // BITS can be either 32, 24, 16 or 8. // Pick values that fit natural register size. +#if !defined(WEBP_REFERENCE_IMPLEMENTATION) + +#define USE_RIGHT_JUSTIFY + #if defined(__i386__) || defined(_M_IX86) // x86 32bit #define BITS 16 #elif defined(__arm__) || defined(_M_ARM) // ARM -#define BITS 8 +#define BITS 24 #else // reasonable default -#define BITS 32 +#define BITS 24 +#endif + +#else // reference choices + +#define USE_RIGHT_JUSTIFY +#define BITS 8 + #endif //------------------------------------------------------------------------------