introduce WEBP_REFERENCE_IMPLEMENTATION compile option

This flag will make the code use no uint64, no asm, and no fancy trick, but instead aim at being as simple and straightforward as possible. Main use is to help emscripten generate proper JS code. More code needs to be simplified later. Also: tune the BITS values to be 24 and make use of WEBP_RIGHT_JUSTIFY Here are the typical timing for decoding a large image: ARM7-a: dwebp_justify_32_neon Time to decode picture: 3.280s dwebp_justify_24_neon Time to decode picture: 2.640s dwebp_justify_16_neon Time to decode picture: 2.723s dwebp_justify_8_neon Time to decode picture: 2.802s dwebp_justify_32 Time to decode picture: 4.264s dwebp_justify_24 Time to decode picture: 3.696s dwebp_justify_16 Time to decode picture: 3.779s dwebp_justify_8 Time to decode picture: 3.834s dwebp_32_neon Time to decode picture: 4.010s dwebp_24_neon Time to decode picture: 2.725s dwebp_16_neon Time to decode picture: 2.852s dwebp_8_neon Time to decode picture: 2.778s dwebp_32 Time to decode picture: 4.587s dwebp_24 Time to decode picture: 3.800s dwebp_16 Time to decode picture: 3.902s dwebp_8 Time to decode picture: 3.815s REFERENCE (HEAD) Time to decode picture: 3.818s x86_64: dwebp_justify_32 Time to decode picture: 0.473s dwebp_justify_24 Time to decode picture: 0.434s dwebp_justify_16 Time to decode picture: 0.450s dwebp_justify_8 Time to decode picture: 0.467s dwebp_32 Time to decode picture: 0.474s dwebp_24 Time to decode picture: 0.468s dwebp_16 Time to decode picture: 0.468s dwebp_8 Time to decode picture: 0.481s REFERENCE (HEAD) Time to decode picture: 0.436s i386: dwebp_justify_32 Time to decode picture: 0.723s dwebp_justify_24 Time to decode picture: 0.618s dwebp_justify_16 Time to decode picture: 0.626s dwebp_justify_8 Time to decode picture: 0.651s dwebp_32 Time to decode picture: 0.744s dwebp_24 Time to decode picture: 0.627s dwebp_16 Time to decode picture: 0.642s dwebp_8 Time to decode picture: 0.642s Change-Id: Ie56c7235733a24f94fbfc2e4351aae36ec39c225
2025-10-28 09:03:02 +01:00 · 2013-02-14 15:46:12 +01:00
parent 3383885799
commit b7490f8553
3 changed files with 37 additions and 16 deletions
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@@ -426,11 +426,16 @@ static void HE8uv(uint8_t *dst) {    // horizontal
 }
 // helper for chroma-DC predictions
-static WEBP_INLINE void Put8x8uv(uint64_t v, uint8_t* dst) {
+static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
  int j;
 #ifndef WEBP_REFERENCE_IMPLEMENTATION
  const uint64_t v = (uint64_t)value * 0x0101010101010101ULL;
  for (j = 0; j < 8; ++j) {
    *(uint64_t*)(dst + j * BPS) = v;
  }
 #else
  for (j = 0; j < 8; ++j) memset(dst + j * BPS, value, 8);
 #endif
 }
 static void DC8uv(uint8_t *dst) {     // DC
@@ -439,7 +444,7 @@ static void DC8uv(uint8_t *dst) {     // DC
  for (i = 0; i < 8; ++i) {
    dc0 += dst[i - BPS] + dst[-1 + i * BPS];
  }
-  Put8x8uv((uint64_t)((dc0 >> 4) * 0x0101010101010101ULL), dst);
+  Put8x8uv(dc0 >> 4, dst);
 }
 static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
@@ -448,7 +453,7 @@ static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
  for (i = 0; i < 8; ++i) {
    dc0 += dst[i - BPS];
  }
-  Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
+  Put8x8uv(dc0 >> 3, dst);
 }
 static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
@@ -457,11 +462,11 @@ static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
  for (i = 0; i < 8; ++i) {
    dc0 += dst[-1 + i * BPS];
  }
-  Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst);
+  Put8x8uv(dc0 >> 3, dst);
 }
 static void DC8uvNoTopLeft(uint8_t *dst) {    // DC with nothing
-  Put8x8uv(0x8080808080808080ULL, dst);
+  Put8x8uv(0x80, dst);
 }
 //------------------------------------------------------------------------------
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
@@ -1079,20 +1079,27 @@ static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
    const uint32_t* const src_end = src + num_pixels;
    while (src < src_end) {
      uint32_t argb = *src++;
 #if !defined(WEBP_REFERENCE_IMPLEMENTATION)
 #if !defined(__BIG_ENDIAN__) && (defined(__i386__) || defined(__x86_64__))
      __asm__ volatile("bswap %0" : "=r"(argb) : "0"(argb));
      *(uint32_t*)dst = argb;
      dst += sizeof(argb);
 #elif !defined(__BIG_ENDIAN__) && defined(_MSC_VER)
      argb = _byteswap_ulong(argb);
      *(uint32_t*)dst = argb;
      dst += sizeof(argb);
 #else
-      *dst++ = (argb >> 24) & 0xff;
+      dst[0] = (argb >> 24) & 0xff;
-      *dst++ = (argb >> 16) & 0xff;
+      dst[1] = (argb >> 16) & 0xff;
-      *dst++ = (argb >>  8) & 0xff;
+      dst[2] = (argb >>  8) & 0xff;
-      *dst++ = (argb >>  0) & 0xff;
+      dst[3] = (argb >>  0) & 0xff;
 #endif
 #else   // WEBP_REFERENCE_IMPLEMENTATION
      dst[0] = (argb >> 24) & 0xff;
      dst[1] = (argb >> 16) & 0xff;
      dst[2] = (argb >>  8) & 0xff;
      dst[3] = (argb >>  0) & 0xff;
 #endif
      dst += sizeof(argb);
    }
  } else {
    memcpy(dst, src, num_pixels * sizeof(*src));
--- a/src/utils/bit_reader.h
+++ b/src/utils/bit_reader.h
@@ -56,20 +56,29 @@ extern "C" {
 // -> we're back to height active 'value_' bits (marked 'v') and BITS cached
 // bits (marked 'B')
 //
-// The right-justify strategy tends to use less shifts, so let's use it:
+// The right-justify strategy tends to use less shifts and is often faster.
 #define USE_RIGHT_JUSTIFY
 //------------------------------------------------------------------------------
 // BITS can be either 32, 24, 16 or 8.
 // Pick values that fit natural register size.
 #if !defined(WEBP_REFERENCE_IMPLEMENTATION)
 #define USE_RIGHT_JUSTIFY
 #if defined(__i386__) || defined(_M_IX86)      // x86 32bit
 #define BITS 16
 #elif defined(__arm__) || defined(_M_ARM)     // ARM
-#define BITS 8
+#define BITS 24
 #else                      // reasonable default
-#define BITS 32
+#define BITS 24
 #endif
 #else     // reference choices
 #define USE_RIGHT_JUSTIFY
 #define BITS 8
 #endif
 //------------------------------------------------------------------------------