mirror of
				https://github.com/webmproject/libwebp.git
				synced 2025-10-26 16:13:41 +01:00 
			
		
		
		
	introduce WEBP_REFERENCE_IMPLEMENTATION compile option
This flag will make the code use no uint64, no asm, and no fancy
trick, but instead aim at being as simple and straightforward as
possible.
Main use is to help emscripten generate proper JS code.
More code needs to be simplified later.
Also: tune the BITS values to be 24 and make use of WEBP_RIGHT_JUSTIFY
Here are the typical timing for decoding a large image:
        ARM7-a:
        dwebp_justify_32_neon Time to decode picture: 3.280s
        dwebp_justify_24_neon Time to decode picture: 2.640s
        dwebp_justify_16_neon Time to decode picture: 2.723s
        dwebp_justify_8_neon Time to decode picture: 2.802s
        dwebp_justify_32 Time to decode picture: 4.264s
        dwebp_justify_24 Time to decode picture: 3.696s
        dwebp_justify_16 Time to decode picture: 3.779s
        dwebp_justify_8 Time to decode picture: 3.834s
        dwebp_32_neon Time to decode picture: 4.010s
        dwebp_24_neon Time to decode picture: 2.725s
        dwebp_16_neon Time to decode picture: 2.852s
        dwebp_8_neon Time to decode picture: 2.778s
        dwebp_32 Time to decode picture: 4.587s
        dwebp_24 Time to decode picture: 3.800s
        dwebp_16 Time to decode picture: 3.902s
        dwebp_8 Time to decode picture: 3.815s
        REFERENCE (HEAD) Time to decode picture: 3.818s
        x86_64:
        dwebp_justify_32 Time to decode picture: 0.473s
        dwebp_justify_24 Time to decode picture: 0.434s
        dwebp_justify_16 Time to decode picture: 0.450s
        dwebp_justify_8 Time to decode picture: 0.467s
        dwebp_32 Time to decode picture: 0.474s
        dwebp_24 Time to decode picture: 0.468s
        dwebp_16 Time to decode picture: 0.468s
        dwebp_8 Time to decode picture: 0.481s
        REFERENCE (HEAD) Time to decode picture: 0.436s
        i386:
        dwebp_justify_32 Time to decode picture: 0.723s
        dwebp_justify_24 Time to decode picture: 0.618s
        dwebp_justify_16 Time to decode picture: 0.626s
        dwebp_justify_8 Time to decode picture: 0.651s
        dwebp_32 Time to decode picture: 0.744s
        dwebp_24 Time to decode picture: 0.627s
        dwebp_16 Time to decode picture: 0.642s
        dwebp_8 Time to decode picture: 0.642s
Change-Id: Ie56c7235733a24f94fbfc2e4351aae36ec39c225
			
			
This commit is contained in:
		| @@ -426,11 +426,16 @@ static void HE8uv(uint8_t *dst) {    // horizontal | ||||
| } | ||||
|  | ||||
| // helper for chroma-DC predictions | ||||
| static WEBP_INLINE void Put8x8uv(uint64_t v, uint8_t* dst) { | ||||
| static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) { | ||||
|   int j; | ||||
| #ifndef WEBP_REFERENCE_IMPLEMENTATION | ||||
|   const uint64_t v = (uint64_t)value * 0x0101010101010101ULL; | ||||
|   for (j = 0; j < 8; ++j) { | ||||
|     *(uint64_t*)(dst + j * BPS) = v; | ||||
|   } | ||||
| #else | ||||
|   for (j = 0; j < 8; ++j) memset(dst + j * BPS, value, 8); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static void DC8uv(uint8_t *dst) {     // DC | ||||
| @@ -439,7 +444,7 @@ static void DC8uv(uint8_t *dst) {     // DC | ||||
|   for (i = 0; i < 8; ++i) { | ||||
|     dc0 += dst[i - BPS] + dst[-1 + i * BPS]; | ||||
|   } | ||||
|   Put8x8uv((uint64_t)((dc0 >> 4) * 0x0101010101010101ULL), dst); | ||||
|   Put8x8uv(dc0 >> 4, dst); | ||||
| } | ||||
|  | ||||
| static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples | ||||
| @@ -448,7 +453,7 @@ static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples | ||||
|   for (i = 0; i < 8; ++i) { | ||||
|     dc0 += dst[i - BPS]; | ||||
|   } | ||||
|   Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst); | ||||
|   Put8x8uv(dc0 >> 3, dst); | ||||
| } | ||||
|  | ||||
| static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples | ||||
| @@ -457,11 +462,11 @@ static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples | ||||
|   for (i = 0; i < 8; ++i) { | ||||
|     dc0 += dst[-1 + i * BPS]; | ||||
|   } | ||||
|   Put8x8uv((uint64_t)((dc0 >> 3) * 0x0101010101010101ULL), dst); | ||||
|   Put8x8uv(dc0 >> 3, dst); | ||||
| } | ||||
|  | ||||
| static void DC8uvNoTopLeft(uint8_t *dst) {    // DC with nothing | ||||
|   Put8x8uv(0x8080808080808080ULL, dst); | ||||
|   Put8x8uv(0x80, dst); | ||||
| } | ||||
|  | ||||
| //------------------------------------------------------------------------------ | ||||
|   | ||||
| @@ -1079,20 +1079,27 @@ static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst, | ||||
|     const uint32_t* const src_end = src + num_pixels; | ||||
|     while (src < src_end) { | ||||
|       uint32_t argb = *src++; | ||||
|  | ||||
| #if !defined(WEBP_REFERENCE_IMPLEMENTATION) | ||||
| #if !defined(__BIG_ENDIAN__) && (defined(__i386__) || defined(__x86_64__)) | ||||
|       __asm__ volatile("bswap %0" : "=r"(argb) : "0"(argb)); | ||||
|       *(uint32_t*)dst = argb; | ||||
|       dst += sizeof(argb); | ||||
| #elif !defined(__BIG_ENDIAN__) && defined(_MSC_VER) | ||||
|       argb = _byteswap_ulong(argb); | ||||
|       *(uint32_t*)dst = argb; | ||||
|       dst += sizeof(argb); | ||||
| #else | ||||
|       *dst++ = (argb >> 24) & 0xff; | ||||
|       *dst++ = (argb >> 16) & 0xff; | ||||
|       *dst++ = (argb >>  8) & 0xff; | ||||
|       *dst++ = (argb >>  0) & 0xff; | ||||
|       dst[0] = (argb >> 24) & 0xff; | ||||
|       dst[1] = (argb >> 16) & 0xff; | ||||
|       dst[2] = (argb >>  8) & 0xff; | ||||
|       dst[3] = (argb >>  0) & 0xff; | ||||
| #endif | ||||
| #else   // WEBP_REFERENCE_IMPLEMENTATION | ||||
|       dst[0] = (argb >> 24) & 0xff; | ||||
|       dst[1] = (argb >> 16) & 0xff; | ||||
|       dst[2] = (argb >>  8) & 0xff; | ||||
|       dst[3] = (argb >>  0) & 0xff; | ||||
| #endif | ||||
|       dst += sizeof(argb); | ||||
|     } | ||||
|   } else { | ||||
|     memcpy(dst, src, num_pixels * sizeof(*src)); | ||||
|   | ||||
| @@ -56,20 +56,29 @@ extern "C" { | ||||
| // -> we're back to height active 'value_' bits (marked 'v') and BITS cached | ||||
| // bits (marked 'B') | ||||
| // | ||||
| // The right-justify strategy tends to use less shifts, so let's use it: | ||||
|  | ||||
| #define USE_RIGHT_JUSTIFY | ||||
| // The right-justify strategy tends to use less shifts and is often faster. | ||||
|  | ||||
| //------------------------------------------------------------------------------ | ||||
| // BITS can be either 32, 24, 16 or 8. | ||||
| // Pick values that fit natural register size. | ||||
|  | ||||
| #if !defined(WEBP_REFERENCE_IMPLEMENTATION) | ||||
|  | ||||
| #define USE_RIGHT_JUSTIFY | ||||
|  | ||||
| #if defined(__i386__) || defined(_M_IX86)      // x86 32bit | ||||
| #define BITS 16 | ||||
| #elif defined(__arm__) || defined(_M_ARM)     // ARM | ||||
| #define BITS 8 | ||||
| #define BITS 24 | ||||
| #else                      // reasonable default | ||||
| #define BITS 32 | ||||
| #define BITS 24 | ||||
| #endif | ||||
|  | ||||
| #else     // reference choices | ||||
|  | ||||
| #define USE_RIGHT_JUSTIFY | ||||
| #define BITS 8 | ||||
|  | ||||
| #endif | ||||
|  | ||||
| //------------------------------------------------------------------------------ | ||||
|   | ||||
		Reference in New Issue
	
	Block a user