mirror of
				https://github.com/webmproject/libwebp.git
				synced 2025-10-31 02:15:42 +01:00 
			
		
		
		
	SSE optimization for vector mismatch.
Change-Id: I564b822033b59d86635230f29ed6197e306a2c4f
This commit is contained in:
		| @@ -262,6 +262,11 @@ extern VP8LHistogramAddFunc VP8LHistogramAdd; | ||||
| // ----------------------------------------------------------------------------- | ||||
| // PrefixEncode() | ||||
|  | ||||
| typedef int (*VP8LVectorMismatchFunc)(const uint32_t* const array1, | ||||
|                                       const uint32_t* const array2, int length); | ||||
| // Returns the first index where array1 and array2 are different. | ||||
| extern VP8LVectorMismatchFunc VP8LVectorMismatch; | ||||
|  | ||||
| static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) { | ||||
|   const int log_floor = BitsLog2Floor(n); | ||||
|   if (n == (n & ~(n - 1)))  // zero or a power of two. | ||||
|   | ||||
| @@ -1053,6 +1053,17 @@ void VP8LColorSpaceTransform(int width, int height, int bits, int quality, | ||||
| } | ||||
|  | ||||
| //------------------------------------------------------------------------------ | ||||
|  | ||||
| static int VectorMismatch(const uint32_t* const array1, | ||||
|                           const uint32_t* const array2, int length) { | ||||
|   int match_len = 0; | ||||
|  | ||||
|   while (match_len < length && array1[match_len] == array2[match_len]) { | ||||
|     ++match_len; | ||||
|   } | ||||
|   return match_len; | ||||
| } | ||||
|  | ||||
| // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel. | ||||
| void VP8LBundleColorMap(const uint8_t* const row, int width, | ||||
|                         int xbits, uint32_t* const dst) { | ||||
| @@ -1149,6 +1160,8 @@ GetEntropyUnrefinedHelperFunc VP8LGetEntropyUnrefinedHelper; | ||||
|  | ||||
| VP8LHistogramAddFunc VP8LHistogramAdd; | ||||
|  | ||||
| VP8LVectorMismatchFunc VP8LVectorMismatch; | ||||
|  | ||||
| extern void VP8LEncDspInitSSE2(void); | ||||
| extern void VP8LEncDspInitSSE41(void); | ||||
| extern void VP8LEncDspInitNEON(void); | ||||
| @@ -1181,6 +1194,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) { | ||||
|  | ||||
|   VP8LHistogramAdd = HistogramAdd; | ||||
|  | ||||
|   VP8LVectorMismatch = VectorMismatch; | ||||
|  | ||||
|   // If defined, use CPUInfo() to overwrite some pointers with faster versions. | ||||
|   if (VP8GetCPUInfo != NULL) { | ||||
| #if defined(WEBP_USE_SSE2) | ||||
|   | ||||
| @@ -324,6 +324,57 @@ static float CombinedShannonEntropy(const int X[256], const int Y[256]) { | ||||
| #undef ANALYZE_X_OR_Y | ||||
| #undef ANALYZE_XY | ||||
|  | ||||
| //------------------------------------------------------------------------------ | ||||
|  | ||||
| static int VectorMismatch(const uint32_t* const array1, | ||||
|                           const uint32_t* const array2, int length) { | ||||
|   int match_len; | ||||
|  | ||||
|   if (12 <= length) { | ||||
|     __m128i A0 = _mm_loadu_si128((const __m128i*)&array1[0]); | ||||
|     __m128i A1 = _mm_loadu_si128((const __m128i*)&array2[0]); | ||||
|     match_len = 0; | ||||
|     do { | ||||
|       // Loop unrolling and early load both provide a speedup of 10% for the | ||||
|       // current function. Also, max_limit can be MAX_LENGTH=4096 at most. | ||||
|       const __m128i cmpA = _mm_cmpeq_epi32(A0, A1); | ||||
|       const __m128i B0 = | ||||
|           _mm_loadu_si128((const __m128i*)&array1[match_len + 4]); | ||||
|       const __m128i B1 = | ||||
|           _mm_loadu_si128((const __m128i*)&array2[match_len + 4]); | ||||
|       if (_mm_movemask_epi8(cmpA) != 0xffff) break; | ||||
|       match_len += 4; | ||||
|  | ||||
|       { | ||||
|         const __m128i cmpB = _mm_cmpeq_epi32(B0, B1); | ||||
|         A0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]); | ||||
|         A1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]); | ||||
|         if (_mm_movemask_epi8(cmpB) != 0xffff) break; | ||||
|         match_len += 4; | ||||
|       } | ||||
|     } while (match_len + 12 < length); | ||||
|   } else { | ||||
|     match_len = 0; | ||||
|     // Unroll the potential first two loops. | ||||
|     if (4 <= length && | ||||
|         _mm_movemask_epi8(_mm_cmpeq_epi32( | ||||
|             _mm_loadu_si128((const __m128i*)&array1[0]), | ||||
|             _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) { | ||||
|       match_len = 4; | ||||
|       if (8 <= length && | ||||
|           _mm_movemask_epi8(_mm_cmpeq_epi32( | ||||
|               _mm_loadu_si128((const __m128i*)&array1[4]), | ||||
|               _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) | ||||
|         match_len = 8; | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   while (match_len < length && array1[match_len] == array2[match_len]) { | ||||
|     ++match_len; | ||||
|   } | ||||
|   return match_len; | ||||
| } | ||||
|  | ||||
| //------------------------------------------------------------------------------ | ||||
| // Entry point | ||||
|  | ||||
| @@ -336,6 +387,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) { | ||||
|   VP8LCollectColorRedTransforms = CollectColorRedTransforms; | ||||
|   VP8LHistogramAdd = HistogramAdd; | ||||
|   VP8LCombinedShannonEntropy = CombinedShannonEntropy; | ||||
|   VP8LVectorMismatch = VectorMismatch; | ||||
| } | ||||
|  | ||||
| #else  // !WEBP_USE_SSE2 | ||||
|   | ||||
| @@ -57,32 +57,19 @@ static int DistanceToPlaneCode(int xsize, int dist) { | ||||
|   return dist + 120; | ||||
| } | ||||
|  | ||||
| // Returns the exact index where array1 and array2 are different if this | ||||
| // index is strictly superior to best_len_match. Otherwise, it returns 0. | ||||
| // Returns the exact index where array1 and array2 are different. For an index | ||||
| // inferior or equal to best_len_match, the return value just has to be strictly | ||||
| // inferior to best_len_match. The current behavior is to return 0 if this index | ||||
| // is best_len_match, and the index itself otherwise. | ||||
| // If no two elements are the same, it returns max_limit. | ||||
| static WEBP_INLINE int FindMatchLength(const uint32_t* const array1, | ||||
|                                        const uint32_t* const array2, | ||||
|                                        int best_len_match, | ||||
|                                        int max_limit) { | ||||
|   int match_len; | ||||
|  | ||||
|                                        int best_len_match, int max_limit) { | ||||
|   // Before 'expensive' linear match, check if the two arrays match at the | ||||
|   // current best length index. | ||||
|   if (array1[best_len_match] != array2[best_len_match]) return 0; | ||||
|  | ||||
| #if defined(WEBP_USE_SSE2) | ||||
|   // Check if anything is different up to best_len_match excluded. | ||||
|   // memcmp seems to be slower on ARM so it is disabled for now. | ||||
|   if (memcmp(array1, array2, best_len_match * sizeof(*array1))) return 0; | ||||
|   match_len = best_len_match + 1; | ||||
| #else | ||||
|   match_len = 0; | ||||
| #endif | ||||
|  | ||||
|   while (match_len < max_limit && array1[match_len] == array2[match_len]) { | ||||
|     ++match_len; | ||||
|   } | ||||
|   return match_len; | ||||
|   return VP8LVectorMismatch(array1, array2, max_limit); | ||||
| } | ||||
|  | ||||
| // ----------------------------------------------------------------------------- | ||||
|   | ||||
		Reference in New Issue
	
	Block a user