mirror of
				https://github.com/webmproject/libwebp.git
				synced 2025-10-31 02:15:42 +01:00 
			
		
		
		
	rework AddVectorEq_SSE2
Take advantage of the known sizes used by VP8LHistogramAdd() and remove loop for the remainder. The loop was being auto-vectorized making the code larger and slower than the vectorized C code. For larger sizes the new code is ~3-4.5% faster than the old code with about the same improvement against the vectorized C code. For the minimal size (40), the new code is ~30% faster than the C and old SSE2 code. The LINE_SIZE==8 option is removed with this change. It had been set to 16 for its entire life and clang-16 was unrolling the LINE_SIZE==8 case by 2 in any case; they both profile similarly. Change-Id: I6dfedfd57474f44d15e2ce510a48e5252221077a
This commit is contained in:
		| @@ -224,35 +224,53 @@ static void AddVector_SSE2(const uint32_t* WEBP_RESTRICT a, | ||||
|   } | ||||
| } | ||||
|  | ||||
| #define LINE_SIZE 16    // 8 or 16 | ||||
| static void AddVectorEq_SSE2(const uint32_t* WEBP_RESTRICT a, | ||||
|                              uint32_t* WEBP_RESTRICT out, int size) { | ||||
|   int i; | ||||
|   for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) { | ||||
|   int i = 0; | ||||
|   int aligned_size = size & ~15; | ||||
|   // Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as | ||||
|   // NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of | ||||
|   // 2). See the usage in VP8LHistogramAdd(). | ||||
|   assert(size >= 16); | ||||
|   assert(size % 2 == 0); | ||||
|  | ||||
|   do { | ||||
|     const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]); | ||||
|     const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]); | ||||
| #if (LINE_SIZE == 16) | ||||
|     const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]); | ||||
|     const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]); | ||||
| #endif | ||||
|     const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i +  0]); | ||||
|     const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i +  4]); | ||||
| #if (LINE_SIZE == 16) | ||||
|     const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i +  8]); | ||||
|     const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]); | ||||
| #endif | ||||
|     _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0)); | ||||
|     _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1)); | ||||
| #if (LINE_SIZE == 16) | ||||
|     _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2)); | ||||
|     _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); | ||||
| #endif | ||||
|     i += 16; | ||||
|   } while (i != aligned_size); | ||||
|  | ||||
|   if ((size & 8) != 0) { | ||||
|     const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); | ||||
|     const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); | ||||
|     const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]); | ||||
|     const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]); | ||||
|     _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); | ||||
|     _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); | ||||
|     i += 8; | ||||
|   } | ||||
|   for (; i < size; ++i) { | ||||
|     out[i] += a[i]; | ||||
|  | ||||
|   size &= 7; | ||||
|   if (size == 4) { | ||||
|     const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]); | ||||
|     const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i]); | ||||
|     _mm_storeu_si128((__m128i*)&out[i], _mm_add_epi32(a0, b0)); | ||||
|   } else if (size == 2) { | ||||
|     const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[i]); | ||||
|     const __m128i b0 = _mm_loadl_epi64((const __m128i*)&out[i]); | ||||
|     _mm_storel_epi64((__m128i*)&out[i], _mm_add_epi32(a0, b0)); | ||||
|   } | ||||
| } | ||||
| #undef LINE_SIZE | ||||
|  | ||||
| //------------------------------------------------------------------------------ | ||||
| // Entropy | ||||
|   | ||||
		Reference in New Issue
	
	Block a user