mirror of
				https://github.com/webmproject/libwebp.git
				synced 2025-10-31 18:35:41 +01:00 
			
		
		
		
	rework AddVectorEq_SSE2
Take advantage of the known sizes used by VP8LHistogramAdd() and remove loop for the remainder. The loop was being auto-vectorized making the code larger and slower than the vectorized C code. For larger sizes the new code is ~3-4.5% faster than the old code with about the same improvement against the vectorized C code. For the minimal size (40), the new code is ~30% faster than the C and old SSE2 code. The LINE_SIZE==8 option is removed with this change. It had been set to 16 for its entire life and clang-16 was unrolling the LINE_SIZE==8 case by 2 in any case; they both profile similarly. Change-Id: I6dfedfd57474f44d15e2ce510a48e5252221077a
This commit is contained in:
		| @@ -224,35 +224,53 @@ static void AddVector_SSE2(const uint32_t* WEBP_RESTRICT a, | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| #define LINE_SIZE 16    // 8 or 16 |  | ||||||
| static void AddVectorEq_SSE2(const uint32_t* WEBP_RESTRICT a, | static void AddVectorEq_SSE2(const uint32_t* WEBP_RESTRICT a, | ||||||
|                              uint32_t* WEBP_RESTRICT out, int size) { |                              uint32_t* WEBP_RESTRICT out, int size) { | ||||||
|   int i; |   int i = 0; | ||||||
|   for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) { |   int aligned_size = size & ~15; | ||||||
|  |   // Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as | ||||||
|  |   // NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of | ||||||
|  |   // 2). See the usage in VP8LHistogramAdd(). | ||||||
|  |   assert(size >= 16); | ||||||
|  |   assert(size % 2 == 0); | ||||||
|  |  | ||||||
|  |   do { | ||||||
|     const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]); |     const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]); | ||||||
|     const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]); |     const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]); | ||||||
| #if (LINE_SIZE == 16) |  | ||||||
|     const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]); |     const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]); | ||||||
|     const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]); |     const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]); | ||||||
| #endif |  | ||||||
|     const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i +  0]); |     const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i +  0]); | ||||||
|     const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i +  4]); |     const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i +  4]); | ||||||
| #if (LINE_SIZE == 16) |  | ||||||
|     const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i +  8]); |     const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i +  8]); | ||||||
|     const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]); |     const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]); | ||||||
| #endif |  | ||||||
|     _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0)); |     _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0)); | ||||||
|     _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1)); |     _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1)); | ||||||
| #if (LINE_SIZE == 16) |  | ||||||
|     _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2)); |     _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2)); | ||||||
|     _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); |     _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); | ||||||
| #endif |     i += 16; | ||||||
|  |   } while (i != aligned_size); | ||||||
|  |  | ||||||
|  |   if ((size & 8) != 0) { | ||||||
|  |     const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); | ||||||
|  |     const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); | ||||||
|  |     const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]); | ||||||
|  |     const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]); | ||||||
|  |     _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); | ||||||
|  |     _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); | ||||||
|  |     i += 8; | ||||||
|   } |   } | ||||||
|   for (; i < size; ++i) { |  | ||||||
|     out[i] += a[i]; |   size &= 7; | ||||||
|  |   if (size == 4) { | ||||||
|  |     const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]); | ||||||
|  |     const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i]); | ||||||
|  |     _mm_storeu_si128((__m128i*)&out[i], _mm_add_epi32(a0, b0)); | ||||||
|  |   } else if (size == 2) { | ||||||
|  |     const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[i]); | ||||||
|  |     const __m128i b0 = _mm_loadl_epi64((const __m128i*)&out[i]); | ||||||
|  |     _mm_storel_epi64((__m128i*)&out[i], _mm_add_epi32(a0, b0)); | ||||||
|   } |   } | ||||||
| } | } | ||||||
| #undef LINE_SIZE |  | ||||||
|  |  | ||||||
| //------------------------------------------------------------------------------ | //------------------------------------------------------------------------------ | ||||||
| // Entropy | // Entropy | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user