mirror of
https://github.com/webmproject/libwebp.git
synced 2024-11-19 20:08:28 +01:00
rework AddVector_SSE2
Take advantage of the known sizes used by VP8LHistogramAdd() and remove loop for the remainder. The loop was being auto-vectorized making the code larger and slower than the vectorized C code. For larger sizes the new code is ~4-7% faster than the old code with about the same improvement against the vectorized C code. For the minimal size (40), the new code is ~30% faster than the C and old SSE2 code. The LINE_SIZE==8 option is removed with this change. It had been set to 16 for its entire life and clang-16 was unrolling the LINE_SIZE==8 case by 2 in any case; they both profile similarly. Change-Id: I2376e2dca3bffa38477b4a432f4c533419e3be0e
This commit is contained in:
parent
2ddaaf0aa5
commit
7bda3deb89
@ -175,36 +175,56 @@ static void CollectColorRedTransforms_SSE2(const uint32_t* WEBP_RESTRICT argb,
|
||||
|
||||
// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
|
||||
// that's ok since the histogram values are less than 1<<28 (max picture size).
|
||||
#define LINE_SIZE 16 // 8 or 16
|
||||
static void AddVector_SSE2(const uint32_t* WEBP_RESTRICT a,
|
||||
const uint32_t* WEBP_RESTRICT b,
|
||||
uint32_t* WEBP_RESTRICT out, int size) {
|
||||
int i;
|
||||
for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
|
||||
int i = 0;
|
||||
int aligned_size = size & ~15;
|
||||
// Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as
|
||||
// NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of
|
||||
// 2). See the usage in VP8LHistogramAdd().
|
||||
assert(size >= 16);
|
||||
assert(size % 2 == 0);
|
||||
|
||||
do {
|
||||
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
|
||||
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
|
||||
#if (LINE_SIZE == 16)
|
||||
const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]);
|
||||
const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
|
||||
#endif
|
||||
const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]);
|
||||
const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]);
|
||||
#if (LINE_SIZE == 16)
|
||||
const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i + 8]);
|
||||
const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]);
|
||||
#endif
|
||||
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
|
||||
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
|
||||
#if (LINE_SIZE == 16)
|
||||
_mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2));
|
||||
_mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
|
||||
#endif
|
||||
i += 16;
|
||||
} while (i != aligned_size);
|
||||
|
||||
if ((size & 8) != 0) {
|
||||
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
|
||||
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
|
||||
const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]);
|
||||
const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]);
|
||||
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
|
||||
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
|
||||
i += 8;
|
||||
}
|
||||
for (; i < size; ++i) {
|
||||
out[i] = a[i] + b[i];
|
||||
|
||||
size &= 7;
|
||||
if (size == 4) {
|
||||
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]);
|
||||
const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i]);
|
||||
_mm_storeu_si128((__m128i*)&out[i], _mm_add_epi32(a0, b0));
|
||||
} else if (size == 2) {
|
||||
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[i]);
|
||||
const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[i]);
|
||||
_mm_storel_epi64((__m128i*)&out[i], _mm_add_epi32(a0, b0));
|
||||
}
|
||||
}
|
||||
|
||||
#define LINE_SIZE 16 // 8 or 16
|
||||
static void AddVectorEq_SSE2(const uint32_t* WEBP_RESTRICT a,
|
||||
uint32_t* WEBP_RESTRICT out, int size) {
|
||||
int i;
|
||||
|
Loading…
Reference in New Issue
Block a user