mirror of
https://github.com/webmproject/libwebp.git
synced 2024-12-26 13:48:21 +01:00
dsp/dec_sse2: DC16 / DC16NoLeft speedup
use psadbw to perform top row summation; left remains in C as repacking it into a vector to apply the same operation is too costly. DC16: ~20% faster DC16NoLeft: ~14% faster Change-Id: I7ec3f8a6e5923f88a530f79fceb88d5001bef691
This commit is contained in:
parent
8e515dfeda
commit
7df2049785
@ -1139,12 +1139,20 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void DC16(uint8_t* dst) { // DC
|
static void DC16(uint8_t* dst) { // DC
|
||||||
int DC = 16;
|
const __m128i zero = _mm_setzero_si128();
|
||||||
|
const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
|
||||||
|
const __m128i sad8x2 = _mm_sad_epu8(top, zero);
|
||||||
|
// sum the two sads: sad8x2[0:1] + sad8x2[8:9]
|
||||||
|
const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
|
||||||
|
int left = 0;
|
||||||
int j;
|
int j;
|
||||||
for (j = 0; j < 16; ++j) {
|
for (j = 0; j < 16; ++j) {
|
||||||
DC += dst[-1 + j * BPS] + dst[j - BPS];
|
left += dst[-1 + j * BPS];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
const int DC = _mm_cvtsi128_si32(sum) + left + 16;
|
||||||
|
Put16(DC >> 5, dst);
|
||||||
}
|
}
|
||||||
Put16(DC >> 5, dst);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
|
static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
|
||||||
@ -1157,11 +1165,12 @@ static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available
|
static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available
|
||||||
int DC = 8;
|
const __m128i zero = _mm_setzero_si128();
|
||||||
int i;
|
const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
|
||||||
for (i = 0; i < 16; ++i) {
|
const __m128i sad8x2 = _mm_sad_epu8(top, zero);
|
||||||
DC += dst[i - BPS];
|
// sum the two sads: sad8x2[0:1] + sad8x2[8:9]
|
||||||
}
|
const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
|
||||||
|
const int DC = _mm_cvtsi128_si32(sum) + 8;
|
||||||
Put16(DC >> 4, dst);
|
Put16(DC >> 4, dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user