mirror of
https://github.com/webmproject/libwebp.git
synced 2024-12-26 13:48:21 +01:00
dsp/dec_sse2: DC8uv / DC8uvNoLeft speedup
use psadbw to perform top row summation; left remains in C as repacking it into a vector to apply the same operation is too costly. DC8uv: ~19% faster DC8uvNoLeft: ~12% faster Change-Id: I707c4f6177a65b5d1f2d3deeca87d2bb740185e2
This commit is contained in:
parent
7df2049785
commit
9904e365a8
@ -1208,21 +1208,26 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void DC8uv(uint8_t* dst) { // DC
|
static void DC8uv(uint8_t* dst) { // DC
|
||||||
int dc0 = 8;
|
const __m128i zero = _mm_setzero_si128();
|
||||||
int i;
|
const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
|
||||||
for (i = 0; i < 8; ++i) {
|
const __m128i sum = _mm_sad_epu8(top, zero);
|
||||||
dc0 += dst[i - BPS] + dst[-1 + i * BPS];
|
int left = 0;
|
||||||
|
int j;
|
||||||
|
for (j = 0; j < 8; ++j) {
|
||||||
|
left += dst[-1 + j * BPS];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
const int DC = _mm_cvtsi128_si32(sum) + left + 8;
|
||||||
|
Put8x8uv(DC >> 4, dst);
|
||||||
}
|
}
|
||||||
Put8x8uv(dc0 >> 4, dst);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
|
static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
|
||||||
int dc0 = 4;
|
const __m128i zero = _mm_setzero_si128();
|
||||||
int i;
|
const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
|
||||||
for (i = 0; i < 8; ++i) {
|
const __m128i sum = _mm_sad_epu8(top, zero);
|
||||||
dc0 += dst[i - BPS];
|
const int DC = _mm_cvtsi128_si32(sum) + 4;
|
||||||
}
|
Put8x8uv(DC >> 3, dst);
|
||||||
Put8x8uv(dc0 >> 3, dst);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
|
static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
|
||||||
|
Loading…
Reference in New Issue
Block a user