dsp/dec_sse2: DC8uv / DC8uvNoLeft speedup

use psadbw to perform top row summation; left remains in C as repacking
it into a vector to apply the same operation is too costly.

DC8uv: ~19% faster
DC8uvNoLeft: ~12% faster

Change-Id: I707c4f6177a65b5d1f2d3deeca87d2bb740185e2
This commit is contained in:
James Zern 2015-04-08 18:43:46 -07:00 committed by Gerrit Code Review
parent 7df2049785
commit 9904e365a8

View File

@ -1208,21 +1208,26 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
} }
static void DC8uv(uint8_t* dst) { // DC static void DC8uv(uint8_t* dst) { // DC
int dc0 = 8; const __m128i zero = _mm_setzero_si128();
int i; const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
for (i = 0; i < 8; ++i) { const __m128i sum = _mm_sad_epu8(top, zero);
dc0 += dst[i - BPS] + dst[-1 + i * BPS]; int left = 0;
int j;
for (j = 0; j < 8; ++j) {
left += dst[-1 + j * BPS];
}
{
const int DC = _mm_cvtsi128_si32(sum) + left + 8;
Put8x8uv(DC >> 4, dst);
} }
Put8x8uv(dc0 >> 4, dst);
} }
static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
int dc0 = 4; const __m128i zero = _mm_setzero_si128();
int i; const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
for (i = 0; i < 8; ++i) { const __m128i sum = _mm_sad_epu8(top, zero);
dc0 += dst[i - BPS]; const int DC = _mm_cvtsi128_si32(sum) + 4;
} Put8x8uv(DC >> 3, dst);
Put8x8uv(dc0 >> 3, dst);
} }
static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples