1-3% faster encoding optimizing SSE_NxN functions

got rid of the |a-b|^|b-a| method and went back
to just (a-b)^2 instead.

quality | size(bytes) after/before | time (ms) after/before

Change-Id: Ia3e0e6507b3f903deb1e182f78dad6df07380fd0
This commit is contained in:
Pascal Massimino 2014-10-09 07:20:00 -07:00
parent 5f81391263
commit fabc65da32

View File

@ -488,95 +488,82 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Metric // Metric
static int SSE_Nx4(const uint8_t* a, const uint8_t* b, static WEBP_INLINE __m128i SubtractAndAccumulate(const __m128i a,
int num_quads, int do_16) { const __m128i b) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
__m128i sum1 = zero; // convert to 16b
__m128i sum2 = zero; const __m128i A0 = _mm_unpacklo_epi8(a, zero);
const __m128i B0 = _mm_unpacklo_epi8(b, zero);
const __m128i A1 = _mm_unpackhi_epi8(a, zero);
const __m128i B1 = _mm_unpackhi_epi8(b, zero);
// subtract
const __m128i C0 = _mm_subs_epi16(A0, B0);
const __m128i C1 = _mm_subs_epi16(A1, B1);
// multiply with self
const __m128i D0 = _mm_madd_epi16(C0, C0);
const __m128i D1 = _mm_madd_epi16(C1, C1);
// accumulate
const __m128i sum = _mm_add_epi32(D0, D1);
return sum;
}
while (num_quads-- > 0) { static int SSE_16xN(const uint8_t* a, const uint8_t* b, int num_pairs) {
// Note: for the !do_16 case, we read 16 pixels instead of 8 but that's ok, __m128i sum = _mm_setzero_si128();
// thanks to buffer over-allocation to that effect. int32_t tmp[4];
while (num_pairs-- > 0) {
const __m128i a0 = _mm_loadu_si128((__m128i*)&a[BPS * 0]); const __m128i a0 = _mm_loadu_si128((__m128i*)&a[BPS * 0]);
const __m128i a1 = _mm_loadu_si128((__m128i*)&a[BPS * 1]); const __m128i a1 = _mm_loadu_si128((__m128i*)&a[BPS * 1]);
const __m128i a2 = _mm_loadu_si128((__m128i*)&a[BPS * 2]);
const __m128i a3 = _mm_loadu_si128((__m128i*)&a[BPS * 3]);
const __m128i b0 = _mm_loadu_si128((__m128i*)&b[BPS * 0]); const __m128i b0 = _mm_loadu_si128((__m128i*)&b[BPS * 0]);
const __m128i b1 = _mm_loadu_si128((__m128i*)&b[BPS * 1]); const __m128i b1 = _mm_loadu_si128((__m128i*)&b[BPS * 1]);
const __m128i b2 = _mm_loadu_si128((__m128i*)&b[BPS * 2]); const __m128i sum1 = SubtractAndAccumulate(a0, b0);
const __m128i b3 = _mm_loadu_si128((__m128i*)&b[BPS * 3]); const __m128i sum2 = SubtractAndAccumulate(a1, b1);
const __m128i sum12 = _mm_add_epi32(sum1, sum2);
// compute clip0(a-b) and clip0(b-a) sum = _mm_add_epi32(sum, sum12);
const __m128i a0p = _mm_subs_epu8(a0, b0); a += 2 * BPS;
const __m128i a0m = _mm_subs_epu8(b0, a0); b += 2 * BPS;
const __m128i a1p = _mm_subs_epu8(a1, b1);
const __m128i a1m = _mm_subs_epu8(b1, a1);
const __m128i a2p = _mm_subs_epu8(a2, b2);
const __m128i a2m = _mm_subs_epu8(b2, a2);
const __m128i a3p = _mm_subs_epu8(a3, b3);
const __m128i a3m = _mm_subs_epu8(b3, a3);
// compute |a-b| with 8b arithmetic as clip0(a-b) | clip0(b-a)
const __m128i diff0 = _mm_or_si128(a0p, a0m);
const __m128i diff1 = _mm_or_si128(a1p, a1m);
const __m128i diff2 = _mm_or_si128(a2p, a2m);
const __m128i diff3 = _mm_or_si128(a3p, a3m);
// unpack (only four operations, instead of eight)
const __m128i low0 = _mm_unpacklo_epi8(diff0, zero);
const __m128i low1 = _mm_unpacklo_epi8(diff1, zero);
const __m128i low2 = _mm_unpacklo_epi8(diff2, zero);
const __m128i low3 = _mm_unpacklo_epi8(diff3, zero);
// multiply with self
const __m128i low_madd0 = _mm_madd_epi16(low0, low0);
const __m128i low_madd1 = _mm_madd_epi16(low1, low1);
const __m128i low_madd2 = _mm_madd_epi16(low2, low2);
const __m128i low_madd3 = _mm_madd_epi16(low3, low3);
// collect in a cascading way
const __m128i low_sum0 = _mm_add_epi32(low_madd0, low_madd1);
const __m128i low_sum1 = _mm_add_epi32(low_madd2, low_madd3);
sum1 = _mm_add_epi32(sum1, low_sum0);
sum2 = _mm_add_epi32(sum2, low_sum1);
if (do_16) { // if necessary, process the higher 8 bytes similarly
const __m128i hi0 = _mm_unpackhi_epi8(diff0, zero);
const __m128i hi1 = _mm_unpackhi_epi8(diff1, zero);
const __m128i hi2 = _mm_unpackhi_epi8(diff2, zero);
const __m128i hi3 = _mm_unpackhi_epi8(diff3, zero);
const __m128i hi_madd0 = _mm_madd_epi16(hi0, hi0);
const __m128i hi_madd1 = _mm_madd_epi16(hi1, hi1);
const __m128i hi_madd2 = _mm_madd_epi16(hi2, hi2);
const __m128i hi_madd3 = _mm_madd_epi16(hi3, hi3);
const __m128i hi_sum0 = _mm_add_epi32(hi_madd0, hi_madd1);
const __m128i hi_sum1 = _mm_add_epi32(hi_madd2, hi_madd3);
sum1 = _mm_add_epi32(sum1, hi_sum0);
sum2 = _mm_add_epi32(sum2, hi_sum1);
}
a += 4 * BPS;
b += 4 * BPS;
}
{
int32_t tmp[4];
const __m128i sum = _mm_add_epi32(sum1, sum2);
_mm_storeu_si128((__m128i*)tmp, sum);
return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
} }
_mm_storeu_si128((__m128i*)tmp, sum);
return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
} }
static int SSE16x16(const uint8_t* a, const uint8_t* b) { static int SSE16x16(const uint8_t* a, const uint8_t* b) {
return SSE_Nx4(a, b, 4, 1); return SSE_16xN(a, b, 8);
} }
static int SSE16x8(const uint8_t* a, const uint8_t* b) { static int SSE16x8(const uint8_t* a, const uint8_t* b) {
return SSE_Nx4(a, b, 2, 1); return SSE_16xN(a, b, 4);
} }
#define LOAD_8x16b(ptr) \
_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(ptr)), zero)
static int SSE8x8(const uint8_t* a, const uint8_t* b) { static int SSE8x8(const uint8_t* a, const uint8_t* b) {
return SSE_Nx4(a, b, 2, 0); const __m128i zero = _mm_setzero_si128();
int num_pairs = 4;
__m128i sum = zero;
int32_t tmp[4];
while (num_pairs-- > 0) {
const __m128i a0 = LOAD_8x16b(&a[BPS * 0]);
const __m128i a1 = LOAD_8x16b(&a[BPS * 1]);
const __m128i b0 = LOAD_8x16b(&b[BPS * 0]);
const __m128i b1 = LOAD_8x16b(&b[BPS * 1]);
// subtract
const __m128i c0 = _mm_subs_epi16(a0, b0);
const __m128i c1 = _mm_subs_epi16(a1, b1);
// multiply/accumulate with self
const __m128i d0 = _mm_madd_epi16(c0, c0);
const __m128i d1 = _mm_madd_epi16(c1, c1);
// collect
const __m128i sum01 = _mm_add_epi32(d0, d1);
sum = _mm_add_epi32(sum, sum01);
a += 2 * BPS;
b += 2 * BPS;
}
_mm_storeu_si128((__m128i*)tmp, sum);
return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
} }
#undef LOAD_8x16b
static int SSE4x4(const uint8_t* a, const uint8_t* b) { static int SSE4x4(const uint8_t* a, const uint8_t* b) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
@ -591,38 +578,25 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
const __m128i b1 = _mm_loadl_epi64((__m128i*)&b[BPS * 1]); const __m128i b1 = _mm_loadl_epi64((__m128i*)&b[BPS * 1]);
const __m128i b2 = _mm_loadl_epi64((__m128i*)&b[BPS * 2]); const __m128i b2 = _mm_loadl_epi64((__m128i*)&b[BPS * 2]);
const __m128i b3 = _mm_loadl_epi64((__m128i*)&b[BPS * 3]); const __m128i b3 = _mm_loadl_epi64((__m128i*)&b[BPS * 3]);
// Combine pair of lines.
// Combine pair of lines and convert to 16b.
const __m128i a01 = _mm_unpacklo_epi32(a0, a1); const __m128i a01 = _mm_unpacklo_epi32(a0, a1);
const __m128i a23 = _mm_unpacklo_epi32(a2, a3); const __m128i a23 = _mm_unpacklo_epi32(a2, a3);
const __m128i b01 = _mm_unpacklo_epi32(b0, b1); const __m128i b01 = _mm_unpacklo_epi32(b0, b1);
const __m128i b23 = _mm_unpacklo_epi32(b2, b3); const __m128i b23 = _mm_unpacklo_epi32(b2, b3);
// Convert to 16b.
const __m128i a01s = _mm_unpacklo_epi8(a01, zero); const __m128i a01s = _mm_unpacklo_epi8(a01, zero);
const __m128i a23s = _mm_unpacklo_epi8(a23, zero); const __m128i a23s = _mm_unpacklo_epi8(a23, zero);
const __m128i b01s = _mm_unpacklo_epi8(b01, zero); const __m128i b01s = _mm_unpacklo_epi8(b01, zero);
const __m128i b23s = _mm_unpacklo_epi8(b23, zero); const __m128i b23s = _mm_unpacklo_epi8(b23, zero);
// subtract, square and accumulate
// Compute differences; (a-b)^2 = (abs(a-b))^2 = (sat8(a-b) + sat8(b-a))^2 const __m128i d0 = _mm_subs_epi16(a01s, b01s);
// TODO(cduvivier): Dissassemble and figure out why this is fastest. We don't const __m128i d1 = _mm_subs_epi16(a23s, b23s);
// need absolute values, there is no need to do calculation const __m128i e0 = _mm_madd_epi16(d0, d0);
// in 8bit as we are already in 16bit, ... Yet this is what const __m128i e1 = _mm_madd_epi16(d1, d1);
// benchmarks the fastest! const __m128i sum = _mm_add_epi32(e0, e1);
const __m128i d0 = _mm_subs_epu8(a01s, b01s);
const __m128i d1 = _mm_subs_epu8(b01s, a01s);
const __m128i d2 = _mm_subs_epu8(a23s, b23s);
const __m128i d3 = _mm_subs_epu8(b23s, a23s);
// Square and add them all together.
const __m128i madd0 = _mm_madd_epi16(d0, d0);
const __m128i madd1 = _mm_madd_epi16(d1, d1);
const __m128i madd2 = _mm_madd_epi16(d2, d2);
const __m128i madd3 = _mm_madd_epi16(d3, d3);
const __m128i sum0 = _mm_add_epi32(madd0, madd1);
const __m128i sum1 = _mm_add_epi32(madd2, madd3);
const __m128i sum2 = _mm_add_epi32(sum0, sum1);
int32_t tmp[4]; int32_t tmp[4];
_mm_storeu_si128((__m128i*)tmp, sum2); _mm_storeu_si128((__m128i*)tmp, sum);
return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
} }