mirror of
https://github.com/webmproject/libwebp.git
synced 2025-02-13 15:32:53 +01:00
dsp/enc_sse2: add luma4 intra predictors
VP8EncPredLuma4 improvement over ~20M pixels: ~39% Change-Id: I9cd841250771276d2d1bef3991215a56e83f7f20
This commit is contained in:
parent
040b11bdf6
commit
f274a96ce9
@ -509,7 +509,10 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
|
||||
|
||||
static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
|
||||
if (size == 4) {
|
||||
// TODO
|
||||
int j;
|
||||
for (j = 0; j < 4; ++j) {
|
||||
memset(dst + j * BPS, value, 4);
|
||||
}
|
||||
} else if (size == 8) {
|
||||
Put8x8uv(value, dst);
|
||||
} else {
|
||||
@ -719,6 +722,213 @@ static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left,
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// 4x4 predictions
|
||||
|
||||
#define DST(x, y) dst[(x) + (y) * BPS]
|
||||
#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
|
||||
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
|
||||
|
||||
// We use the following 8b-arithmetic tricks:
|
||||
// (a + 2 * b + c + 2) >> 2 = (AC + b + 1) >> 1
|
||||
// where: AC = (a + c) >> 1 = [(a + c + 1) >> 1] - [(a^c) & 1]
|
||||
// and:
|
||||
// (a + 2 * b + c + 2) >> 2 = (AB + BC + 1) >> 1 - (ab|bc)&lsb
|
||||
// where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1
|
||||
// and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
|
||||
|
||||
static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
|
||||
const __m128i one = _mm_set1_epi8(1);
|
||||
const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
|
||||
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
|
||||
const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
|
||||
const __m128i a = _mm_avg_epu8(ABCDEFGH, CDEFGH00);
|
||||
const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one);
|
||||
const __m128i b = _mm_subs_epu8(a, lsb);
|
||||
const __m128i avg = _mm_avg_epu8(b, BCDEFGH0);
|
||||
const uint32_t vals = _mm_cvtsi128_si32(avg);
|
||||
int i;
|
||||
for (i = 0; i < 4; ++i) {
|
||||
*(uint32_t*)(dst + i * BPS) = vals;
|
||||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
|
||||
const int X = top[-1];
|
||||
const int I = top[-2];
|
||||
const int J = top[-3];
|
||||
const int K = top[-4];
|
||||
const int L = top[-5];
|
||||
*(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(X, I, J);
|
||||
*(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(I, J, K);
|
||||
*(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(J, K, L);
|
||||
*(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(K, L, L);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
|
||||
uint32_t dc = 4;
|
||||
int i;
|
||||
for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
|
||||
Fill(dst, dc >> 3, 4);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { // Down-Left
|
||||
const __m128i one = _mm_set1_epi8(1);
|
||||
const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
|
||||
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
|
||||
const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
|
||||
const __m128i CDEFGHH0 = _mm_insert_epi16(CDEFGH00, top[7], 3);
|
||||
const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, CDEFGHH0);
|
||||
const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one);
|
||||
const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
|
||||
const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0);
|
||||
*(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32( abcdefg );
|
||||
*(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1));
|
||||
*(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2));
|
||||
*(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3));
|
||||
}
|
||||
|
||||
static WEBP_INLINE void VR4(uint8_t* dst,
|
||||
const uint8_t* top) { // Vertical-Right
|
||||
const __m128i one = _mm_set1_epi8(1);
|
||||
const int I = top[-2];
|
||||
const int J = top[-3];
|
||||
const int K = top[-4];
|
||||
const int X = top[-1];
|
||||
const __m128i XABCD = _mm_loadl_epi64((const __m128i*)(top - 1));
|
||||
const __m128i ABCD0 = _mm_srli_si128(XABCD, 1);
|
||||
const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0);
|
||||
const __m128i _XABCD = _mm_slli_si128(XABCD, 1);
|
||||
const __m128i IXABCD = _mm_insert_epi16(_XABCD, I | (X << 8), 0);
|
||||
const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0);
|
||||
const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
|
||||
const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
|
||||
const __m128i efgh = _mm_avg_epu8(avg2, XABCD);
|
||||
*(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32( abcd );
|
||||
*(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32( efgh );
|
||||
*(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1));
|
||||
*(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1));
|
||||
|
||||
// these two are hard to implement in SSE2, so we keep the C-version:
|
||||
DST(0, 2) = AVG3(J, I, X);
|
||||
DST(0, 3) = AVG3(K, J, I);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void VL4(uint8_t* dst,
|
||||
const uint8_t* top) { // Vertical-Left
|
||||
const __m128i one = _mm_set1_epi8(1);
|
||||
const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
|
||||
const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
|
||||
const __m128i CDEFGH__ = _mm_srli_si128(ABCDEFGH, 2);
|
||||
const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, BCDEFGH_);
|
||||
const __m128i avg2 = _mm_avg_epu8(CDEFGH__, BCDEFGH_);
|
||||
const __m128i avg3 = _mm_avg_epu8(avg1, avg2);
|
||||
const __m128i lsb1 = _mm_and_si128(_mm_xor_si128(avg1, avg2), one);
|
||||
const __m128i ab = _mm_xor_si128(ABCDEFGH, BCDEFGH_);
|
||||
const __m128i bc = _mm_xor_si128(CDEFGH__, BCDEFGH_);
|
||||
const __m128i abbc = _mm_or_si128(ab, bc);
|
||||
const __m128i lsb2 = _mm_and_si128(abbc, lsb1);
|
||||
const __m128i avg4 = _mm_subs_epu8(avg3, lsb2);
|
||||
const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
|
||||
*(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32( avg1 );
|
||||
*(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32( avg4 );
|
||||
*(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1));
|
||||
*(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1));
|
||||
|
||||
// these two are hard to get and irregular
|
||||
DST(3, 2) = (extra_out >> 0) & 0xff;
|
||||
DST(3, 3) = (extra_out >> 8) & 0xff;
|
||||
}
|
||||
|
||||
static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { // Down-right
|
||||
const __m128i one = _mm_set1_epi8(1);
|
||||
const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
|
||||
const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
|
||||
const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1);
|
||||
const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2);
|
||||
const __m128i avg1 = _mm_avg_epu8(JIXABCD__, LKJIXABCD);
|
||||
const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one);
|
||||
const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
|
||||
const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_);
|
||||
*(uint32_t*)(dst + 3 * BPS) = _mm_cvtsi128_si32( abcdefg );
|
||||
*(uint32_t*)(dst + 2 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1));
|
||||
*(uint32_t*)(dst + 1 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2));
|
||||
*(uint32_t*)(dst + 0 * BPS) = _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3));
|
||||
}
|
||||
|
||||
static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
|
||||
const int I = top[-2];
|
||||
const int J = top[-3];
|
||||
const int K = top[-4];
|
||||
const int L = top[-5];
|
||||
DST(0, 0) = AVG2(I, J);
|
||||
DST(2, 0) = DST(0, 1) = AVG2(J, K);
|
||||
DST(2, 1) = DST(0, 2) = AVG2(K, L);
|
||||
DST(1, 0) = AVG3(I, J, K);
|
||||
DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
|
||||
DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
|
||||
DST(3, 2) = DST(2, 2) =
|
||||
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
|
||||
}
|
||||
|
||||
static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
|
||||
const int X = top[-1];
|
||||
const int I = top[-2];
|
||||
const int J = top[-3];
|
||||
const int K = top[-4];
|
||||
const int L = top[-5];
|
||||
const int A = top[0];
|
||||
const int B = top[1];
|
||||
const int C = top[2];
|
||||
|
||||
DST(0, 0) = DST(2, 1) = AVG2(I, X);
|
||||
DST(0, 1) = DST(2, 2) = AVG2(J, I);
|
||||
DST(0, 2) = DST(2, 3) = AVG2(K, J);
|
||||
DST(0, 3) = AVG2(L, K);
|
||||
|
||||
DST(3, 0) = AVG3(A, B, C);
|
||||
DST(2, 0) = AVG3(X, A, B);
|
||||
DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
|
||||
DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
|
||||
DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
|
||||
DST(1, 3) = AVG3(L, K, J);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i top_values = _mm_cvtsi32_si128(*(const int*)top);
|
||||
const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
|
||||
int y;
|
||||
for (y = 0; y < 4; ++y, dst += BPS) {
|
||||
const int val = top[-2 - y] - top[-1];
|
||||
const __m128i base = _mm_set1_epi16(val);
|
||||
const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
|
||||
*(int*)dst = _mm_cvtsi128_si32(out);
|
||||
}
|
||||
}
|
||||
|
||||
#undef DST
|
||||
#undef AVG3
|
||||
#undef AVG2
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// luma 4x4 prediction
|
||||
|
||||
// Left samples are top[-5 .. -2], top_left is top[-1], top are
|
||||
// located at top[0..3], and top right is top[4..7]
|
||||
static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
|
||||
DC4(I4DC4 + dst, top);
|
||||
TM4(I4TM4 + dst, top);
|
||||
VE4(I4VE4 + dst, top);
|
||||
HE4(I4HE4 + dst, top);
|
||||
RD4(I4RD4 + dst, top);
|
||||
VR4(I4VR4 + dst, top);
|
||||
LD4(I4LD4 + dst, top);
|
||||
VL4(I4VL4 + dst, top);
|
||||
HD4(I4HD4 + dst, top);
|
||||
HU4(I4HU4 + dst, top);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Chroma 8x8 prediction (paragraph 12.2)
|
||||
|
||||
@ -1186,6 +1396,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
|
||||
VP8CollectHistogram = CollectHistogram;
|
||||
VP8EncPredLuma16 = Intra16Preds;
|
||||
VP8EncPredChroma8 = IntraChromaPreds;
|
||||
VP8EncPredLuma4 = Intra4Preds;
|
||||
VP8EncQuantizeBlock = QuantizeBlock;
|
||||
VP8EncQuantize2Blocks = Quantize2Blocks;
|
||||
VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
|
||||
|
Loading…
x
Reference in New Issue
Block a user