mirror of
https://github.com/webmproject/libwebp.git
synced 2024-12-26 13:48:21 +01:00
Make Forward WHT transform use 32bit fixed-point calculation
This is in preparation for a future change where input will be 16bit instead of 12bit No speed diff observed. Note that the NEON implementation was using 32bit calc already. Change-Id: If06935db5c56a77fc9cefcb2dec617483f5f62b4
This commit is contained in:
parent
a3359f5d2c
commit
41c0cc4b9a
@ -190,7 +190,7 @@ static void ITransformWHT(const int16_t* in, int16_t* out) {
|
|||||||
|
|
||||||
static void FTransformWHT(const int16_t* in, int16_t* out) {
|
static void FTransformWHT(const int16_t* in, int16_t* out) {
|
||||||
// input is 12b signed
|
// input is 12b signed
|
||||||
int16_t tmp[16];
|
int32_t tmp[16];
|
||||||
int i;
|
int i;
|
||||||
for (i = 0; i < 4; ++i, in += 64) {
|
for (i = 0; i < 4; ++i, in += 64) {
|
||||||
const int a0 = (in[0 * 16] + in[2 * 16]); // 13b
|
const int a0 = (in[0 * 16] + in[2 * 16]); // 13b
|
||||||
|
@ -456,7 +456,7 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {
|
static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {
|
||||||
int16_t tmp[16];
|
int32_t tmp[16];
|
||||||
int i;
|
int i;
|
||||||
for (i = 0; i < 4; ++i, in += 64) {
|
for (i = 0; i < 4; ++i, in += 64) {
|
||||||
const int a0 = (in[0 * 16] + in[2 * 16]);
|
const int a0 = (in[0 * 16] + in[2 * 16]);
|
||||||
@ -469,22 +469,22 @@ static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {
|
|||||||
tmp[3 + i * 4] = a0 - a1;
|
tmp[3 + i * 4] = a0 - a1;
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
const __m128i src0 = _mm_loadl_epi64((__m128i*)&tmp[0]);
|
const __m128i src0 = _mm_loadu_si128((__m128i*)&tmp[0]);
|
||||||
const __m128i src1 = _mm_loadl_epi64((__m128i*)&tmp[4]);
|
const __m128i src1 = _mm_loadu_si128((__m128i*)&tmp[4]);
|
||||||
const __m128i src2 = _mm_loadl_epi64((__m128i*)&tmp[8]);
|
const __m128i src2 = _mm_loadu_si128((__m128i*)&tmp[8]);
|
||||||
const __m128i src3 = _mm_loadl_epi64((__m128i*)&tmp[12]);
|
const __m128i src3 = _mm_loadu_si128((__m128i*)&tmp[12]);
|
||||||
const __m128i a0 = _mm_add_epi16(src0, src2);
|
const __m128i a0 = _mm_add_epi32(src0, src2);
|
||||||
const __m128i a1 = _mm_add_epi16(src1, src3);
|
const __m128i a1 = _mm_add_epi32(src1, src3);
|
||||||
const __m128i a2 = _mm_sub_epi16(src1, src3);
|
const __m128i a2 = _mm_sub_epi32(src1, src3);
|
||||||
const __m128i a3 = _mm_sub_epi16(src0, src2);
|
const __m128i a3 = _mm_sub_epi32(src0, src2);
|
||||||
const __m128i b0 = _mm_srai_epi16(_mm_adds_epi16(a0, a1), 1);
|
const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1);
|
||||||
const __m128i b1 = _mm_srai_epi16(_mm_adds_epi16(a3, a2), 1);
|
const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1);
|
||||||
const __m128i b2 = _mm_srai_epi16(_mm_subs_epi16(a3, a2), 1);
|
const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1);
|
||||||
const __m128i b3 = _mm_srai_epi16(_mm_subs_epi16(a0, a1), 1);
|
const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1);
|
||||||
_mm_storel_epi64((__m128i*)&out[ 0], b0);
|
const __m128i out0 = _mm_packs_epi32(b0, b1);
|
||||||
_mm_storel_epi64((__m128i*)&out[ 4], b1);
|
const __m128i out1 = _mm_packs_epi32(b2, b3);
|
||||||
_mm_storel_epi64((__m128i*)&out[ 8], b2);
|
_mm_storeu_si128((__m128i*)&out[0], out0);
|
||||||
_mm_storel_epi64((__m128i*)&out[12], b3);
|
_mm_storeu_si128((__m128i*)&out[8], out1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user