Make Forward WHT transform use 32bit fixed-point calculation

This is in preparation for a future change where input will
be 16bit instead of 12bit

No speed diff observed.

Note that the NEON implementation was using 32bit calc already.

Change-Id: If06935db5c56a77fc9cefcb2dec617483f5f62b4
This commit is contained in:
skal 2013-12-10 06:10:52 +01:00
parent a3359f5d2c
commit 41c0cc4b9a
2 changed files with 18 additions and 18 deletions

View File

@ -190,7 +190,7 @@ static void ITransformWHT(const int16_t* in, int16_t* out) {
static void FTransformWHT(const int16_t* in, int16_t* out) { static void FTransformWHT(const int16_t* in, int16_t* out) {
// input is 12b signed // input is 12b signed
int16_t tmp[16]; int32_t tmp[16];
int i; int i;
for (i = 0; i < 4; ++i, in += 64) { for (i = 0; i < 4; ++i, in += 64) {
const int a0 = (in[0 * 16] + in[2 * 16]); // 13b const int a0 = (in[0 * 16] + in[2 * 16]); // 13b

View File

@ -456,7 +456,7 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
} }
static void FTransformWHTSSE2(const int16_t* in, int16_t* out) { static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {
int16_t tmp[16]; int32_t tmp[16];
int i; int i;
for (i = 0; i < 4; ++i, in += 64) { for (i = 0; i < 4; ++i, in += 64) {
const int a0 = (in[0 * 16] + in[2 * 16]); const int a0 = (in[0 * 16] + in[2 * 16]);
@ -469,22 +469,22 @@ static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {
tmp[3 + i * 4] = a0 - a1; tmp[3 + i * 4] = a0 - a1;
} }
{ {
const __m128i src0 = _mm_loadl_epi64((__m128i*)&tmp[0]); const __m128i src0 = _mm_loadu_si128((__m128i*)&tmp[0]);
const __m128i src1 = _mm_loadl_epi64((__m128i*)&tmp[4]); const __m128i src1 = _mm_loadu_si128((__m128i*)&tmp[4]);
const __m128i src2 = _mm_loadl_epi64((__m128i*)&tmp[8]); const __m128i src2 = _mm_loadu_si128((__m128i*)&tmp[8]);
const __m128i src3 = _mm_loadl_epi64((__m128i*)&tmp[12]); const __m128i src3 = _mm_loadu_si128((__m128i*)&tmp[12]);
const __m128i a0 = _mm_add_epi16(src0, src2); const __m128i a0 = _mm_add_epi32(src0, src2);
const __m128i a1 = _mm_add_epi16(src1, src3); const __m128i a1 = _mm_add_epi32(src1, src3);
const __m128i a2 = _mm_sub_epi16(src1, src3); const __m128i a2 = _mm_sub_epi32(src1, src3);
const __m128i a3 = _mm_sub_epi16(src0, src2); const __m128i a3 = _mm_sub_epi32(src0, src2);
const __m128i b0 = _mm_srai_epi16(_mm_adds_epi16(a0, a1), 1); const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1);
const __m128i b1 = _mm_srai_epi16(_mm_adds_epi16(a3, a2), 1); const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1);
const __m128i b2 = _mm_srai_epi16(_mm_subs_epi16(a3, a2), 1); const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1);
const __m128i b3 = _mm_srai_epi16(_mm_subs_epi16(a0, a1), 1); const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1);
_mm_storel_epi64((__m128i*)&out[ 0], b0); const __m128i out0 = _mm_packs_epi32(b0, b1);
_mm_storel_epi64((__m128i*)&out[ 4], b1); const __m128i out1 = _mm_packs_epi32(b2, b3);
_mm_storel_epi64((__m128i*)&out[ 8], b2); _mm_storeu_si128((__m128i*)&out[0], out0);
_mm_storel_epi64((__m128i*)&out[12], b3); _mm_storeu_si128((__m128i*)&out[8], out1);
} }
} }