Simplify the texture evaluation Disto4x4()

We don't need to use the exact forward transform,
since it's only a rough evaluation.
-> Removed some shifts and rounding constants.

Change-Id: I3fdf8b4fe9720473894155e1ad0345f4d1fd9a33
This commit is contained in:
skal 2012-11-14 07:49:31 +01:00
parent a7305c2ef0
commit e5c3b3f554
2 changed files with 21 additions and 39 deletions

View File

@ -569,30 +569,30 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
int i; int i;
// horizontal pass // horizontal pass
for (i = 0; i < 4; ++i, in += BPS) { for (i = 0; i < 4; ++i, in += BPS) {
const int a0 = (in[0] + in[2]) << 2; const int a0 = in[0] + in[2];
const int a1 = (in[1] + in[3]) << 2; const int a1 = in[1] + in[3];
const int a2 = (in[1] - in[3]) << 2; const int a2 = in[1] - in[3];
const int a3 = (in[0] - in[2]) << 2; const int a3 = in[0] - in[2];
tmp[0 + i * 4] = a0 + a1 + (a0 != 0); tmp[0 + i * 4] = a0 + a1;
tmp[1 + i * 4] = a3 + a2; tmp[1 + i * 4] = a3 + a2;
tmp[2 + i * 4] = a3 - a2; tmp[2 + i * 4] = a3 - a2;
tmp[3 + i * 4] = a0 - a1; tmp[3 + i * 4] = a0 - a1;
} }
// vertical pass // vertical pass
for (i = 0; i < 4; ++i, ++w) { for (i = 0; i < 4; ++i, ++w) {
const int a0 = (tmp[0 + i] + tmp[8 + i]); const int a0 = tmp[0 + i] + tmp[8 + i];
const int a1 = (tmp[4 + i] + tmp[12+ i]); const int a1 = tmp[4 + i] + tmp[12+ i];
const int a2 = (tmp[4 + i] - tmp[12+ i]); const int a2 = tmp[4 + i] - tmp[12+ i];
const int a3 = (tmp[0 + i] - tmp[8 + i]); const int a3 = tmp[0 + i] - tmp[8 + i];
const int b0 = a0 + a1; const int b0 = a0 + a1;
const int b1 = a3 + a2; const int b1 = a3 + a2;
const int b2 = a3 - a2; const int b2 = a3 - a2;
const int b3 = a0 - a1; const int b3 = a0 - a1;
// abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
sum += w[ 0] * ((abs(b0) + 3) >> 3); sum += w[ 0] * abs(b0);
sum += w[ 4] * ((abs(b1) + 3) >> 3); sum += w[ 4] * abs(b1);
sum += w[ 8] * ((abs(b2) + 3) >> 3); sum += w[ 8] * abs(b2);
sum += w[12] * ((abs(b3) + 3) >> 3); sum += w[12] * abs(b3);
} }
return sum; return sum;
} }
@ -601,7 +601,7 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) { const uint16_t* const w) {
const int sum1 = TTransform(a, w); const int sum1 = TTransform(a, w);
const int sum2 = TTransform(b, w); const int sum2 = TTransform(b, w);
return (abs(sum2 - sum1) + 8) >> 4; return abs(sum2 - sum1) >> 5;
} }
static int Disto16x16(const uint8_t* const a, const uint8_t* const b, static int Disto16x16(const uint8_t* const a, const uint8_t* const b,

View File

@ -502,8 +502,6 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
int32_t sum[4]; int32_t sum[4];
__m128i tmp_0, tmp_1, tmp_2, tmp_3; __m128i tmp_0, tmp_1, tmp_2, tmp_3;
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi16(1);
const __m128i three = _mm_set1_epi16(3);
// Load, combine and tranpose inputs. // Load, combine and tranpose inputs.
{ {
@ -550,17 +548,14 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
// Horizontal pass and subsequent transpose. // Horizontal pass and subsequent transpose.
{ {
// Calculate a and b (two 4x4 at once). // Calculate a and b (two 4x4 at once).
const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2); const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
// b0_extra = (a0 != 0); const __m128i b0 = _mm_add_epi16(a0, a1);
const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one);
const __m128i b0_base = _mm_add_epi16(a0, a1);
const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b1 = _mm_add_epi16(a3, a2);
const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2);
const __m128i b3 = _mm_sub_epi16(a0, a1); const __m128i b3 = _mm_sub_epi16(a0, a1);
const __m128i b0 = _mm_add_epi16(b0_base, b0_extra);
// a00 a01 a02 a03 b00 b01 b02 b03 // a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13 // a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23 // a20 a21 a22 a23 b20 b21 b22 b23
@ -635,19 +630,6 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
B_b2 = _mm_sub_epi16(B_b2, sign_B_b2); B_b2 = _mm_sub_epi16(B_b2, sign_B_b2);
} }
// b = abs(b) + 3
A_b0 = _mm_add_epi16(A_b0, three);
A_b2 = _mm_add_epi16(A_b2, three);
B_b0 = _mm_add_epi16(B_b0, three);
B_b2 = _mm_add_epi16(B_b2, three);
// abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
// b = (abs(b) + 3) >> 3
A_b0 = _mm_srai_epi16(A_b0, 3);
A_b2 = _mm_srai_epi16(A_b2, 3);
B_b0 = _mm_srai_epi16(B_b0, 3);
B_b2 = _mm_srai_epi16(B_b2, 3);
// weighted sums // weighted sums
A_b0 = _mm_madd_epi16(A_b0, w_0); A_b0 = _mm_madd_epi16(A_b0, w_0);
A_b2 = _mm_madd_epi16(A_b2, w_8); A_b2 = _mm_madd_epi16(A_b2, w_8);
@ -666,7 +648,7 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b, static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) { const uint16_t* const w) {
const int diff_sum = TTransformSSE2(a, b, w); const int diff_sum = TTransformSSE2(a, b, w);
return (abs(diff_sum) + 8) >> 4; return abs(diff_sum) >> 5;
} }
static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b, static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,