From e7ff3f9af6bff4b8b1885aa75204b67dbbfe496f Mon Sep 17 00:00:00 2001 From: Pascal Massimino Date: Thu, 21 Apr 2011 13:32:45 -0700 Subject: [PATCH] merge two ITransforms together when applicable and change the TTransform to return the sum directly. output is bitwise the same, speed up 1-2%. This is preparatory to a more efficient SSE2 implementation. Change-Id: I0bcdf05808c93420fbe9dcb75e5e7e55a4ae5b89 --- src/enc/dsp.c | 40 ++++++++++++++++++++++++---------------- src/enc/quant.c | 14 +++++++------- src/enc/vp8enci.h | 5 ++++- 3 files changed, 35 insertions(+), 24 deletions(-) diff --git a/src/enc/dsp.c b/src/enc/dsp.c index 45f977c9..aca199e0 100644 --- a/src/enc/dsp.c +++ b/src/enc/dsp.c @@ -49,7 +49,8 @@ static const int kC1 = 20091 + (1 << 16); static const int kC2 = 35468; #define MUL(a, b) (((a) * (b)) >> 16) -static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst) { +static inline void ITransformOne(const uint8_t* ref, const int16_t* in, + uint8_t* dst) { int C[4 * 4], *tmp; int i; tmp = C; @@ -80,6 +81,13 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst) { tmp++; } } +static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, + int do_two) { + ITransformOne(ref, in, dst); + if (do_two) { + ITransformOne(ref + 4, in + 16, dst + 4); + } +} static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { int i; @@ -526,9 +534,12 @@ VP8Metric VP8SSE4x4 = SSE4x4; // reconstructed samples. // Hadamard transform -static void TTransform(const uint8_t* in, int16_t* out) { +// Returns the weighted sum of the absolute value of transformed coefficients. +static int TTransform(const uint8_t* in, const uint16_t* w) { + int sum = 0; int tmp[16]; int i; + // horizontal pass for (i = 0; i < 4; ++i, in += BPS) { const int a0 = (in[0] + in[2]) << 2; const int a1 = (in[1] + in[3]) << 2; @@ -539,7 +550,8 @@ static void TTransform(const uint8_t* in, int16_t* out) { tmp[2 + i * 4] = a3 - a2; tmp[3 + i * 4] = a0 - a1; } - for (i = 0; i < 4; ++i) { + // vertical pass + for (i = 0; i < 4; ++i, ++w) { const int a0 = (tmp[0 + i] + tmp[8 + i]); const int a1 = (tmp[4 + i] + tmp[12+ i]); const int a2 = (tmp[4 + i] - tmp[12+ i]); @@ -548,24 +560,20 @@ static void TTransform(const uint8_t* in, int16_t* out) { const int b1 = a3 + a2; const int b2 = a3 - a2; const int b3 = a0 - a1; - out[ 0 + i] = (b0 + (b0 < 0) + 3) >> 3; - out[ 4 + i] = (b1 + (b1 < 0) + 3) >> 3; - out[ 8 + i] = (b2 + (b2 < 0) + 3) >> 3; - out[12 + i] = (b3 + (b3 < 0) + 3) >> 3; + // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3 + sum += w[ 0] * ((abs(b0) + 3) >> 3); + sum += w[ 4] * ((abs(b1) + 3) >> 3); + sum += w[ 8] * ((abs(b2) + 3) >> 3); + sum += w[12] * ((abs(b3) + 3) >> 3); } + return sum; } static int Disto4x4(const uint8_t* const a, const uint8_t* const b, const uint16_t* const w) { - int16_t tmp1[16], tmp2[16]; - int k; - int D; - TTransform(a, tmp1); - TTransform(b, tmp2); - D = 0; - for (k = 0; k < 16; ++k) - D += w[k] * (abs(tmp2[k]) - abs(tmp1[k])); - return (abs(D) + 8) >> 4; + const int sum1 = TTransform(a, w); + const int sum2 = TTransform(b, w); + return (abs(sum2 - sum1) + 8) >> 4; } static int Disto16x16(const uint8_t* const a, const uint8_t* const b, diff --git a/src/enc/quant.c b/src/enc/quant.c index e4399194..31ec8144 100644 --- a/src/enc/quant.c +++ b/src/enc/quant.c @@ -615,8 +615,8 @@ static int ReconstructIntra16(VP8EncIterator* const it, // Transform back VP8ITransformWHT(dc_tmp, tmp[0]); - for (n = 0; n < 16; ++n) { - VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n]); + for (n = 0; n < 16; n += 2) { + VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1); } return nz; @@ -642,7 +642,7 @@ static int ReconstructIntra4(VP8EncIterator* const it, } else { nz = VP8EncQuantizeBlock(tmp, levels, 0, &dqm->y1_); } - VP8ITransform(ref, tmp, yuv_out); + VP8ITransform(ref, tmp, yuv_out, 0); return nz; } @@ -666,8 +666,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd, for (x = 0; x < 2; ++x, ++n) { const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y]; const int non_zero = - TrellisQuantizeBlock(it, tmp[n], rd->uv_levels[n], ctx, 2, &dqm->uv_, - dqm->lambda_trellis_uv_); + TrellisQuantizeBlock(it, tmp[n], rd->uv_levels[n], ctx, 2, + &dqm->uv_, dqm->lambda_trellis_uv_); it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero; nz |= non_zero << n; } @@ -679,8 +679,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd, } } - for (n = 0; n < 8; ++n) { - VP8ITransform(ref + VP8Scan[16 + n], tmp[n], yuv_out + VP8Scan[16 + n]); + for (n = 0; n < 8; n += 2) { + VP8ITransform(ref + VP8Scan[16 + n], tmp[n], yuv_out + VP8Scan[16 + n], 1); } return (nz << 16); } diff --git a/src/enc/vp8enci.h b/src/enc/vp8enci.h index b19450d3..9ee6fc4e 100644 --- a/src/enc/vp8enci.h +++ b/src/enc/vp8enci.h @@ -416,7 +416,10 @@ int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt); // in dsp.c // Transforms -typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst); +// VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms +// will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4). +typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst, + int do_two); typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out); typedef void (*VP8WHT)(const int16_t* in, int16_t* out); extern VP8Idct VP8ITransform;