From ba843a92e7eb5603b0e9bb633f137a23d04b25cd Mon Sep 17 00:00:00 2001 From: Pascal Massimino Date: Tue, 4 Oct 2016 07:10:27 +0200 Subject: [PATCH] fix some SSIM calculations * prevent 64bit overflow by controlling the 32b->64b conversions and preventively descaling by 8bit before the final multiply * adjust the threshold constants C1 and C2 to de-emphasis the dark areas * use a hat-like filter instead of box-filtering to avoid blockiness during averaging SSIM distortion calc is actually *faster* now in SSE2, because of the unrolling during the function rewrite. The C-version is quite slower because still un-optimized. Change-Id: I96e2715827f79d26faae354cc28c7406c6800c90 --- extras/get_disto.c | 54 +++++++++++++++++---------------- src/dsp/enc.c | 75 +++++++++++++++++++++++++--------------------- src/dsp/enc_sse2.c | 51 ++++++++++++++++++++----------- 3 files changed, 103 insertions(+), 77 deletions(-) diff --git a/extras/get_disto.c b/extras/get_disto.c index 75ecd62c..172bb43f 100644 --- a/extras/get_disto.c +++ b/extras/get_disto.c @@ -94,26 +94,28 @@ typedef struct { uint32_t xxm, xym, yym; // sum(w_i * x_i * x_i), etc. } DistoStats; -static const double kMinValue = 1.e-10; // minimal threshold +// hat-shaped filter. Sum of coefficients is equal to 16. +static const uint32_t kWeight[2 * SSIM_KERNEL + 1] = { 1, 2, 3, 4, 3, 2, 1 }; static WEBP_INLINE double SSIMCalculation(const DistoStats* const stats) { const uint32_t N = stats->w; - const double w2 = N * N; - const double C1 = 6.5025 * w2; - const double C2 = 58.5225 * w2; - const double xmxm = stats->xm * stats->xm; - const double ymym = stats->ym * stats->ym; - const double xmym = stats->xm * stats->ym; - const double sxy = stats->xym * N - xmym; - const double fnum = (2 * xmym + C1) * (2 * sxy + C2); - double fden; - double sxx = stats->xxm * N - xmxm; - double syy = stats->yym * N - ymym; - // small errors are possible, due to rounding. Clamp to zero. - if (sxx < 0.) sxx = 0.; - if (syy < 0.) syy = 0.; - fden = (xmxm + ymym + C1) * (sxx + syy + C2); - return (fden != 0.) ? fnum / fden : kMinValue; + const uint32_t w2 = N * N; + const uint32_t C1 = 20 * w2; + const uint32_t C2 = 60 * w2; + const uint64_t xmxm = (uint64_t)stats->xm * stats->xm; + const uint64_t ymym = (uint64_t)stats->ym * stats->ym; + const int64_t xmym = (int64_t)stats->xm * stats->ym; + const int64_t sxy = (int64_t)stats->xym * N - xmym; // can be negative + const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm; + const uint64_t syy = (uint64_t)stats->yym * N - ymym; + // we descale by 8 to prevent overflow during the fnum/fden multiply. + const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8; + const uint64_t den_S = (sxx + syy + C2) >> 8; + const uint64_t fnum = (2 * xmym + C1) * num_S; + const uint64_t fden = (xmxm + ymym + C1) * den_S; + const double r = (double)fnum / fden; + assert(r >= 0. && r <= 1.0); + return r; } static double SSIMGetClipped(const uint8_t* src1, int stride1, @@ -129,16 +131,18 @@ static double SSIMGetClipped(const uint8_t* src1, int stride1, src2 += ymin * stride2; for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) { for (x = xmin; x <= xmax; ++x) { - const int s1 = src1[x]; - const int s2 = src2[x]; - stats.xm += s1; - stats.ym += s2; - stats.xxm += s1 * s1; - stats.xym += s1 * s2; - stats.yym += s2 * s2; + const uint32_t w = kWeight[SSIM_KERNEL + x - xo] + * kWeight[SSIM_KERNEL + y - yo]; + const uint32_t s1 = src1[x]; + const uint32_t s2 = src2[x]; + stats.w += w; + stats.xm += w * s1; + stats.ym += w * s2; + stats.xxm += w * s1 * s1; + stats.xym += w * s1 * s2; + stats.yym += w * s2 * s2; } } - stats.w = (ymax - ymin + 1) * (xmax - xmin + 1); return SSIMCalculation(&stats); } diff --git a/src/dsp/enc.c b/src/dsp/enc.c index 3013b87c..71cc5348 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -693,31 +693,35 @@ static void Copy16x8(const uint8_t* src, uint8_t* dst) { //------------------------------------------------------------------------------ // SSIM / PSNR -static const double kMinValue = 1.e-10; // minimal threshold +// hat-shaped filter. Sum of coefficients is equal to 16. +static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = { + 1, 2, 3, 4, 3, 2, 1 +}; +static const uint32_t kWeightSum = 16 * 16; // sum{kWeight}^2 static WEBP_INLINE double SSIMCalculation( const VP8DistoStats* const stats, uint32_t N /*num samples*/) { - const double w2 = N * N; - const double C1 = 6.5025 * w2; - const double C2 = 58.5225 * w2; - const double xmxm = stats->xm * stats->xm; - const double ymym = stats->ym * stats->ym; - const double xmym = stats->xm * stats->ym; - const double sxy = stats->xym * N - xmym; - const double fnum = (2 * xmym + C1) * (2 * sxy + C2); - double fden; - double sxx = stats->xxm * N - xmxm; - double syy = stats->yym * N - ymym; - // small errors are possible, due to rounding. Clamp to zero. - if (sxx < 0.) sxx = 0.; - if (syy < 0.) syy = 0.; - fden = (xmxm + ymym + C1) * (sxx + syy + C2); - return (fden != 0.) ? fnum / fden : kMinValue; + const uint32_t w2 = N * N; + const uint32_t C1 = 20 * w2; + const uint32_t C2 = 60 * w2; + const uint64_t xmxm = (uint64_t)stats->xm * stats->xm; + const uint64_t ymym = (uint64_t)stats->ym * stats->ym; + const int64_t xmym = (int64_t)stats->xm * stats->ym; + const int64_t sxy = (int64_t)stats->xym * N - xmym; // can be negative + const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm; + const uint64_t syy = (uint64_t)stats->yym * N - ymym; + // we descale by 8 to prevent overflow during the fnum/fden multiply. + const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8; + const uint64_t den_S = (sxx + syy + C2) >> 8; + const uint64_t fnum = (2 * xmym + C1) * num_S; + const uint64_t fden = (xmxm + ymym + C1) * den_S; + const double r = (double)fnum / fden; + assert(r >= 0. && r <= 1.0); + return r; } double VP8SSIMFromStats(const VP8DistoStats* const stats) { - const uint32_t w = (2 * VP8_SSIM_KERNEL + 1) * (2 * VP8_SSIM_KERNEL + 1); - return SSIMCalculation(stats, w); + return SSIMCalculation(stats, kWeightSum); } double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) { @@ -739,16 +743,18 @@ static double SSIMGetClipped_C(const uint8_t* src1, int stride1, src2 += ymin * stride2; for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) { for (x = xmin; x <= xmax; ++x) { - const int s1 = src1[x]; - const int s2 = src2[x]; - stats.xm += s1; - stats.ym += s2; - stats.xxm += s1 * s1; - stats.xym += s1 * s2; - stats.yym += s2 * s2; + const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo] + * kWeight[VP8_SSIM_KERNEL + y - yo]; + const uint32_t s1 = src1[x]; + const uint32_t s2 = src2[x]; + stats.w += w; + stats.xm += w * s1; + stats.ym += w * s2; + stats.xxm += w * s1 * s1; + stats.xym += w * s1 * s2; + stats.yym += w * s2 * s2; } } - stats.w = (ymax - ymin + 1) * (xmax - xmin + 1); return VP8SSIMFromStatsClipped(&stats); } @@ -758,13 +764,14 @@ static double SSIMGet_C(const uint8_t* src1, int stride1, int x, y; for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) { for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) { - const int s1 = src1[x]; - const int s2 = src2[x]; - stats.xm += s1; - stats.ym += s2; - stats.xxm += s1 * s1; - stats.xym += s1 * s2; - stats.yym += s2 * s2; + const uint32_t w = kWeight[x] * kWeight[y]; + const uint32_t s1 = src1[x]; + const uint32_t s2 = src2[x]; + stats.xm += w * s1; + stats.ym += w * s2; + stats.xxm += w * s1 * s1; + stats.xym += w * s1 * s2; + stats.yym += w * s2 * s2; } } return VP8SSIMFromStats(&stats); diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c index b5fb83ee..59d83de9 100644 --- a/src/dsp/enc_sse2.c +++ b/src/dsp/enc_sse2.c @@ -1422,30 +1422,45 @@ static uint32_t HorizontalAdd32b(const __m128i* const m) { return (uint32_t)_mm_cvtsi128_si32(c); } +static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 }; + +#define ACCUMULATE_ROW(WEIGHT) do { \ + /* compute row weight (Wx * Wy) */ \ + const __m128i Wy = _mm_set1_epi16((WEIGHT)); \ + const __m128i W = _mm_mullo_epi16(Wx, Wy); \ + /* process 8 bytes at a time (7 bytes, actually) */ \ + const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \ + const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \ + /* convert to 16b and multiply by weight */ \ + const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \ + const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \ + const __m128i wa1 = _mm_mullo_epi16(a1, W); \ + const __m128i wb1 = _mm_mullo_epi16(b1, W); \ + /* accumulate */ \ + xm = _mm_add_epi16(xm, wa1); \ + ym = _mm_add_epi16(ym, wb1); \ + xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \ + xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \ + yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \ + src1 += stride1; \ + src2 += stride2; \ +} while (0) + static double SSIMGet_SSE2(const uint8_t* src1, int stride1, const uint8_t* src2, int stride2) { VP8DistoStats stats; - int y; const __m128i zero = _mm_setzero_si128(); - __m128i xm = zero, ym = zero; // 16b accums + __m128i xm = zero, ym = zero; // 16b accums __m128i xxm = zero, yym = zero, xym = zero; // 32b accum + const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight); assert(2 * VP8_SSIM_KERNEL + 1 == 7); - for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) { - // process 8 bytes at a time (7 bytes, actually) - const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); - const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); - // convert 8b -> 16b - const __m128i a1 = _mm_unpacklo_epi8(a0, zero); - const __m128i b1 = _mm_unpacklo_epi8(b0, zero); - // zeroes the rightmost pixel we are not interested in: - const __m128i s1 = _mm_slli_si128(a1, 2); - const __m128i s2 = _mm_slli_si128(b1, 2); - xm = _mm_add_epi16(xm, s1); - ym = _mm_add_epi16(ym, s2); - xxm = _mm_add_epi32(xxm, _mm_madd_epi16(s1, s1)); - xym = _mm_add_epi32(xym, _mm_madd_epi16(s1, s2)); - yym = _mm_add_epi32(yym, _mm_madd_epi16(s2, s2)); - } + ACCUMULATE_ROW(1); + ACCUMULATE_ROW(2); + ACCUMULATE_ROW(3); + ACCUMULATE_ROW(4); + ACCUMULATE_ROW(3); + ACCUMULATE_ROW(2); + ACCUMULATE_ROW(1); stats.xm = HorizontalAdd16b(&xm); stats.ym = HorizontalAdd16b(&ym); stats.xxm = HorizontalAdd32b(&xxm);