From bfff0bf3292f2643df65e5c99e8cec29746c9125 Mon Sep 17 00:00:00 2001 From: Pascal Massimino Date: Wed, 14 Sep 2016 16:15:43 +0200 Subject: [PATCH] speed-up SSIM calculation SSIM results are incompatible with previous version! We're now averaging the SSIM value for each pixels instead of printing a frame-level global SSIM value. * Got rid of some old code * switched to uint32_t for accumulation * refactoring SSIM calculation is ~4x faster now. Change-Id: I48d838e66aef5199b9b5cd5cddef6a98411f5673 --- src/dsp/dsp.h | 29 +++++----- src/dsp/enc.c | 82 ++++++++++++++++++++-------- src/enc/filter.c | 105 ++++-------------------------------- src/enc/picture_psnr.c | 120 ++++++++++++++++++++--------------------- src/enc/vp8enci.h | 8 --- 5 files changed, 143 insertions(+), 201 deletions(-) diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index f57569d2..e8c8d383 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -254,26 +254,29 @@ void VP8EncDspCostInit(void); // struct for accumulating statistical moments typedef struct { - double w; // sum(w_i) : sum of weights - double xm, ym; // sum(w_i * x_i), sum(w_i * y_i) - double xxm, xym, yym; // sum(w_i * x_i * x_i), etc. + uint32_t w; // sum(w_i) : sum of weights + uint32_t xm, ym; // sum(w_i * x_i), sum(w_i * y_i) + uint32_t xxm, xym, yym; // sum(w_i * x_i * x_i), etc. } VP8DistoStats; +// Compute the final SSIM value +// The non-clipped version assumes stats->w = (2 * VP8_SSIM_KERNEL + 1)^2. +double VP8SSIMFromStats(const VP8DistoStats* const stats); +double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats); + #define VP8_SSIM_KERNEL 3 // total size of the kernel: 2 * VP8_SSIM_KERNEL + 1 -typedef void (*VP8SSIMAccumulateClippedFunc)(const uint8_t* src1, int stride1, - const uint8_t* src2, int stride2, - int xo, int yo, // center position - int W, int H, // plane dimension - VP8DistoStats* const stats); +typedef double (*VP8SSIMGetClippedFunc)(const uint8_t* src1, int stride1, + const uint8_t* src2, int stride2, + int xo, int yo, // center position + int W, int H); // plane dimension // This version is called with the guarantee that you can load 8 bytes and // 8 rows at offset src1 and src2 -typedef void (*VP8SSIMAccumulateFunc)(const uint8_t* src1, int stride1, - const uint8_t* src2, int stride2, - VP8DistoStats* const stats); +typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1, + const uint8_t* src2, int stride2); -extern VP8SSIMAccumulateFunc VP8SSIMAccumulate; // unclipped / unchecked -extern VP8SSIMAccumulateClippedFunc VP8SSIMAccumulateClipped; // with clipping +extern VP8SSIMGetFunc VP8SSIMGet; // unclipped / unchecked +extern VP8SSIMGetClippedFunc VP8SSIMGetClipped; // with clipping typedef uint32_t (*VP8AccumulateSSEFunc)(const uint8_t* src1, const uint8_t* src2, int len); diff --git a/src/dsp/enc.c b/src/dsp/enc.c index d94e617d..269f0602 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -693,10 +693,41 @@ static void Copy16x8(const uint8_t* src, uint8_t* dst) { //------------------------------------------------------------------------------ // SSIM / PSNR -static void SSIMAccumulateClipped(const uint8_t* src1, int stride1, - const uint8_t* src2, int stride2, - int xo, int yo, int W, int H, - VP8DistoStats* const stats) { +static const double kMinValue = 1.e-10; // minimal threshold + +static WEBP_INLINE double SSIMCalculation( + const VP8DistoStats* const stats, uint32_t N /*num samples*/) { + const double w2 = N * N; + const double C1 = 6.5025 * w2; + const double C2 = 58.5225 * w2; + const double xmxm = stats->xm * stats->xm; + const double ymym = stats->ym * stats->ym; + const double xmym = stats->xm * stats->ym; + const double sxy = stats->xym * N - xmym; + const double fnum = (2 * xmym + C1) * (2 * sxy + C2); + double fden; + double sxx = stats->xxm * N - xmxm; + double syy = stats->yym * N - ymym; + // small errors are possible, due to rounding. Clamp to zero. + if (sxx < 0.) sxx = 0.; + if (syy < 0.) syy = 0.; + fden = (xmxm + ymym + C1) * (sxx + syy + C2); + return (fden != 0.) ? fnum / fden : kMinValue; +} + +double VP8SSIMFromStats(const VP8DistoStats* const stats) { + const uint32_t w = (2 * VP8_SSIM_KERNEL + 1) * (2 * VP8_SSIM_KERNEL + 1); + return SSIMCalculation(stats, w); +} + +double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) { + return SSIMCalculation(stats, stats->w); +} + +static double SSIMGetClipped_C(const uint8_t* src1, int stride1, + const uint8_t* src2, int stride2, + int xo, int yo, int W, int H) { + VP8DistoStats stats = { 0., 0., 0., 0., 0., 0. }; const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL; const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1 : yo + VP8_SSIM_KERNEL; @@ -710,34 +741,37 @@ static void SSIMAccumulateClipped(const uint8_t* src1, int stride1, for (x = xmin; x <= xmax; ++x) { const int s1 = src1[x]; const int s2 = src2[x]; - stats->w += 1; - stats->xm += s1; - stats->ym += s2; - stats->xxm += s1 * s1; - stats->xym += s1 * s2; - stats->yym += s2 * s2; + stats.xm += s1; + stats.ym += s2; + stats.xxm += s1 * s1; + stats.xym += s1 * s2; + stats.yym += s2 * s2; } } + stats.w = (ymax - ymin + 1) * (xmax - xmin + 1); + return VP8SSIMFromStatsClipped(&stats); } -static void SSIMAccumulate(const uint8_t* src1, int stride1, - const uint8_t* src2, int stride2, - VP8DistoStats* const stats) { +static double SSIMGet_C(const uint8_t* src1, int stride1, + const uint8_t* src2, int stride2) { + VP8DistoStats stats = { 0., 0., 0., 0., 0., 0. }; int x, y; for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) { for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) { const int s1 = src1[x]; const int s2 = src2[x]; - stats->w += 1; - stats->xm += s1; - stats->ym += s2; - stats->xxm += s1 * s1; - stats->xym += s1 * s2; - stats->yym += s2 * s2; + stats.xm += s1; + stats.ym += s2; + stats.xxm += s1 * s1; + stats.xym += s1 * s2; + stats.yym += s2 * s2; } } + return VP8SSIMFromStats(&stats); } +//------------------------------------------------------------------------------ + static uint32_t AccumulateSSE(const uint8_t* src1, const uint8_t* src2, int len) { int i; @@ -750,8 +784,10 @@ static uint32_t AccumulateSSE(const uint8_t* src1, return sse2; } -VP8SSIMAccumulateFunc VP8SSIMAccumulate; -VP8SSIMAccumulateClippedFunc VP8SSIMAccumulateClipped; +//------------------------------------------------------------------------------ + +VP8SSIMGetFunc VP8SSIMGet; +VP8SSIMGetClippedFunc VP8SSIMGetClipped; VP8AccumulateSSEFunc VP8AccumulateSSE; extern void VP8SSIMDspInitSSE2(void); @@ -762,8 +798,8 @@ static volatile VP8CPUInfo ssim_last_cpuinfo_used = WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) { if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return; - VP8SSIMAccumulate = SSIMAccumulate; - VP8SSIMAccumulateClipped = SSIMAccumulateClipped; + VP8SSIMGetClipped = SSIMGetClipped_C; + VP8SSIMGet = SSIMGet_C; VP8AccumulateSSE = AccumulateSSE; if (VP8GetCPUInfo != NULL) { diff --git a/src/enc/filter.c b/src/enc/filter.c index e8ea8b4f..bf3e374f 100644 --- a/src/enc/filter.c +++ b/src/enc/filter.c @@ -105,115 +105,28 @@ static void DoFilter(const VP8EncIterator* const it, int level) { } //------------------------------------------------------------------------------ -// SSIM metric - -static const double kMinValue = 1.e-10; // minimal threshold - -void VP8SSIMAddStats(const VP8DistoStats* const src, VP8DistoStats* const dst) { - dst->w += src->w; - dst->xm += src->xm; - dst->ym += src->ym; - dst->xxm += src->xxm; - dst->xym += src->xym; - dst->yym += src->yym; -} - -double VP8SSIMGet(const VP8DistoStats* const stats) { - const double xmxm = stats->xm * stats->xm; - const double ymym = stats->ym * stats->ym; - const double xmym = stats->xm * stats->ym; - const double w2 = stats->w * stats->w; - double sxx = stats->xxm * stats->w - xmxm; - double syy = stats->yym * stats->w - ymym; - double sxy = stats->xym * stats->w - xmym; - double C1, C2; - double fnum; - double fden; - // small errors are possible, due to rounding. Clamp to zero. - if (sxx < 0.) sxx = 0.; - if (syy < 0.) syy = 0.; - C1 = 6.5025 * w2; - C2 = 58.5225 * w2; - fnum = (2 * xmym + C1) * (2 * sxy + C2); - fden = (xmxm + ymym + C1) * (sxx + syy + C2); - return (fden != 0.) ? fnum / fden : kMinValue; -} - -double VP8SSIMGetSquaredError(const VP8DistoStats* const s) { - if (s->w > 0.) { - const double iw2 = 1. / (s->w * s->w); - const double sxx = s->xxm * s->w - s->xm * s->xm; - const double syy = s->yym * s->w - s->ym * s->ym; - const double sxy = s->xym * s->w - s->xm * s->ym; - const double SSE = iw2 * (sxx + syy - 2. * sxy); - if (SSE > kMinValue) return SSE; - } - return kMinValue; -} - -#define LIMIT(A, M) ((A) > (M) ? (M) : (A)) -static void VP8SSIMAccumulateRow(const uint8_t* src1, int stride1, - const uint8_t* src2, int stride2, - int y, int W, int H, - VP8DistoStats* const stats) { - int x = 0; - const int w0 = LIMIT(VP8_SSIM_KERNEL, W); - for (x = 0; x < w0; ++x) { - VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats); - } - for (; x <= W - 8 + VP8_SSIM_KERNEL; ++x) { - VP8SSIMAccumulate( - src1 + (y - VP8_SSIM_KERNEL) * stride1 + (x - VP8_SSIM_KERNEL), stride1, - src2 + (y - VP8_SSIM_KERNEL) * stride2 + (x - VP8_SSIM_KERNEL), stride2, - stats); - } - for (; x < W; ++x) { - VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats); - } -} - -void VP8SSIMAccumulatePlane(const uint8_t* src1, int stride1, - const uint8_t* src2, int stride2, - int W, int H, VP8DistoStats* const stats) { - int x, y; - const int h0 = LIMIT(VP8_SSIM_KERNEL, H); - const int h1 = LIMIT(VP8_SSIM_KERNEL, H - VP8_SSIM_KERNEL); - for (y = 0; y < h0; ++y) { - for (x = 0; x < W; ++x) { - VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats); - } - } - for (; y < h1; ++y) { - VP8SSIMAccumulateRow(src1, stride1, src2, stride2, y, W, H, stats); - } - for (; y < H; ++y) { - for (x = 0; x < W; ++x) { - VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats); - } - } -} -#undef LIMIT +// SSIM metric for one macroblock static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) { int x, y; - VP8DistoStats s = { .0, .0, .0, .0, .0, .0 }; + double sum = 0.; // compute SSIM in a 10 x 10 window for (y = VP8_SSIM_KERNEL; y < 16 - VP8_SSIM_KERNEL; y++) { for (x = VP8_SSIM_KERNEL; x < 16 - VP8_SSIM_KERNEL; x++) { - VP8SSIMAccumulateClipped(yuv1 + Y_OFF_ENC, BPS, yuv2 + Y_OFF_ENC, BPS, - x, y, 16, 16, &s); + sum += VP8SSIMGetClipped(yuv1 + Y_OFF_ENC, BPS, yuv2 + Y_OFF_ENC, BPS, + x, y, 16, 16); } } for (x = 1; x < 7; x++) { for (y = 1; y < 7; y++) { - VP8SSIMAccumulateClipped(yuv1 + U_OFF_ENC, BPS, yuv2 + U_OFF_ENC, BPS, - x, y, 8, 8, &s); - VP8SSIMAccumulateClipped(yuv1 + V_OFF_ENC, BPS, yuv2 + V_OFF_ENC, BPS, - x, y, 8, 8, &s); + sum += VP8SSIMGetClipped(yuv1 + U_OFF_ENC, BPS, yuv2 + U_OFF_ENC, BPS, + x, y, 8, 8); + sum += VP8SSIMGetClipped(yuv1 + V_OFF_ENC, BPS, yuv2 + V_OFF_ENC, BPS, + x, y, 8, 8); } } - return VP8SSIMGet(&s); + return sum; } //------------------------------------------------------------------------------ diff --git a/src/enc/picture_psnr.c b/src/enc/picture_psnr.c index 6c0d94f3..e3f1e4cb 100644 --- a/src/enc/picture_psnr.c +++ b/src/enc/picture_psnr.c @@ -17,6 +17,10 @@ #include "./vp8enci.h" #include "../utils/utils.h" +typedef double (*AccumulateFunc)(const uint8_t* src, int src_stride, + const uint8_t* ref, int ref_stride, + int w, int h); + //------------------------------------------------------------------------------ // local-min distortion // @@ -67,6 +71,43 @@ static double AccumulateSSE(const uint8_t* src, int src_stride, return total_sse; } +//------------------------------------------------------------------------------ + +static double AccumulateSSIM(const uint8_t* src, int src_stride, + const uint8_t* ref, int ref_stride, + int w, int h) { + const int w0 = (w < VP8_SSIM_KERNEL) ? w : VP8_SSIM_KERNEL; + const int w1 = w - VP8_SSIM_KERNEL - 1; + const int h0 = (h < VP8_SSIM_KERNEL) ? h : VP8_SSIM_KERNEL; + const int h1 = h - VP8_SSIM_KERNEL - 1; + int x, y; + double sum = 0.; + for (y = 0; y < h0; ++y) { + for (x = 0; x < w; ++x) { + sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h); + } + } + for (; y < h1; ++y) { + for (x = 0; x < w0; ++x) { + sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h); + } + for (; x < w1; ++x) { + const int off1 = x - VP8_SSIM_KERNEL + (y - VP8_SSIM_KERNEL) * src_stride; + const int off2 = x - VP8_SSIM_KERNEL + (y - VP8_SSIM_KERNEL) * ref_stride; + sum += VP8SSIMGet(src + off1, src_stride, ref + off2, ref_stride); + } + for (; x < w; ++x) { + sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h); + } + } + for (; y < h; ++y) { + for (x = 0; x < w; ++x) { + sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h); + } + } + return sum; +} + //------------------------------------------------------------------------------ // Distortion @@ -77,6 +118,7 @@ static double GetPSNR(double v, double size) { return (v > 0. && size > 0.) ? -4.3429448 * log(v / (size * 255 * 255.)) : kMinDistortion_dB; } + static double GetLogSSIM(double v, double size) { v = (size > 0.) ? v / size : 1.; return (v < 1.) ? -10.0 * log10(1. - v) : kMinDistortion_dB; @@ -88,10 +130,10 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref, double disto[4] = { 0. }; double sizes[4] = { 0. }; double total_size = 0., total_disto = 0.; - VP8DistoStats stats[5]; - + const AccumulateFunc metric = (type == 0) ? AccumulateSSE : + (type == 1) ? AccumulateSSIM : + AccumulateLSIM; VP8SSIMDspInit(); - memset(stats, 0, sizeof(stats)); if (src == NULL || ref == NULL || src->width != ref->width || src->height != ref->height || @@ -120,13 +162,7 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref, } } sizes[c] = w * h; - if (type >= 2) { - disto[c] = AccumulateLSIM(tmp1, w, tmp2, w, w, h); - } else if (type == 0) { - disto[c] = AccumulateSSE(tmp1, w, tmp2, w, w, h); - } else { - VP8SSIMAccumulatePlane(tmp1, w, tmp2, w, w, h, &stats[c]); - } + disto[c] = metric(tmp1, w, tmp2, w, w, h); } WebPSafeFree(tmp_plane); } @@ -149,62 +185,24 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref, sizes[1] = sizes[2] = uv_w * uv_h; sizes[3] = has_alpha ? w * h : 0.; - if (type >= 2) { - disto[0] = AccumulateLSIM(src->y, src->y_stride, ref->y, ref->y_stride, - w, h); - disto[1] = AccumulateLSIM(src->u, src->uv_stride, ref->u, ref->uv_stride, - uv_w, uv_h); - disto[2] = AccumulateLSIM(src->v, src->uv_stride, ref->v, ref->uv_stride, - uv_w, uv_h); - if (has_alpha) { - disto[3] = AccumulateLSIM(src->a, src->a_stride, ref->a, ref->a_stride, - w, h); - } - } else if (type == 0) { - disto[0] = AccumulateSSE(src->y, src->y_stride, ref->y, ref->y_stride, - w, h); - disto[1] = AccumulateSSE(src->u, src->uv_stride, ref->u, ref->uv_stride, - uv_w, uv_h); - disto[2] = AccumulateSSE(src->v, src->uv_stride, ref->v, ref->uv_stride, - uv_w, uv_h); - if (has_alpha) { - disto[3] = AccumulateSSE(src->a, src->a_stride, ref->a, ref->a_stride, - w, h); - } - } else { - VP8SSIMAccumulatePlane(src->y, src->y_stride, - ref->y, ref->y_stride, - w, h, &stats[0]); - VP8SSIMAccumulatePlane(src->u, src->uv_stride, - ref->u, ref->uv_stride, - uv_w, uv_h, &stats[1]); - VP8SSIMAccumulatePlane(src->v, src->uv_stride, - ref->v, ref->uv_stride, - uv_w, uv_h, &stats[2]); - if (has_alpha) { - VP8SSIMAccumulatePlane(src->a, src->a_stride, - ref->a, ref->a_stride, - w, h, &stats[3]); - } + disto[0] = metric(src->y, src->y_stride, ref->y, ref->y_stride, w, h); + disto[1] = metric(src->u, src->uv_stride, ref->u, ref->uv_stride, + uv_w, uv_h); + disto[2] = metric(src->v, src->uv_stride, ref->v, ref->uv_stride, + uv_w, uv_h); + if (has_alpha) { + disto[3] = metric(src->a, src->a_stride, ref->a, ref->a_stride, w, h); } } for (c = 0; c < 4; ++c) { - if (type == 1) { - results[c] = (float)GetLogSSIM(VP8SSIMGet(&stats[c]), 1.); - VP8SSIMAddStats(&stats[c], &stats[4]); - } else { - total_disto += disto[c]; - total_size += sizes[c]; - results[c] = (float)GetPSNR(disto[c], sizes[c]); - } + total_disto += disto[c]; + total_size += sizes[c]; + results[c] = (type == 1) ? (float)GetLogSSIM(disto[c], sizes[c]) + : (float)GetPSNR(disto[c], sizes[c]); } - if (type == 1) { - results[4] = (float)GetLogSSIM(VP8SSIMGet(&stats[4]), 1.); - } else { - results[4] = (float)GetPSNR(total_disto, total_size); - } - + results[4] = (type == 1) ? (float)GetLogSSIM(total_disto, total_size) + : (float)GetPSNR(total_disto, total_size); return 1; } diff --git a/src/enc/vp8enci.h b/src/enc/vp8enci.h index d60a1358..ed83bd6d 100644 --- a/src/enc/vp8enci.h +++ b/src/enc/vp8enci.h @@ -474,14 +474,6 @@ int VP8EncStartAlpha(VP8Encoder* const enc); // start alpha coding process int VP8EncFinishAlpha(VP8Encoder* const enc); // finalize compressed data int VP8EncDeleteAlpha(VP8Encoder* const enc); // delete compressed data - // in filter.c -void VP8SSIMAddStats(const VP8DistoStats* const src, VP8DistoStats* const dst); -void VP8SSIMAccumulatePlane(const uint8_t* src1, int stride1, - const uint8_t* src2, int stride2, - int W, int H, VP8DistoStats* const stats); -double VP8SSIMGet(const VP8DistoStats* const stats); -double VP8SSIMGetSquaredError(const VP8DistoStats* const stats); - // autofilter void VP8InitFilter(VP8EncIterator* const it); void VP8StoreFilterStats(VP8EncIterator* const it);