speed-up SSIM calculation

SSIM results are incompatible with previous version! We're now averaging the SSIM value for each pixels instead of printing a frame-level global SSIM value. * Got rid of some old code * switched to uint32_t for accumulation * refactoring SSIM calculation is ~4x faster now. Change-Id: I48d838e66aef5199b9b5cd5cddef6a98411f5673
2025-08-29 15:22:12 +02:00 · 2016-09-14 16:15:43 +02:00
parent 64577de8ae
commit bfff0bf329
5 changed files with 143 additions and 201 deletions
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -254,26 +254,29 @@ void VP8EncDspCostInit(void);

 // struct for accumulating statistical moments
 typedef struct {
-  double w;              // sum(w_i) : sum of weights
-  double xm, ym;         // sum(w_i * x_i), sum(w_i * y_i)
-  double xxm, xym, yym;  // sum(w_i * x_i * x_i), etc.
+  uint32_t w;              // sum(w_i) : sum of weights
+  uint32_t xm, ym;         // sum(w_i * x_i), sum(w_i * y_i)
+  uint32_t xxm, xym, yym;  // sum(w_i * x_i * x_i), etc.
 } VP8DistoStats;

+// Compute the final SSIM value
+// The non-clipped version assumes stats->w = (2 * VP8_SSIM_KERNEL + 1)^2.
+double VP8SSIMFromStats(const VP8DistoStats* const stats);
+double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats);
+
 #define VP8_SSIM_KERNEL 3   // total size of the kernel: 2 * VP8_SSIM_KERNEL + 1
-typedef void (*VP8SSIMAccumulateClippedFunc)(const uint8_t* src1, int stride1,
-                                             const uint8_t* src2, int stride2,
-                                             int xo, int yo,  // center position
-                                             int W, int H,    // plane dimension
-                                             VP8DistoStats* const stats);
+typedef double (*VP8SSIMGetClippedFunc)(const uint8_t* src1, int stride1,
+                                        const uint8_t* src2, int stride2,
+                                        int xo, int yo,  // center position
+                                        int W, int H);   // plane dimension

 // This version is called with the guarantee that you can load 8 bytes and
 // 8 rows at offset src1 and src2
-typedef void (*VP8SSIMAccumulateFunc)(const uint8_t* src1, int stride1,
-                                      const uint8_t* src2, int stride2,
-                                      VP8DistoStats* const stats);
+typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,
+                                 const uint8_t* src2, int stride2);

-extern VP8SSIMAccumulateFunc VP8SSIMAccumulate;         // unclipped / unchecked
-extern VP8SSIMAccumulateClippedFunc VP8SSIMAccumulateClipped;   // with clipping
+extern VP8SSIMGetFunc VP8SSIMGet;         // unclipped / unchecked
+extern VP8SSIMGetClippedFunc VP8SSIMGetClipped;   // with clipping

 typedef uint32_t (*VP8AccumulateSSEFunc)(const uint8_t* src1,
                                         const uint8_t* src2, int len);
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@@ -693,10 +693,41 @@ static void Copy16x8(const uint8_t* src, uint8_t* dst) {
 //------------------------------------------------------------------------------
 // SSIM / PSNR

-static void SSIMAccumulateClipped(const uint8_t* src1, int stride1,
-                                  const uint8_t* src2, int stride2,
-                                  int xo, int yo, int W, int H,
-                                  VP8DistoStats* const stats) {
+static const double kMinValue = 1.e-10;  // minimal threshold
+
+static WEBP_INLINE double SSIMCalculation(
+    const VP8DistoStats* const stats, uint32_t N  /*num samples*/) {
+  const double w2 = N * N;
+  const double C1 = 6.5025 * w2;
+  const double C2 = 58.5225 * w2;
+  const double xmxm = stats->xm * stats->xm;
+  const double ymym = stats->ym * stats->ym;
+  const double xmym = stats->xm * stats->ym;
+  const double sxy = stats->xym * N - xmym;
+  const double fnum = (2 * xmym + C1) * (2 * sxy + C2);
+  double fden;
+  double sxx = stats->xxm * N - xmxm;
+  double syy = stats->yym * N - ymym;
+  // small errors are possible, due to rounding. Clamp to zero.
+  if (sxx < 0.) sxx = 0.;
+  if (syy < 0.) syy = 0.;
+  fden = (xmxm + ymym + C1) * (sxx + syy + C2);
+  return (fden != 0.) ? fnum / fden : kMinValue;
+}
+
+double VP8SSIMFromStats(const VP8DistoStats* const stats) {
+  const uint32_t w = (2 * VP8_SSIM_KERNEL + 1) * (2 * VP8_SSIM_KERNEL + 1);
+  return SSIMCalculation(stats, w);
+}
+
+double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
+  return SSIMCalculation(stats, stats->w);
+}
+
+static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
+                               const uint8_t* src2, int stride2,
+                               int xo, int yo, int W, int H) {
+  VP8DistoStats stats = { 0., 0., 0., 0., 0., 0. };
  const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
  const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
                                                  : yo + VP8_SSIM_KERNEL;
@@ -710,34 +741,37 @@ static void SSIMAccumulateClipped(const uint8_t* src1, int stride1,
    for (x = xmin; x <= xmax; ++x) {
      const int s1 = src1[x];
      const int s2 = src2[x];
-      stats->w   += 1;
-      stats->xm  += s1;
-      stats->ym  += s2;
-      stats->xxm += s1 * s1;
-      stats->xym += s1 * s2;
-      stats->yym += s2 * s2;
+      stats.xm  += s1;
+      stats.ym  += s2;
+      stats.xxm += s1 * s1;
+      stats.xym += s1 * s2;
+      stats.yym += s2 * s2;
    }
  }
+  stats.w = (ymax - ymin + 1) * (xmax - xmin + 1);
+  return VP8SSIMFromStatsClipped(&stats);
 }

-static void SSIMAccumulate(const uint8_t* src1, int stride1,
-                           const uint8_t* src2, int stride2,
-                           VP8DistoStats* const stats) {
+static double SSIMGet_C(const uint8_t* src1, int stride1,
+                        const uint8_t* src2, int stride2) {
+  VP8DistoStats stats = { 0., 0., 0., 0., 0., 0. };
  int x, y;
  for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
    for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
      const int s1 = src1[x];
      const int s2 = src2[x];
-      stats->w   += 1;
-      stats->xm  += s1;
-      stats->ym  += s2;
-      stats->xxm += s1 * s1;
-      stats->xym += s1 * s2;
-      stats->yym += s2 * s2;
+      stats.xm  += s1;
+      stats.ym  += s2;
+      stats.xxm += s1 * s1;
+      stats.xym += s1 * s2;
+      stats.yym += s2 * s2;
    }
  }
+  return VP8SSIMFromStats(&stats);
 }

+//------------------------------------------------------------------------------
+
 static uint32_t AccumulateSSE(const uint8_t* src1,
                              const uint8_t* src2, int len) {
  int i;
@@ -750,8 +784,10 @@ static uint32_t AccumulateSSE(const uint8_t* src1,
  return sse2;
 }

-VP8SSIMAccumulateFunc VP8SSIMAccumulate;
-VP8SSIMAccumulateClippedFunc VP8SSIMAccumulateClipped;
+//------------------------------------------------------------------------------
+
+VP8SSIMGetFunc VP8SSIMGet;
+VP8SSIMGetClippedFunc VP8SSIMGetClipped;
 VP8AccumulateSSEFunc VP8AccumulateSSE;

 extern void VP8SSIMDspInitSSE2(void);
@@ -762,8 +798,8 @@ static volatile VP8CPUInfo ssim_last_cpuinfo_used =
 WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
  if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;

-  VP8SSIMAccumulate = SSIMAccumulate;
-  VP8SSIMAccumulateClipped = SSIMAccumulateClipped;
+  VP8SSIMGetClipped = SSIMGetClipped_C;
+  VP8SSIMGet = SSIMGet_C;

  VP8AccumulateSSE = AccumulateSSE;
  if (VP8GetCPUInfo != NULL) {
--- a/src/enc/filter.c
+++ b/src/enc/filter.c
@@ -105,115 +105,28 @@ static void DoFilter(const VP8EncIterator* const it, int level) {
 }

 //------------------------------------------------------------------------------
-// SSIM metric
-
-static const double kMinValue = 1.e-10;  // minimal threshold
-
-void VP8SSIMAddStats(const VP8DistoStats* const src, VP8DistoStats* const dst) {
-  dst->w   += src->w;
-  dst->xm  += src->xm;
-  dst->ym  += src->ym;
-  dst->xxm += src->xxm;
-  dst->xym += src->xym;
-  dst->yym += src->yym;
-}
-
-double VP8SSIMGet(const VP8DistoStats* const stats) {
-  const double xmxm = stats->xm * stats->xm;
-  const double ymym = stats->ym * stats->ym;
-  const double xmym = stats->xm * stats->ym;
-  const double w2 = stats->w * stats->w;
-  double sxx = stats->xxm * stats->w - xmxm;
-  double syy = stats->yym * stats->w - ymym;
-  double sxy = stats->xym * stats->w - xmym;
-  double C1, C2;
-  double fnum;
-  double fden;
-  // small errors are possible, due to rounding. Clamp to zero.
-  if (sxx < 0.) sxx = 0.;
-  if (syy < 0.) syy = 0.;
-  C1 = 6.5025 * w2;
-  C2 = 58.5225 * w2;
-  fnum = (2 * xmym + C1) * (2 * sxy + C2);
-  fden = (xmxm + ymym + C1) * (sxx + syy + C2);
-  return (fden != 0.) ? fnum / fden : kMinValue;
-}
-
-double VP8SSIMGetSquaredError(const VP8DistoStats* const s) {
-  if (s->w > 0.) {
-    const double iw2 = 1. / (s->w * s->w);
-    const double sxx = s->xxm * s->w - s->xm * s->xm;
-    const double syy = s->yym * s->w - s->ym * s->ym;
-    const double sxy = s->xym * s->w - s->xm * s->ym;
-    const double SSE = iw2 * (sxx + syy - 2. * sxy);
-    if (SSE > kMinValue) return SSE;
-  }
-  return kMinValue;
-}
-
-#define LIMIT(A, M)  ((A) > (M) ? (M) : (A))
-static void VP8SSIMAccumulateRow(const uint8_t* src1, int stride1,
-                                 const uint8_t* src2, int stride2,
-                                 int y, int W, int H,
-                                 VP8DistoStats* const stats) {
-  int x = 0;
-  const int w0 = LIMIT(VP8_SSIM_KERNEL, W);
-  for (x = 0; x < w0; ++x) {
-    VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
-  }
-  for (; x <= W - 8 + VP8_SSIM_KERNEL; ++x) {
-    VP8SSIMAccumulate(
-        src1 + (y - VP8_SSIM_KERNEL) * stride1 + (x - VP8_SSIM_KERNEL), stride1,
-        src2 + (y - VP8_SSIM_KERNEL) * stride2 + (x - VP8_SSIM_KERNEL), stride2,
-        stats);
-  }
-  for (; x < W; ++x) {
-    VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
-  }
-}
-
-void VP8SSIMAccumulatePlane(const uint8_t* src1, int stride1,
-                            const uint8_t* src2, int stride2,
-                            int W, int H, VP8DistoStats* const stats) {
-  int x, y;
-  const int h0 = LIMIT(VP8_SSIM_KERNEL, H);
-  const int h1 = LIMIT(VP8_SSIM_KERNEL, H - VP8_SSIM_KERNEL);
-  for (y = 0; y < h0; ++y) {
-    for (x = 0; x < W; ++x) {
-      VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
-    }
-  }
-  for (; y < h1; ++y) {
-    VP8SSIMAccumulateRow(src1, stride1, src2, stride2, y, W, H, stats);
-  }
-  for (; y < H; ++y) {
-    for (x = 0; x < W; ++x) {
-      VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
-    }
-  }
-}
-#undef LIMIT
+// SSIM metric for one macroblock

 static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
  int x, y;
-  VP8DistoStats s = { .0, .0, .0, .0, .0, .0 };
+  double sum = 0.;

  // compute SSIM in a 10 x 10 window
  for (y = VP8_SSIM_KERNEL; y < 16 - VP8_SSIM_KERNEL; y++) {
    for (x = VP8_SSIM_KERNEL; x < 16 - VP8_SSIM_KERNEL; x++) {
-      VP8SSIMAccumulateClipped(yuv1 + Y_OFF_ENC, BPS, yuv2 + Y_OFF_ENC, BPS,
-                               x, y, 16, 16, &s);
+      sum += VP8SSIMGetClipped(yuv1 + Y_OFF_ENC, BPS, yuv2 + Y_OFF_ENC, BPS,
+                               x, y, 16, 16);
    }
  }
  for (x = 1; x < 7; x++) {
    for (y = 1; y < 7; y++) {
-      VP8SSIMAccumulateClipped(yuv1 + U_OFF_ENC, BPS, yuv2 + U_OFF_ENC, BPS,
-                               x, y, 8, 8, &s);
-      VP8SSIMAccumulateClipped(yuv1 + V_OFF_ENC, BPS, yuv2 + V_OFF_ENC, BPS,
-                               x, y, 8, 8, &s);
+      sum += VP8SSIMGetClipped(yuv1 + U_OFF_ENC, BPS, yuv2 + U_OFF_ENC, BPS,
+                               x, y, 8, 8);
+      sum += VP8SSIMGetClipped(yuv1 + V_OFF_ENC, BPS, yuv2 + V_OFF_ENC, BPS,
+                               x, y, 8, 8);
    }
  }
-  return VP8SSIMGet(&s);
+  return sum;
 }

 //------------------------------------------------------------------------------
--- a/src/enc/picture_psnr.c
+++ b/src/enc/picture_psnr.c
@@ -17,6 +17,10 @@
 #include "./vp8enci.h"
 #include "../utils/utils.h"

+typedef double (*AccumulateFunc)(const uint8_t* src, int src_stride,
+                                 const uint8_t* ref, int ref_stride,
+                                 int w, int h);
+
 //------------------------------------------------------------------------------
 // local-min distortion
 //
@@ -67,6 +71,43 @@ static double AccumulateSSE(const uint8_t* src, int src_stride,
  return total_sse;
 }

+//------------------------------------------------------------------------------
+
+static double AccumulateSSIM(const uint8_t* src, int src_stride,
+                             const uint8_t* ref, int ref_stride,
+                             int w, int h) {
+  const int w0 = (w < VP8_SSIM_KERNEL) ? w : VP8_SSIM_KERNEL;
+  const int w1 = w - VP8_SSIM_KERNEL - 1;
+  const int h0 = (h < VP8_SSIM_KERNEL) ? h : VP8_SSIM_KERNEL;
+  const int h1 = h - VP8_SSIM_KERNEL - 1;
+  int x, y;
+  double sum = 0.;
+  for (y = 0; y < h0; ++y) {
+    for (x = 0; x < w; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+  }
+  for (; y < h1; ++y) {
+    for (x = 0; x < w0; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+    for (; x < w1; ++x) {
+      const int off1 = x - VP8_SSIM_KERNEL + (y - VP8_SSIM_KERNEL) * src_stride;
+      const int off2 = x - VP8_SSIM_KERNEL + (y - VP8_SSIM_KERNEL) * ref_stride;
+      sum += VP8SSIMGet(src + off1, src_stride, ref + off2, ref_stride);
+    }
+    for (; x < w; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+  }
+  for (; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+  }
+  return sum;
+}
+
 //------------------------------------------------------------------------------
 // Distortion

@@ -77,6 +118,7 @@ static double GetPSNR(double v, double size) {
  return (v > 0. && size > 0.) ? -4.3429448 * log(v / (size * 255 * 255.))
                               : kMinDistortion_dB;
 }
+
 static double GetLogSSIM(double v, double size) {
  v = (size > 0.) ? v / size : 1.;
  return (v < 1.) ? -10.0 * log10(1. - v) : kMinDistortion_dB;
@@ -88,10 +130,10 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
  double disto[4] = { 0. };
  double sizes[4] = { 0. };
  double total_size = 0., total_disto = 0.;
-  VP8DistoStats stats[5];
-
+  const AccumulateFunc metric = (type == 0) ? AccumulateSSE :
+                                (type == 1) ? AccumulateSSIM :
+                                              AccumulateLSIM;
  VP8SSIMDspInit();
-  memset(stats, 0, sizeof(stats));

  if (src == NULL || ref == NULL ||
      src->width != ref->width || src->height != ref->height ||
@@ -120,13 +162,7 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
          }
        }
        sizes[c] = w * h;
-        if (type >= 2) {
-          disto[c] = AccumulateLSIM(tmp1, w, tmp2, w, w, h);
-        } else if (type == 0) {
-          disto[c] = AccumulateSSE(tmp1, w, tmp2, w, w, h);
-        } else {
-          VP8SSIMAccumulatePlane(tmp1, w, tmp2, w, w, h, &stats[c]);
-        }
+        disto[c] = metric(tmp1, w, tmp2, w, w, h);
      }
      WebPSafeFree(tmp_plane);
    }
@@ -149,62 +185,24 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
    sizes[1] = sizes[2] = uv_w * uv_h;
    sizes[3] = has_alpha ? w * h : 0.;

-    if (type >= 2) {
-      disto[0] = AccumulateLSIM(src->y, src->y_stride, ref->y, ref->y_stride,
-                                w, h);
-      disto[1] = AccumulateLSIM(src->u, src->uv_stride, ref->u, ref->uv_stride,
-                                uv_w, uv_h);
-      disto[2] = AccumulateLSIM(src->v, src->uv_stride, ref->v, ref->uv_stride,
-                                uv_w, uv_h);
-      if (has_alpha) {
-        disto[3] = AccumulateLSIM(src->a, src->a_stride, ref->a, ref->a_stride,
-                                  w, h);
-      }
-    } else if (type == 0) {
-      disto[0] = AccumulateSSE(src->y, src->y_stride, ref->y, ref->y_stride,
-                               w, h);
-      disto[1] = AccumulateSSE(src->u, src->uv_stride, ref->u, ref->uv_stride,
-                               uv_w, uv_h);
-      disto[2] = AccumulateSSE(src->v, src->uv_stride, ref->v, ref->uv_stride,
-                               uv_w, uv_h);
-      if (has_alpha) {
-        disto[3] = AccumulateSSE(src->a, src->a_stride, ref->a, ref->a_stride,
-                                 w, h);
-      }
-    } else {
-      VP8SSIMAccumulatePlane(src->y, src->y_stride,
-                             ref->y, ref->y_stride,
-                             w, h, &stats[0]);
-      VP8SSIMAccumulatePlane(src->u, src->uv_stride,
-                             ref->u, ref->uv_stride,
-                             uv_w, uv_h, &stats[1]);
-      VP8SSIMAccumulatePlane(src->v, src->uv_stride,
-                             ref->v, ref->uv_stride,
-                             uv_w, uv_h, &stats[2]);
-      if (has_alpha) {
-        VP8SSIMAccumulatePlane(src->a, src->a_stride,
-                               ref->a, ref->a_stride,
-                               w, h, &stats[3]);
-      }
+    disto[0] = metric(src->y, src->y_stride, ref->y, ref->y_stride, w, h);
+    disto[1] = metric(src->u, src->uv_stride, ref->u, ref->uv_stride,
+                      uv_w, uv_h);
+    disto[2] = metric(src->v, src->uv_stride, ref->v, ref->uv_stride,
+                      uv_w, uv_h);
+    if (has_alpha) {
+      disto[3] = metric(src->a, src->a_stride, ref->a, ref->a_stride, w, h);
    }
  }

  for (c = 0; c < 4; ++c) {
-    if (type == 1) {
-      results[c] = (float)GetLogSSIM(VP8SSIMGet(&stats[c]), 1.);
-      VP8SSIMAddStats(&stats[c], &stats[4]);
-    } else {
-      total_disto += disto[c];
-      total_size += sizes[c];
-      results[c] = (float)GetPSNR(disto[c], sizes[c]);
-    }
+    total_disto += disto[c];
+    total_size += sizes[c];
+    results[c] = (type == 1) ? (float)GetLogSSIM(disto[c], sizes[c])
+                             : (float)GetPSNR(disto[c], sizes[c]);
  }
-  if (type == 1) {
-    results[4] = (float)GetLogSSIM(VP8SSIMGet(&stats[4]), 1.);
-  } else {
-    results[4] = (float)GetPSNR(total_disto, total_size);
-  }
-
+  results[4] = (type == 1) ? (float)GetLogSSIM(total_disto, total_size)
+                           : (float)GetPSNR(total_disto, total_size);
  return 1;
 }

--- a/src/enc/vp8enci.h
+++ b/src/enc/vp8enci.h
@@ -474,14 +474,6 @@ int VP8EncStartAlpha(VP8Encoder* const enc);    // start alpha coding process
 int VP8EncFinishAlpha(VP8Encoder* const enc);   // finalize compressed data
 int VP8EncDeleteAlpha(VP8Encoder* const enc);   // delete compressed data

-  // in filter.c
-void VP8SSIMAddStats(const VP8DistoStats* const src, VP8DistoStats* const dst);
-void VP8SSIMAccumulatePlane(const uint8_t* src1, int stride1,
-                            const uint8_t* src2, int stride2,
-                            int W, int H, VP8DistoStats* const stats);
-double VP8SSIMGet(const VP8DistoStats* const stats);
-double VP8SSIMGetSquaredError(const VP8DistoStats* const stats);
-
 // autofilter
 void VP8InitFilter(VP8EncIterator* const it);
 void VP8StoreFilterStats(VP8EncIterator* const it);