From 50c3d7da9afd1c419f23f6b5a1dc0237b110c041 Mon Sep 17 00:00:00 2001 From: Pascal Massimino Date: Tue, 13 Sep 2016 17:02:25 +0200 Subject: [PATCH] refactor the PSNR / SSIM calculation code -print_psnr is now much faster because it doesn't use the SSIM code. The SSIM speed-up and re-write will come later. Change-Id: Iabf565e0a8b41651d8164df1266cfeded4ab4823 --- src/dsp/dsp.h | 6 ++- src/dsp/enc.c | 25 +++++++++ src/dsp/enc_sse2.c | 48 +++++++++++++++++ src/enc/picture_psnr.c | 116 ++++++++++++++++++++++++++--------------- 4 files changed, 153 insertions(+), 42 deletions(-) diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index ac0ccd78..f57569d2 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -250,7 +250,7 @@ extern VP8GetResidualCostFunc VP8GetResidualCost; void VP8EncDspCostInit(void); //------------------------------------------------------------------------------ -// SSIM utils +// SSIM / PSNR utils // struct for accumulating statistical moments typedef struct { @@ -275,6 +275,10 @@ typedef void (*VP8SSIMAccumulateFunc)(const uint8_t* src1, int stride1, extern VP8SSIMAccumulateFunc VP8SSIMAccumulate; // unclipped / unchecked extern VP8SSIMAccumulateClippedFunc VP8SSIMAccumulateClipped; // with clipping +typedef uint32_t (*VP8AccumulateSSEFunc)(const uint8_t* src1, + const uint8_t* src2, int len); +extern VP8AccumulateSSEFunc VP8AccumulateSSE; + // must be called before using any of the above directly void VP8SSIMDspInit(void); diff --git a/src/dsp/enc.c b/src/dsp/enc.c index 110ef189..d94e617d 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -691,6 +691,7 @@ static void Copy16x8(const uint8_t* src, uint8_t* dst) { } //------------------------------------------------------------------------------ +// SSIM / PSNR static void SSIMAccumulateClipped(const uint8_t* src1, int stride1, const uint8_t* src2, int stride2, @@ -737,8 +738,23 @@ static void SSIMAccumulate(const uint8_t* src1, int stride1, } } +static uint32_t AccumulateSSE(const uint8_t* src1, + const uint8_t* src2, int len) { + int i; + uint32_t sse2 = 0; + assert(len <= 65535); // to ensure that accumulation fits within uint32_t + for (i = 0; i < len; ++i) { + const int32_t diff = src1[i] - src2[i]; + sse2 += diff * diff; + } + return sse2; +} + VP8SSIMAccumulateFunc VP8SSIMAccumulate; VP8SSIMAccumulateClippedFunc VP8SSIMAccumulateClipped; +VP8AccumulateSSEFunc VP8AccumulateSSE; + +extern void VP8SSIMDspInitSSE2(void); static volatile VP8CPUInfo ssim_last_cpuinfo_used = (VP8CPUInfo)&ssim_last_cpuinfo_used; @@ -749,6 +765,15 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) { VP8SSIMAccumulate = SSIMAccumulate; VP8SSIMAccumulateClipped = SSIMAccumulateClipped; + VP8AccumulateSSE = AccumulateSSE; + if (VP8GetCPUInfo != NULL) { +#if defined(WEBP_USE_SSE2) + if (VP8GetCPUInfo(kSSE2)) { + VP8SSIMDspInitSSE2(); + } +#endif + } + ssim_last_cpuinfo_used = VP8GetCPUInfo; } diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c index 0d3cb2de..35ffe88a 100644 --- a/src/dsp/enc_sse2.c +++ b/src/dsp/enc_sse2.c @@ -1365,8 +1365,56 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) { VP8Mean16x4 = Mean16x4; } +//------------------------------------------------------------------------------ +// SSIM / PSNR entry point (TODO(skal): move to its own file later) + +static uint32_t AccumulateSSE_SSE2(const uint8_t* src1, + const uint8_t* src2, int len) { + int i = 0; + uint32_t sse2 = 0; + if (len >= 16) { + const int limit = len - 32; + int32_t tmp[4]; + __m128i sum1; + __m128i sum = _mm_setzero_si128(); + __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]); + __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]); + i += 16; + while (i <= limit) { + const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]); + const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]); + __m128i sum2; + i += 16; + SubtractAndAccumulate(a0, b0, &sum1); + sum = _mm_add_epi32(sum, sum1); + a0 = _mm_loadu_si128((const __m128i*)&src1[i]); + b0 = _mm_loadu_si128((const __m128i*)&src2[i]); + i += 16; + SubtractAndAccumulate(a1, b1, &sum2); + sum = _mm_add_epi32(sum, sum2); + } + SubtractAndAccumulate(a0, b0, &sum1); + sum = _mm_add_epi32(sum, sum1); + _mm_storeu_si128((__m128i*)tmp, sum); + sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]); + } + + for (; i < len; ++i) { + const int32_t diff = src1[i] - src2[i]; + sse2 += diff * diff; + } + return sse2; +} + +extern void VP8SSIMDspInitSSE2(void); + +WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) { + VP8AccumulateSSE = AccumulateSSE_SSE2; +} + #else // !WEBP_USE_SSE2 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2) +WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2) #endif // WEBP_USE_SSE2 diff --git a/src/enc/picture_psnr.c b/src/enc/picture_psnr.c index 329757de..6c0d94f3 100644 --- a/src/enc/picture_psnr.c +++ b/src/enc/picture_psnr.c @@ -25,9 +25,9 @@ #define RADIUS 2 // search radius. Shouldn't be too large. -static void AccumulateLSIM(const uint8_t* src, int src_stride, - const uint8_t* ref, int ref_stride, - int w, int h, VP8DistoStats* stats) { +static double AccumulateLSIM(const uint8_t* src, int src_stride, + const uint8_t* ref, int ref_stride, + int w, int h) { int x, y; double total_sse = 0.; for (y = 0; y < h; ++y) { @@ -50,37 +50,52 @@ static void AccumulateLSIM(const uint8_t* src, int src_stride, total_sse += best_sse; } } - stats->w = w * h; - stats->xm = 0; - stats->ym = 0; - stats->xxm = total_sse; - stats->yym = 0; - stats->xxm = 0; + return total_sse; } #undef RADIUS +static double AccumulateSSE(const uint8_t* src, int src_stride, + const uint8_t* ref, int ref_stride, + int w, int h) { + int y; + double total_sse = 0.; + for (y = 0; y < h; ++y) { + total_sse += VP8AccumulateSSE(src, ref, w); + src += src_stride; + ref += ref_stride; + } + return total_sse; +} + //------------------------------------------------------------------------------ // Distortion // Max value returned in case of exact similarity. static const double kMinDistortion_dB = 99.; -static float GetPSNR(const double v) { - return (float)((v > 0.) ? -4.3429448 * log(v / (255 * 255.)) - : kMinDistortion_dB); + +static double GetPSNR(double v, double size) { + return (v > 0. && size > 0.) ? -4.3429448 * log(v / (size * 255 * 255.)) + : kMinDistortion_dB; +} +static double GetLogSSIM(double v, double size) { + v = (size > 0.) ? v / size : 1.; + return (v < 1.) ? -10.0 * log10(1. - v) : kMinDistortion_dB; } int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref, - int type, float result[5]) { + int type, float results[5]) { + int w, h, c; + double disto[4] = { 0. }; + double sizes[4] = { 0. }; + double total_size = 0., total_disto = 0.; VP8DistoStats stats[5]; - int w, h; - - memset(stats, 0, sizeof(stats)); VP8SSIMDspInit(); + memset(stats, 0, sizeof(stats)); if (src == NULL || ref == NULL || src->width != ref->width || src->height != ref->height || - src->use_argb != ref->use_argb || result == NULL) { + src->use_argb != ref->use_argb || results == NULL) { return 0; } w = src->width; @@ -90,7 +105,7 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref, if (src->argb == NULL || ref->argb == NULL) { return 0; } else { - int i, j, c; + int i, j; uint8_t* tmp1, *tmp2; uint8_t* const tmp_plane = (uint8_t*)WebPSafeMalloc(2ULL * w * h, sizeof(*tmp_plane)); @@ -104,8 +119,11 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref, tmp2[j * w + i] = ref->argb[i + j * ref->argb_stride] >> (c * 8); } } + sizes[c] = w * h; if (type >= 2) { - AccumulateLSIM(tmp1, w, tmp2, w, w, h, &stats[c]); + disto[c] = AccumulateLSIM(tmp1, w, tmp2, w, w, h); + } else if (type == 0) { + disto[c] = AccumulateSSE(tmp1, w, tmp2, w, w, h); } else { VP8SSIMAccumulatePlane(tmp1, w, tmp2, w, w, h, &stats[c]); } @@ -127,16 +145,31 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref, uv_w = (src->width + 1) >> 1; uv_h = (src->height + 1) >> 1; + sizes[0] = w * h; + sizes[1] = sizes[2] = uv_w * uv_h; + sizes[3] = has_alpha ? w * h : 0.; + if (type >= 2) { - AccumulateLSIM(src->y, src->y_stride, ref->y, ref->y_stride, - w, h, &stats[0]); - AccumulateLSIM(src->u, src->uv_stride, ref->u, ref->uv_stride, - uv_w, uv_h, &stats[1]); - AccumulateLSIM(src->v, src->uv_stride, ref->v, ref->uv_stride, - uv_w, uv_h, &stats[2]); + disto[0] = AccumulateLSIM(src->y, src->y_stride, ref->y, ref->y_stride, + w, h); + disto[1] = AccumulateLSIM(src->u, src->uv_stride, ref->u, ref->uv_stride, + uv_w, uv_h); + disto[2] = AccumulateLSIM(src->v, src->uv_stride, ref->v, ref->uv_stride, + uv_w, uv_h); if (has_alpha) { - AccumulateLSIM(src->a, src->a_stride, ref->a, ref->a_stride, - w, h, &stats[3]); + disto[3] = AccumulateLSIM(src->a, src->a_stride, ref->a, ref->a_stride, + w, h); + } + } else if (type == 0) { + disto[0] = AccumulateSSE(src->y, src->y_stride, ref->y, ref->y_stride, + w, h); + disto[1] = AccumulateSSE(src->u, src->uv_stride, ref->u, ref->uv_stride, + uv_w, uv_h); + disto[2] = AccumulateSSE(src->v, src->uv_stride, ref->v, ref->uv_stride, + uv_w, uv_h); + if (has_alpha) { + disto[3] = AccumulateSSE(src->a, src->a_stride, ref->a, ref->a_stride, + w, h); } } else { VP8SSIMAccumulatePlane(src->y, src->y_stride, @@ -155,22 +188,23 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref, } } } - // Final stat calculations. - { - int c; - for (c = 0; c <= 4; ++c) { - if (type == 1) { - const double v = VP8SSIMGet(&stats[c]); - result[c] = (float)((v < 1.) ? -10.0 * log10(1. - v) - : kMinDistortion_dB); - } else { - const double v = VP8SSIMGetSquaredError(&stats[c]); - result[c] = GetPSNR(v); - } - // Accumulate forward - if (c < 4) VP8SSIMAddStats(&stats[c], &stats[4]); + + for (c = 0; c < 4; ++c) { + if (type == 1) { + results[c] = (float)GetLogSSIM(VP8SSIMGet(&stats[c]), 1.); + VP8SSIMAddStats(&stats[c], &stats[4]); + } else { + total_disto += disto[c]; + total_size += sizes[c]; + results[c] = (float)GetPSNR(disto[c], sizes[c]); } } + if (type == 1) { + results[4] = (float)GetLogSSIM(VP8SSIMGet(&stats[4]), 1.); + } else { + results[4] = (float)GetPSNR(total_disto, total_size); + } + return 1; }