From 693bf74ec07568e055a649eae0cefa954358baac Mon Sep 17 00:00:00 2001 From: Pascal Massimino Date: Mon, 20 Feb 2017 17:27:15 +0100 Subject: [PATCH] move the SSIM calculation code in ssim.c / ssim_sse2.c Change-Id: I63a63fa7f44f257f2e17e45358b206c23069c448 --- Android.mk | 2 + Makefile.vc | 2 + build.gradle | 2 + makefile.unix | 2 + src/dsp/Makefile.am | 2 + src/dsp/enc.c | 134 -------------------------------------- src/dsp/enc_sse2.c | 111 ------------------------------- src/dsp/ssim.c | 152 +++++++++++++++++++++++++++++++++++++++++++ src/dsp/ssim_sse2.c | 154 ++++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 316 insertions(+), 245 deletions(-) create mode 100644 src/dsp/ssim.c create mode 100644 src/dsp/ssim_sse2.c diff --git a/Android.mk b/Android.mk index a5c61817..6abf5f6d 100644 --- a/Android.mk +++ b/Android.mk @@ -101,6 +101,8 @@ dsp_enc_srcs := \ src/dsp/lossless_enc_neon.$(NEON) \ src/dsp/lossless_enc_sse2.c \ src/dsp/lossless_enc_sse41.c \ + src/dsp/ssim.c \ + src/dsp/ssim_sse2.c \ enc_srcs := \ src/enc/alpha_enc.c \ diff --git a/Makefile.vc b/Makefile.vc index f8f79e80..c04a43c0 100644 --- a/Makefile.vc +++ b/Makefile.vc @@ -254,6 +254,8 @@ DSP_ENC_OBJS = \ $(DIROBJ)\dsp\lossless_enc_neon.obj \ $(DIROBJ)\dsp\lossless_enc_sse2.obj \ $(DIROBJ)\dsp\lossless_enc_sse41.obj \ + $(DIROBJ)\dsp\ssim.obj \ + $(DIROBJ)\dsp\ssim_sse2.obj \ EX_ANIM_UTIL_OBJS = \ $(DIROBJ)\examples\anim_util.obj \ diff --git a/build.gradle b/build.gradle index 1d661b50..1af8bf37 100644 --- a/build.gradle +++ b/build.gradle @@ -179,6 +179,8 @@ model { include "lossless_enc_neon.$NEON" include "lossless_enc_sse2.c" include "lossless_enc_sse41.c" + include "ssim.c" + include "ssim_sse2.c" srcDir "src/enc" include "alpha_enc.c" include "analysis_enc.c" diff --git a/makefile.unix b/makefile.unix index 717e7b49..b001cfe3 100644 --- a/makefile.unix +++ b/makefile.unix @@ -192,6 +192,8 @@ DSP_ENC_OBJS = \ src/dsp/lossless_enc_neon.o \ src/dsp/lossless_enc_sse2.o \ src/dsp/lossless_enc_sse41.o \ + src/dsp/ssim.o \ + src/dsp/ssim_sse2.o \ ENC_OBJS = \ src/enc/alpha_enc.o \ diff --git a/src/dsp/Makefile.am b/src/dsp/Makefile.am index 33fc1e42..14687798 100644 --- a/src/dsp/Makefile.am +++ b/src/dsp/Makefile.am @@ -50,6 +50,7 @@ ENC_SOURCES += enc_mips_dsp_r2.c ENC_SOURCES += lossless_enc.c ENC_SOURCES += lossless_enc_mips32.c ENC_SOURCES += lossless_enc_mips_dsp_r2.c +ENC_SOURCES += ssim.c libwebpdsp_avx2_la_SOURCES = libwebpdsp_avx2_la_SOURCES += enc_avx2.c @@ -99,6 +100,7 @@ libwebpdsp_sse2_la_SOURCES += argb_sse2.c libwebpdsp_sse2_la_SOURCES += cost_sse2.c libwebpdsp_sse2_la_SOURCES += enc_sse2.c libwebpdsp_sse2_la_SOURCES += lossless_enc_sse2.c +libwebpdsp_sse2_la_SOURCES += ssim_sse2.c libwebpdsp_sse2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS) libwebpdsp_sse2_la_CFLAGS = $(AM_CFLAGS) $(SSE2_FLAGS) libwebpdsp_sse2_la_LIBADD = libwebpdspdecode_sse2.la diff --git a/src/dsp/enc.c b/src/dsp/enc.c index f31bc6de..d830888f 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -690,140 +690,6 @@ static void Copy16x8(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 16, 8); } -//------------------------------------------------------------------------------ -// SSIM / PSNR - -// hat-shaped filter. Sum of coefficients is equal to 16. -static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = { - 1, 2, 3, 4, 3, 2, 1 -}; -static const uint32_t kWeightSum = 16 * 16; // sum{kWeight}^2 - -static WEBP_INLINE double SSIMCalculation( - const VP8DistoStats* const stats, uint32_t N /*num samples*/) { - const uint32_t w2 = N * N; - const uint32_t C1 = 20 * w2; - const uint32_t C2 = 60 * w2; - const uint32_t C3 = 8 * 8 * w2; // 'dark' limit ~= 6 - const uint64_t xmxm = (uint64_t)stats->xm * stats->xm; - const uint64_t ymym = (uint64_t)stats->ym * stats->ym; - if (xmxm + ymym >= C3) { - const int64_t xmym = (int64_t)stats->xm * stats->ym; - const int64_t sxy = (int64_t)stats->xym * N - xmym; // can be negative - const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm; - const uint64_t syy = (uint64_t)stats->yym * N - ymym; - // we descale by 8 to prevent overflow during the fnum/fden multiply. - const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8; - const uint64_t den_S = (sxx + syy + C2) >> 8; - const uint64_t fnum = (2 * xmym + C1) * num_S; - const uint64_t fden = (xmxm + ymym + C1) * den_S; - const double r = (double)fnum / fden; - assert(r >= 0. && r <= 1.0); - return r; - } - return 1.; // area is too dark to contribute meaningfully -} - -double VP8SSIMFromStats(const VP8DistoStats* const stats) { - return SSIMCalculation(stats, kWeightSum); -} - -double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) { - return SSIMCalculation(stats, stats->w); -} - -static double SSIMGetClipped_C(const uint8_t* src1, int stride1, - const uint8_t* src2, int stride2, - int xo, int yo, int W, int H) { - VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 }; - const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL; - const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1 - : yo + VP8_SSIM_KERNEL; - const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL; - const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1 - : xo + VP8_SSIM_KERNEL; - int x, y; - src1 += ymin * stride1; - src2 += ymin * stride2; - for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) { - for (x = xmin; x <= xmax; ++x) { - const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo] - * kWeight[VP8_SSIM_KERNEL + y - yo]; - const uint32_t s1 = src1[x]; - const uint32_t s2 = src2[x]; - stats.w += w; - stats.xm += w * s1; - stats.ym += w * s2; - stats.xxm += w * s1 * s1; - stats.xym += w * s1 * s2; - stats.yym += w * s2 * s2; - } - } - return VP8SSIMFromStatsClipped(&stats); -} - -static double SSIMGet_C(const uint8_t* src1, int stride1, - const uint8_t* src2, int stride2) { - VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 }; - int x, y; - for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) { - for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) { - const uint32_t w = kWeight[x] * kWeight[y]; - const uint32_t s1 = src1[x]; - const uint32_t s2 = src2[x]; - stats.xm += w * s1; - stats.ym += w * s2; - stats.xxm += w * s1 * s1; - stats.xym += w * s1 * s2; - stats.yym += w * s2 * s2; - } - } - return VP8SSIMFromStats(&stats); -} - -//------------------------------------------------------------------------------ - -static uint32_t AccumulateSSE(const uint8_t* src1, - const uint8_t* src2, int len) { - int i; - uint32_t sse2 = 0; - assert(len <= 65535); // to ensure that accumulation fits within uint32_t - for (i = 0; i < len; ++i) { - const int32_t diff = src1[i] - src2[i]; - sse2 += diff * diff; - } - return sse2; -} - -//------------------------------------------------------------------------------ - -VP8SSIMGetFunc VP8SSIMGet; -VP8SSIMGetClippedFunc VP8SSIMGetClipped; -VP8AccumulateSSEFunc VP8AccumulateSSE; - -extern void VP8SSIMDspInitSSE2(void); - -static volatile VP8CPUInfo ssim_last_cpuinfo_used = - (VP8CPUInfo)&ssim_last_cpuinfo_used; - -WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) { - if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return; - - VP8SSIMGetClipped = SSIMGetClipped_C; - VP8SSIMGet = SSIMGet_C; - - VP8AccumulateSSE = AccumulateSSE; - if (VP8GetCPUInfo != NULL) { -#if defined(WEBP_USE_SSE2) - if (VP8GetCPUInfo(kSSE2)) { - VP8SSIMDspInitSSE2(); - } -#endif - } - - ssim_last_cpuinfo_used = VP8GetCPUInfo; -} - //------------------------------------------------------------------------------ // Initialization diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c index 2026a74c..e8cd366c 100644 --- a/src/dsp/enc_sse2.c +++ b/src/dsp/enc_sse2.c @@ -1366,119 +1366,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) { VP8Mean16x4 = Mean16x4; } -//------------------------------------------------------------------------------ -// SSIM / PSNR entry point (TODO(skal): move to its own file later) - -static uint32_t AccumulateSSE_SSE2(const uint8_t* src1, - const uint8_t* src2, int len) { - int i = 0; - uint32_t sse2 = 0; - if (len >= 16) { - const int limit = len - 32; - int32_t tmp[4]; - __m128i sum1; - __m128i sum = _mm_setzero_si128(); - __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]); - __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]); - i += 16; - while (i <= limit) { - const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]); - const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]); - __m128i sum2; - i += 16; - SubtractAndAccumulate(a0, b0, &sum1); - sum = _mm_add_epi32(sum, sum1); - a0 = _mm_loadu_si128((const __m128i*)&src1[i]); - b0 = _mm_loadu_si128((const __m128i*)&src2[i]); - i += 16; - SubtractAndAccumulate(a1, b1, &sum2); - sum = _mm_add_epi32(sum, sum2); - } - SubtractAndAccumulate(a0, b0, &sum1); - sum = _mm_add_epi32(sum, sum1); - _mm_storeu_si128((__m128i*)tmp, sum); - sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]); - } - - for (; i < len; ++i) { - const int32_t diff = src1[i] - src2[i]; - sse2 += diff * diff; - } - return sse2; -} - -static uint32_t HorizontalAdd16b(const __m128i* const m) { - uint16_t tmp[8]; - const __m128i a = _mm_srli_si128(*m, 8); - const __m128i b = _mm_add_epi16(*m, a); - _mm_storeu_si128((__m128i*)tmp, b); - return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0]; -} - -static uint32_t HorizontalAdd32b(const __m128i* const m) { - const __m128i a = _mm_srli_si128(*m, 8); - const __m128i b = _mm_add_epi32(*m, a); - const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4)); - return (uint32_t)_mm_cvtsi128_si32(c); -} - -static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 }; - -#define ACCUMULATE_ROW(WEIGHT) do { \ - /* compute row weight (Wx * Wy) */ \ - const __m128i Wy = _mm_set1_epi16((WEIGHT)); \ - const __m128i W = _mm_mullo_epi16(Wx, Wy); \ - /* process 8 bytes at a time (7 bytes, actually) */ \ - const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \ - const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \ - /* convert to 16b and multiply by weight */ \ - const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \ - const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \ - const __m128i wa1 = _mm_mullo_epi16(a1, W); \ - const __m128i wb1 = _mm_mullo_epi16(b1, W); \ - /* accumulate */ \ - xm = _mm_add_epi16(xm, wa1); \ - ym = _mm_add_epi16(ym, wb1); \ - xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \ - xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \ - yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \ - src1 += stride1; \ - src2 += stride2; \ -} while (0) - -static double SSIMGet_SSE2(const uint8_t* src1, int stride1, - const uint8_t* src2, int stride2) { - VP8DistoStats stats; - const __m128i zero = _mm_setzero_si128(); - __m128i xm = zero, ym = zero; // 16b accums - __m128i xxm = zero, yym = zero, xym = zero; // 32b accum - const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight); - assert(2 * VP8_SSIM_KERNEL + 1 == 7); - ACCUMULATE_ROW(1); - ACCUMULATE_ROW(2); - ACCUMULATE_ROW(3); - ACCUMULATE_ROW(4); - ACCUMULATE_ROW(3); - ACCUMULATE_ROW(2); - ACCUMULATE_ROW(1); - stats.xm = HorizontalAdd16b(&xm); - stats.ym = HorizontalAdd16b(&ym); - stats.xxm = HorizontalAdd32b(&xxm); - stats.xym = HorizontalAdd32b(&xym); - stats.yym = HorizontalAdd32b(&yym); - return VP8SSIMFromStats(&stats); -} - -extern void VP8SSIMDspInitSSE2(void); - -WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) { - VP8AccumulateSSE = AccumulateSSE_SSE2; - VP8SSIMGet = SSIMGet_SSE2; -} - #else // !WEBP_USE_SSE2 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2) -WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2) #endif // WEBP_USE_SSE2 diff --git a/src/dsp/ssim.c b/src/dsp/ssim.c new file mode 100644 index 00000000..9e67852e --- /dev/null +++ b/src/dsp/ssim.c @@ -0,0 +1,152 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// distortion calculation +// +// Author: Skal (pascal.massimino@gmail.com) + +#include +#include // for abs() + +#include "./dsp.h" +//#include "../enc/vp8i_enc.h" + +//------------------------------------------------------------------------------ +// SSIM / PSNR + +// hat-shaped filter. Sum of coefficients is equal to 16. +static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = { + 1, 2, 3, 4, 3, 2, 1 +}; +static const uint32_t kWeightSum = 16 * 16; // sum{kWeight}^2 + +static WEBP_INLINE double SSIMCalculation( + const VP8DistoStats* const stats, uint32_t N /*num samples*/) { + const uint32_t w2 = N * N; + const uint32_t C1 = 20 * w2; + const uint32_t C2 = 60 * w2; + const uint32_t C3 = 8 * 8 * w2; // 'dark' limit ~= 6 + const uint64_t xmxm = (uint64_t)stats->xm * stats->xm; + const uint64_t ymym = (uint64_t)stats->ym * stats->ym; + if (xmxm + ymym >= C3) { + const int64_t xmym = (int64_t)stats->xm * stats->ym; + const int64_t sxy = (int64_t)stats->xym * N - xmym; // can be negative + const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm; + const uint64_t syy = (uint64_t)stats->yym * N - ymym; + // we descale by 8 to prevent overflow during the fnum/fden multiply. + const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8; + const uint64_t den_S = (sxx + syy + C2) >> 8; + const uint64_t fnum = (2 * xmym + C1) * num_S; + const uint64_t fden = (xmxm + ymym + C1) * den_S; + const double r = (double)fnum / fden; + assert(r >= 0. && r <= 1.0); + return r; + } + return 1.; // area is too dark to contribute meaningfully +} + +double VP8SSIMFromStats(const VP8DistoStats* const stats) { + return SSIMCalculation(stats, kWeightSum); +} + +double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) { + return SSIMCalculation(stats, stats->w); +} + +static double SSIMGetClipped_C(const uint8_t* src1, int stride1, + const uint8_t* src2, int stride2, + int xo, int yo, int W, int H) { + VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 }; + const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL; + const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1 + : yo + VP8_SSIM_KERNEL; + const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL; + const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1 + : xo + VP8_SSIM_KERNEL; + int x, y; + src1 += ymin * stride1; + src2 += ymin * stride2; + for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) { + for (x = xmin; x <= xmax; ++x) { + const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo] + * kWeight[VP8_SSIM_KERNEL + y - yo]; + const uint32_t s1 = src1[x]; + const uint32_t s2 = src2[x]; + stats.w += w; + stats.xm += w * s1; + stats.ym += w * s2; + stats.xxm += w * s1 * s1; + stats.xym += w * s1 * s2; + stats.yym += w * s2 * s2; + } + } + return VP8SSIMFromStatsClipped(&stats); +} + +static double SSIMGet_C(const uint8_t* src1, int stride1, + const uint8_t* src2, int stride2) { + VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 }; + int x, y; + for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) { + for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) { + const uint32_t w = kWeight[x] * kWeight[y]; + const uint32_t s1 = src1[x]; + const uint32_t s2 = src2[x]; + stats.xm += w * s1; + stats.ym += w * s2; + stats.xxm += w * s1 * s1; + stats.xym += w * s1 * s2; + stats.yym += w * s2 * s2; + } + } + return VP8SSIMFromStats(&stats); +} + +//------------------------------------------------------------------------------ + +static uint32_t AccumulateSSE(const uint8_t* src1, + const uint8_t* src2, int len) { + int i; + uint32_t sse2 = 0; + assert(len <= 65535); // to ensure that accumulation fits within uint32_t + for (i = 0; i < len; ++i) { + const int32_t diff = src1[i] - src2[i]; + sse2 += diff * diff; + } + return sse2; +} + +//------------------------------------------------------------------------------ + +VP8SSIMGetFunc VP8SSIMGet; +VP8SSIMGetClippedFunc VP8SSIMGetClipped; +VP8AccumulateSSEFunc VP8AccumulateSSE; + +extern void VP8SSIMDspInitSSE2(void); + +static volatile VP8CPUInfo ssim_last_cpuinfo_used = + (VP8CPUInfo)&ssim_last_cpuinfo_used; + +WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) { + if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return; + + VP8SSIMGetClipped = SSIMGetClipped_C; + VP8SSIMGet = SSIMGet_C; + + VP8AccumulateSSE = AccumulateSSE; + if (VP8GetCPUInfo != NULL) { +#if defined(WEBP_USE_SSE2) + if (VP8GetCPUInfo(kSSE2)) { + VP8SSIMDspInitSSE2(); + } +#endif + } + + ssim_last_cpuinfo_used = VP8GetCPUInfo; +} diff --git a/src/dsp/ssim_sse2.c b/src/dsp/ssim_sse2.c new file mode 100644 index 00000000..effd8628 --- /dev/null +++ b/src/dsp/ssim_sse2.c @@ -0,0 +1,154 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// SSE2 version of distortion calculation +// +// Author: Skal (pascal.massimino@gmail.com) + +#include "./dsp.h" + +#if defined(WEBP_USE_SSE2) + +#include +#include + +#include "./common_sse2.h" + +// Helper function +static WEBP_INLINE void SubtractAndSquare(const __m128i a, const __m128i b, + __m128i* const sum) { + // take abs(a-b) in 8b + const __m128i a_b = _mm_subs_epu8(a, b); + const __m128i b_a = _mm_subs_epu8(b, a); + const __m128i abs_a_b = _mm_or_si128(a_b, b_a); + // zero-extend to 16b + const __m128i zero = _mm_setzero_si128(); + const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero); + const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero); + // multiply with self + const __m128i sum1 = _mm_madd_epi16(C0, C0); + const __m128i sum2 = _mm_madd_epi16(C1, C1); + *sum = _mm_add_epi32(sum1, sum2); +} + +//------------------------------------------------------------------------------ +// SSIM / PSNR entry point + +static uint32_t AccumulateSSE_SSE2(const uint8_t* src1, + const uint8_t* src2, int len) { + int i = 0; + uint32_t sse2 = 0; + if (len >= 16) { + const int limit = len - 32; + int32_t tmp[4]; + __m128i sum1; + __m128i sum = _mm_setzero_si128(); + __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]); + __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]); + i += 16; + while (i <= limit) { + const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]); + const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]); + __m128i sum2; + i += 16; + SubtractAndSquare(a0, b0, &sum1); + sum = _mm_add_epi32(sum, sum1); + a0 = _mm_loadu_si128((const __m128i*)&src1[i]); + b0 = _mm_loadu_si128((const __m128i*)&src2[i]); + i += 16; + SubtractAndSquare(a1, b1, &sum2); + sum = _mm_add_epi32(sum, sum2); + } + SubtractAndSquare(a0, b0, &sum1); + sum = _mm_add_epi32(sum, sum1); + _mm_storeu_si128((__m128i*)tmp, sum); + sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]); + } + + for (; i < len; ++i) { + const int32_t diff = src1[i] - src2[i]; + sse2 += diff * diff; + } + return sse2; +} + +static uint32_t HorizontalAdd16b(const __m128i* const m) { + uint16_t tmp[8]; + const __m128i a = _mm_srli_si128(*m, 8); + const __m128i b = _mm_add_epi16(*m, a); + _mm_storeu_si128((__m128i*)tmp, b); + return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0]; +} + +static uint32_t HorizontalAdd32b(const __m128i* const m) { + const __m128i a = _mm_srli_si128(*m, 8); + const __m128i b = _mm_add_epi32(*m, a); + const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4)); + return (uint32_t)_mm_cvtsi128_si32(c); +} + +static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 }; + +#define ACCUMULATE_ROW(WEIGHT) do { \ + /* compute row weight (Wx * Wy) */ \ + const __m128i Wy = _mm_set1_epi16((WEIGHT)); \ + const __m128i W = _mm_mullo_epi16(Wx, Wy); \ + /* process 8 bytes at a time (7 bytes, actually) */ \ + const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \ + const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \ + /* convert to 16b and multiply by weight */ \ + const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \ + const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \ + const __m128i wa1 = _mm_mullo_epi16(a1, W); \ + const __m128i wb1 = _mm_mullo_epi16(b1, W); \ + /* accumulate */ \ + xm = _mm_add_epi16(xm, wa1); \ + ym = _mm_add_epi16(ym, wb1); \ + xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \ + xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \ + yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \ + src1 += stride1; \ + src2 += stride2; \ +} while (0) + +static double SSIMGet_SSE2(const uint8_t* src1, int stride1, + const uint8_t* src2, int stride2) { + VP8DistoStats stats; + const __m128i zero = _mm_setzero_si128(); + __m128i xm = zero, ym = zero; // 16b accums + __m128i xxm = zero, yym = zero, xym = zero; // 32b accum + const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight); + assert(2 * VP8_SSIM_KERNEL + 1 == 7); + ACCUMULATE_ROW(1); + ACCUMULATE_ROW(2); + ACCUMULATE_ROW(3); + ACCUMULATE_ROW(4); + ACCUMULATE_ROW(3); + ACCUMULATE_ROW(2); + ACCUMULATE_ROW(1); + stats.xm = HorizontalAdd16b(&xm); + stats.ym = HorizontalAdd16b(&ym); + stats.xxm = HorizontalAdd32b(&xxm); + stats.xym = HorizontalAdd32b(&xym); + stats.yym = HorizontalAdd32b(&yym); + return VP8SSIMFromStats(&stats); +} + +extern void VP8SSIMDspInitSSE2(void); + +WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) { + VP8AccumulateSSE = AccumulateSSE_SSE2; + VP8SSIMGet = SSIMGet_SSE2; +} + +#else // !WEBP_USE_SSE2 + +WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2) + +#endif // WEBP_USE_SSE2