diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index e8c8d383..e0fb4e5d 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -427,6 +427,15 @@ extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v, extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb, uint8_t* u, uint8_t* v, int width); +// utilities for accurate RGB->YUV conversion +extern uint64_t (*WebPSmartYUVUpdateY)(const uint16_t* src, const uint16_t* ref, + uint16_t* dst, int len); +extern void (*WebPSmartYUVUpdateRGB)(const int16_t* src, const int16_t* ref, + int16_t* dst, int len); +extern void (*WebPSmartYUVFilterRow)(const int16_t* A, const int16_t* B, + int len, + const uint16_t* best_y, uint16_t* out); + // Must be called before using the above. void WebPInitConvertARGBToYUV(void); diff --git a/src/dsp/yuv.c b/src/dsp/yuv.c index f50a2531..d7fd4305 100644 --- a/src/dsp/yuv.c +++ b/src/dsp/yuv.c @@ -13,6 +13,8 @@ #include "./yuv.h" +#include + #if defined(WEBP_YUV_USE_TABLE) static int done = 0; @@ -244,6 +246,48 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb, //----------------------------------------------------------------------------- +#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic +static uint16_t clip_y(int v) { + return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v; +} + +static uint64_t SmartYUVUpdateY_C(const uint16_t* ref, const uint16_t* src, + uint16_t* dst, int len) { + uint64_t diff = 0; + int i; + for (i = 0; i < len; ++i) { + const int diff_y = ref[i] - src[i]; + const int new_y = (int)dst[i] + diff_y; + dst[i] = clip_y(new_y); + diff += (uint64_t)abs(diff_y); + } + return diff; +} + +static void SmartYUVUpdateRGB_C(const int16_t* ref, const int16_t* src, + int16_t* dst, int len) { + int i; + for (i = 0; i < len; ++i) { + const int diff_uv = (int)ref[i] - src[i]; + dst[i] += diff_uv; + } +} + +static void SmartYUVFilterRow_C(const int16_t* A, const int16_t* B, int len, + const uint16_t* best_y, uint16_t* out) { + int i; + for (i = 0; i < len; ++i, ++A, ++B) { + const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4; + const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4; + out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0); + out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); + } +} + +#undef MAX_Y + +//----------------------------------------------------------------------------- + void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width); void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width); void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb, @@ -253,10 +297,18 @@ void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width); void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v, int src_width, int do_store); +uint64_t (*WebPSmartYUVUpdateY)(const uint16_t* ref, const uint16_t* src, + uint16_t* dst, int len); +void (*WebPSmartYUVUpdateRGB)(const int16_t* ref, const int16_t* src, + int16_t* dst, int len); +void (*WebPSmartYUVFilterRow)(const int16_t* A, const int16_t* B, int len, + const uint16_t* best_y, uint16_t* out); + static volatile VP8CPUInfo rgba_to_yuv_last_cpuinfo_used = (VP8CPUInfo)&rgba_to_yuv_last_cpuinfo_used; extern void WebPInitConvertARGBToYUVSSE2(void); +extern void WebPInitSmartYUVSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) { if (rgba_to_yuv_last_cpuinfo_used == VP8GetCPUInfo) return; @@ -269,10 +321,15 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) { WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C; + WebPSmartYUVUpdateY = SmartYUVUpdateY_C; + WebPSmartYUVUpdateRGB = SmartYUVUpdateRGB_C; + WebPSmartYUVFilterRow = SmartYUVFilterRow_C; + if (VP8GetCPUInfo != NULL) { #if defined(WEBP_USE_SSE2) if (VP8GetCPUInfo(kSSE2)) { WebPInitConvertARGBToYUVSSE2(); + WebPInitSmartYUVSSE2(); } #endif // WEBP_USE_SSE2 } diff --git a/src/dsp/yuv_sse2.c b/src/dsp/yuv_sse2.c index e19bddff..9feaea13 100644 --- a/src/dsp/yuv_sse2.c +++ b/src/dsp/yuv_sse2.c @@ -15,6 +15,7 @@ #if defined(WEBP_USE_SSE2) +#include #include //----------------------------------------------------------------------------- @@ -767,9 +768,128 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) { WebPConvertRGBA32ToUV = ConvertRGBA32ToUV; } +//------------------------------------------------------------------------------ + +#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic +static uint16_t clip_y(int v) { + return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v; +} + +static uint64_t SmartYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src, + uint16_t* dst, int len) { + uint64_t diff = 0; + uint32_t tmp[4]; + int i; + const __m128i zero = _mm_setzero_si128(); + const __m128i max = _mm_set1_epi16(MAX_Y); + const __m128i one = _mm_set1_epi16(1); + __m128i sum = zero; + + for (i = 0; i + 8 <= len; i += 8) { + const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i)); + const __m128i B = _mm_loadu_si128((const __m128i*)(src + i)); + const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i)); + const __m128i D = _mm_sub_epi16(A, B); // diff_y + const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0) + const __m128i F = _mm_add_epi16(C, D); // new_y + const __m128i G = _mm_or_si128(E, one); // -1 or 1 + const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero); + const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...)) + _mm_storeu_si128((__m128i*)(dst + i), H); + sum = _mm_add_epi32(sum, I); + } + _mm_storeu_si128((__m128i*)tmp, sum); + diff = tmp[3] + tmp[2] + tmp[1] + tmp[0]; + for (; i < len; ++i) { + const int diff_y = ref[i] - src[i]; + const int new_y = (int)dst[i] + diff_y; + dst[i] = clip_y(new_y); + diff += (uint64_t)abs(diff_y); + } + return diff; +} + +static void SmartYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src, + int16_t* dst, int len) { + int i = 0; + for (i = 0; i + 8 <= len; i += 8) { + const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i)); + const __m128i B = _mm_loadu_si128((const __m128i*)(src + i)); + const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i)); + const __m128i D = _mm_sub_epi16(A, B); // diff_uv + const __m128i E = _mm_add_epi16(C, D); // new_uv + _mm_storeu_si128((__m128i*)(dst + i), E); + } + for (; i < len; ++i) { + const int diff_uv = (int)ref[i] - src[i]; + dst[i] += diff_uv; + } +} + +static void SmartYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len, + const uint16_t* best_y, uint16_t* out) { + int i; + const __m128i kCst8 = _mm_set1_epi16(8); + const __m128i max = _mm_set1_epi16(MAX_Y); + const __m128i zero = _mm_setzero_si128(); + for (i = 0; i + 8 <= len; i += 8) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0)); + const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1)); + const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0)); + const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1)); + const __m128i a0b1 = _mm_add_epi16(a0, b1); + const __m128i a1b0 = _mm_add_epi16(a1, b0); + const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1 + const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8); + const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1) + const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0) + const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3); + const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3); + const __m128i d0 = _mm_add_epi16(c1, a0); + const __m128i d1 = _mm_add_epi16(c0, a1); + const __m128i e0 = _mm_srai_epi16(d0, 1); + const __m128i e1 = _mm_srai_epi16(d1, 1); + const __m128i f0 = _mm_unpacklo_epi16(e0, e1); + const __m128i f1 = _mm_unpackhi_epi16(e0, e1); + const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0)); + const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8)); + const __m128i h0 = _mm_add_epi16(g0, f0); + const __m128i h1 = _mm_add_epi16(g1, f1); + const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero); + const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero); + _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0); + _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1); + } + for (; i < len; ++i) { + // (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 = + // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4 + // We reuse the common sub-expressions. + const int a0b1 = A[i + 0] + B[i + 1]; + const int a1b0 = A[i + 1] + B[i + 0]; + const int a0a1b0b1 = a0b1 + a1b0 + 8; + const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; + const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; + out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0); + out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); + } +} + +#undef MAX_Y + +//------------------------------------------------------------------------------ + +extern void WebPInitSmartYUVSSE2(void); + +WEBP_TSAN_IGNORE_FUNCTION void WebPInitSmartYUVSSE2(void) { + WebPSmartYUVUpdateY = SmartYUVUpdateY_SSE2; + WebPSmartYUVUpdateRGB = SmartYUVUpdateRGB_SSE2; + WebPSmartYUVFilterRow = SmartYUVFilterRow_SSE2; +} + #else // !WEBP_USE_SSE2 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2) WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2) +WEBP_DSP_INIT_STUB(WebPInitSmartYUVSSE2) #endif // WEBP_USE_SSE2 diff --git a/src/enc/picture_csp.c b/src/enc/picture_csp.c index ed619fe3..b5c96e1b 100644 --- a/src/enc/picture_csp.c +++ b/src/enc/picture_csp.c @@ -306,17 +306,6 @@ static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) { //------------------------------------------------------------------------------ -static void FilterRow(const fixed_t* A, const fixed_t* B, int len, - const fixed_y_t* best_y, fixed_y_t* out) { - int i; - for (i = 0; i < len; ++i, ++A, ++B) { - const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4; - const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4; - out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0); - out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); - } -} - static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0) { const int v0 = (A * 3 + B + 2) >> 2; return clip_y(v0 + W0); @@ -357,14 +346,15 @@ static void InterpolateTwoRows(const fixed_y_t* const best_y, fixed_y_t* out1, fixed_y_t* out2) { const int uv_w = w >> 1; + const int len = (w - 1) >> 1; // length to filter int k = 3; while (k-- > 0) { // process each R/G/B segments in turn // special boundary case for i==0 out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]); out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]); - FilterRow(cur_uv, prev_uv, (w - 1) >> 1, best_y + 0 + 1, out1 + 1); - FilterRow(cur_uv, next_uv, (w - 1) >> 1, best_y + w + 1, out2 + 1); + WebPSmartYUVFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1); + WebPSmartYUVFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1); // special boundary case for i == w - 1 when w is even if (!(w & 1)) { @@ -452,7 +442,7 @@ static int PreprocessARGB(const uint8_t* r_ptr, const int uv_w = w >> 1; const int uv_h = h >> 1; uint64_t prev_diff_y_sum = ~0; - int i, j, iter; + int j, iter; // TODO(skal): allocate one big memory chunk. But for now, it's easier // for valgrind debugging to have several chunks. @@ -480,6 +470,8 @@ static int PreprocessARGB(const uint8_t* r_ptr, assert(picture->width >= kMinDimensionIterativeConversion); assert(picture->height >= kMinDimensionIterativeConversion); + WebPInitConvertARGBToYUV(); + // Import RGB samples to W/RGB representation. for (j = 0; j < picture->height; j += 2) { const int is_last_row = (j == picture->height - 1); @@ -535,16 +527,9 @@ static int PreprocessARGB(const uint8_t* r_ptr, UpdateChroma(src1, src2, best_rgb_uv, uv_w); // update two rows of Y and one row of RGB - for (i = 0; i < 2 * w; ++i) { - const int diff_y = target_y[i] - best_rgb_y[i]; - const int new_y = (int)best_y[i] + diff_y; - best_y[i] = clip_y(new_y); - diff_y_sum += (uint64_t)abs(diff_y); - } - for (i = 0; i < 3 * uv_w; ++i) { - const int diff_uv = (int)target_uv[i] - best_rgb_uv[i]; - best_uv[i] += diff_uv; - } + diff_y_sum += WebPSmartYUVUpdateY(target_y, best_rgb_y, best_y, 2 * w); + WebPSmartYUVUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w); + best_y += 2 * w; best_uv += 3 * uv_w; target_y += 2 * w;