Merge "Make libwebp depend on libsharpyuv." into main

This commit is contained in:
Maryla Ustarroz-Calonge
2022-03-07 09:34:37 +00:00
committed by Gerrit Code Review
13 changed files with 51 additions and 717 deletions

View File

@ -332,15 +332,6 @@ extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width);
// utilities for accurate RGB->YUV conversion
extern uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* src, const uint16_t* ref,
uint16_t* dst, int len);
extern void (*WebPSharpYUVUpdateRGB)(const int16_t* src, const int16_t* ref,
int16_t* dst, int len);
extern void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B,
int len,
const uint16_t* best_y, uint16_t* out);
// Must be called before using the above.
void WebPInitConvertARGBToYUV(void);

View File

@ -194,50 +194,6 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
//-----------------------------------------------------------------------------
#if !WEBP_NEON_OMIT_C_CODE
#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
static uint16_t clip_y(int v) {
return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
}
static uint64_t SharpYUVUpdateY_C(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len) {
uint64_t diff = 0;
int i;
for (i = 0; i < len; ++i) {
const int diff_y = ref[i] - src[i];
const int new_y = (int)dst[i] + diff_y;
dst[i] = clip_y(new_y);
diff += (uint64_t)abs(diff_y);
}
return diff;
}
static void SharpYUVUpdateRGB_C(const int16_t* ref, const int16_t* src,
int16_t* dst, int len) {
int i;
for (i = 0; i < len; ++i) {
const int diff_uv = ref[i] - src[i];
dst[i] += diff_uv;
}
}
static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out) {
int i;
for (i = 0; i < len; ++i, ++A, ++B) {
const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
#undef MAX_Y
//-----------------------------------------------------------------------------
void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
@ -247,18 +203,9 @@ void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
int src_width, int do_store);
uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len);
void (*WebPSharpYUVUpdateRGB)(const int16_t* ref, const int16_t* src,
int16_t* dst, int len);
void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out);
extern void WebPInitConvertARGBToYUVSSE2(void);
extern void WebPInitConvertARGBToYUVSSE41(void);
extern void WebPInitConvertARGBToYUVNEON(void);
extern void WebPInitSharpYUVSSE2(void);
extern void WebPInitSharpYUVNEON(void);
WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
WebPConvertARGBToY = ConvertARGBToY_C;
@ -269,17 +216,10 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;
#if !WEBP_NEON_OMIT_C_CODE
WebPSharpYUVUpdateY = SharpYUVUpdateY_C;
WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_C;
WebPSharpYUVFilterRow = SharpYUVFilterRow_C;
#endif
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitConvertARGBToYUVSSE2();
WebPInitSharpYUVSSE2();
}
#endif // WEBP_HAVE_SSE2
#if defined(WEBP_HAVE_SSE41)
@ -293,7 +233,6 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitConvertARGBToYUVNEON();
WebPInitSharpYUVNEON();
}
#endif // WEBP_HAVE_NEON
@ -302,7 +241,4 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
assert(WebPConvertRGB24ToY != NULL);
assert(WebPConvertBGR24ToY != NULL);
assert(WebPConvertRGBA32ToUV != NULL);
assert(WebPSharpYUVUpdateY != NULL);
assert(WebPSharpYUVUpdateRGB != NULL);
assert(WebPSharpYUVFilterRow != NULL);
}

View File

@ -173,116 +173,8 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) {
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON;
}
//------------------------------------------------------------------------------
#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
static uint16_t clip_y_NEON(int v) {
return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
}
static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len) {
int i;
const int16x8_t zero = vdupq_n_s16(0);
const int16x8_t max = vdupq_n_s16(MAX_Y);
uint64x2_t sum = vdupq_n_u64(0);
uint64_t diff;
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
const int16x8_t D = vsubq_s16(A, B); // diff_y
const int16x8_t F = vaddq_s16(C, D); // new_y
const uint16x8_t H =
vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
const int16x8_t I = vabsq_s16(D); // abs(diff_y)
vst1q_u16(dst + i, H);
sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
}
diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
for (; i < len; ++i) {
const int diff_y = ref[i] - src[i];
const int new_y = (int)(dst[i]) + diff_y;
dst[i] = clip_y_NEON(new_y);
diff += (uint64_t)(abs(diff_y));
}
return diff;
}
static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
int16_t* dst, int len) {
int i;
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t A = vld1q_s16(ref + i);
const int16x8_t B = vld1q_s16(src + i);
const int16x8_t C = vld1q_s16(dst + i);
const int16x8_t D = vsubq_s16(A, B); // diff_uv
const int16x8_t E = vaddq_s16(C, D); // new_uv
vst1q_s16(dst + i, E);
}
for (; i < len; ++i) {
const int diff_uv = ref[i] - src[i];
dst[i] += diff_uv;
}
}
static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out) {
int i;
const int16x8_t max = vdupq_n_s16(MAX_Y);
const int16x8_t zero = vdupq_n_s16(0);
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t a0 = vld1q_s16(A + i + 0);
const int16x8_t a1 = vld1q_s16(A + i + 1);
const int16x8_t b0 = vld1q_s16(B + i + 0);
const int16x8_t b1 = vld1q_s16(B + i + 1);
const int16x8_t a0b1 = vaddq_s16(a0, b1);
const int16x8_t a1b0 = vaddq_s16(a1, b0);
const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1
const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1)
const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0)
const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
const int16x8_t d0 = vaddq_s16(c1, a0);
const int16x8_t d1 = vaddq_s16(c0, a1);
const int16x8_t e0 = vrshrq_n_s16(d0, 1);
const int16x8_t e1 = vrshrq_n_s16(d1, 1);
const int16x8x2_t f = vzipq_s16(e0, e1);
const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
}
for (; i < len; ++i) {
const int a0b1 = A[i + 0] + B[i + 1];
const int a1b0 = A[i + 1] + B[i + 0];
const int a0a1b0b1 = a0b1 + a1b0 + 8;
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0);
out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1);
}
}
#undef MAX_Y
//------------------------------------------------------------------------------
extern void WebPInitSharpYUVNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) {
WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON;
WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON;
WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON;
}
#else // !WEBP_USE_NEON
WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON)
WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON)
#endif // WEBP_USE_NEON

View File

@ -747,128 +747,9 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2;
}
//------------------------------------------------------------------------------
#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
static uint16_t clip_y(int v) {
return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
}
static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len) {
uint64_t diff = 0;
uint32_t tmp[4];
int i;
const __m128i zero = _mm_setzero_si128();
const __m128i max = _mm_set1_epi16(MAX_Y);
const __m128i one = _mm_set1_epi16(1);
__m128i sum = zero;
for (i = 0; i + 8 <= len; i += 8) {
const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
const __m128i D = _mm_sub_epi16(A, B); // diff_y
const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
const __m128i F = _mm_add_epi16(C, D); // new_y
const __m128i G = _mm_or_si128(E, one); // -1 or 1
const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
_mm_storeu_si128((__m128i*)(dst + i), H);
sum = _mm_add_epi32(sum, I);
}
_mm_storeu_si128((__m128i*)tmp, sum);
diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
for (; i < len; ++i) {
const int diff_y = ref[i] - src[i];
const int new_y = (int)dst[i] + diff_y;
dst[i] = clip_y(new_y);
diff += (uint64_t)abs(diff_y);
}
return diff;
}
static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
int16_t* dst, int len) {
int i = 0;
for (i = 0; i + 8 <= len; i += 8) {
const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
const __m128i D = _mm_sub_epi16(A, B); // diff_uv
const __m128i E = _mm_add_epi16(C, D); // new_uv
_mm_storeu_si128((__m128i*)(dst + i), E);
}
for (; i < len; ++i) {
const int diff_uv = ref[i] - src[i];
dst[i] += diff_uv;
}
}
static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out) {
int i;
const __m128i kCst8 = _mm_set1_epi16(8);
const __m128i max = _mm_set1_epi16(MAX_Y);
const __m128i zero = _mm_setzero_si128();
for (i = 0; i + 8 <= len; i += 8) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
const __m128i a0b1 = _mm_add_epi16(a0, b1);
const __m128i a1b0 = _mm_add_epi16(a1, b0);
const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)
const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)
const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
const __m128i d0 = _mm_add_epi16(c1, a0);
const __m128i d1 = _mm_add_epi16(c0, a1);
const __m128i e0 = _mm_srai_epi16(d0, 1);
const __m128i e1 = _mm_srai_epi16(d1, 1);
const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
const __m128i h0 = _mm_add_epi16(g0, f0);
const __m128i h1 = _mm_add_epi16(g1, f1);
const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
_mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
_mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
}
for (; i < len; ++i) {
// (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
// = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
// We reuse the common sub-expressions.
const int a0b1 = A[i + 0] + B[i + 1];
const int a1b0 = A[i + 1] + B[i + 0];
const int a0a1b0b1 = a0b1 + a1b0 + 8;
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
}
}
#undef MAX_Y
//------------------------------------------------------------------------------
extern void WebPInitSharpYUVSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) {
WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2;
WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2;
WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2)
#endif // WEBP_USE_SSE2