mirror of
https://github.com/webmproject/libwebp.git
synced 2025-07-13 06:24:27 +02:00
Merge "Make libwebp depend on libsharpyuv." into main
This commit is contained in:
committed by
Gerrit Code Review
commit
d0d2292e1a
@ -332,15 +332,6 @@ extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
|
||||
extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
|
||||
uint8_t* u, uint8_t* v, int width);
|
||||
|
||||
// utilities for accurate RGB->YUV conversion
|
||||
extern uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* src, const uint16_t* ref,
|
||||
uint16_t* dst, int len);
|
||||
extern void (*WebPSharpYUVUpdateRGB)(const int16_t* src, const int16_t* ref,
|
||||
int16_t* dst, int len);
|
||||
extern void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B,
|
||||
int len,
|
||||
const uint16_t* best_y, uint16_t* out);
|
||||
|
||||
// Must be called before using the above.
|
||||
void WebPInitConvertARGBToYUV(void);
|
||||
|
||||
|
@ -194,50 +194,6 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
|
||||
static uint16_t clip_y(int v) {
|
||||
return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
|
||||
}
|
||||
|
||||
static uint64_t SharpYUVUpdateY_C(const uint16_t* ref, const uint16_t* src,
|
||||
uint16_t* dst, int len) {
|
||||
uint64_t diff = 0;
|
||||
int i;
|
||||
for (i = 0; i < len; ++i) {
|
||||
const int diff_y = ref[i] - src[i];
|
||||
const int new_y = (int)dst[i] + diff_y;
|
||||
dst[i] = clip_y(new_y);
|
||||
diff += (uint64_t)abs(diff_y);
|
||||
}
|
||||
return diff;
|
||||
}
|
||||
|
||||
static void SharpYUVUpdateRGB_C(const int16_t* ref, const int16_t* src,
|
||||
int16_t* dst, int len) {
|
||||
int i;
|
||||
for (i = 0; i < len; ++i) {
|
||||
const int diff_uv = ref[i] - src[i];
|
||||
dst[i] += diff_uv;
|
||||
}
|
||||
}
|
||||
|
||||
static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len,
|
||||
const uint16_t* best_y, uint16_t* out) {
|
||||
int i;
|
||||
for (i = 0; i < len; ++i, ++A, ++B) {
|
||||
const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
|
||||
const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
|
||||
out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
|
||||
out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
|
||||
}
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
#undef MAX_Y
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
|
||||
void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
|
||||
void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
|
||||
@ -247,18 +203,9 @@ void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
|
||||
void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
|
||||
int src_width, int do_store);
|
||||
|
||||
uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* ref, const uint16_t* src,
|
||||
uint16_t* dst, int len);
|
||||
void (*WebPSharpYUVUpdateRGB)(const int16_t* ref, const int16_t* src,
|
||||
int16_t* dst, int len);
|
||||
void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B, int len,
|
||||
const uint16_t* best_y, uint16_t* out);
|
||||
|
||||
extern void WebPInitConvertARGBToYUVSSE2(void);
|
||||
extern void WebPInitConvertARGBToYUVSSE41(void);
|
||||
extern void WebPInitConvertARGBToYUVNEON(void);
|
||||
extern void WebPInitSharpYUVSSE2(void);
|
||||
extern void WebPInitSharpYUVNEON(void);
|
||||
|
||||
WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
|
||||
WebPConvertARGBToY = ConvertARGBToY_C;
|
||||
@ -269,17 +216,10 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
|
||||
|
||||
WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
WebPSharpYUVUpdateY = SharpYUVUpdateY_C;
|
||||
WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_C;
|
||||
WebPSharpYUVFilterRow = SharpYUVFilterRow_C;
|
||||
#endif
|
||||
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_HAVE_SSE2)
|
||||
if (VP8GetCPUInfo(kSSE2)) {
|
||||
WebPInitConvertARGBToYUVSSE2();
|
||||
WebPInitSharpYUVSSE2();
|
||||
}
|
||||
#endif // WEBP_HAVE_SSE2
|
||||
#if defined(WEBP_HAVE_SSE41)
|
||||
@ -293,7 +233,6 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
|
||||
if (WEBP_NEON_OMIT_C_CODE ||
|
||||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
|
||||
WebPInitConvertARGBToYUVNEON();
|
||||
WebPInitSharpYUVNEON();
|
||||
}
|
||||
#endif // WEBP_HAVE_NEON
|
||||
|
||||
@ -302,7 +241,4 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
|
||||
assert(WebPConvertRGB24ToY != NULL);
|
||||
assert(WebPConvertBGR24ToY != NULL);
|
||||
assert(WebPConvertRGBA32ToUV != NULL);
|
||||
assert(WebPSharpYUVUpdateY != NULL);
|
||||
assert(WebPSharpYUVUpdateRGB != NULL);
|
||||
assert(WebPSharpYUVFilterRow != NULL);
|
||||
}
|
||||
|
@ -173,116 +173,8 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) {
|
||||
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
|
||||
static uint16_t clip_y_NEON(int v) {
|
||||
return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
|
||||
}
|
||||
|
||||
static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
|
||||
uint16_t* dst, int len) {
|
||||
int i;
|
||||
const int16x8_t zero = vdupq_n_s16(0);
|
||||
const int16x8_t max = vdupq_n_s16(MAX_Y);
|
||||
uint64x2_t sum = vdupq_n_u64(0);
|
||||
uint64_t diff;
|
||||
|
||||
for (i = 0; i + 8 <= len; i += 8) {
|
||||
const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
|
||||
const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
|
||||
const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
|
||||
const int16x8_t D = vsubq_s16(A, B); // diff_y
|
||||
const int16x8_t F = vaddq_s16(C, D); // new_y
|
||||
const uint16x8_t H =
|
||||
vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
|
||||
const int16x8_t I = vabsq_s16(D); // abs(diff_y)
|
||||
vst1q_u16(dst + i, H);
|
||||
sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
|
||||
}
|
||||
diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
|
||||
for (; i < len; ++i) {
|
||||
const int diff_y = ref[i] - src[i];
|
||||
const int new_y = (int)(dst[i]) + diff_y;
|
||||
dst[i] = clip_y_NEON(new_y);
|
||||
diff += (uint64_t)(abs(diff_y));
|
||||
}
|
||||
return diff;
|
||||
}
|
||||
|
||||
static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
|
||||
int16_t* dst, int len) {
|
||||
int i;
|
||||
for (i = 0; i + 8 <= len; i += 8) {
|
||||
const int16x8_t A = vld1q_s16(ref + i);
|
||||
const int16x8_t B = vld1q_s16(src + i);
|
||||
const int16x8_t C = vld1q_s16(dst + i);
|
||||
const int16x8_t D = vsubq_s16(A, B); // diff_uv
|
||||
const int16x8_t E = vaddq_s16(C, D); // new_uv
|
||||
vst1q_s16(dst + i, E);
|
||||
}
|
||||
for (; i < len; ++i) {
|
||||
const int diff_uv = ref[i] - src[i];
|
||||
dst[i] += diff_uv;
|
||||
}
|
||||
}
|
||||
|
||||
static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
|
||||
const uint16_t* best_y, uint16_t* out) {
|
||||
int i;
|
||||
const int16x8_t max = vdupq_n_s16(MAX_Y);
|
||||
const int16x8_t zero = vdupq_n_s16(0);
|
||||
for (i = 0; i + 8 <= len; i += 8) {
|
||||
const int16x8_t a0 = vld1q_s16(A + i + 0);
|
||||
const int16x8_t a1 = vld1q_s16(A + i + 1);
|
||||
const int16x8_t b0 = vld1q_s16(B + i + 0);
|
||||
const int16x8_t b1 = vld1q_s16(B + i + 1);
|
||||
const int16x8_t a0b1 = vaddq_s16(a0, b1);
|
||||
const int16x8_t a1b0 = vaddq_s16(a1, b0);
|
||||
const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1
|
||||
const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1)
|
||||
const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0)
|
||||
const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
|
||||
const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
|
||||
const int16x8_t d0 = vaddq_s16(c1, a0);
|
||||
const int16x8_t d1 = vaddq_s16(c0, a1);
|
||||
const int16x8_t e0 = vrshrq_n_s16(d0, 1);
|
||||
const int16x8_t e1 = vrshrq_n_s16(d1, 1);
|
||||
const int16x8x2_t f = vzipq_s16(e0, e1);
|
||||
const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
|
||||
const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
|
||||
const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
|
||||
const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
|
||||
const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
|
||||
const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
|
||||
vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
|
||||
vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
|
||||
}
|
||||
for (; i < len; ++i) {
|
||||
const int a0b1 = A[i + 0] + B[i + 1];
|
||||
const int a1b0 = A[i + 1] + B[i + 0];
|
||||
const int a0a1b0b1 = a0b1 + a1b0 + 8;
|
||||
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
|
||||
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
|
||||
out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0);
|
||||
out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1);
|
||||
}
|
||||
}
|
||||
#undef MAX_Y
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
extern void WebPInitSharpYUVNEON(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) {
|
||||
WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON;
|
||||
WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON;
|
||||
WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_NEON
|
||||
|
||||
WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON)
|
||||
WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON)
|
||||
|
||||
#endif // WEBP_USE_NEON
|
||||
|
@ -747,128 +747,9 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
|
||||
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
|
||||
static uint16_t clip_y(int v) {
|
||||
return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
|
||||
}
|
||||
|
||||
static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
|
||||
uint16_t* dst, int len) {
|
||||
uint64_t diff = 0;
|
||||
uint32_t tmp[4];
|
||||
int i;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i max = _mm_set1_epi16(MAX_Y);
|
||||
const __m128i one = _mm_set1_epi16(1);
|
||||
__m128i sum = zero;
|
||||
|
||||
for (i = 0; i + 8 <= len; i += 8) {
|
||||
const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
|
||||
const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
|
||||
const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
|
||||
const __m128i D = _mm_sub_epi16(A, B); // diff_y
|
||||
const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
|
||||
const __m128i F = _mm_add_epi16(C, D); // new_y
|
||||
const __m128i G = _mm_or_si128(E, one); // -1 or 1
|
||||
const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
|
||||
const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
|
||||
_mm_storeu_si128((__m128i*)(dst + i), H);
|
||||
sum = _mm_add_epi32(sum, I);
|
||||
}
|
||||
_mm_storeu_si128((__m128i*)tmp, sum);
|
||||
diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
|
||||
for (; i < len; ++i) {
|
||||
const int diff_y = ref[i] - src[i];
|
||||
const int new_y = (int)dst[i] + diff_y;
|
||||
dst[i] = clip_y(new_y);
|
||||
diff += (uint64_t)abs(diff_y);
|
||||
}
|
||||
return diff;
|
||||
}
|
||||
|
||||
static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
|
||||
int16_t* dst, int len) {
|
||||
int i = 0;
|
||||
for (i = 0; i + 8 <= len; i += 8) {
|
||||
const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
|
||||
const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
|
||||
const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
|
||||
const __m128i D = _mm_sub_epi16(A, B); // diff_uv
|
||||
const __m128i E = _mm_add_epi16(C, D); // new_uv
|
||||
_mm_storeu_si128((__m128i*)(dst + i), E);
|
||||
}
|
||||
for (; i < len; ++i) {
|
||||
const int diff_uv = ref[i] - src[i];
|
||||
dst[i] += diff_uv;
|
||||
}
|
||||
}
|
||||
|
||||
static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
|
||||
const uint16_t* best_y, uint16_t* out) {
|
||||
int i;
|
||||
const __m128i kCst8 = _mm_set1_epi16(8);
|
||||
const __m128i max = _mm_set1_epi16(MAX_Y);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
for (i = 0; i + 8 <= len; i += 8) {
|
||||
const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
|
||||
const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
|
||||
const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
|
||||
const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
|
||||
const __m128i a0b1 = _mm_add_epi16(a0, b1);
|
||||
const __m128i a1b0 = _mm_add_epi16(a1, b0);
|
||||
const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
|
||||
const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
|
||||
const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)
|
||||
const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)
|
||||
const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
|
||||
const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
|
||||
const __m128i d0 = _mm_add_epi16(c1, a0);
|
||||
const __m128i d1 = _mm_add_epi16(c0, a1);
|
||||
const __m128i e0 = _mm_srai_epi16(d0, 1);
|
||||
const __m128i e1 = _mm_srai_epi16(d1, 1);
|
||||
const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
|
||||
const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
|
||||
const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
|
||||
const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
|
||||
const __m128i h0 = _mm_add_epi16(g0, f0);
|
||||
const __m128i h1 = _mm_add_epi16(g1, f1);
|
||||
const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
|
||||
const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
|
||||
_mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
|
||||
_mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
|
||||
}
|
||||
for (; i < len; ++i) {
|
||||
// (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
|
||||
// = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
|
||||
// We reuse the common sub-expressions.
|
||||
const int a0b1 = A[i + 0] + B[i + 1];
|
||||
const int a1b0 = A[i + 1] + B[i + 0];
|
||||
const int a0a1b0b1 = a0b1 + a1b0 + 8;
|
||||
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
|
||||
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
|
||||
out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
|
||||
out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
|
||||
}
|
||||
}
|
||||
|
||||
#undef MAX_Y
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
extern void WebPInitSharpYUVSSE2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) {
|
||||
WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2;
|
||||
WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2;
|
||||
WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_SSE2
|
||||
|
||||
WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
|
||||
WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
|
||||
WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2)
|
||||
|
||||
#endif // WEBP_USE_SSE2
|
||||
|
Reference in New Issue
Block a user