mirror of
				https://github.com/webmproject/libwebp.git
				synced 2025-10-31 18:35:41 +01:00 
			
		
		
		
	sharpyuv: add 32bit version of SharpYuvFilterRow.
This allows increasing intermediate value precision from 10 bits to 14 bits. Change-Id: I0fc33400d200a849bcc2c677ab8346215a9dbc3b
This commit is contained in:
		| @@ -32,9 +32,7 @@ static const int kMinDimensionIterativeConversion = 4; | |||||||
| static const int kYuvHalf = 1 << (YUV_FIX - 1); | static const int kYuvHalf = 1 << (YUV_FIX - 1); | ||||||
|  |  | ||||||
| // Max bit depth so that intermediate calculations fit in 16 bits. | // Max bit depth so that intermediate calculations fit in 16 bits. | ||||||
| // TODO(b/194336375): the C code can handle up to 14 bits, but the SIMD code | static const int kMaxBitDepth = 14; | ||||||
| // currently needs more room. |  | ||||||
| static const int kMaxBitDepth = 10; |  | ||||||
|  |  | ||||||
| // Returns the precision shift to use based on the input rgb_bit_depth. | // Returns the precision shift to use based on the input rgb_bit_depth. | ||||||
| static int GetPrecisionShift(int rgb_bit_depth) { | static int GetPrecisionShift(int rgb_bit_depth) { | ||||||
|   | |||||||
| @@ -44,8 +44,7 @@ typedef struct { | |||||||
| //     r, g, and b channels. If rgb_bit_depth is > 8, it should be a | //     r, g, and b channels. If rgb_bit_depth is > 8, it should be a | ||||||
| //     multiple of 2. | //     multiple of 2. | ||||||
| // rgb_bit_depth: number of bits for each r/g/b value. One of: 8, 10, 12, 16. | // rgb_bit_depth: number of bits for each r/g/b value. One of: 8, 10, 12, 16. | ||||||
| //     Note: for 10+ bit, input is truncated to 10 bits. | //     Note: 16 bit input is truncated to 14 bits before conversion to yuv. | ||||||
| //     TODO(b/194336375): increase precision. |  | ||||||
| // yuv_bit_depth: number of bits for each y/u/v value. One of: 8, 10, 12. | // yuv_bit_depth: number of bits for each y/u/v value. One of: 8, 10, 12. | ||||||
| // y_ptr, u_ptr, v_ptr: pointers to the destination y, u and v channels.  Should | // y_ptr, u_ptr, v_ptr: pointers to the destination y, u and v channels.  Should | ||||||
| //     point to uint8_t buffers if yuv_bit_depth is 8, or uint16_t buffers | //     point to uint8_t buffers if yuv_bit_depth is 8, or uint16_t buffers | ||||||
|   | |||||||
| @@ -75,9 +75,9 @@ static void SharpYuvUpdateRGB_NEON(const int16_t* ref, const int16_t* src, | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| static void SharpYuvFilterRow_NEON(const int16_t* A, const int16_t* B, int len, | static void SharpYuvFilterRow16_NEON(const int16_t* A, const int16_t* B, | ||||||
|                                    const uint16_t* best_y, uint16_t* out, |                                      int len, const uint16_t* best_y, | ||||||
|                                    int bit_depth) { |                                      uint16_t* out, int bit_depth) { | ||||||
|   const int max_y = (1 << bit_depth) - 1; |   const int max_y = (1 << bit_depth) - 1; | ||||||
|   int i; |   int i; | ||||||
|   const int16x8_t max = vdupq_n_s16(max_y); |   const int16x8_t max = vdupq_n_s16(max_y); | ||||||
| @@ -94,10 +94,8 @@ static void SharpYuvFilterRow_NEON(const int16_t* A, const int16_t* B, int len, | |||||||
|     const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0);    // 2*(A1+B0) |     const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0);    // 2*(A1+B0) | ||||||
|     const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3); |     const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3); | ||||||
|     const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3); |     const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3); | ||||||
|     const int16x8_t d0 = vaddq_s16(c1, a0); |     const int16x8_t e0 = vrhaddq_s16(c1, a0); | ||||||
|     const int16x8_t d1 = vaddq_s16(c0, a1); |     const int16x8_t e1 = vrhaddq_s16(c0, a1); | ||||||
|     const int16x8_t e0 = vrshrq_n_s16(d0, 1); |  | ||||||
|     const int16x8_t e1 = vrshrq_n_s16(d1, 1); |  | ||||||
|     const int16x8x2_t f = vzipq_s16(e0, e1); |     const int16x8x2_t f = vzipq_s16(e0, e1); | ||||||
|     const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0)); |     const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0)); | ||||||
|     const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8)); |     const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8)); | ||||||
| @@ -119,6 +117,56 @@ static void SharpYuvFilterRow_NEON(const int16_t* A, const int16_t* B, int len, | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | static void SharpYuvFilterRow32_NEON(const int16_t* A, const int16_t* B, | ||||||
|  |                                      int len, const uint16_t* best_y, | ||||||
|  |                                      uint16_t* out, int bit_depth) { | ||||||
|  |   const int max_y = (1 << bit_depth) - 1; | ||||||
|  |   int i; | ||||||
|  |   const uint16x8_t max = vdupq_n_u16(max_y); | ||||||
|  |   for (i = 0; i + 4 <= len; i += 4) { | ||||||
|  |     const int16x4_t a0 = vld1_s16(A + i + 0); | ||||||
|  |     const int16x4_t a1 = vld1_s16(A + i + 1); | ||||||
|  |     const int16x4_t b0 = vld1_s16(B + i + 0); | ||||||
|  |     const int16x4_t b1 = vld1_s16(B + i + 1); | ||||||
|  |     const int32x4_t a0b1 = vaddl_s16(a0, b1); | ||||||
|  |     const int32x4_t a1b0 = vaddl_s16(a1, b0); | ||||||
|  |     const int32x4_t a0a1b0b1 = vaddq_s32(a0b1, a1b0);  // A0+A1+B0+B1 | ||||||
|  |     const int32x4_t a0b1_2 = vaddq_s32(a0b1, a0b1);    // 2*(A0+B1) | ||||||
|  |     const int32x4_t a1b0_2 = vaddq_s32(a1b0, a1b0);    // 2*(A1+B0) | ||||||
|  |     const int32x4_t c0 = vshrq_n_s32(vaddq_s32(a0b1_2, a0a1b0b1), 3); | ||||||
|  |     const int32x4_t c1 = vshrq_n_s32(vaddq_s32(a1b0_2, a0a1b0b1), 3); | ||||||
|  |     const int32x4_t e0 = vrhaddq_s32(c1, vmovl_s16(a0)); | ||||||
|  |     const int32x4_t e1 = vrhaddq_s32(c0, vmovl_s16(a1)); | ||||||
|  |     const int32x4x2_t f = vzipq_s32(e0, e1); | ||||||
|  |  | ||||||
|  |     const int16x8_t g = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i)); | ||||||
|  |     const int32x4_t h0 = vaddw_s16(f.val[0], vget_low_s16(g)); | ||||||
|  |     const int32x4_t h1 = vaddw_s16(f.val[1], vget_high_s16(g)); | ||||||
|  |     const uint16x8_t i_16 = vcombine_u16(vqmovun_s32(h0), vqmovun_s32(h1)); | ||||||
|  |     const uint16x8_t i_clamped = vminq_u16(i_16, max); | ||||||
|  |     vst1q_u16(out + 2 * i + 0, i_clamped); | ||||||
|  |   } | ||||||
|  |   for (; i < len; ++i) { | ||||||
|  |     const int a0b1 = A[i + 0] + B[i + 1]; | ||||||
|  |     const int a1b0 = A[i + 1] + B[i + 0]; | ||||||
|  |     const int a0a1b0b1 = a0b1 + a1b0 + 8; | ||||||
|  |     const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; | ||||||
|  |     const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; | ||||||
|  |     out[2 * i + 0] = clip_NEON(best_y[2 * i + 0] + v0, max_y); | ||||||
|  |     out[2 * i + 1] = clip_NEON(best_y[2 * i + 1] + v1, max_y); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | static void SharpYuvFilterRow_NEON(const int16_t* A, const int16_t* B, int len, | ||||||
|  |                                    const uint16_t* best_y, uint16_t* out, | ||||||
|  |                                    int bit_depth) { | ||||||
|  |   if (bit_depth <= 10) { | ||||||
|  |     SharpYuvFilterRow16_NEON(A, B, len, best_y, out, bit_depth); | ||||||
|  |   } else { | ||||||
|  |     SharpYuvFilterRow32_NEON(A, B, len, best_y, out, bit_depth); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
| //------------------------------------------------------------------------------ | //------------------------------------------------------------------------------ | ||||||
|  |  | ||||||
| WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvNEON(void) { | WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvNEON(void) { | ||||||
|   | |||||||
| @@ -78,9 +78,9 @@ static void SharpYuvUpdateRGB_SSE2(const int16_t* ref, const int16_t* src, | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| static void SharpYuvFilterRow_SSE2(const int16_t* A, const int16_t* B, int len, | static void SharpYuvFilterRow16_SSE2(const int16_t* A, const int16_t* B, | ||||||
|                                    const uint16_t* best_y, uint16_t* out, |                                      int len, const uint16_t* best_y, | ||||||
|                                    int bit_depth) { |                                      uint16_t* out, int bit_depth) { | ||||||
|   const int max_y = (1 << bit_depth) - 1; |   const int max_y = (1 << bit_depth) - 1; | ||||||
|   int i; |   int i; | ||||||
|   const __m128i kCst8 = _mm_set1_epi16(8); |   const __m128i kCst8 = _mm_set1_epi16(8); | ||||||
| @@ -128,6 +128,66 @@ static void SharpYuvFilterRow_SSE2(const int16_t* A, const int16_t* B, int len, | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | static WEBP_INLINE __m128i s16_to_s32(__m128i in) { | ||||||
|  |   return _mm_srai_epi32(_mm_unpacklo_epi16(in, in), 16); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | static void SharpYuvFilterRow32_SSE2(const int16_t* A, const int16_t* B, | ||||||
|  |                                      int len, const uint16_t* best_y, | ||||||
|  |                                      uint16_t* out, int bit_depth) { | ||||||
|  |   const int max_y = (1 << bit_depth) - 1; | ||||||
|  |   int i; | ||||||
|  |   const __m128i kCst8 = _mm_set1_epi32(8); | ||||||
|  |   const __m128i max = _mm_set1_epi16(max_y); | ||||||
|  |   const __m128i zero = _mm_setzero_si128(); | ||||||
|  |   for (i = 0; i + 4 <= len; i += 4) { | ||||||
|  |     const __m128i a0 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(A + i + 0))); | ||||||
|  |     const __m128i a1 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(A + i + 1))); | ||||||
|  |     const __m128i b0 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(B + i + 0))); | ||||||
|  |     const __m128i b1 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(B + i + 1))); | ||||||
|  |     const __m128i a0b1 = _mm_add_epi32(a0, b1); | ||||||
|  |     const __m128i a1b0 = _mm_add_epi32(a1, b0); | ||||||
|  |     const __m128i a0a1b0b1 = _mm_add_epi32(a0b1, a1b0);  // A0+A1+B0+B1 | ||||||
|  |     const __m128i a0a1b0b1_8 = _mm_add_epi32(a0a1b0b1, kCst8); | ||||||
|  |     const __m128i a0b1_2 = _mm_add_epi32(a0b1, a0b1);  // 2*(A0+B1) | ||||||
|  |     const __m128i a1b0_2 = _mm_add_epi32(a1b0, a1b0);  // 2*(A1+B0) | ||||||
|  |     const __m128i c0 = _mm_srai_epi32(_mm_add_epi32(a0b1_2, a0a1b0b1_8), 3); | ||||||
|  |     const __m128i c1 = _mm_srai_epi32(_mm_add_epi32(a1b0_2, a0a1b0b1_8), 3); | ||||||
|  |     const __m128i d0 = _mm_add_epi32(c1, a0); | ||||||
|  |     const __m128i d1 = _mm_add_epi32(c0, a1); | ||||||
|  |     const __m128i e0 = _mm_srai_epi32(d0, 1); | ||||||
|  |     const __m128i e1 = _mm_srai_epi32(d1, 1); | ||||||
|  |     const __m128i f0 = _mm_unpacklo_epi32(e0, e1); | ||||||
|  |     const __m128i f1 = _mm_unpackhi_epi32(e0, e1); | ||||||
|  |     const __m128i g = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0)); | ||||||
|  |     const __m128i h_16 = _mm_add_epi16(g, _mm_packs_epi32(f0, f1)); | ||||||
|  |     const __m128i final = _mm_max_epi16(_mm_min_epi16(h_16, max), zero); | ||||||
|  |     _mm_storeu_si128((__m128i*)(out + 2 * i + 0), final); | ||||||
|  |   } | ||||||
|  |   for (; i < len; ++i) { | ||||||
|  |     //   (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 = | ||||||
|  |     // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4 | ||||||
|  |     // We reuse the common sub-expressions. | ||||||
|  |     const int a0b1 = A[i + 0] + B[i + 1]; | ||||||
|  |     const int a1b0 = A[i + 1] + B[i + 0]; | ||||||
|  |     const int a0a1b0b1 = a0b1 + a1b0 + 8; | ||||||
|  |     const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; | ||||||
|  |     const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; | ||||||
|  |     out[2 * i + 0] = clip_SSE2(best_y[2 * i + 0] + v0, max_y); | ||||||
|  |     out[2 * i + 1] = clip_SSE2(best_y[2 * i + 1] + v1, max_y); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | static void SharpYuvFilterRow_SSE2(const int16_t* A, const int16_t* B, int len, | ||||||
|  |                                    const uint16_t* best_y, uint16_t* out, | ||||||
|  |                                    int bit_depth) { | ||||||
|  |   if (bit_depth <= 10) { | ||||||
|  |     SharpYuvFilterRow16_SSE2(A, B, len, best_y, out, bit_depth); | ||||||
|  |   } else { | ||||||
|  |     SharpYuvFilterRow32_SSE2(A, B, len, best_y, out, bit_depth); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
| //------------------------------------------------------------------------------ | //------------------------------------------------------------------------------ | ||||||
|  |  | ||||||
| extern void InitSharpYuvSSE2(void); | extern void InitSharpYuvSSE2(void); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user