mirror of
				https://github.com/webmproject/libwebp.git
				synced 2025-10-31 18:35:41 +01:00 
			
		
		
		
	Speed DispatchAlpha_SSE2 up
On some dataset, this was taking 2.5%. 2% when switching to _mm_maskmoveu_si128. 1.7% when using _mm_loadu_si128 Confirmed by IACA: going from throughput of 4.26 to 3.5 and then to 6.26 for twice the input. Change-Id: I409f901aaad9d39bf55a1aac28cc25f126876b01
This commit is contained in:
		| @@ -16,6 +16,8 @@ | ||||
| #if defined(WEBP_USE_SSE2) | ||||
| #include <emmintrin.h> | ||||
|  | ||||
| #include "src/dsp/cpu.h" | ||||
|  | ||||
| //------------------------------------------------------------------------------ | ||||
|  | ||||
| static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha, | ||||
| @@ -26,38 +28,44 @@ static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha, | ||||
|   uint32_t alpha_and = 0xff; | ||||
|   int i, j; | ||||
|   const __m128i zero = _mm_setzero_si128(); | ||||
|   const __m128i rgb_mask = _mm_set1_epi32((int)0xffffff00);  // to preserve RGB | ||||
|   const __m128i all_0xff = _mm_set_epi32(0, 0, ~0, ~0); | ||||
|   __m128i all_alphas = all_0xff; | ||||
|   const __m128i alpha_mask = _mm_set1_epi32((int)0xff);  // to preserve A | ||||
|   const __m128i all_0xff = _mm_set1_epi8(0xff); | ||||
|   __m128i all_alphas16 = all_0xff; | ||||
|   __m128i all_alphas8 = all_0xff; | ||||
|  | ||||
|   // We must be able to access 3 extra bytes after the last written byte | ||||
|   // 'dst[4 * width - 4]', because we don't know if alpha is the first or the | ||||
|   // last byte of the quadruplet. | ||||
|   const int limit = (width - 1) & ~7; | ||||
|  | ||||
|   for (j = 0; j < height; ++j) { | ||||
|     __m128i* out = (__m128i*)dst; | ||||
|     for (i = 0; i < limit; i += 8) { | ||||
|     char* ptr = (char*)dst; | ||||
|     for (i = 0; i + 16 <= width - 1; i += 16) { | ||||
|       // load 16 alpha bytes | ||||
|       const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]); | ||||
|       const __m128i a1_lo = _mm_unpacklo_epi8(a0, zero); | ||||
|       const __m128i a1_hi = _mm_unpackhi_epi8(a0, zero); | ||||
|       const __m128i a2_lo_lo = _mm_unpacklo_epi16(a1_lo, zero); | ||||
|       const __m128i a2_lo_hi = _mm_unpackhi_epi16(a1_lo, zero); | ||||
|       const __m128i a2_hi_lo = _mm_unpacklo_epi16(a1_hi, zero); | ||||
|       const __m128i a2_hi_hi = _mm_unpackhi_epi16(a1_hi, zero); | ||||
|       _mm_maskmoveu_si128(a2_lo_lo, alpha_mask, ptr + 0); | ||||
|       _mm_maskmoveu_si128(a2_lo_hi, alpha_mask, ptr + 16); | ||||
|       _mm_maskmoveu_si128(a2_hi_lo, alpha_mask, ptr + 32); | ||||
|       _mm_maskmoveu_si128(a2_hi_hi, alpha_mask, ptr + 48); | ||||
|       // accumulate 16 alpha 'and' in parallel | ||||
|       all_alphas16 = _mm_and_si128(all_alphas16, a0); | ||||
|       ptr += 64; | ||||
|     } | ||||
|     if (i + 8 <= width - 1) { | ||||
|       // load 8 alpha bytes | ||||
|       const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]); | ||||
|       const __m128i a1 = _mm_unpacklo_epi8(a0, zero); | ||||
|       const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); | ||||
|       const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); | ||||
|       // load 8 dst pixels (32 bytes) | ||||
|       const __m128i b0_lo = _mm_loadu_si128(out + 0); | ||||
|       const __m128i b0_hi = _mm_loadu_si128(out + 1); | ||||
|       // mask dst alpha values | ||||
|       const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask); | ||||
|       const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask); | ||||
|       // combine | ||||
|       const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo); | ||||
|       const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi); | ||||
|       // store | ||||
|       _mm_storeu_si128(out + 0, b2_lo); | ||||
|       _mm_storeu_si128(out + 1, b2_hi); | ||||
|       // accumulate eight alpha 'and' in parallel | ||||
|       all_alphas = _mm_and_si128(all_alphas, a0); | ||||
|       out += 2; | ||||
|       _mm_maskmoveu_si128(a2_lo, alpha_mask, ptr); | ||||
|       _mm_maskmoveu_si128(a2_hi, alpha_mask, ptr + 16); | ||||
|       // accumulate 8 alpha 'and' in parallel | ||||
|       all_alphas8 = _mm_and_si128(all_alphas8, a0); | ||||
|       i += 8; | ||||
|     } | ||||
|     for (; i < width; ++i) { | ||||
|       const uint32_t alpha_value = alpha[i]; | ||||
| @@ -68,8 +76,9 @@ static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha, | ||||
|     dst += dst_stride; | ||||
|   } | ||||
|   // Combine the eight alpha 'and' into a 8-bit mask. | ||||
|   alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); | ||||
|   return (alpha_and != 0xff); | ||||
|   alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas8, all_0xff)) & 0xff; | ||||
|   return (alpha_and != 0xff || | ||||
|           _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas16, all_0xff)) != 0xffff); | ||||
| } | ||||
|  | ||||
| static void DispatchAlphaToGreen_SSE2(const uint8_t* WEBP_RESTRICT alpha, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user