mirror of
				https://github.com/webmproject/libwebp.git
				synced 2025-10-31 02:15:42 +01:00 
			
		
		
		
	speed-up SetResidualSSE2
(was unnecessarily complicated) Before: VP8SetResidualCoeffs: checksum = 1127918 elapsed = 475 ms. Change-Id: Ia54bef86c45f9f474622ff16e594bf1da4f67ebd After: VP8SetResidualCoeffs: checksum = 1127918 elapsed = 404 ms.
This commit is contained in:
		
				
					committed by
					
						 James Zern
						James Zern
					
				
			
			
				
	
			
			
			
						parent
						
							bf46d0acff
						
					
				
				
					commit
					f262d6120e
				
			| @@ -24,24 +24,21 @@ | ||||
|  | ||||
| static void SetResidualCoeffsSSE2(const int16_t* const coeffs, | ||||
|                                   VP8Residual* const res) { | ||||
|   const __m128i c0 = _mm_loadu_si128((const __m128i*)coeffs); | ||||
|   const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0)); | ||||
|   const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8)); | ||||
|   // Use SSE to compare 8 values with a single instruction. | ||||
|   // Use SSE2 to compare 16 values with a single instruction. | ||||
|   const __m128i zero = _mm_setzero_si128(); | ||||
|   const __m128i m0 = _mm_cmpeq_epi16(c0, zero); | ||||
|   const __m128i m1 = _mm_cmpeq_epi16(c1, zero); | ||||
|   // Get the comparison results as a bitmask, consisting of two times 16 bits: | ||||
|   // two identical bits for each result. Concatenate both bitmasks to get a | ||||
|   // single 32 bit value. Negate the mask to get the position of entries that | ||||
|   // are not equal to zero. We don't need to mask out least significant bits | ||||
|   // according to res->first, since coeffs[0] is 0 if res->first > 0 | ||||
|   const uint32_t mask = | ||||
|       ~(((uint32_t)_mm_movemask_epi8(m1) << 16) | _mm_movemask_epi8(m0)); | ||||
|   const __m128i m0 = _mm_packs_epi16(c0, c1); | ||||
|   const __m128i m1 = _mm_cmpeq_epi8(m0, zero); | ||||
|   // Get the comparison results as a bitmask into 16bits. Negate the mask to get | ||||
|   // the position of entries that are not equal to zero. We don't need to mask | ||||
|   // out least significant bits according to res->first, since coeffs[0] is 0 | ||||
|   // if res->first > 0. | ||||
|   const uint32_t mask = 0x0000ffffu ^ (uint32_t)_mm_movemask_epi8(m1); | ||||
|   // The position of the most significant non-zero bit indicates the position of | ||||
|   // the last non-zero value. Divide the result by two because __movemask_epi8 | ||||
|   // operates on 8 bit values instead of 16 bit values. | ||||
|   // the last non-zero value. | ||||
|   assert(res->first == 0 || coeffs[0] == 0); | ||||
|   res->last = mask ? (BitsLog2Floor(mask) >> 1) : -1; | ||||
|   res->last = mask ? BitsLog2Floor(mask) : -1; | ||||
|   res->coeffs = coeffs; | ||||
| } | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user