From 27bfeee43aadbbafe5d244a90392ed0ea43ecdee Mon Sep 17 00:00:00 2001 From: levytamar82 Date: Wed, 18 Jun 2014 04:45:51 -0700 Subject: [PATCH] QuantizeBlock SSE2 Optimization: Another store to load forward block was detected coming from the function FTransform. FTransform save the output data 4 times 8 bytes each. when this data is later being loaded by the QuantizeBlock function in one chunk of 16 bytes that caused a store to load forward block. The fix was done in the FTransform function where each two consecutive 8 bytes were merged into one 16 bytes register and saved into the memory. This fix gives ~21% function level gain and 1.6% user level gain. Change-Id: Idc27c307d5083f3ebe206d3ca19059e5bd465992 --- src/dsp/enc_sse2.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c index 53332f7f..f95c1b66 100644 --- a/src/dsp/enc_sse2.c +++ b/src/dsp/enc_sse2.c @@ -445,10 +445,10 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { // -> f1 = f1 + 1 - (a3 == 0) const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); - _mm_storel_epi64((__m128i*)&out[ 0], d0); - _mm_storel_epi64((__m128i*)&out[ 4], g1); - _mm_storel_epi64((__m128i*)&out[ 8], d2); - _mm_storel_epi64((__m128i*)&out[12], f3); + const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1); + const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3); + _mm_storeu_si128((__m128i*)&out[0], d0_g1); + _mm_storeu_si128((__m128i*)&out[8], d2_f3); } }