From 9e356d6b258a8bc1cc7d9ffd707ed9aadf26d92a Mon Sep 17 00:00:00 2001 From: Pascal Massimino Date: Wed, 24 Jun 2015 09:36:44 +0200 Subject: [PATCH] SSE2: slightly faster (~5%) AddGreenToBlueAndRed() Change-Id: Ie147010b66544c4e959f26966ad588394302d418 --- src/dsp/lossless_sse2.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/dsp/lossless_sse2.c b/src/dsp/lossless_sse2.c index 2bf09e63..34cefc72 100644 --- a/src/dsp/lossless_sse2.c +++ b/src/dsp/lossless_sse2.c @@ -156,15 +156,13 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) { // Subtract-Green Transform static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { - const __m128i mask = _mm_set1_epi32(0x0000ff00); int i; for (i = 0; i + 4 <= num_pixels; i += 4) { - const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); - const __m128i in_00g0 = _mm_and_si128(in, mask); // 00g0|00g0|... - const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8); // 0g00|0g00|... - const __m128i in_000g = _mm_srli_epi32(in_00g0, 8); // 000g|000g|... - const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g); - const __m128i out = _mm_add_epi8(in, in_0g0g); + const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb + const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g + const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); + const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g + const __m128i out = _mm_add_epi8(in, C); _mm_storeu_si128((__m128i*)&argb_data[i], out); } // fallthrough and finish off with plain-C