SSE2 variants of Subtract-Green: Rectify loop condition

When 4 pixels are left, they should be processed with SSE2.

Decoding is marginally faster (~0.4%).
Encoding speed: No observable difference.

Change-Id: I3cf21c07145a560ff795451e65e64faf148d5c3e
This commit is contained in:
Urvang Joshi 2014-03-31 10:51:45 -07:00
parent daccbf400d
commit 4fd7c82e6a

View File

@ -123,7 +123,7 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
const __m128i mask = _mm_set1_epi32(0x0000ff00); const __m128i mask = _mm_set1_epi32(0x0000ff00);
int i; int i;
for (i = 0; i + 4 < num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
const __m128i in_00g0 = _mm_and_si128(in, mask); // 00g0|00g0|... const __m128i in_00g0 = _mm_and_si128(in, mask); // 00g0|00g0|...
const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8); // 0g00|0g00|... const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8); // 0g00|0g00|...
@ -139,7 +139,7 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
const __m128i mask = _mm_set1_epi32(0x0000ff00); const __m128i mask = _mm_set1_epi32(0x0000ff00);
int i; int i;
for (i = 0; i + 4 < num_pixels; i += 4) { for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
const __m128i in_00g0 = _mm_and_si128(in, mask); // 00g0|00g0|... const __m128i in_00g0 = _mm_and_si128(in, mask); // 00g0|00g0|...
const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8); // 0g00|0g00|... const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8); // 0g00|0g00|...