diff --git a/src/dsp/dec_sse2.c b/src/dsp/dec_sse2.c index 873aa59e..0aa4cb16 100644 --- a/src/dsp/dec_sse2.c +++ b/src/dsp/dec_sse2.c @@ -158,10 +158,10 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) { dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS)); } else { // Load four bytes/pixels per line. - dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS)); - dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS)); - dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS)); - dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS)); + dst0 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 0 * BPS)); + dst1 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 1 * BPS)); + dst2 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 2 * BPS)); + dst3 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 3 * BPS)); } // Convert to 16b. dst0 = _mm_unpacklo_epi8(dst0, zero); @@ -213,10 +213,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { const __m128i m3 = _mm_subs_epi16(B, d4); const __m128i zero = _mm_setzero_si128(); // Load the source pixels. - __m128i dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS)); - __m128i dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS)); - __m128i dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS)); - __m128i dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS)); + __m128i dst0 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 0 * BPS)); + __m128i dst1 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 1 * BPS)); + __m128i dst2 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 2 * BPS)); + __m128i dst3 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 3 * BPS)); // Convert to 16b. dst0 = _mm_unpacklo_epi8(dst0, zero); dst1 = _mm_unpacklo_epi8(dst1, zero); @@ -477,11 +477,11 @@ static WEBP_INLINE void Load8x4_SSE2(const uint8_t* const b, int stride, // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00 // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10 const __m128i A0 = _mm_set_epi32( - WebPMemToUint32(&b[6 * stride]), WebPMemToUint32(&b[2 * stride]), - WebPMemToUint32(&b[4 * stride]), WebPMemToUint32(&b[0 * stride])); + WebPMemToInt32(&b[6 * stride]), WebPMemToInt32(&b[2 * stride]), + WebPMemToInt32(&b[4 * stride]), WebPMemToInt32(&b[0 * stride])); const __m128i A1 = _mm_set_epi32( - WebPMemToUint32(&b[7 * stride]), WebPMemToUint32(&b[3 * stride]), - WebPMemToUint32(&b[5 * stride]), WebPMemToUint32(&b[1 * stride])); + WebPMemToInt32(&b[7 * stride]), WebPMemToInt32(&b[3 * stride]), + WebPMemToInt32(&b[5 * stride]), WebPMemToInt32(&b[1 * stride])); // B0 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 // B1 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 @@ -1015,7 +1015,7 @@ static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, int size) { const __m128i zero = _mm_setzero_si128(); int y; if (size == 4) { - const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top)); + const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top)); const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); for (y = 0; y < 4; ++y, dst += BPS) { const int val = dst[-1] - top[-1]; diff --git a/src/dsp/dec_sse41.c b/src/dsp/dec_sse41.c index 8f18506d..08a36302 100644 --- a/src/dsp/dec_sse41.c +++ b/src/dsp/dec_sse41.c @@ -23,7 +23,7 @@ static void HE16_SSE41(uint8_t* dst) { // horizontal int j; const __m128i kShuffle3 = _mm_set1_epi8(3); for (j = 16; j > 0; --j) { - const __m128i in = _mm_cvtsi32_si128(WebPMemToUint32(dst - 4)); + const __m128i in = _mm_cvtsi32_si128(WebPMemToInt32(dst - 4)); const __m128i values = _mm_shuffle_epi8(in, kShuffle3); _mm_storeu_si128((__m128i*)dst, values); dst += BPS; diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c index b2e78ed9..f4babc62 100644 --- a/src/dsp/enc_sse2.c +++ b/src/dsp/enc_sse2.c @@ -156,10 +156,10 @@ static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); } else { // Load four bytes/pixels per line. - ref0 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[0 * BPS])); - ref1 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[1 * BPS])); - ref2 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[2 * BPS])); - ref3 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[3 * BPS])); + ref0 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[0 * BPS])); + ref1 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[1 * BPS])); + ref2 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[2 * BPS])); + ref3 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[3 * BPS])); } // Convert to 16b. ref0 = _mm_unpacklo_epi8(ref0, zero); @@ -875,7 +875,7 @@ static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) { const __m128i zero = _mm_setzero_si128(); - const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top)); + const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top)); const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); int y; for (y = 0; y < 4; ++y, dst += BPS) { diff --git a/src/dsp/rescaler_sse2.c b/src/dsp/rescaler_sse2.c index d7effea1..8b231d85 100644 --- a/src/dsp/rescaler_sse2.c +++ b/src/dsp/rescaler_sse2.c @@ -132,7 +132,7 @@ static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk, __m128i base = zero; accum += wrk->x_add; while (accum > 0) { - const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src)); + const __m128i A = _mm_cvtsi32_si128(WebPMemToInt32(src)); src += 4; base = _mm_unpacklo_epi8(A, zero); // To avoid overflow, we need: base * x_add / x_sub < 32768 diff --git a/src/utils/utils.h b/src/utils/utils.h index ef04f108..5211b81b 100644 --- a/src/utils/utils.h +++ b/src/utils/utils.h @@ -73,6 +73,11 @@ static WEBP_INLINE uint32_t WebPMemToUint32(const uint8_t* const ptr) { memcpy(&A, ptr, sizeof(A)); return A; } + +static WEBP_INLINE int32_t WebPMemToInt32(const uint8_t* const ptr) { + return (int32_t)WebPMemToUint32(ptr); +} + static WEBP_INLINE void WebPUint32ToMem(uint8_t* const ptr, uint32_t val) { memcpy(ptr, &val, sizeof(val)); }