Do not use a register array in SSE.

Change-Id: I79cf95bdac1164fc4de899828e9380c23df8d141
This commit is contained in:
Vincent Rabaud
2016-11-24 13:02:08 +01:00
parent 67748b41db
commit 7474d46e45
3 changed files with 122 additions and 123 deletions

View File

@ -217,39 +217,26 @@ static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels,
__m128i* out = (__m128i*)dst;
while (num_pixels >= 32) {
__m128i rgb_planar[6];
{
const __m128i bgra[4] = { _mm_loadu_si128(in + 0),
_mm_loadu_si128(in + 1),
_mm_loadu_si128(in + 2),
_mm_loadu_si128(in + 3) };
__m128i bgra_planar[4];
VP8L32bToPlanar(bgra, bgra_planar);
rgb_planar[0] = _mm_loadu_si128(bgra_planar + 1);
rgb_planar[2] = _mm_loadu_si128(bgra_planar + 2);
rgb_planar[4] = _mm_loadu_si128(bgra_planar + 3);
}
{
const __m128i bgra[4] = { _mm_loadu_si128(in + 4),
_mm_loadu_si128(in + 5),
_mm_loadu_si128(in + 6),
_mm_loadu_si128(in + 7) };
__m128i bgra_planar[4];
VP8L32bToPlanar(bgra, bgra_planar);
rgb_planar[1] = _mm_loadu_si128(bgra_planar + 1);
rgb_planar[3] = _mm_loadu_si128(bgra_planar + 2);
rgb_planar[5] = _mm_loadu_si128(bgra_planar + 3);
}
{
__m128i bgr[6];
VP8PlanarTo24b(rgb_planar, bgr);
_mm_storeu_si128(out + 0, bgr[0]);
_mm_storeu_si128(out + 1, bgr[1]);
_mm_storeu_si128(out + 2, bgr[2]);
_mm_storeu_si128(out + 3, bgr[3]);
_mm_storeu_si128(out + 4, bgr[4]);
_mm_storeu_si128(out + 5, bgr[5]);
}
// Load the BGRA buffers.
__m128i in0 = _mm_loadu_si128(in + 0);
__m128i in1 = _mm_loadu_si128(in + 1);
__m128i in2 = _mm_loadu_si128(in + 2);
__m128i in3 = _mm_loadu_si128(in + 3);
__m128i in4 = _mm_loadu_si128(in + 4);
__m128i in5 = _mm_loadu_si128(in + 5);
__m128i in6 = _mm_loadu_si128(in + 6);
__m128i in7 = _mm_loadu_si128(in + 7);
VP8L32bToPlanar(&in0, &in1, &in2, &in3);
VP8L32bToPlanar(&in4, &in5, &in6, &in7);
// At this points, in1/in5 contains red only, in2/in6 green only ...
// Pack the colors in 24b RGB.
VP8PlanarTo24b(&in1, &in5, &in2, &in6, &in3, &in7);
_mm_storeu_si128(out + 0, in1);
_mm_storeu_si128(out + 1, in5);
_mm_storeu_si128(out + 2, in2);
_mm_storeu_si128(out + 3, in6);
_mm_storeu_si128(out + 4, in3);
_mm_storeu_si128(out + 5, in7);
in += 8;
out += 6;
num_pixels -= 32;