apply clang-format

(Debian clang-format version 19.1.7 (3+build4)) with `--style=Google`. Manual changes: * clang-format disabled around macros with stringification (mostly assembly) * some inline assembly strings were adjusted to avoid awkward line breaks * trailing commas, `//` or suffixes (`ull`) added to help array formatting * thread_utils.c: parameter comments were changed to the more common /*...=*/ style to improve formatting The automatically generated code under swig/ was skipped. Bug: 433996651 Change-Id: Iea3f24160d78d2a2653971cdf13fa932e47ff1b3
2025-08-29 23:32:05 +02:00 · 2025-07-28 18:23:12 -07:00
parent b569988d3f
commit 44257cb826
224 changed files with 16312 additions and 16734 deletions
--- a/src/dsp/lossless_sse2.c
+++ b/src/dsp/lossless_sse2.c
@@ -129,8 +129,8 @@ static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,
  return (uint32_t)_mm_cvtsi128_si32(A2);
 }

-static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
-                                          uint32_t a2, uint32_t a3) {
+static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1, uint32_t a2,
+                                          uint32_t a3) {
  const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1);
  const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3);
  const __m128i sum = _mm_add_epi16(avg2, avg1);
@@ -233,21 +233,21 @@ static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,

 // Macro that adds 32-bit integers from IN using mod 256 arithmetic
 // per 8 bit channel.
-#define GENERATE_PREDICTOR_1(X, IN)                                           \
-static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
-                                   int num_pixels,                            \
-                                   uint32_t* WEBP_RESTRICT out) {             \
-  int i;                                                                      \
-  for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
-    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
-    const __m128i other = _mm_loadu_si128((const __m128i*)&(IN));             \
-    const __m128i res = _mm_add_epi8(src, other);                             \
-    _mm_storeu_si128((__m128i*)&out[i], res);                                 \
-  }                                                                           \
-  if (i != num_pixels) {                                                      \
-    VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);     \
-  }                                                                           \
-}
+#define GENERATE_PREDICTOR_1(X, IN)                                         \
+  static void PredictorAdd##X##_SSE2(const uint32_t* in,                    \
+                                     const uint32_t* upper, int num_pixels, \
+                                     uint32_t* WEBP_RESTRICT out) {         \
+    int i;                                                                  \
+    for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
+      const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);          \
+      const __m128i other = _mm_loadu_si128((const __m128i*)&(IN));         \
+      const __m128i res = _mm_add_epi8(src, other);                         \
+      _mm_storeu_si128((__m128i*)&out[i], res);                             \
+    }                                                                       \
+    if (i != num_pixels) {                                                  \
+      VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
+    }                                                                       \
+  }

 // Predictor2: Top.
 GENERATE_PREDICTOR_1(2, upper[i])
@@ -263,24 +263,24 @@ GENERATE_PREDICTOR_ADD(Predictor5_SSE2, PredictorAdd5_SSE2)
 GENERATE_PREDICTOR_ADD(Predictor6_SSE2, PredictorAdd6_SSE2)
 GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2)

-#define GENERATE_PREDICTOR_2(X, IN)                                           \
-static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
-                                   int num_pixels,                            \
-                                   uint32_t* WEBP_RESTRICT out) {             \
-  int i;                                                                      \
-  for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
-    const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN));            \
-    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);             \
-    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
-    __m128i avg, res;                                                         \
-    Average2_m128i(&T, &Tother, &avg);                                        \
-    res = _mm_add_epi8(avg, src);                                             \
-    _mm_storeu_si128((__m128i*)&out[i], res);                                 \
-  }                                                                           \
-  if (i != num_pixels) {                                                      \
-    VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);     \
-  }                                                                           \
-}
+#define GENERATE_PREDICTOR_2(X, IN)                                         \
+  static void PredictorAdd##X##_SSE2(const uint32_t* in,                    \
+                                     const uint32_t* upper, int num_pixels, \
+                                     uint32_t* WEBP_RESTRICT out) {         \
+    int i;                                                                  \
+    for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
+      const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN));        \
+      const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);         \
+      const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);          \
+      __m128i avg, res;                                                     \
+      Average2_m128i(&T, &Tother, &avg);                                    \
+      res = _mm_add_epi8(avg, src);                                         \
+      _mm_storeu_si128((__m128i*)&out[i], res);                             \
+    }                                                                       \
+    if (i != num_pixels) {                                                  \
+      VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
+    }                                                                       \
+  }
 // Predictor8: average TL T.
 GENERATE_PREDICTOR_2(8, upper[i - 1])
 // Predictor9: average T TR.
@@ -288,20 +288,22 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
 #undef GENERATE_PREDICTOR_2

 // Predictor10: average of (average of (L,TL), average of (T, TR)).
-#define DO_PRED10(OUT) do {                         \
-  __m128i avgLTL, avg;                              \
-  Average2_m128i(&L, &TL, &avgLTL);                 \
-  Average2_m128i(&avgTTR, &avgLTL, &avg);           \
-  L = _mm_add_epi8(avg, src);                       \
-  out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L);  \
-} while (0)
+#define DO_PRED10(OUT)                               \
+  do {                                               \
+    __m128i avgLTL, avg;                             \
+    Average2_m128i(&L, &TL, &avgLTL);                \
+    Average2_m128i(&avgTTR, &avgLTL, &avg);          \
+    L = _mm_add_epi8(avg, src);                      \
+    out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L); \
+  } while (0)

-#define DO_PRED10_SHIFT do {                                  \
-  /* Rotate the pre-computed values for the next iteration.*/ \
-  avgTTR = _mm_srli_si128(avgTTR, 4);                         \
-  TL = _mm_srli_si128(TL, 4);                                 \
-  src = _mm_srli_si128(src, 4);                               \
-} while (0)
+#define DO_PRED10_SHIFT                                         \
+  do {                                                          \
+    /* Rotate the pre-computed values for the next iteration.*/ \
+    avgTTR = _mm_srli_si128(avgTTR, 4);                         \
+    TL = _mm_srli_si128(TL, 4);                                 \
+    src = _mm_srli_si128(src, 4);                               \
+  } while (0)

 static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
@@ -330,25 +332,27 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
 #undef DO_PRED10_SHIFT

 // Predictor11: select.
-#define DO_PRED11(OUT) do {                                            \
-  const __m128i L_lo = _mm_unpacklo_epi32(L, T);                       \
-  const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);                     \
-  const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/   \
-  const __m128i mask = _mm_cmpgt_epi32(pb, pa);                        \
-  const __m128i A = _mm_and_si128(mask, L);                            \
-  const __m128i B = _mm_andnot_si128(mask, T);                         \
-  const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
-  L = _mm_add_epi8(src, pred);                                         \
-  out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L);                     \
-} while (0)
+#define DO_PRED11(OUT)                                                   \
+  do {                                                                   \
+    const __m128i L_lo = _mm_unpacklo_epi32(L, T);                       \
+    const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);                     \
+    const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/   \
+    const __m128i mask = _mm_cmpgt_epi32(pb, pa);                        \
+    const __m128i A = _mm_and_si128(mask, L);                            \
+    const __m128i B = _mm_andnot_si128(mask, T);                         \
+    const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
+    L = _mm_add_epi8(src, pred);                                         \
+    out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L);                     \
+  } while (0)

-#define DO_PRED11_SHIFT do {                                \
-  /* Shift the pre-computed value for the next iteration.*/ \
-  T = _mm_srli_si128(T, 4);                                 \
-  TL = _mm_srli_si128(TL, 4);                               \
-  src = _mm_srli_si128(src, 4);                             \
-  pa = _mm_srli_si128(pa, 4);                               \
-} while (0)
+#define DO_PRED11_SHIFT                                       \
+  do {                                                        \
+    /* Shift the pre-computed value for the next iteration.*/ \
+    T = _mm_srli_si128(T, 4);                                 \
+    TL = _mm_srli_si128(TL, 4);                               \
+    src = _mm_srli_si128(src, 4);                             \
+    pa = _mm_srli_si128(pa, 4);                               \
+  } while (0)

 static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
@@ -387,19 +391,21 @@ static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
 #undef DO_PRED11_SHIFT

 // Predictor12: ClampedAddSubtractFull.
-#define DO_PRED12(DIFF, LANE, OUT) do {              \
-  const __m128i all = _mm_add_epi16(L, (DIFF));      \
-  const __m128i alls = _mm_packus_epi16(all, all);   \
-  const __m128i res = _mm_add_epi8(src, alls);       \
-  out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(res); \
-  L = _mm_unpacklo_epi8(res, zero);                  \
-} while (0)
+#define DO_PRED12(DIFF, LANE, OUT)                     \
+  do {                                                 \
+    const __m128i all = _mm_add_epi16(L, (DIFF));      \
+    const __m128i alls = _mm_packus_epi16(all, all);   \
+    const __m128i res = _mm_add_epi8(src, alls);       \
+    out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(res); \
+    L = _mm_unpacklo_epi8(res, zero);                  \
+  } while (0)

-#define DO_PRED12_SHIFT(DIFF, LANE) do {                    \
-  /* Shift the pre-computed value for the next iteration.*/ \
-  if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8);      \
-  src = _mm_srli_si128(src, 4);                             \
-} while (0)
+#define DO_PRED12_SHIFT(DIFF, LANE)                           \
+  do {                                                        \
+    /* Shift the pre-computed value for the next iteration.*/ \
+    if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8);      \
+    src = _mm_srli_si128(src, 4);                             \
+  } while (0)

 static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
@@ -444,8 +450,8 @@ static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels,
                                      uint32_t* dst) {
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
-    const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
-    const __m128i A = _mm_srli_epi16(in, 8);     // 0 a 0 g
+    const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]);  // argb
+    const __m128i A = _mm_srli_epi16(in, 8);                      // 0 a 0 g
    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // 0g0g
    const __m128i out = _mm_add_epi8(in, C);
@@ -464,7 +470,7 @@ static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
                                       const uint32_t* const src,
                                       int num_pixels, uint32_t* dst) {
 // sign-extended multiplying constants, pre-shifted by 5.
-#define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
+#define CST(X) (((int16_t)(m->X << 8)) >> 5)  // sign-extend
 #define MK_CST_16(HI, LO) \
  _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
  const __m128i mults_rb = MK_CST_16(CST(green_to_red), CST(green_to_blue));
@@ -474,17 +480,17 @@ static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
  const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00);  // alpha-green masks
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
-    const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
-    const __m128i A = _mm_and_si128(in, mask_ag);     // a   0   g   0
+    const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]);  // argb
+    const __m128i A = _mm_and_si128(in, mask_ag);  // a   0   g   0
    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // g0g0
-    const __m128i D = _mm_mulhi_epi16(C, mults_rb);    // x dr  x db1
-    const __m128i E = _mm_add_epi8(in, D);             // x r'  x   b'
-    const __m128i F = _mm_slli_epi16(E, 8);            // r' 0   b' 0
-    const __m128i G = _mm_mulhi_epi16(F, mults_b2);    // x db2  0  0
-    const __m128i H = _mm_srli_epi32(G, 8);            // 0  x db2  0
-    const __m128i I = _mm_add_epi8(H, F);              // r' x  b'' 0
-    const __m128i J = _mm_srli_epi16(I, 8);            // 0  r'  0  b''
+    const __m128i D = _mm_mulhi_epi16(C, mults_rb);  // x dr  x db1
+    const __m128i E = _mm_add_epi8(in, D);           // x r'  x   b'
+    const __m128i F = _mm_slli_epi16(E, 8);          // r' 0   b' 0
+    const __m128i G = _mm_mulhi_epi16(F, mults_b2);  // x db2  0  0
+    const __m128i H = _mm_srli_epi32(G, 8);          // 0  x db2  0
+    const __m128i I = _mm_add_epi8(H, F);            // r' x  b'' 0
+    const __m128i J = _mm_srli_epi16(I, 8);          // 0  r'  0  b''
    const __m128i out = _mm_or_si128(J, A);
    _mm_storeu_si128((__m128i*)&dst[i], out);
  }
@@ -569,21 +575,21 @@ static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* WEBP_RESTRICT src,
  const __m128i* in = (const __m128i*)src;
  __m128i* out = (__m128i*)dst;
  while (num_pixels >= 8) {
-    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
-    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
+    const __m128i bgra0 = _mm_loadu_si128(in++);  // bgra0|bgra1|bgra2|bgra3
+    const __m128i bgra4 = _mm_loadu_si128(in++);  // bgra4|bgra5|bgra6|bgra7
    const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
    const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
-    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);    // b0b2b4b6g0g2g4g6...
-    const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);    // b1b3b5b7g1g3g5g7...
-    const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);    // b0...b7 | g0...g7
-    const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);    // r0...r7 | a0...a7
-    const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);   // g0...g7 | a0...a7
-    const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);   // r0...r7 | b0...b7
-    const __m128i ga1 = _mm_srli_epi16(ga0, 4);         // g0-|g1-|...|a6-|a7-
-    const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0);  // -r0|-r1|...|-b6|-a7
-    const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f);  // g0-|g1-|...|a6-|a7-
-    const __m128i rgba0 = _mm_or_si128(ga2, rb1);       // rg0..rg7 | ba0..ba7
-    const __m128i rgba1 = _mm_srli_si128(rgba0, 8);     // ba0..ba7 | 0
+    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);      // b0b2b4b6g0g2g4g6...
+    const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);      // b1b3b5b7g1g3g5g7...
+    const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);      // b0...b7 | g0...g7
+    const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);      // r0...r7 | a0...a7
+    const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);     // g0...g7 | a0...a7
+    const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);     // r0...r7 | b0...b7
+    const __m128i ga1 = _mm_srli_epi16(ga0, 4);           // g0-|g1-|...|a6-|a7-
+    const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0);    // -r0|-r1|...|-b6|-a7
+    const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f);    // g0-|g1-|...|a6-|a7-
+    const __m128i rgba0 = _mm_or_si128(ga2, rb1);         // rg0..rg7 | ba0..ba7
+    const __m128i rgba1 = _mm_srli_si128(rgba0, 8);       // ba0..ba7 | 0
 #if (WEBP_SWAP_16BIT_CSP == 1)
    const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0);  // barg0...barg7
 #else
@@ -607,8 +613,8 @@ static void ConvertBGRAToRGB565_SSE2(const uint32_t* WEBP_RESTRICT src,
  const __m128i* in = (const __m128i*)src;
  __m128i* out = (__m128i*)dst;
  while (num_pixels >= 8) {
-    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
-    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
+    const __m128i bgra0 = _mm_loadu_si128(in++);  // bgra0|bgra1|bgra2|bgra3
+    const __m128i bgra4 = _mm_loadu_si128(in++);  // bgra4|bgra5|bgra6|bgra7
    const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
    const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);      // b0b2b4b6g0g2g4g6...
@@ -625,11 +631,11 @@ static void ConvertBGRAToRGB565_SSE2(const uint32_t* WEBP_RESTRICT src,
    const __m128i b0 = _mm_srli_si128(rb1, 8);              // -b0...-b7|0
    const __m128i rg1 = _mm_or_si128(rb1, g_lo2);           // gr0...gr7|xx
    const __m128i b1 = _mm_srli_epi16(b0, 3);
-    const __m128i gb1 = _mm_or_si128(b1, g_hi2);            // bg0...bg7|xx
+    const __m128i gb1 = _mm_or_si128(b1, g_hi2);  // bg0...bg7|xx
 #if (WEBP_SWAP_16BIT_CSP == 1)
-    const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1);     // rggb0...rggb7
+    const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1);  // rggb0...rggb7
 #else
-    const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1);     // bgrb0...bgrb7
+    const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1);  // bgrb0...bgrb7
 #endif
    _mm_storeu_si128(out++, rgba);
    num_pixels -= 8;
@@ -648,22 +654,22 @@ static void ConvertBGRAToBGR_SSE2(const uint32_t* WEBP_RESTRICT src,
  const uint8_t* const end = dst + num_pixels * 3;
  // the last storel_epi64 below writes 8 bytes starting at offset 18
  while (dst + 26 <= end) {
-    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
-    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
-    const __m128i a0l = _mm_and_si128(bgra0, mask_l);   // bgr0|0|bgr0|0
-    const __m128i a4l = _mm_and_si128(bgra4, mask_l);   // bgr0|0|bgr0|0
-    const __m128i a0h = _mm_and_si128(bgra0, mask_h);   // 0|bgr0|0|bgr0
-    const __m128i a4h = _mm_and_si128(bgra4, mask_h);   // 0|bgr0|0|bgr0
-    const __m128i b0h = _mm_srli_epi64(a0h, 8);         // 000b|gr00|000b|gr00
-    const __m128i b4h = _mm_srli_epi64(a4h, 8);         // 000b|gr00|000b|gr00
-    const __m128i c0 = _mm_or_si128(a0l, b0h);          // rgbrgb00|rgbrgb00
-    const __m128i c4 = _mm_or_si128(a4l, b4h);          // rgbrgb00|rgbrgb00
+    const __m128i bgra0 = _mm_loadu_si128(in++);  // bgra0|bgra1|bgra2|bgra3
+    const __m128i bgra4 = _mm_loadu_si128(in++);  // bgra4|bgra5|bgra6|bgra7
+    const __m128i a0l = _mm_and_si128(bgra0, mask_l);  // bgr0|0|bgr0|0
+    const __m128i a4l = _mm_and_si128(bgra4, mask_l);  // bgr0|0|bgr0|0
+    const __m128i a0h = _mm_and_si128(bgra0, mask_h);  // 0|bgr0|0|bgr0
+    const __m128i a4h = _mm_and_si128(bgra4, mask_h);  // 0|bgr0|0|bgr0
+    const __m128i b0h = _mm_srli_epi64(a0h, 8);        // 000b|gr00|000b|gr00
+    const __m128i b4h = _mm_srli_epi64(a4h, 8);        // 000b|gr00|000b|gr00
+    const __m128i c0 = _mm_or_si128(a0l, b0h);         // rgbrgb00|rgbrgb00
+    const __m128i c4 = _mm_or_si128(a4l, b4h);         // rgbrgb00|rgbrgb00
    const __m128i c2 = _mm_srli_si128(c0, 8);
    const __m128i c6 = _mm_srli_si128(c4, 8);
-    _mm_storel_epi64((__m128i*)(dst +   0), c0);
-    _mm_storel_epi64((__m128i*)(dst +   6), c2);
-    _mm_storel_epi64((__m128i*)(dst +  12), c4);
-    _mm_storel_epi64((__m128i*)(dst +  18), c6);
+    _mm_storel_epi64((__m128i*)(dst + 0), c0);
+    _mm_storel_epi64((__m128i*)(dst + 6), c2);
+    _mm_storel_epi64((__m128i*)(dst + 12), c4);
+    _mm_storel_epi64((__m128i*)(dst + 18), c6);
    dst += 24;
    num_pixels -= 8;
  }