add intrinsics NEON code for chroma strong-filtering

The nice trick is to pack 8 u + 8 v samples into a single uint8x16x_t register, and re-use the previous (luma) functions Change-Id: Idf50ed2d6b7137ea080d603062bc9e0c66d79f38
2025-09-18 08:42:00 +02:00 · 2014-04-02 11:24:10 +02:00
parent 2132992d47
commit e351ec0759
1 changed files with 230 additions and 0 deletions
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@@ -235,6 +235,87 @@ static WEBP_INLINE void Load16x8(const uint8_t* const src, int stride,
  Load16x4(src + 2  * stride, stride, q0, q1, q2, q3);
 }
 static WEBP_INLINE void Load8x8x2(const uint8_t* const u,
                                  const uint8_t* const v,
                                  int stride,
                                  uint8x16_t* const p3, uint8x16_t* const p2,
                                  uint8x16_t* const p1, uint8x16_t* const p0,
                                  uint8x16_t* const q0, uint8x16_t* const q1,
                                  uint8x16_t* const q2, uint8x16_t* const q3) {
  // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
  // and the v-samples on the higher half.
  *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
  *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
  *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
  *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
  *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
  *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
  *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
  *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
 }
 #define LOAD_UV_8(ROW) \
  vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
 static WEBP_INLINE void Load8x8x2T(const uint8_t* const u,
                                   const uint8_t* const v,
                                   int stride,
                                   uint8x16_t* const p3, uint8x16_t* const p2,
                                   uint8x16_t* const p1, uint8x16_t* const p0,
                                   uint8x16_t* const q0, uint8x16_t* const q1,
                                   uint8x16_t* const q2, uint8x16_t* const q3) {
  // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
  // and the v-samples on the higher half.
  const uint8x16_t row0 = LOAD_UV_8(0);
  const uint8x16_t row1 = LOAD_UV_8(1);
  const uint8x16_t row2 = LOAD_UV_8(2);
  const uint8x16_t row3 = LOAD_UV_8(3);
  const uint8x16_t row4 = LOAD_UV_8(4);
  const uint8x16_t row5 = LOAD_UV_8(5);
  const uint8x16_t row6 = LOAD_UV_8(6);
  const uint8x16_t row7 = LOAD_UV_8(7);
  // Perform two side-by-side 8x8 transposes
  // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
  // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
  // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
  // u30 u31 u32 u33 u34 u35 u36 u37 | ...
  // u40 u41 u42 u43 u44 u45 u46 u47 | ...
  // u50 u51 u52 u53 u54 u55 u56 u57 | ...
  // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
  // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
  const uint8x16x2_t row01 = vtrnq_u8(row0, row1);  // u00 u10 u02 u12 ...
                                                    // u01 u11 u03 u13 ...
  const uint8x16x2_t row23 = vtrnq_u8(row2, row3);  // u20 u30 u22 u32 ...
                                                    // u21 u31 u23 u33 ...
  const uint8x16x2_t row45 = vtrnq_u8(row4, row5);  // ...
  const uint8x16x2_t row67 = vtrnq_u8(row6, row7);  // ...
  const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
                                       vreinterpretq_u16_u8(row23.val[0]));
  const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
                                       vreinterpretq_u16_u8(row23.val[1]));
  const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
                                       vreinterpretq_u16_u8(row67.val[0]));
  const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
                                       vreinterpretq_u16_u8(row67.val[1]));
  const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
                                       vreinterpretq_u32_u16(row46.val[0]));
  const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
                                       vreinterpretq_u32_u16(row46.val[1]));
  const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
                                       vreinterpretq_u32_u16(row57.val[0]));
  const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
                                       vreinterpretq_u32_u16(row57.val[1]));
  *p3 = vreinterpretq_u8_u32(row04.val[0]);
  *p2 = vreinterpretq_u8_u32(row15.val[0]);
  *p1 = vreinterpretq_u8_u32(row26.val[0]);
  *p0 = vreinterpretq_u8_u32(row37.val[0]);
  *q0 = vreinterpretq_u8_u32(row04.val[1]);
  *q1 = vreinterpretq_u8_u32(row15.val[1]);
  *q2 = vreinterpretq_u8_u32(row26.val[1]);
  *q3 = vreinterpretq_u8_u32(row37.val[1]);
 }
 #undef LOAD_UV_8
 static WEBP_INLINE void Store2x8(const uint8x8x2_t v,
                                 uint8_t* const dst, int stride) {
  vst2_lane_u8(dst + 0 * stride, v, 0);
@@ -294,6 +375,87 @@ static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0,
  Store16x2(q0, q1, dst + stride, stride);
 }
 static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
                                   uint8_t* const u, uint8_t* const v,
                                   int stride) {
  // p0 and q0 contain the u+v samples packed in low/high halves.
  vst1_u8(u - stride, vget_low_u8(p0));
  vst1_u8(u,          vget_low_u8(q0));
  vst1_u8(v - stride, vget_high_u8(p0));
  vst1_u8(v,          vget_high_u8(q0));
 }
 static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,
                                   const uint8x16_t q0, const uint8x16_t q1,
                                   uint8_t* const u, uint8_t* const v,
                                   int stride) {
  // The p1...q1 registers contain the u+v samples packed in low/high halves.
  Store8x2x2(p1, p0, u - stride, v - stride, stride);
  Store8x2x2(q0, q1, u + stride, v + stride, stride);
 }
 #define STORE6_LANE(DST, VAL0, VAL1, LANE) do {   \
  vst3_lane_u8((DST) - 3, (VAL0), (LANE));        \
  vst3_lane_u8((DST) + 0, (VAL1), (LANE));        \
  (DST) += stride;                                \
 } while (0)
 static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,
                                   const uint8x16_t p0, const uint8x16_t q0,
                                   const uint8x16_t q1, const uint8x16_t q2,
                                   uint8_t* u, uint8_t* v,
                                   int stride) {
  const uint8x8x3_t u0 = {{vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0)}};
  const uint8x8x3_t u1 = {{vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2)}};
  const uint8x8x3_t v0 =
      {{vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0)}};
  const uint8x8x3_t v1 =
      {{vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2)}};
  STORE6_LANE(u, u0, u1, 0);
  STORE6_LANE(u, u0, u1, 1);
  STORE6_LANE(u, u0, u1, 2);
  STORE6_LANE(u, u0, u1, 3);
  STORE6_LANE(u, u0, u1, 4);
  STORE6_LANE(u, u0, u1, 5);
  STORE6_LANE(u, u0, u1, 6);
  STORE6_LANE(u, u0, u1, 7);
  STORE6_LANE(v, v0, v1, 0);
  STORE6_LANE(v, v0, v1, 1);
  STORE6_LANE(v, v0, v1, 2);
  STORE6_LANE(v, v0, v1, 3);
  STORE6_LANE(v, v0, v1, 4);
  STORE6_LANE(v, v0, v1, 5);
  STORE6_LANE(v, v0, v1, 6);
  STORE6_LANE(v, v0, v1, 7);
 }
 #undef STORE6_LANE
 static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
                                   const uint8x16_t q0, const uint8x16_t q1,
                                   uint8_t* const u, uint8_t* const v,
                                   int stride) {
  const uint8x8x4_t u0 =
      {{vget_low_u8(p1), vget_low_u8(p0), vget_low_u8(q0), vget_low_u8(q1)}};
  const uint8x8x4_t v0 =
      {{vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0), vget_high_u8(q1)}};
  vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
  vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
  vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
  vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
  vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
  vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
  vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
  vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
  vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
  vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
  vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
  vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
  vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
  vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
  vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
  vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
 }
 //------------------------------------------------------------------------------
 static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
@@ -749,6 +911,70 @@ static void HFilter16i(uint8_t* p, int stride,
 }
 #endif
 // 8-pixels wide variant, for chroma filtering
 static void VFilter8(uint8_t* u, uint8_t* v, int stride,
                     int thresh, int ithresh, int hev_thresh) {
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
  Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
  {
    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
                                         ithresh, thresh);
    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
              &op2, &op1, &op0, &oq0, &oq1, &oq2);
    Store8x2x2(op2, op1, u - 2 * stride, v - 2 * stride, stride);
    Store8x2x2(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
    Store8x2x2(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
  }
 }
 static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
                      int thresh, int ithresh, int hev_thresh) {
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
  u += 4 * stride;
  v += 4 * stride;
  Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
  {
    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
                                         ithresh, thresh);
    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    uint8x16_t op1, op0, oq0, oq1;
    DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
    Store8x4x2(op1, op0, oq0, oq1, u, v, stride);
  }
 }
 static void HFilter8(uint8_t* u, uint8_t* v, int stride,
                     int thresh, int ithresh, int hev_thresh) {
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
  Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
  {
    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
                                         ithresh, thresh);
    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
              &op2, &op1, &op0, &oq0, &oq1, &oq2);
    Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
  }
 }
 static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
                      int thresh, int ithresh, int hev_thresh) {
  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
  u += 4;
  v += 4;
  Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
  {
    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
                                         ithresh, thresh);
    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    uint8x16_t op1, op0, oq0, oq1;
    DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
    Store4x8x2(op1, op0, oq0, oq1, u, v, stride);
  }
 }
 #endif  // USE_INTRINSICS
 //-----------------------------------------------------------------------------
@@ -1062,6 +1288,10 @@ void VP8DspInitNEON(void) {
 #if !defined(WORK_AROUND_GCC)
  VP8HFilter16i = HFilter16i;
 #endif
  VP8VFilter8 = VFilter8;
  VP8VFilter8i = VFilter8i;
  VP8HFilter8 = HFilter8;
  VP8HFilter8i = HFilter8i;
 #endif
  VP8SimpleVFilter16 = SimpleVFilter16;
  VP8SimpleHFilter16 = SimpleHFilter16;