mirror of
https://github.com/webmproject/libwebp.git
synced 2024-11-20 04:18:26 +01:00
add intrinsics NEON code for chroma strong-filtering
The nice trick is to pack 8 u + 8 v samples into a single uint8x16x_t register, and re-use the previous (luma) functions Change-Id: Idf50ed2d6b7137ea080d603062bc9e0c66d79f38
This commit is contained in:
parent
2132992d47
commit
e351ec0759
@ -235,6 +235,87 @@ static WEBP_INLINE void Load16x8(const uint8_t* const src, int stride,
|
||||
Load16x4(src + 2 * stride, stride, q0, q1, q2, q3);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void Load8x8x2(const uint8_t* const u,
|
||||
const uint8_t* const v,
|
||||
int stride,
|
||||
uint8x16_t* const p3, uint8x16_t* const p2,
|
||||
uint8x16_t* const p1, uint8x16_t* const p0,
|
||||
uint8x16_t* const q0, uint8x16_t* const q1,
|
||||
uint8x16_t* const q2, uint8x16_t* const q3) {
|
||||
// We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
|
||||
// and the v-samples on the higher half.
|
||||
*p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
|
||||
*p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
|
||||
*p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
|
||||
*p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
|
||||
*q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
|
||||
*q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
|
||||
*q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
|
||||
*q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
|
||||
}
|
||||
|
||||
#define LOAD_UV_8(ROW) \
|
||||
vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
|
||||
|
||||
static WEBP_INLINE void Load8x8x2T(const uint8_t* const u,
|
||||
const uint8_t* const v,
|
||||
int stride,
|
||||
uint8x16_t* const p3, uint8x16_t* const p2,
|
||||
uint8x16_t* const p1, uint8x16_t* const p0,
|
||||
uint8x16_t* const q0, uint8x16_t* const q1,
|
||||
uint8x16_t* const q2, uint8x16_t* const q3) {
|
||||
// We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
|
||||
// and the v-samples on the higher half.
|
||||
const uint8x16_t row0 = LOAD_UV_8(0);
|
||||
const uint8x16_t row1 = LOAD_UV_8(1);
|
||||
const uint8x16_t row2 = LOAD_UV_8(2);
|
||||
const uint8x16_t row3 = LOAD_UV_8(3);
|
||||
const uint8x16_t row4 = LOAD_UV_8(4);
|
||||
const uint8x16_t row5 = LOAD_UV_8(5);
|
||||
const uint8x16_t row6 = LOAD_UV_8(6);
|
||||
const uint8x16_t row7 = LOAD_UV_8(7);
|
||||
// Perform two side-by-side 8x8 transposes
|
||||
// u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
|
||||
// u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
|
||||
// u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
|
||||
// u30 u31 u32 u33 u34 u35 u36 u37 | ...
|
||||
// u40 u41 u42 u43 u44 u45 u46 u47 | ...
|
||||
// u50 u51 u52 u53 u54 u55 u56 u57 | ...
|
||||
// u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
|
||||
// u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
|
||||
const uint8x16x2_t row01 = vtrnq_u8(row0, row1); // u00 u10 u02 u12 ...
|
||||
// u01 u11 u03 u13 ...
|
||||
const uint8x16x2_t row23 = vtrnq_u8(row2, row3); // u20 u30 u22 u32 ...
|
||||
// u21 u31 u23 u33 ...
|
||||
const uint8x16x2_t row45 = vtrnq_u8(row4, row5); // ...
|
||||
const uint8x16x2_t row67 = vtrnq_u8(row6, row7); // ...
|
||||
const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
|
||||
vreinterpretq_u16_u8(row23.val[0]));
|
||||
const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
|
||||
vreinterpretq_u16_u8(row23.val[1]));
|
||||
const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
|
||||
vreinterpretq_u16_u8(row67.val[0]));
|
||||
const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
|
||||
vreinterpretq_u16_u8(row67.val[1]));
|
||||
const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
|
||||
vreinterpretq_u32_u16(row46.val[0]));
|
||||
const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
|
||||
vreinterpretq_u32_u16(row46.val[1]));
|
||||
const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
|
||||
vreinterpretq_u32_u16(row57.val[0]));
|
||||
const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
|
||||
vreinterpretq_u32_u16(row57.val[1]));
|
||||
*p3 = vreinterpretq_u8_u32(row04.val[0]);
|
||||
*p2 = vreinterpretq_u8_u32(row15.val[0]);
|
||||
*p1 = vreinterpretq_u8_u32(row26.val[0]);
|
||||
*p0 = vreinterpretq_u8_u32(row37.val[0]);
|
||||
*q0 = vreinterpretq_u8_u32(row04.val[1]);
|
||||
*q1 = vreinterpretq_u8_u32(row15.val[1]);
|
||||
*q2 = vreinterpretq_u8_u32(row26.val[1]);
|
||||
*q3 = vreinterpretq_u8_u32(row37.val[1]);
|
||||
}
|
||||
#undef LOAD_UV_8
|
||||
|
||||
static WEBP_INLINE void Store2x8(const uint8x8x2_t v,
|
||||
uint8_t* const dst, int stride) {
|
||||
vst2_lane_u8(dst + 0 * stride, v, 0);
|
||||
@ -294,6 +375,87 @@ static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0,
|
||||
Store16x2(q0, q1, dst + stride, stride);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
|
||||
uint8_t* const u, uint8_t* const v,
|
||||
int stride) {
|
||||
// p0 and q0 contain the u+v samples packed in low/high halves.
|
||||
vst1_u8(u - stride, vget_low_u8(p0));
|
||||
vst1_u8(u, vget_low_u8(q0));
|
||||
vst1_u8(v - stride, vget_high_u8(p0));
|
||||
vst1_u8(v, vget_high_u8(q0));
|
||||
}
|
||||
|
||||
static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,
|
||||
const uint8x16_t q0, const uint8x16_t q1,
|
||||
uint8_t* const u, uint8_t* const v,
|
||||
int stride) {
|
||||
// The p1...q1 registers contain the u+v samples packed in low/high halves.
|
||||
Store8x2x2(p1, p0, u - stride, v - stride, stride);
|
||||
Store8x2x2(q0, q1, u + stride, v + stride, stride);
|
||||
}
|
||||
|
||||
#define STORE6_LANE(DST, VAL0, VAL1, LANE) do { \
|
||||
vst3_lane_u8((DST) - 3, (VAL0), (LANE)); \
|
||||
vst3_lane_u8((DST) + 0, (VAL1), (LANE)); \
|
||||
(DST) += stride; \
|
||||
} while (0)
|
||||
|
||||
static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,
|
||||
const uint8x16_t p0, const uint8x16_t q0,
|
||||
const uint8x16_t q1, const uint8x16_t q2,
|
||||
uint8_t* u, uint8_t* v,
|
||||
int stride) {
|
||||
const uint8x8x3_t u0 = {{vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0)}};
|
||||
const uint8x8x3_t u1 = {{vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2)}};
|
||||
const uint8x8x3_t v0 =
|
||||
{{vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0)}};
|
||||
const uint8x8x3_t v1 =
|
||||
{{vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2)}};
|
||||
STORE6_LANE(u, u0, u1, 0);
|
||||
STORE6_LANE(u, u0, u1, 1);
|
||||
STORE6_LANE(u, u0, u1, 2);
|
||||
STORE6_LANE(u, u0, u1, 3);
|
||||
STORE6_LANE(u, u0, u1, 4);
|
||||
STORE6_LANE(u, u0, u1, 5);
|
||||
STORE6_LANE(u, u0, u1, 6);
|
||||
STORE6_LANE(u, u0, u1, 7);
|
||||
STORE6_LANE(v, v0, v1, 0);
|
||||
STORE6_LANE(v, v0, v1, 1);
|
||||
STORE6_LANE(v, v0, v1, 2);
|
||||
STORE6_LANE(v, v0, v1, 3);
|
||||
STORE6_LANE(v, v0, v1, 4);
|
||||
STORE6_LANE(v, v0, v1, 5);
|
||||
STORE6_LANE(v, v0, v1, 6);
|
||||
STORE6_LANE(v, v0, v1, 7);
|
||||
}
|
||||
#undef STORE6_LANE
|
||||
|
||||
static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
|
||||
const uint8x16_t q0, const uint8x16_t q1,
|
||||
uint8_t* const u, uint8_t* const v,
|
||||
int stride) {
|
||||
const uint8x8x4_t u0 =
|
||||
{{vget_low_u8(p1), vget_low_u8(p0), vget_low_u8(q0), vget_low_u8(q1)}};
|
||||
const uint8x8x4_t v0 =
|
||||
{{vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0), vget_high_u8(q1)}};
|
||||
vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
|
||||
vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
|
||||
vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
|
||||
vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
|
||||
vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
|
||||
vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
|
||||
vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
|
||||
vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
|
||||
vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
|
||||
vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
|
||||
vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
|
||||
vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
|
||||
vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
|
||||
vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
|
||||
vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
|
||||
vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
|
||||
@ -749,6 +911,70 @@ static void HFilter16i(uint8_t* p, int stride,
|
||||
}
|
||||
#endif
|
||||
|
||||
// 8-pixels wide variant, for chroma filtering
|
||||
static void VFilter8(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
|
||||
Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
||||
{
|
||||
const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
|
||||
ithresh, thresh);
|
||||
const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
|
||||
uint8x16_t op2, op1, op0, oq0, oq1, oq2;
|
||||
DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
|
||||
&op2, &op1, &op0, &oq0, &oq1, &oq2);
|
||||
Store8x2x2(op2, op1, u - 2 * stride, v - 2 * stride, stride);
|
||||
Store8x2x2(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
|
||||
Store8x2x2(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
|
||||
}
|
||||
}
|
||||
static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
|
||||
u += 4 * stride;
|
||||
v += 4 * stride;
|
||||
Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
||||
{
|
||||
const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
|
||||
ithresh, thresh);
|
||||
const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
|
||||
uint8x16_t op1, op0, oq0, oq1;
|
||||
DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
|
||||
Store8x4x2(op1, op0, oq0, oq1, u, v, stride);
|
||||
}
|
||||
}
|
||||
|
||||
static void HFilter8(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
|
||||
Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
||||
{
|
||||
const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
|
||||
ithresh, thresh);
|
||||
const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
|
||||
uint8x16_t op2, op1, op0, oq0, oq1, oq2;
|
||||
DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
|
||||
&op2, &op1, &op0, &oq0, &oq1, &oq2);
|
||||
Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
|
||||
}
|
||||
}
|
||||
|
||||
static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
|
||||
u += 4;
|
||||
v += 4;
|
||||
Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
||||
{
|
||||
const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
|
||||
ithresh, thresh);
|
||||
const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
|
||||
uint8x16_t op1, op0, oq0, oq1;
|
||||
DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
|
||||
Store4x8x2(op1, op0, oq0, oq1, u, v, stride);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // USE_INTRINSICS
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
@ -1062,6 +1288,10 @@ void VP8DspInitNEON(void) {
|
||||
#if !defined(WORK_AROUND_GCC)
|
||||
VP8HFilter16i = HFilter16i;
|
||||
#endif
|
||||
VP8VFilter8 = VFilter8;
|
||||
VP8VFilter8i = VFilter8i;
|
||||
VP8HFilter8 = HFilter8;
|
||||
VP8HFilter8i = HFilter8i;
|
||||
#endif
|
||||
VP8SimpleVFilter16 = SimpleVFilter16;
|
||||
VP8SimpleHFilter16 = SimpleHFilter16;
|
||||
|
Loading…
Reference in New Issue
Block a user