mirror of
https://github.com/webmproject/libwebp.git
synced 2025-01-26 22:52:55 +01:00
Add AArch64 Neon implementation of Intra4Preds
Add Neon implementation of Intra4Preds for use on 64-bit Arm platforms. (The same implementation cannot be used for 32-bit Arm platforms as it uses a number of AArch64-only Neon instructions.) Change-Id: Id781e7614f4e8e876dfeecd95cfc85e04611d8c6
This commit is contained in:
parent
ff2b5b15ae
commit
baa93808d9
@ -343,6 +343,8 @@ static void Intra16Preds_C(uint8_t* dst,
|
||||
//------------------------------------------------------------------------------
|
||||
// luma 4x4 prediction
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
|
||||
|
||||
#define DST(x, y) dst[(x) + (y) * BPS]
|
||||
#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
|
||||
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
|
||||
@ -529,6 +531,8 @@ static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
|
||||
HU4(I4HU4 + dst, top);
|
||||
}
|
||||
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Metric
|
||||
|
||||
@ -762,8 +766,11 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
|
||||
VP8EncQuantize2Blocks = Quantize2Blocks_C;
|
||||
#endif
|
||||
|
||||
VP8FTransform2 = FTransform2_C;
|
||||
#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
|
||||
VP8EncPredLuma4 = Intra4Preds_C;
|
||||
#endif
|
||||
|
||||
VP8FTransform2 = FTransform2_C;
|
||||
VP8EncPredLuma16 = Intra16Preds_C;
|
||||
VP8EncPredChroma8 = IntraChromaPreds_C;
|
||||
VP8Mean16x4 = Mean16x4_C;
|
||||
|
@ -911,6 +911,121 @@ static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
|
||||
|
||||
#endif // !WORK_AROUND_GCC
|
||||
|
||||
#if WEBP_AARCH64
|
||||
|
||||
#define DC4_VE4_HE4_TM4_NEON(dst, tbl, res, lane) \
|
||||
do { \
|
||||
uint8x16_t r; \
|
||||
r = vqtbl2q_u8(qcombined, tbl); \
|
||||
r = vreinterpretq_u8_u32( \
|
||||
vsetq_lane_u32(vget_lane_u32(vreinterpret_u32_u8(res), lane), \
|
||||
vreinterpretq_u32_u8(r), 1)); \
|
||||
vst1q_u8(dst, r); \
|
||||
} while (0)
|
||||
|
||||
#define RD4_VR4_LD4_VL4_NEON(dst, tbl) \
|
||||
do { \
|
||||
uint8x16_t r; \
|
||||
r = vqtbl2q_u8(qcombined, tbl); \
|
||||
vst1q_u8(dst, r); \
|
||||
} while (0)
|
||||
|
||||
static void Intra4Preds_NEON(uint8_t* dst, const uint8_t* top) {
|
||||
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13
|
||||
// L K J I X A B C D E F G H
|
||||
// -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7
|
||||
static const uint8_t kLookupTbl1[64] = {
|
||||
0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12,
|
||||
3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0,
|
||||
4, 20, 21, 22, 3, 18, 2, 17, 3, 19, 4, 20, 2, 17, 1, 16,
|
||||
2, 18, 3, 19, 1, 16, 31, 31, 1, 17, 2, 18, 31, 31, 31, 31
|
||||
};
|
||||
|
||||
static const uint8_t kLookupTbl2[64] = {
|
||||
20, 21, 22, 23, 5, 6, 7, 8, 22, 23, 24, 25, 6, 7, 8, 9,
|
||||
19, 20, 21, 22, 20, 21, 22, 23, 23, 24, 25, 26, 22, 23, 24, 25,
|
||||
18, 19, 20, 21, 19, 5, 6, 7, 24, 25, 26, 27, 7, 8, 9, 26,
|
||||
17, 18, 19, 20, 18, 20, 21, 22, 25, 26, 27, 28, 23, 24, 25, 27
|
||||
};
|
||||
|
||||
static const uint8_t kLookupTbl3[64] = {
|
||||
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 19, 19, 19, 19,
|
||||
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 18, 18, 18, 18,
|
||||
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 17, 17, 17, 17,
|
||||
30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 16, 16, 16, 16
|
||||
};
|
||||
|
||||
const uint8x16x4_t lookup_avgs1 = vld1q_u8_x4(kLookupTbl1);
|
||||
const uint8x16x4_t lookup_avgs2 = vld1q_u8_x4(kLookupTbl2);
|
||||
const uint8x16x4_t lookup_avgs3 = vld1q_u8_x4(kLookupTbl3);
|
||||
|
||||
const uint8x16_t preload = vld1q_u8(top - 5);
|
||||
uint8x16x2_t qcombined;
|
||||
uint8x16_t result0, result1;
|
||||
|
||||
uint8x16_t a = vqtbl1q_u8(preload, lookup_avgs1.val[0]);
|
||||
uint8x16_t b = preload;
|
||||
uint8x16_t c = vextq_u8(a, a, 2);
|
||||
|
||||
uint8x16_t avg3_all = vrhaddq_u8(vhaddq_u8(a, c), b);
|
||||
uint8x16_t avg2_all = vrhaddq_u8(a, b);
|
||||
|
||||
uint8x8_t preload_x8, sub_a, sub_c;
|
||||
uint8_t result_u8;
|
||||
uint8x8_t res_lo, res_hi;
|
||||
uint8x16_t full_b;
|
||||
uint16x8_t sub, sum_lo, sum_hi;
|
||||
|
||||
preload_x8 = vget_low_u8(c);
|
||||
preload_x8 = vset_lane_u8(vgetq_lane_u8(preload, 0), preload_x8, 3);
|
||||
|
||||
result_u8 = (vaddlv_u8(preload_x8) + 4) >> 3;
|
||||
|
||||
avg3_all = vsetq_lane_u8(vgetq_lane_u8(preload, 0), avg3_all, 15);
|
||||
avg3_all = vsetq_lane_u8(result_u8, avg3_all, 14);
|
||||
|
||||
qcombined.val[0] = avg2_all;
|
||||
qcombined.val[1] = avg3_all;
|
||||
|
||||
sub_a = vdup_laneq_u8(preload, 4);
|
||||
|
||||
// preload = {a,b,c,d,...} => full_b = {d,d,d,d,c,c,c,c,b,b,b,b,a,a,a,a}
|
||||
full_b = vqtbl1q_u8(preload, lookup_avgs1.val[1]);
|
||||
// preload = {a,b,c,d,...} => sub_c = {a,b,c,d,a,b,c,d,a,b,c,d,a,b,c,d}
|
||||
sub_c = vreinterpret_u8_u32(vdup_n_u32(
|
||||
vgetq_lane_u32(vreinterpretq_u32_u8(vextq_u8(preload, preload, 5)), 0)));
|
||||
|
||||
sub = vsubl_u8(sub_c, sub_a);
|
||||
sum_lo = vaddw_u8(sub, vget_low_u8(full_b));
|
||||
res_lo = vqmovn_u16(sum_lo);
|
||||
|
||||
sum_hi = vaddw_u8(sub, vget_high_u8(full_b));
|
||||
res_hi = vqmovn_u16(sum_hi);
|
||||
|
||||
// DC4, VE4, HE4, TM4
|
||||
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 0, lookup_avgs3.val[0], res_lo, 0);
|
||||
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 1, lookup_avgs3.val[1], res_lo, 1);
|
||||
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 2, lookup_avgs3.val[2], res_hi, 0);
|
||||
DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 3, lookup_avgs3.val[3], res_hi, 1);
|
||||
|
||||
// RD4, VR4, LD4, VL4
|
||||
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 0, lookup_avgs2.val[0]);
|
||||
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 1, lookup_avgs2.val[1]);
|
||||
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 2, lookup_avgs2.val[2]);
|
||||
RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 3, lookup_avgs2.val[3]);
|
||||
|
||||
// HD4, HU4
|
||||
result0 = vqtbl2q_u8(qcombined, lookup_avgs1.val[2]);
|
||||
result1 = vqtbl2q_u8(qcombined, lookup_avgs1.val[3]);
|
||||
|
||||
vst1_u8(dst + I4HD4 + BPS * 0, vget_low_u8(result0));
|
||||
vst1_u8(dst + I4HD4 + BPS * 1, vget_high_u8(result0));
|
||||
vst1_u8(dst + I4HD4 + BPS * 2, vget_low_u8(result1));
|
||||
vst1_u8(dst + I4HD4 + BPS * 3, vget_high_u8(result1));
|
||||
}
|
||||
|
||||
#endif // WEBP_AARCH64
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Entry point
|
||||
|
||||
@ -931,6 +1046,10 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
|
||||
VP8SSE8x8 = SSE8x8_NEON;
|
||||
VP8SSE4x4 = SSE4x4_NEON;
|
||||
|
||||
#if WEBP_AARCH64 && (BPS == 32)
|
||||
VP8EncPredLuma4 = Intra4Preds_NEON;
|
||||
#endif
|
||||
|
||||
#if !defined(WORK_AROUND_GCC)
|
||||
VP8EncQuantizeBlock = QuantizeBlock_NEON;
|
||||
VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
|
||||
|
Loading…
x
Reference in New Issue
Block a user