From baa93808d9033a15179cab1fa3f3afeea8a6b0f9 Mon Sep 17 00:00:00 2001 From: Istvan Stefan Date: Mon, 27 Mar 2023 16:21:40 +0200 Subject: [PATCH] Add AArch64 Neon implementation of Intra4Preds Add Neon implementation of Intra4Preds for use on 64-bit Arm platforms. (The same implementation cannot be used for 32-bit Arm platforms as it uses a number of AArch64-only Neon instructions.) Change-Id: Id781e7614f4e8e876dfeecd95cfc85e04611d8c6 --- src/dsp/enc.c | 9 +++- src/dsp/enc_neon.c | 119 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 1 deletion(-) diff --git a/src/dsp/enc.c b/src/dsp/enc.c index 395ad05b..4ec27ffd 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -343,6 +343,8 @@ static void Intra16Preds_C(uint8_t* dst, //------------------------------------------------------------------------------ // luma 4x4 prediction +#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 + #define DST(x, y) dst[(x) + (y) * BPS] #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2)) #define AVG2(a, b) (((a) + (b) + 1) >> 1) @@ -529,6 +531,8 @@ static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) { HU4(I4HU4 + dst, top); } +#endif // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 + //------------------------------------------------------------------------------ // Metric @@ -762,8 +766,11 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) { VP8EncQuantize2Blocks = Quantize2Blocks_C; #endif - VP8FTransform2 = FTransform2_C; +#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 VP8EncPredLuma4 = Intra4Preds_C; +#endif + + VP8FTransform2 = FTransform2_C; VP8EncPredLuma16 = Intra16Preds_C; VP8EncPredChroma8 = IntraChromaPreds_C; VP8Mean16x4 = Mean16x4_C; diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c index 6f641c9a..d8165767 100644 --- a/src/dsp/enc_neon.c +++ b/src/dsp/enc_neon.c @@ -911,6 +911,121 @@ static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32], #endif // !WORK_AROUND_GCC +#if WEBP_AARCH64 + +#define DC4_VE4_HE4_TM4_NEON(dst, tbl, res, lane) \ + do { \ + uint8x16_t r; \ + r = vqtbl2q_u8(qcombined, tbl); \ + r = vreinterpretq_u8_u32( \ + vsetq_lane_u32(vget_lane_u32(vreinterpret_u32_u8(res), lane), \ + vreinterpretq_u32_u8(r), 1)); \ + vst1q_u8(dst, r); \ + } while (0) + +#define RD4_VR4_LD4_VL4_NEON(dst, tbl) \ + do { \ + uint8x16_t r; \ + r = vqtbl2q_u8(qcombined, tbl); \ + vst1q_u8(dst, r); \ + } while (0) + +static void Intra4Preds_NEON(uint8_t* dst, const uint8_t* top) { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 + // L K J I X A B C D E F G H + // -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 + static const uint8_t kLookupTbl1[64] = { + 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, + 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, + 4, 20, 21, 22, 3, 18, 2, 17, 3, 19, 4, 20, 2, 17, 1, 16, + 2, 18, 3, 19, 1, 16, 31, 31, 1, 17, 2, 18, 31, 31, 31, 31 + }; + + static const uint8_t kLookupTbl2[64] = { + 20, 21, 22, 23, 5, 6, 7, 8, 22, 23, 24, 25, 6, 7, 8, 9, + 19, 20, 21, 22, 20, 21, 22, 23, 23, 24, 25, 26, 22, 23, 24, 25, + 18, 19, 20, 21, 19, 5, 6, 7, 24, 25, 26, 27, 7, 8, 9, 26, + 17, 18, 19, 20, 18, 20, 21, 22, 25, 26, 27, 28, 23, 24, 25, 27 + }; + + static const uint8_t kLookupTbl3[64] = { + 30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 19, 19, 19, 19, + 30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 18, 18, 18, 18, + 30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 17, 17, 17, 17, + 30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 16, 16, 16, 16 + }; + + const uint8x16x4_t lookup_avgs1 = vld1q_u8_x4(kLookupTbl1); + const uint8x16x4_t lookup_avgs2 = vld1q_u8_x4(kLookupTbl2); + const uint8x16x4_t lookup_avgs3 = vld1q_u8_x4(kLookupTbl3); + + const uint8x16_t preload = vld1q_u8(top - 5); + uint8x16x2_t qcombined; + uint8x16_t result0, result1; + + uint8x16_t a = vqtbl1q_u8(preload, lookup_avgs1.val[0]); + uint8x16_t b = preload; + uint8x16_t c = vextq_u8(a, a, 2); + + uint8x16_t avg3_all = vrhaddq_u8(vhaddq_u8(a, c), b); + uint8x16_t avg2_all = vrhaddq_u8(a, b); + + uint8x8_t preload_x8, sub_a, sub_c; + uint8_t result_u8; + uint8x8_t res_lo, res_hi; + uint8x16_t full_b; + uint16x8_t sub, sum_lo, sum_hi; + + preload_x8 = vget_low_u8(c); + preload_x8 = vset_lane_u8(vgetq_lane_u8(preload, 0), preload_x8, 3); + + result_u8 = (vaddlv_u8(preload_x8) + 4) >> 3; + + avg3_all = vsetq_lane_u8(vgetq_lane_u8(preload, 0), avg3_all, 15); + avg3_all = vsetq_lane_u8(result_u8, avg3_all, 14); + + qcombined.val[0] = avg2_all; + qcombined.val[1] = avg3_all; + + sub_a = vdup_laneq_u8(preload, 4); + + // preload = {a,b,c,d,...} => full_b = {d,d,d,d,c,c,c,c,b,b,b,b,a,a,a,a} + full_b = vqtbl1q_u8(preload, lookup_avgs1.val[1]); + // preload = {a,b,c,d,...} => sub_c = {a,b,c,d,a,b,c,d,a,b,c,d,a,b,c,d} + sub_c = vreinterpret_u8_u32(vdup_n_u32( + vgetq_lane_u32(vreinterpretq_u32_u8(vextq_u8(preload, preload, 5)), 0))); + + sub = vsubl_u8(sub_c, sub_a); + sum_lo = vaddw_u8(sub, vget_low_u8(full_b)); + res_lo = vqmovn_u16(sum_lo); + + sum_hi = vaddw_u8(sub, vget_high_u8(full_b)); + res_hi = vqmovn_u16(sum_hi); + + // DC4, VE4, HE4, TM4 + DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 0, lookup_avgs3.val[0], res_lo, 0); + DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 1, lookup_avgs3.val[1], res_lo, 1); + DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 2, lookup_avgs3.val[2], res_hi, 0); + DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 3, lookup_avgs3.val[3], res_hi, 1); + + // RD4, VR4, LD4, VL4 + RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 0, lookup_avgs2.val[0]); + RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 1, lookup_avgs2.val[1]); + RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 2, lookup_avgs2.val[2]); + RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 3, lookup_avgs2.val[3]); + + // HD4, HU4 + result0 = vqtbl2q_u8(qcombined, lookup_avgs1.val[2]); + result1 = vqtbl2q_u8(qcombined, lookup_avgs1.val[3]); + + vst1_u8(dst + I4HD4 + BPS * 0, vget_low_u8(result0)); + vst1_u8(dst + I4HD4 + BPS * 1, vget_high_u8(result0)); + vst1_u8(dst + I4HD4 + BPS * 2, vget_low_u8(result1)); + vst1_u8(dst + I4HD4 + BPS * 3, vget_high_u8(result1)); +} + +#endif // WEBP_AARCH64 + //------------------------------------------------------------------------------ // Entry point @@ -931,6 +1046,10 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) { VP8SSE8x8 = SSE8x8_NEON; VP8SSE4x4 = SSE4x4_NEON; +#if WEBP_AARCH64 && (BPS == 32) + VP8EncPredLuma4 = Intra4Preds_NEON; +#endif + #if !defined(WORK_AROUND_GCC) VP8EncQuantizeBlock = QuantizeBlock_NEON; VP8EncQuantize2Blocks = Quantize2Blocks_NEON;