From baa93808d9033a15179cab1fa3f3afeea8a6b0f9 Mon Sep 17 00:00:00 2001 From: Istvan Stefan Date: Mon, 27 Mar 2023 16:21:40 +0200 Subject: [PATCH 1/3] Add AArch64 Neon implementation of Intra4Preds Add Neon implementation of Intra4Preds for use on 64-bit Arm platforms. (The same implementation cannot be used for 32-bit Arm platforms as it uses a number of AArch64-only Neon instructions.) Change-Id: Id781e7614f4e8e876dfeecd95cfc85e04611d8c6 --- src/dsp/enc.c | 9 +++- src/dsp/enc_neon.c | 119 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 1 deletion(-) diff --git a/src/dsp/enc.c b/src/dsp/enc.c index 395ad05b..4ec27ffd 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -343,6 +343,8 @@ static void Intra16Preds_C(uint8_t* dst, //------------------------------------------------------------------------------ // luma 4x4 prediction +#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 + #define DST(x, y) dst[(x) + (y) * BPS] #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2)) #define AVG2(a, b) (((a) + (b) + 1) >> 1) @@ -529,6 +531,8 @@ static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) { HU4(I4HU4 + dst, top); } +#endif // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 + //------------------------------------------------------------------------------ // Metric @@ -762,8 +766,11 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) { VP8EncQuantize2Blocks = Quantize2Blocks_C; #endif - VP8FTransform2 = FTransform2_C; +#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 VP8EncPredLuma4 = Intra4Preds_C; +#endif + + VP8FTransform2 = FTransform2_C; VP8EncPredLuma16 = Intra16Preds_C; VP8EncPredChroma8 = IntraChromaPreds_C; VP8Mean16x4 = Mean16x4_C; diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c index 6f641c9a..d8165767 100644 --- a/src/dsp/enc_neon.c +++ b/src/dsp/enc_neon.c @@ -911,6 +911,121 @@ static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32], #endif // !WORK_AROUND_GCC +#if WEBP_AARCH64 + +#define DC4_VE4_HE4_TM4_NEON(dst, tbl, res, lane) \ + do { \ + uint8x16_t r; \ + r = vqtbl2q_u8(qcombined, tbl); \ + r = vreinterpretq_u8_u32( \ + vsetq_lane_u32(vget_lane_u32(vreinterpret_u32_u8(res), lane), \ + vreinterpretq_u32_u8(r), 1)); \ + vst1q_u8(dst, r); \ + } while (0) + +#define RD4_VR4_LD4_VL4_NEON(dst, tbl) \ + do { \ + uint8x16_t r; \ + r = vqtbl2q_u8(qcombined, tbl); \ + vst1q_u8(dst, r); \ + } while (0) + +static void Intra4Preds_NEON(uint8_t* dst, const uint8_t* top) { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 + // L K J I X A B C D E F G H + // -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 + static const uint8_t kLookupTbl1[64] = { + 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, + 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, + 4, 20, 21, 22, 3, 18, 2, 17, 3, 19, 4, 20, 2, 17, 1, 16, + 2, 18, 3, 19, 1, 16, 31, 31, 1, 17, 2, 18, 31, 31, 31, 31 + }; + + static const uint8_t kLookupTbl2[64] = { + 20, 21, 22, 23, 5, 6, 7, 8, 22, 23, 24, 25, 6, 7, 8, 9, + 19, 20, 21, 22, 20, 21, 22, 23, 23, 24, 25, 26, 22, 23, 24, 25, + 18, 19, 20, 21, 19, 5, 6, 7, 24, 25, 26, 27, 7, 8, 9, 26, + 17, 18, 19, 20, 18, 20, 21, 22, 25, 26, 27, 28, 23, 24, 25, 27 + }; + + static const uint8_t kLookupTbl3[64] = { + 30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 19, 19, 19, 19, + 30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 18, 18, 18, 18, + 30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 17, 17, 17, 17, + 30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 16, 16, 16, 16 + }; + + const uint8x16x4_t lookup_avgs1 = vld1q_u8_x4(kLookupTbl1); + const uint8x16x4_t lookup_avgs2 = vld1q_u8_x4(kLookupTbl2); + const uint8x16x4_t lookup_avgs3 = vld1q_u8_x4(kLookupTbl3); + + const uint8x16_t preload = vld1q_u8(top - 5); + uint8x16x2_t qcombined; + uint8x16_t result0, result1; + + uint8x16_t a = vqtbl1q_u8(preload, lookup_avgs1.val[0]); + uint8x16_t b = preload; + uint8x16_t c = vextq_u8(a, a, 2); + + uint8x16_t avg3_all = vrhaddq_u8(vhaddq_u8(a, c), b); + uint8x16_t avg2_all = vrhaddq_u8(a, b); + + uint8x8_t preload_x8, sub_a, sub_c; + uint8_t result_u8; + uint8x8_t res_lo, res_hi; + uint8x16_t full_b; + uint16x8_t sub, sum_lo, sum_hi; + + preload_x8 = vget_low_u8(c); + preload_x8 = vset_lane_u8(vgetq_lane_u8(preload, 0), preload_x8, 3); + + result_u8 = (vaddlv_u8(preload_x8) + 4) >> 3; + + avg3_all = vsetq_lane_u8(vgetq_lane_u8(preload, 0), avg3_all, 15); + avg3_all = vsetq_lane_u8(result_u8, avg3_all, 14); + + qcombined.val[0] = avg2_all; + qcombined.val[1] = avg3_all; + + sub_a = vdup_laneq_u8(preload, 4); + + // preload = {a,b,c,d,...} => full_b = {d,d,d,d,c,c,c,c,b,b,b,b,a,a,a,a} + full_b = vqtbl1q_u8(preload, lookup_avgs1.val[1]); + // preload = {a,b,c,d,...} => sub_c = {a,b,c,d,a,b,c,d,a,b,c,d,a,b,c,d} + sub_c = vreinterpret_u8_u32(vdup_n_u32( + vgetq_lane_u32(vreinterpretq_u32_u8(vextq_u8(preload, preload, 5)), 0))); + + sub = vsubl_u8(sub_c, sub_a); + sum_lo = vaddw_u8(sub, vget_low_u8(full_b)); + res_lo = vqmovn_u16(sum_lo); + + sum_hi = vaddw_u8(sub, vget_high_u8(full_b)); + res_hi = vqmovn_u16(sum_hi); + + // DC4, VE4, HE4, TM4 + DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 0, lookup_avgs3.val[0], res_lo, 0); + DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 1, lookup_avgs3.val[1], res_lo, 1); + DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 2, lookup_avgs3.val[2], res_hi, 0); + DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 3, lookup_avgs3.val[3], res_hi, 1); + + // RD4, VR4, LD4, VL4 + RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 0, lookup_avgs2.val[0]); + RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 1, lookup_avgs2.val[1]); + RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 2, lookup_avgs2.val[2]); + RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 3, lookup_avgs2.val[3]); + + // HD4, HU4 + result0 = vqtbl2q_u8(qcombined, lookup_avgs1.val[2]); + result1 = vqtbl2q_u8(qcombined, lookup_avgs1.val[3]); + + vst1_u8(dst + I4HD4 + BPS * 0, vget_low_u8(result0)); + vst1_u8(dst + I4HD4 + BPS * 1, vget_high_u8(result0)); + vst1_u8(dst + I4HD4 + BPS * 2, vget_low_u8(result1)); + vst1_u8(dst + I4HD4 + BPS * 3, vget_high_u8(result1)); +} + +#endif // WEBP_AARCH64 + //------------------------------------------------------------------------------ // Entry point @@ -931,6 +1046,10 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) { VP8SSE8x8 = SSE8x8_NEON; VP8SSE4x4 = SSE4x4_NEON; +#if WEBP_AARCH64 && (BPS == 32) + VP8EncPredLuma4 = Intra4Preds_NEON; +#endif + #if !defined(WORK_AROUND_GCC) VP8EncQuantizeBlock = QuantizeBlock_NEON; VP8EncQuantize2Blocks = Quantize2Blocks_NEON; From 3bfb05e38c3762b119473ac8cc76084b2030d09d Mon Sep 17 00:00:00 2001 From: Istvan Stefan Date: Fri, 5 Jul 2024 09:46:30 +0100 Subject: [PATCH 2/3] Add AArch64 Neon implementation of Intra16Preds Add a Neon implementation of Intra16Preds for use on 64-bit Arm platforms. (This implementation cannot be used on 32-bit Arm platforms as it makes use of a number of AArch64-only Neon instructions.) Change-Id: I24c67cd54b66307e3924fd332c2795fd7422f082 --- src/dsp/enc.c | 4 +- src/dsp/enc_neon.c | 151 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 153 insertions(+), 2 deletions(-) diff --git a/src/dsp/enc.c b/src/dsp/enc.c index 4ec27ffd..fba68f14 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -332,6 +332,7 @@ static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left, //------------------------------------------------------------------------------ // luma 16x16 prediction (paragraph 12.3) +#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 static void Intra16Preds_C(uint8_t* dst, const uint8_t* left, const uint8_t* top) { DCMode(I16DC16 + dst, left, top, 16, 16, 5); @@ -339,6 +340,7 @@ static void Intra16Preds_C(uint8_t* dst, HorizontalPred(I16HE16 + dst, left, 16); TrueMotion(I16TM16 + dst, left, top, 16); } +#endif // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 //------------------------------------------------------------------------------ // luma 4x4 prediction @@ -768,10 +770,10 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) { #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 VP8EncPredLuma4 = Intra4Preds_C; + VP8EncPredLuma16 = Intra16Preds_C; #endif VP8FTransform2 = FTransform2_C; - VP8EncPredLuma16 = Intra16Preds_C; VP8EncPredChroma8 = IntraChromaPreds_C; VP8Mean16x4 = Mean16x4_C; VP8EncQuantizeBlockWHT = QuantizeBlock_C; diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c index d8165767..57b55df2 100644 --- a/src/dsp/enc_neon.c +++ b/src/dsp/enc_neon.c @@ -1024,6 +1024,152 @@ static void Intra4Preds_NEON(uint8_t* dst, const uint8_t* top) { vst1_u8(dst + I4HD4 + BPS * 3, vget_high_u8(result1)); } +static WEBP_INLINE void Fill_NEON(uint8_t* dst, const uint8_t value) { + uint8x16_t a = vdupq_n_u8(value); + int i; + for (i = 0; i < 16; i++) { + vst1q_u8(dst + BPS * i, a); + } +} + +static WEBP_INLINE void Fill16_NEON(uint8_t* dst, const uint8_t* src) { + uint8x16_t a = vld1q_u8(src); + int i; + for (i = 0; i < 16; i++) { + vst1q_u8(dst + BPS * i, a); + } +} + +static WEBP_INLINE void HorizontalPred16_NEON(uint8_t* dst, + const uint8_t* left) { + uint8x16_t a; + + if (left == NULL) { + Fill_NEON(dst, 129); + return; + } + + a = vld1q_u8(left + 0); + vst1q_u8(dst + BPS * 0, vdupq_laneq_u8(a, 0)); + vst1q_u8(dst + BPS * 1, vdupq_laneq_u8(a, 1)); + vst1q_u8(dst + BPS * 2, vdupq_laneq_u8(a, 2)); + vst1q_u8(dst + BPS * 3, vdupq_laneq_u8(a, 3)); + vst1q_u8(dst + BPS * 4, vdupq_laneq_u8(a, 4)); + vst1q_u8(dst + BPS * 5, vdupq_laneq_u8(a, 5)); + vst1q_u8(dst + BPS * 6, vdupq_laneq_u8(a, 6)); + vst1q_u8(dst + BPS * 7, vdupq_laneq_u8(a, 7)); + vst1q_u8(dst + BPS * 8, vdupq_laneq_u8(a, 8)); + vst1q_u8(dst + BPS * 9, vdupq_laneq_u8(a, 9)); + vst1q_u8(dst + BPS * 10, vdupq_laneq_u8(a, 10)); + vst1q_u8(dst + BPS * 11, vdupq_laneq_u8(a, 11)); + vst1q_u8(dst + BPS * 12, vdupq_laneq_u8(a, 12)); + vst1q_u8(dst + BPS * 13, vdupq_laneq_u8(a, 13)); + vst1q_u8(dst + BPS * 14, vdupq_laneq_u8(a, 14)); + vst1q_u8(dst + BPS * 15, vdupq_laneq_u8(a, 15)); +} + +static WEBP_INLINE void VerticalPred16_NEON(uint8_t* dst, const uint8_t* top) { + if (top != NULL) { + Fill16_NEON(dst, top); + } else { + Fill_NEON(dst, 127); + } +} + +static WEBP_INLINE void DCMode_NEON(uint8_t* dst, const uint8_t* left, + const uint8_t* top) { + uint8_t s; + + if (top != NULL) { + uint16_t dc; + dc = vaddlvq_u8(vld1q_u8(top)); + if (left != NULL) { + // top and left present. + dc += vaddlvq_u8(vld1q_u8(left)); + s = vqrshrnh_n_u16(dc, 5); + } else { + // top but no left. + s = vqrshrnh_n_u16(dc, 4); + } + } else { + if (left != NULL) { + uint16_t dc; + // left but no top. + dc = vaddlvq_u8(vld1q_u8(left)); + s = vqrshrnh_n_u16(dc, 4); + } else { + // No top, no left, nothing. + s = 0x80; + } + } + Fill_NEON(dst, s); +} + +static WEBP_INLINE void TrueMotionHelper_NEON(uint8_t* dst, + const uint8x8_t outer, + const uint8x8x2_t inner, + const uint16x8_t a, int i, + const int n) { + uint8x8_t d1, d2; + uint16x8_t r1, r2; + + r1 = vaddl_u8(outer, inner.val[0]); + r1 = vqsubq_u16(r1, a); + d1 = vqmovn_u16(r1); + r2 = vaddl_u8(outer, inner.val[1]); + r2 = vqsubq_u16(r2, a); + d2 = vqmovn_u16(r2); + vst1_u8(dst + BPS * (i * 4 + n), d1); + vst1_u8(dst + BPS * (i * 4 + n) + 8, d2); +} + +static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, const uint8_t* left, + const uint8_t* top) { + int i; + uint16x8_t a; + uint8x8x2_t inner; + + if (left == NULL) { + // True motion without left samples (hence: with default 129 value) is + // equivalent to VE prediction where you just copy the top samples. + // Note that if top samples are not available, the default value is then + // 129, and not 127 as in the VerticalPred case. + if (top != NULL) { + VerticalPred16_NEON(dst, top); + } else { + Fill_NEON(dst, 129); + } + return; + } + + // left is not NULL. + if (top == NULL) { + HorizontalPred16_NEON(dst, left); + return; + } + + // Neither left nor top are NULL. + a = vdupq_n_u16(left[-1]); + inner = vld1_u8_x2(top); + + for (i = 0; i < 4; i++) { + const uint8x8x4_t outer = vld4_dup_u8(&left[i * 4]); + + TrueMotionHelper_NEON(dst, outer.val[0], inner, a, i, 0); + TrueMotionHelper_NEON(dst, outer.val[1], inner, a, i, 1); + TrueMotionHelper_NEON(dst, outer.val[2], inner, a, i, 2); + TrueMotionHelper_NEON(dst, outer.val[3], inner, a, i, 3); + } +} + +static void Intra16Preds_NEON(uint8_t* dst, const uint8_t* left, + const uint8_t* top) { + DCMode_NEON(I16DC16 + dst, left, top); + VerticalPred16_NEON(I16VE16 + dst, top); + HorizontalPred16_NEON(I16HE16 + dst, left); + TrueMotion_NEON(I16TM16 + dst, left, top); +} + #endif // WEBP_AARCH64 //------------------------------------------------------------------------------ @@ -1046,9 +1192,12 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) { VP8SSE8x8 = SSE8x8_NEON; VP8SSE4x4 = SSE4x4_NEON; -#if WEBP_AARCH64 && (BPS == 32) +#if WEBP_AARCH64 +#if BPS == 32 VP8EncPredLuma4 = Intra4Preds_NEON; #endif + VP8EncPredLuma16 = Intra16Preds_NEON; +#endif #if !defined(WORK_AROUND_GCC) VP8EncQuantizeBlock = QuantizeBlock_NEON; From 314a142a34fae6ef866f2be34d440642cb44e471 Mon Sep 17 00:00:00 2001 From: Istvan Stefan Date: Thu, 13 Apr 2023 22:51:32 +0200 Subject: [PATCH 3/3] Use QuantizeBlock_NEON for VP8EncQuantizeBlockWHT on Arm Use the Neon implementation instead of falling back to QuantizeBlock_C. Change-Id: Iff6e47eda353cbaa9766f75040fa63aa34607816 --- src/dsp/enc.c | 4 ++-- src/dsp/enc_neon.c | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/dsp/enc.c b/src/dsp/enc.c index fba68f14..cb44ba6f 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -650,6 +650,7 @@ static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b, // Quantization // +#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC static const uint8_t kZigzag[16] = { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 }; @@ -681,7 +682,6 @@ static int QuantizeBlock_C(int16_t in[16], int16_t out[16], return (last >= 0); } -#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC static int Quantize2Blocks_C(int16_t in[32], int16_t out[32], const VP8Matrix* const mtx) { int nz; @@ -766,6 +766,7 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) { #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC VP8EncQuantizeBlock = QuantizeBlock_C; VP8EncQuantize2Blocks = Quantize2Blocks_C; + VP8EncQuantizeBlockWHT = QuantizeBlock_C; #endif #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 @@ -776,7 +777,6 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) { VP8FTransform2 = FTransform2_C; VP8EncPredChroma8 = IntraChromaPreds_C; VP8Mean16x4 = Mean16x4_C; - VP8EncQuantizeBlockWHT = QuantizeBlock_C; VP8Copy4x4 = Copy4x4_C; VP8Copy16x8 = Copy16x8_C; diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c index 57b55df2..3d4090a8 100644 --- a/src/dsp/enc_neon.c +++ b/src/dsp/enc_neon.c @@ -1202,6 +1202,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) { #if !defined(WORK_AROUND_GCC) VP8EncQuantizeBlock = QuantizeBlock_NEON; VP8EncQuantize2Blocks = Quantize2Blocks_NEON; + VP8EncQuantizeBlockWHT = QuantizeBlock_NEON; #endif }