mirror of
				https://github.com/webmproject/libwebp.git
				synced 2025-10-31 02:15:42 +01:00 
			
		
		
		
	Add AArch64 Neon implementation of Intra4Preds
Add Neon implementation of Intra4Preds for use on 64-bit Arm platforms. (The same implementation cannot be used for 32-bit Arm platforms as it uses a number of AArch64-only Neon instructions.) Change-Id: Id781e7614f4e8e876dfeecd95cfc85e04611d8c6
This commit is contained in:
		
				
					committed by
					
						 Jonathan Wright
						Jonathan Wright
					
				
			
			
				
	
			
			
			
						parent
						
							ff2b5b15ae
						
					
				
				
					commit
					baa93808d9
				
			| @@ -343,6 +343,8 @@ static void Intra16Preds_C(uint8_t* dst, | ||||
| //------------------------------------------------------------------------------ | ||||
| // luma 4x4 prediction | ||||
|  | ||||
| #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 | ||||
|  | ||||
| #define DST(x, y) dst[(x) + (y) * BPS] | ||||
| #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2)) | ||||
| #define AVG2(a, b) (((a) + (b) + 1) >> 1) | ||||
| @@ -529,6 +531,8 @@ static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) { | ||||
|   HU4(I4HU4 + dst, top); | ||||
| } | ||||
|  | ||||
| #endif  // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 | ||||
|  | ||||
| //------------------------------------------------------------------------------ | ||||
| // Metric | ||||
|  | ||||
| @@ -762,8 +766,11 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) { | ||||
|   VP8EncQuantize2Blocks = Quantize2Blocks_C; | ||||
| #endif | ||||
|  | ||||
|   VP8FTransform2 = FTransform2_C; | ||||
| #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 | ||||
|   VP8EncPredLuma4 = Intra4Preds_C; | ||||
| #endif | ||||
|  | ||||
|   VP8FTransform2 = FTransform2_C; | ||||
|   VP8EncPredLuma16 = Intra16Preds_C; | ||||
|   VP8EncPredChroma8 = IntraChromaPreds_C; | ||||
|   VP8Mean16x4 = Mean16x4_C; | ||||
|   | ||||
| @@ -911,6 +911,121 @@ static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32], | ||||
|  | ||||
| #endif   // !WORK_AROUND_GCC | ||||
|  | ||||
| #if WEBP_AARCH64 | ||||
|  | ||||
| #define DC4_VE4_HE4_TM4_NEON(dst, tbl, res, lane)                              \ | ||||
|   do {                                                                         \ | ||||
|     uint8x16_t r;                                                              \ | ||||
|     r = vqtbl2q_u8(qcombined, tbl);                                            \ | ||||
|     r = vreinterpretq_u8_u32(                                                  \ | ||||
|         vsetq_lane_u32(vget_lane_u32(vreinterpret_u32_u8(res), lane),          \ | ||||
|                        vreinterpretq_u32_u8(r), 1));                           \ | ||||
|     vst1q_u8(dst, r);                                                          \ | ||||
|   } while (0) | ||||
|  | ||||
| #define RD4_VR4_LD4_VL4_NEON(dst, tbl)                                         \ | ||||
|   do {                                                                         \ | ||||
|     uint8x16_t r;                                                              \ | ||||
|     r = vqtbl2q_u8(qcombined, tbl);                                            \ | ||||
|     vst1q_u8(dst, r);                                                          \ | ||||
|   } while (0) | ||||
|  | ||||
| static void Intra4Preds_NEON(uint8_t* dst, const uint8_t* top) { | ||||
|   // 0   1   2   3   4   5   6   7   8   9  10  11  12  13 | ||||
|   //     L   K   J   I   X   A   B   C   D   E   F   G   H | ||||
|   //    -5  -4  -3  -2  -1   0   1   2   3   4   5   6   7 | ||||
|   static const uint8_t kLookupTbl1[64] = { | ||||
|     0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 12, 12, | ||||
|     3,  3,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  0,  0,  0,  0, | ||||
|     4, 20, 21, 22,  3, 18,  2, 17,  3, 19,  4, 20,  2, 17,  1, 16, | ||||
|     2, 18,  3, 19,  1, 16, 31, 31,  1, 17,  2, 18, 31, 31, 31, 31 | ||||
|   }; | ||||
|  | ||||
|   static const uint8_t kLookupTbl2[64] = { | ||||
|     20, 21, 22, 23,  5,  6,  7,  8, 22, 23, 24, 25,  6,  7,  8,  9, | ||||
|     19, 20, 21, 22, 20, 21, 22, 23, 23, 24, 25, 26, 22, 23, 24, 25, | ||||
|     18, 19, 20, 21, 19,  5,  6,  7, 24, 25, 26, 27,  7,  8,  9, 26, | ||||
|     17, 18, 19, 20, 18, 20, 21, 22, 25, 26, 27, 28, 23, 24, 25, 27 | ||||
|   }; | ||||
|  | ||||
|   static const uint8_t kLookupTbl3[64] = { | ||||
|     30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 19, 19, 19, 19, | ||||
|     30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 18, 18, 18, 18, | ||||
|     30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 17, 17, 17, 17, | ||||
|     30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 16, 16, 16, 16 | ||||
|   }; | ||||
|  | ||||
|   const uint8x16x4_t lookup_avgs1 = vld1q_u8_x4(kLookupTbl1); | ||||
|   const uint8x16x4_t lookup_avgs2 = vld1q_u8_x4(kLookupTbl2); | ||||
|   const uint8x16x4_t lookup_avgs3 = vld1q_u8_x4(kLookupTbl3); | ||||
|  | ||||
|   const uint8x16_t preload = vld1q_u8(top - 5); | ||||
|   uint8x16x2_t qcombined; | ||||
|   uint8x16_t result0, result1; | ||||
|  | ||||
|   uint8x16_t a = vqtbl1q_u8(preload, lookup_avgs1.val[0]); | ||||
|   uint8x16_t b = preload; | ||||
|   uint8x16_t c = vextq_u8(a, a, 2); | ||||
|  | ||||
|   uint8x16_t avg3_all = vrhaddq_u8(vhaddq_u8(a, c), b); | ||||
|   uint8x16_t avg2_all = vrhaddq_u8(a, b); | ||||
|  | ||||
|   uint8x8_t preload_x8, sub_a, sub_c; | ||||
|   uint8_t result_u8; | ||||
|   uint8x8_t res_lo, res_hi; | ||||
|   uint8x16_t full_b; | ||||
|   uint16x8_t sub, sum_lo, sum_hi; | ||||
|  | ||||
|   preload_x8 = vget_low_u8(c); | ||||
|   preload_x8 = vset_lane_u8(vgetq_lane_u8(preload, 0), preload_x8, 3); | ||||
|  | ||||
|   result_u8 = (vaddlv_u8(preload_x8) + 4) >> 3; | ||||
|  | ||||
|   avg3_all = vsetq_lane_u8(vgetq_lane_u8(preload, 0), avg3_all, 15); | ||||
|   avg3_all = vsetq_lane_u8(result_u8, avg3_all, 14); | ||||
|  | ||||
|   qcombined.val[0] = avg2_all; | ||||
|   qcombined.val[1] = avg3_all; | ||||
|  | ||||
|   sub_a = vdup_laneq_u8(preload, 4); | ||||
|  | ||||
|   // preload = {a,b,c,d,...} => full_b = {d,d,d,d,c,c,c,c,b,b,b,b,a,a,a,a} | ||||
|   full_b = vqtbl1q_u8(preload, lookup_avgs1.val[1]); | ||||
|   // preload = {a,b,c,d,...} => sub_c = {a,b,c,d,a,b,c,d,a,b,c,d,a,b,c,d} | ||||
|   sub_c = vreinterpret_u8_u32(vdup_n_u32( | ||||
|       vgetq_lane_u32(vreinterpretq_u32_u8(vextq_u8(preload, preload, 5)), 0))); | ||||
|  | ||||
|   sub = vsubl_u8(sub_c, sub_a); | ||||
|   sum_lo = vaddw_u8(sub, vget_low_u8(full_b)); | ||||
|   res_lo = vqmovn_u16(sum_lo); | ||||
|  | ||||
|   sum_hi = vaddw_u8(sub, vget_high_u8(full_b)); | ||||
|   res_hi = vqmovn_u16(sum_hi); | ||||
|  | ||||
|   // DC4, VE4, HE4, TM4 | ||||
|   DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 0, lookup_avgs3.val[0], res_lo, 0); | ||||
|   DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 1, lookup_avgs3.val[1], res_lo, 1); | ||||
|   DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 2, lookup_avgs3.val[2], res_hi, 0); | ||||
|   DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 3, lookup_avgs3.val[3], res_hi, 1); | ||||
|  | ||||
|   // RD4, VR4, LD4, VL4 | ||||
|   RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 0, lookup_avgs2.val[0]); | ||||
|   RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 1, lookup_avgs2.val[1]); | ||||
|   RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 2, lookup_avgs2.val[2]); | ||||
|   RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 3, lookup_avgs2.val[3]); | ||||
|  | ||||
|   // HD4, HU4 | ||||
|   result0 = vqtbl2q_u8(qcombined, lookup_avgs1.val[2]); | ||||
|   result1 = vqtbl2q_u8(qcombined, lookup_avgs1.val[3]); | ||||
|  | ||||
|   vst1_u8(dst + I4HD4 + BPS * 0, vget_low_u8(result0)); | ||||
|   vst1_u8(dst + I4HD4 + BPS * 1, vget_high_u8(result0)); | ||||
|   vst1_u8(dst + I4HD4 + BPS * 2, vget_low_u8(result1)); | ||||
|   vst1_u8(dst + I4HD4 + BPS * 3, vget_high_u8(result1)); | ||||
| } | ||||
|  | ||||
| #endif // WEBP_AARCH64 | ||||
|  | ||||
| //------------------------------------------------------------------------------ | ||||
| // Entry point | ||||
|  | ||||
| @@ -931,6 +1046,10 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) { | ||||
|   VP8SSE8x8 = SSE8x8_NEON; | ||||
|   VP8SSE4x4 = SSE4x4_NEON; | ||||
|  | ||||
| #if WEBP_AARCH64 && (BPS == 32) | ||||
|   VP8EncPredLuma4 = Intra4Preds_NEON; | ||||
| #endif | ||||
|  | ||||
| #if !defined(WORK_AROUND_GCC) | ||||
|   VP8EncQuantizeBlock = QuantizeBlock_NEON; | ||||
|   VP8EncQuantize2Blocks = Quantize2Blocks_NEON; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user