cpu.h: add WEBP_AARCH64

and define it to true for __aarch64__ and Win Arm64 + Visual Studio.

Microsoft's compiler (cl.exe) does not define __aarch64__, but relies on
_M_ARM64 & _M_ARM64EC

Bug: b/277254922
Change-Id: I20e4fa07a4031599db69e3d7ba9050345315ef51
This commit is contained in:
James Zern 2023-05-01 21:55:05 -07:00
parent 8151f388eb
commit 0c496a4ff9
11 changed files with 24 additions and 16 deletions

View File

@ -12,10 +12,11 @@
// Author: Skal (pascal.massimino@gmail.com) // Author: Skal (pascal.massimino@gmail.com)
#include "src/dec/vp8i_dec.h" #include "src/dec/vp8i_dec.h"
#include "src/dsp/cpu.h"
#include "src/utils/bit_reader_inl_utils.h" #include "src/utils/bit_reader_inl_utils.h"
#if !defined(USE_GENERIC_TREE) #if !defined(USE_GENERIC_TREE)
#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) #if !defined(__arm__) && !defined(_M_ARM) && !WEBP_AARCH64
// using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then. // using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then.
#define USE_GENERIC_TREE 1 // ALTERNATE_CODE #define USE_GENERIC_TREE 1 // ALTERNATE_CODE
#else #else

View File

@ -29,7 +29,7 @@ static void SetResidualCoeffs_NEON(const int16_t* const coeffs,
const uint8x16_t eob = vcombine_u8(vqmovn_u16(eob_0), vqmovn_u16(eob_1)); const uint8x16_t eob = vcombine_u8(vqmovn_u16(eob_0), vqmovn_u16(eob_1));
const uint8x16_t masked = vandq_u8(eob, vld1q_u8(position)); const uint8x16_t masked = vandq_u8(eob, vld1q_u8(position));
#ifdef __aarch64__ #if WEBP_AARCH64
res->last = vmaxvq_u8(masked) - 1; res->last = vmaxvq_u8(masked) - 1;
#else #else
const uint8x8_t eob_8x8 = vmax_u8(vget_low_u8(masked), vget_high_u8(masked)); const uint8x8_t eob_8x8 = vmax_u8(vget_low_u8(masked), vget_high_u8(masked));
@ -43,7 +43,7 @@ static void SetResidualCoeffs_NEON(const int16_t* const coeffs,
vst1_lane_s32(&res->last, vreinterpret_s32_u32(eob_32x2), 0); vst1_lane_s32(&res->last, vreinterpret_s32_u32(eob_32x2), 0);
--res->last; --res->last;
#endif // __aarch64__ #endif // WEBP_AARCH64
res->coeffs = coeffs; res->coeffs = coeffs;
} }

View File

@ -105,6 +105,12 @@
#define WEBP_USE_INTRINSICS #define WEBP_USE_INTRINSICS
#endif #endif
#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
#define WEBP_AARCH64 1
#else
#define WEBP_AARCH64 0
#endif
#if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON) #if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON)
#define WEBP_HAVE_NEON #define WEBP_HAVE_NEON
#endif #endif
@ -134,8 +140,7 @@
#define WEBP_NEON_OMIT_C_CODE 0 #define WEBP_NEON_OMIT_C_CODE 0
#endif #endif
#if !(LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 8) || \ #if !(LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 8) || WEBP_AARCH64)
defined(__aarch64__))
#define WEBP_NEON_WORK_AROUND_GCC 1 #define WEBP_NEON_WORK_AROUND_GCC 1
#else #else
#define WEBP_NEON_WORK_AROUND_GCC 0 #define WEBP_NEON_WORK_AROUND_GCC 0

View File

@ -1428,7 +1428,7 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
if (do_top) { if (do_top) {
const uint8x8_t A = vld1_u8(dst - BPS); // top row const uint8x8_t A = vld1_u8(dst - BPS); // top row
#if defined(__aarch64__) #if WEBP_AARCH64
const uint16_t p2 = vaddlv_u8(A); const uint16_t p2 = vaddlv_u8(A);
sum_top = vdupq_n_u16(p2); sum_top = vdupq_n_u16(p2);
#else #else
@ -1511,7 +1511,7 @@ static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
if (do_top) { if (do_top) {
const uint8x16_t A = vld1q_u8(dst - BPS); // top row const uint8x16_t A = vld1q_u8(dst - BPS); // top row
#if defined(__aarch64__) #if WEBP_AARCH64
const uint16_t p3 = vaddlvq_u8(A); const uint16_t p3 = vaddlvq_u8(A);
sum_top = vdupq_n_u16(p3); sum_top = vdupq_n_u16(p3);
#else #else

View File

@ -764,7 +764,7 @@ static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
// Horizontal sum of all four uint32_t values in 'sum'. // Horizontal sum of all four uint32_t values in 'sum'.
static int SumToInt_NEON(uint32x4_t sum) { static int SumToInt_NEON(uint32x4_t sum) {
#if defined(__aarch64__) #if WEBP_AARCH64
return (int)vaddvq_u32(sum); return (int)vaddvq_u32(sum);
#else #else
const uint64x2_t sum2 = vpaddlq_u32(sum); const uint64x2_t sum2 = vpaddlq_u32(sum);
@ -865,7 +865,7 @@ static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
uint8x8x4_t shuffles; uint8x8x4_t shuffles;
// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
// non-standard versions there. // non-standard versions there.
#if defined(__APPLE__) && defined(__aarch64__) && \ #if defined(__APPLE__) && WEBP_AARCH64 && \
defined(__apple_build_version__) && (__apple_build_version__< 6020037) defined(__apple_build_version__) && (__apple_build_version__< 6020037)
uint8x16x2_t all_out; uint8x16x2_t all_out;
INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1)); INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));

View File

@ -25,7 +25,7 @@
// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
// non-standard versions there. // non-standard versions there.
#if defined(__APPLE__) && defined(__aarch64__) && \ #if defined(__APPLE__) && WEBP_AARCH64 && \
defined(__apple_build_version__) && (__apple_build_version__< 6020037) defined(__apple_build_version__) && (__apple_build_version__< 6020037)
#define USE_VTBLQ #define USE_VTBLQ
#endif #endif

View File

@ -498,7 +498,7 @@ static void PredictorAdd13_NEON(const uint32_t* in, const uint32_t* upper,
// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
// non-standard versions there. // non-standard versions there.
#if defined(__APPLE__) && defined(__aarch64__) && \ #if defined(__APPLE__) && WEBP_AARCH64 && \
defined(__apple_build_version__) && (__apple_build_version__< 6020037) defined(__apple_build_version__) && (__apple_build_version__< 6020037)
#define USE_VTBLQ #define USE_VTBLQ
#endif #endif

View File

@ -21,7 +21,7 @@
// Right now, some intrinsics functions seem slower, so we disable them // Right now, some intrinsics functions seem slower, so we disable them
// everywhere except newer clang/gcc or aarch64 where the inline assembly is // everywhere except newer clang/gcc or aarch64 where the inline assembly is
// incompatible. // incompatible.
#if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__) #if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || WEBP_AARCH64
#define WEBP_USE_INTRINSICS // use intrinsics when possible #define WEBP_USE_INTRINSICS // use intrinsics when possible
#endif #endif
@ -46,7 +46,7 @@
// if using intrinsics, this flag avoids some functions that make gcc-4.6.3 // if using intrinsics, this flag avoids some functions that make gcc-4.6.3
// crash ("internal compiler error: in immed_double_const, at emit-rtl."). // crash ("internal compiler error: in immed_double_const, at emit-rtl.").
// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183) // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__)) #if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || WEBP_AARCH64)
#define WORK_AROUND_GCC #define WORK_AROUND_GCC
#endif #endif

View File

@ -22,7 +22,7 @@
#define IsFlat IsFlat_NEON #define IsFlat IsFlat_NEON
static uint32_t horizontal_add_uint32x4(const uint32x4_t a) { static uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
#if defined(__aarch64__) #if WEBP_AARCH64
return vaddvq_u32(a); return vaddvq_u32(a);
#else #else
const uint64x2_t b = vpaddlq_u32(a); const uint64x2_t b = vpaddlq_u32(a);

View File

@ -15,6 +15,7 @@
#include "src/webp/config.h" #include "src/webp/config.h"
#endif #endif
#include "src/dsp/cpu.h"
#include "src/utils/bit_reader_inl_utils.h" #include "src/utils/bit_reader_inl_utils.h"
#include "src/utils/utils.h" #include "src/utils/utils.h"
@ -121,7 +122,7 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits,
#define VP8L_LOG8_WBITS 4 // Number of bytes needed to store VP8L_WBITS bits. #define VP8L_LOG8_WBITS 4 // Number of bytes needed to store VP8L_WBITS bits.
#if defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \ #if defined(__arm__) || defined(_M_ARM) || WEBP_AARCH64 || \
defined(__i386__) || defined(_M_IX86) || \ defined(__i386__) || defined(_M_IX86) || \
defined(__x86_64__) || defined(_M_X64) defined(__x86_64__) || defined(_M_X64)
#define VP8L_USE_FAST_LOAD #define VP8L_USE_FAST_LOAD

View File

@ -19,6 +19,7 @@
#ifdef _MSC_VER #ifdef _MSC_VER
#include <stdlib.h> // _byteswap_ulong #include <stdlib.h> // _byteswap_ulong
#endif #endif
#include "src/dsp/cpu.h"
#include "src/webp/types.h" #include "src/webp/types.h"
// Warning! This macro triggers quite some MACRO wizardry around func signature! // Warning! This macro triggers quite some MACRO wizardry around func signature!
@ -64,7 +65,7 @@ extern "C" {
#define BITS 56 #define BITS 56
#elif defined(__arm__) || defined(_M_ARM) // ARM #elif defined(__arm__) || defined(_M_ARM) // ARM
#define BITS 24 #define BITS 24
#elif defined(__aarch64__) // ARM 64bit #elif WEBP_AARCH64 // ARM 64bit
#define BITS 56 #define BITS 56
#elif defined(__mips__) // MIPS #elif defined(__mips__) // MIPS
#define BITS 24 #define BITS 24