mirror of
https://github.com/webmproject/libwebp.git
synced 2024-11-20 12:28:26 +01:00
enc_neon: enable intrinsics-only functions
CollectHistogram / SSE* / QuantizeBlock have no inline equivalents, enable them where possible and use USE_INTRINSICS to control borderline cases: it's left undefined for now. Change-Id: I62235bc4ddb8aa0769d1ce18a90e0d7da1e18155
This commit is contained in:
parent
f937e01261
commit
42b35e086b
@ -15,7 +15,7 @@
|
|||||||
|
|
||||||
#if defined(WEBP_USE_NEON)
|
#if defined(WEBP_USE_NEON)
|
||||||
|
|
||||||
#define USE_INTRINSICS // use intrinsics when possible
|
// #define USE_INTRINSICS // use intrinsics when possible
|
||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
@ -36,7 +36,7 @@ static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
|
|||||||
// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
|
// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
|
||||||
// USE_INTRINSICS define.
|
// USE_INTRINSICS define.
|
||||||
// With gcc-4.8, it's a little faster speed than inlined-assembly.
|
// With gcc-4.8, it's a little faster speed than inlined-assembly.
|
||||||
#if 0 // defined(USE_INTRINSICS)
|
#if defined(USE_INTRINSICS)
|
||||||
|
|
||||||
// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
|
// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
|
||||||
static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
|
static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
|
||||||
@ -264,7 +264,7 @@ static uint8x16_t Load4x4(const uint8_t* src) {
|
|||||||
|
|
||||||
// Forward transform.
|
// Forward transform.
|
||||||
|
|
||||||
#if 0 // #ifdef USE_INTRINSICS
|
#if defined(USE_INTRINSICS)
|
||||||
|
|
||||||
static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
|
static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
|
||||||
const int16x4_t C, const int16x4_t D,
|
const int16x4_t C, const int16x4_t D,
|
||||||
@ -583,7 +583,7 @@ static void FTransformWHT(const int16_t* src, int16_t* out) {
|
|||||||
// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
|
// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
|
||||||
// USE_INTRINSICS define.
|
// USE_INTRINSICS define.
|
||||||
// With gcc-4.8, it's only slightly slower than the inlined.
|
// With gcc-4.8, it's only slightly slower than the inlined.
|
||||||
#if 0 // #ifdef USE_INTRINSICS
|
#if defined(USE_INTRINSICS)
|
||||||
|
|
||||||
// Zero extend an uint16x4_t 'v' to an int32x4_t.
|
// Zero extend an uint16x4_t 'v' to an int32x4_t.
|
||||||
static WEBP_INLINE int32x4_t ConvertU16ToS32(uint16x4_t v) {
|
static WEBP_INLINE int32x4_t ConvertU16ToS32(uint16x4_t v) {
|
||||||
@ -909,9 +909,6 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
|
|||||||
return D;
|
return D;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#if defined(USE_INTRINSICS)
|
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
||||||
@ -1002,13 +999,11 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
|
|||||||
return SumToInt(vpaddlq_u16(prod));
|
return SumToInt(vpaddlq_u16(prod));
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // USE_INTRINSICS
|
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
// Compilation with gcc-4.6.x is problematic for now. Disable this function
|
// Compilation with gcc-4.6.x is problematic for now. Disable this function
|
||||||
// in this case.
|
// in this case.
|
||||||
#if (__GNUC__ <= 4 && __GNUC_MINOR__ < 8)
|
#if !LOCAL_GCC_PREREQ(4,8)
|
||||||
#define SKIP_QUANTIZE
|
#define SKIP_QUANTIZE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -1088,15 +1083,11 @@ void VP8EncDspInitNEON(void) {
|
|||||||
|
|
||||||
VP8TDisto4x4 = Disto4x4;
|
VP8TDisto4x4 = Disto4x4;
|
||||||
VP8TDisto16x16 = Disto16x16;
|
VP8TDisto16x16 = Disto16x16;
|
||||||
#if defined(USE_INTRINSICS)
|
|
||||||
VP8CollectHistogram = CollectHistogram;
|
VP8CollectHistogram = CollectHistogram;
|
||||||
VP8SSE16x16 = SSE16x16;
|
VP8SSE16x16 = SSE16x16;
|
||||||
VP8SSE16x8 = SSE16x8;
|
VP8SSE16x8 = SSE16x8;
|
||||||
VP8SSE8x8 = SSE8x8;
|
VP8SSE8x8 = SSE8x8;
|
||||||
VP8SSE4x4 = SSE4x4;
|
VP8SSE4x4 = SSE4x4;
|
||||||
#else
|
|
||||||
(void)Load4x4; // to avoid a warning
|
|
||||||
#endif
|
|
||||||
#if !defined(SKIP_QUANTIZE)
|
#if !defined(SKIP_QUANTIZE)
|
||||||
VP8EncQuantizeBlock = QuantizeBlock;
|
VP8EncQuantizeBlock = QuantizeBlock;
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
Reference in New Issue
Block a user