add light filtering NEON intrinsics

disabled for now (but tested OK), thanks to the USE_INTRINSICS #define
We'll activate the code when we're on par with non-intrinsics

Change-Id: Idbfb9cb01f4c7c9f5131b270f8c11b70d0d485ff
This commit is contained in:
Pascal Massimino 2014-03-30 09:12:41 -07:00
parent af44460880
commit daccbf400d

View File

@ -16,6 +16,8 @@
#if defined(WEBP_USE_NEON)
// #define USE_INTRINSICS // use intrinsics when possible
#include <arm_neon.h>
#include "../dec/vp8i.h"
@ -68,6 +70,65 @@
DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \
FLIP_SIGN_BIT2(p0, q0, q10)
#if defined(USE_INTRINSICS)
static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
const uint8x16_t q0, const uint8x16_t q1,
int thresh) {
const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0)
const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1)
const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 * abs(p0-q0)
const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1); // abs(p1-q1) / 2
const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
return mask;
}
static int8x16_t FlipSign(const uint8x16_t v) {
const uint8x16_t sign_bit = vdupq_n_u8(0x80);
return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
}
static uint8x16_t FlipSignBack(const int8x16_t v) {
const int8x16_t sign_bit = vdupq_n_s8(0x80);
return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
}
static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
const int8x16_t q0, const int8x16_t q1) {
const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1)
const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0)
const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 * (q0 - p0)
const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 * (q0 - p0)
return s3;
}
static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
const uint8x16_t q0, const uint8x16_t q1,
uint8x16_t* const op0, uint8x16_t* const oq0,
int thresh) {
const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
const int8x16_t p1s = FlipSign(p1);
const int8x16_t p0s = FlipSign(p0);
const int8x16_t q0s = FlipSign(q0);
const int8x16_t q1s = FlipSign(q1);
const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
// DoSimpleFilter:
const int8x16_t kCst3 = vdupq_n_s8(0x03);
const int8x16_t kCst4 = vdupq_n_s8(0x04);
const int8x16_t delta_p3 = vqaddq_s8(delta1, kCst3);
const int8x16_t delta_p4 = vqaddq_s8(delta1, kCst4);
const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
*op0 = FlipSignBack(sp0);
*oq0 = FlipSignBack(sq0);
}
#endif // USE_INTRINSICS
// Load/Store vertical edge
#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \
"vld4.8 {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
@ -113,6 +174,7 @@ static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
// Simple In-loop filtering (Paragraph 15.2)
static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
#if !defined(USE_INTRINSICS)
__asm__ volatile (
"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
@ -131,6 +193,16 @@ static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
: [stride] "r"(stride), [thresh] "r"(thresh)
: "memory", QRegs
);
#else
const uint8x16_t p1 = vld1q_u8(p - 2 * stride);
const uint8x16_t p0 = vld1q_u8(p - 1 * stride);
const uint8x16_t q0 = vld1q_u8(p + 0 * stride);
const uint8x16_t q1 = vld1q_u8(p + 1 * stride);
uint8x16_t oq0, op0;
DoFilter2(p1, p0, q0, q1, &op0, &oq0, thresh);
vst1q_u8(p - stride, op0);
vst1q_u8(p, oq0);
#endif
}
static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {