NEON speed up

add TransformDC special case, and make the switch function inlined.
Recovers a few of the CPU lost during the addition of TransformAC3
(only on ARM)

Change-Id: I21c1f0c6a9cb9d1dfc1e307b4f473a2791273bd6
This commit is contained in:
skal 2013-12-18 22:32:58 +01:00
parent d49345533f
commit 26d842eb8f
2 changed files with 39 additions and 6 deletions

View File

@ -671,7 +671,7 @@ static void Copy32b(uint8_t* dst, uint8_t* src) {
memcpy(dst, src, 4); memcpy(dst, src, 4);
} }
static void DoTransform(uint32_t bits, const int16_t* const src, static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src,
uint8_t* const dst) { uint8_t* const dst) {
switch (bits >> 30) { switch (bits >> 30) {
case 3: case 3:

View File

@ -160,7 +160,7 @@ static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
// Inverse transforms (Paragraph 14.4) // Inverse transforms (Paragraph 14.4)
static void TransformOneNEON(const int16_t *in, uint8_t *dst) { static void TransformOne(const int16_t* in, uint8_t* dst) {
const int kBPS = BPS; const int kBPS = BPS;
const int16_t constants[] = {20091, 17734, 0, 0}; const int16_t constants[] = {20091, 17734, 0, 0};
/* kC1, kC2. Padded because vld1.16 loads 8 bytes /* kC1, kC2. Padded because vld1.16 loads 8 bytes
@ -309,13 +309,44 @@ static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
); );
} }
static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) { static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
TransformOneNEON(in, dst); TransformOne(in, dst);
if (do_two) { if (do_two) {
TransformOneNEON(in + 16, dst + 4); TransformOne(in + 16, dst + 4);
} }
} }
static void TransformDC(const int16_t* in, uint8_t* dst) {
const int DC = (in[0] + 4) >> 3;
const int kBPS = BPS;
__asm__ volatile (
"vdup.16 q1, %[DC] \n"
"vld1.32 d0[0], [%[dst]], %[kBPS] \n"
"vld1.32 d1[0], [%[dst]], %[kBPS] \n"
"vld1.32 d0[1], [%[dst]], %[kBPS] \n"
"vld1.32 d1[1], [%[dst]], %[kBPS] \n"
"sub %[dst], %[dst], %[kBPS], lsl #2 \n"
// add DC and convert to s16.
"vaddw.u8 q2, q1, d0 \n"
"vaddw.u8 q3, q1, d1 \n"
// convert back to u8 with saturation
"vqmovun.s16 d0, q2 \n"
"vqmovun.s16 d1, q3 \n"
"vst1.32 d0[0], [%[dst]], %[kBPS] \n"
"vst1.32 d1[0], [%[dst]], %[kBPS] \n"
"vst1.32 d0[1], [%[dst]], %[kBPS] \n"
"vst1.32 d1[1], [%[dst]] \n"
: [in] "+r"(in), [dst] "+r"(dst) /* modified registers */
: [kBPS] "r"(kBPS), /* constants */
[DC] "r"(DC)
: "memory", "q0", "q1", "q2", "q3" /* clobbered */
);
}
static void TransformWHT(const int16_t* in, int16_t* out) { static void TransformWHT(const int16_t* in, int16_t* out) {
const int kStep = 32; // The store is only incrementing the pointer as if we const int kStep = 32; // The store is only incrementing the pointer as if we
// had stored a single byte. // had stored a single byte.
@ -392,7 +423,9 @@ extern void VP8DspInitNEON(void);
void VP8DspInitNEON(void) { void VP8DspInitNEON(void) {
#if defined(WEBP_USE_NEON) #if defined(WEBP_USE_NEON)
VP8Transform = TransformTwoNEON; VP8Transform = TransformTwo;
VP8TransformAC3 = TransformOne; // no special code here
VP8TransformDC = TransformDC;
VP8TransformWHT = TransformWHT; VP8TransformWHT = TransformWHT;
VP8SimpleVFilter16 = SimpleVFilter16NEON; VP8SimpleVFilter16 = SimpleVFilter16NEON;