mirror of
https://github.com/webmproject/libwebp.git
synced 2024-12-27 06:08:21 +01:00
NEON speed up
add TransformDC special case, and make the switch function inlined. Recovers a few of the CPU lost during the addition of TransformAC3 (only on ARM) Change-Id: I21c1f0c6a9cb9d1dfc1e307b4f473a2791273bd6
This commit is contained in:
parent
d49345533f
commit
26d842eb8f
@ -671,7 +671,7 @@ static void Copy32b(uint8_t* dst, uint8_t* src) {
|
|||||||
memcpy(dst, src, 4);
|
memcpy(dst, src, 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void DoTransform(uint32_t bits, const int16_t* const src,
|
static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src,
|
||||||
uint8_t* const dst) {
|
uint8_t* const dst) {
|
||||||
switch (bits >> 30) {
|
switch (bits >> 30) {
|
||||||
case 3:
|
case 3:
|
||||||
|
@ -160,7 +160,7 @@ static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
|
|||||||
//-----------------------------------------------------------------------------
|
//-----------------------------------------------------------------------------
|
||||||
// Inverse transforms (Paragraph 14.4)
|
// Inverse transforms (Paragraph 14.4)
|
||||||
|
|
||||||
static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
|
static void TransformOne(const int16_t* in, uint8_t* dst) {
|
||||||
const int kBPS = BPS;
|
const int kBPS = BPS;
|
||||||
const int16_t constants[] = {20091, 17734, 0, 0};
|
const int16_t constants[] = {20091, 17734, 0, 0};
|
||||||
/* kC1, kC2. Padded because vld1.16 loads 8 bytes
|
/* kC1, kC2. Padded because vld1.16 loads 8 bytes
|
||||||
@ -309,13 +309,44 @@ static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {
|
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
|
||||||
TransformOneNEON(in, dst);
|
TransformOne(in, dst);
|
||||||
if (do_two) {
|
if (do_two) {
|
||||||
TransformOneNEON(in + 16, dst + 4);
|
TransformOne(in + 16, dst + 4);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void TransformDC(const int16_t* in, uint8_t* dst) {
|
||||||
|
const int DC = (in[0] + 4) >> 3;
|
||||||
|
const int kBPS = BPS;
|
||||||
|
__asm__ volatile (
|
||||||
|
"vdup.16 q1, %[DC] \n"
|
||||||
|
|
||||||
|
"vld1.32 d0[0], [%[dst]], %[kBPS] \n"
|
||||||
|
"vld1.32 d1[0], [%[dst]], %[kBPS] \n"
|
||||||
|
"vld1.32 d0[1], [%[dst]], %[kBPS] \n"
|
||||||
|
"vld1.32 d1[1], [%[dst]], %[kBPS] \n"
|
||||||
|
|
||||||
|
"sub %[dst], %[dst], %[kBPS], lsl #2 \n"
|
||||||
|
|
||||||
|
// add DC and convert to s16.
|
||||||
|
"vaddw.u8 q2, q1, d0 \n"
|
||||||
|
"vaddw.u8 q3, q1, d1 \n"
|
||||||
|
// convert back to u8 with saturation
|
||||||
|
"vqmovun.s16 d0, q2 \n"
|
||||||
|
"vqmovun.s16 d1, q3 \n"
|
||||||
|
|
||||||
|
"vst1.32 d0[0], [%[dst]], %[kBPS] \n"
|
||||||
|
"vst1.32 d1[0], [%[dst]], %[kBPS] \n"
|
||||||
|
"vst1.32 d0[1], [%[dst]], %[kBPS] \n"
|
||||||
|
"vst1.32 d1[1], [%[dst]] \n"
|
||||||
|
: [in] "+r"(in), [dst] "+r"(dst) /* modified registers */
|
||||||
|
: [kBPS] "r"(kBPS), /* constants */
|
||||||
|
[DC] "r"(DC)
|
||||||
|
: "memory", "q0", "q1", "q2", "q3" /* clobbered */
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
static void TransformWHT(const int16_t* in, int16_t* out) {
|
static void TransformWHT(const int16_t* in, int16_t* out) {
|
||||||
const int kStep = 32; // The store is only incrementing the pointer as if we
|
const int kStep = 32; // The store is only incrementing the pointer as if we
|
||||||
// had stored a single byte.
|
// had stored a single byte.
|
||||||
@ -392,7 +423,9 @@ extern void VP8DspInitNEON(void);
|
|||||||
|
|
||||||
void VP8DspInitNEON(void) {
|
void VP8DspInitNEON(void) {
|
||||||
#if defined(WEBP_USE_NEON)
|
#if defined(WEBP_USE_NEON)
|
||||||
VP8Transform = TransformTwoNEON;
|
VP8Transform = TransformTwo;
|
||||||
|
VP8TransformAC3 = TransformOne; // no special code here
|
||||||
|
VP8TransformDC = TransformDC;
|
||||||
VP8TransformWHT = TransformWHT;
|
VP8TransformWHT = TransformWHT;
|
||||||
|
|
||||||
VP8SimpleVFilter16 = SimpleVFilter16NEON;
|
VP8SimpleVFilter16 = SimpleVFilter16NEON;
|
||||||
|
Loading…
Reference in New Issue
Block a user