mirror of
https://github.com/webmproject/libwebp.git
synced 2024-11-20 12:28:26 +01:00
dec_neon: add DC8uvNoTop / DC8uvNoLeft
adds do_top/do_left flags to DC8uv; ~88% / ~92% faster respectively no change in DC8uv speed. Change-Id: I109ec4d9ad13c9db64516e98ed4693a21a3e9b54
This commit is contained in:
parent
d8340da756
commit
14108d7878
@ -1389,35 +1389,59 @@ static void LD4(uint8_t* dst) { // Down-left
|
|||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
// Chroma
|
// Chroma
|
||||||
|
|
||||||
static void DC8uv(uint8_t* dst) { // DC
|
static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) {
|
||||||
const uint8x8_t A = vld1_u8(dst - BPS); // top row
|
uint16x8_t sum_top;
|
||||||
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
|
uint16x8_t sum_left;
|
||||||
const uint16x4_t p1 = vpadd_u16(p0, p0);
|
uint8x8_t dc0;
|
||||||
const uint16x4_t p2 = vpadd_u16(p1, p1);
|
|
||||||
const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
|
if (do_top) {
|
||||||
const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
|
const uint8x8_t A = vld1_u8(dst - BPS); // top row
|
||||||
const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
|
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
|
||||||
const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
|
const uint16x4_t p1 = vpadd_u16(p0, p0);
|
||||||
const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
|
const uint16x4_t p2 = vpadd_u16(p1, p1);
|
||||||
const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
|
sum_top = vcombine_u16(p2, p2);
|
||||||
const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
|
}
|
||||||
const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
|
|
||||||
const uint16x8_t s0 = vaddq_u16(L0, L1);
|
if (do_left) {
|
||||||
const uint16x8_t s1 = vaddq_u16(L2, L3);
|
const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
|
||||||
const uint16x8_t s2 = vaddq_u16(L4, L5);
|
const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
|
||||||
const uint16x8_t s3 = vaddq_u16(L6, L7);
|
const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
|
||||||
const uint16x8_t s01 = vaddq_u16(s0, s1);
|
const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
|
||||||
const uint16x8_t s23 = vaddq_u16(s2, s3);
|
const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
|
||||||
const uint16x8_t s0123 = vaddq_u16(s01, s23);
|
const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
|
||||||
const uint16x8_t sum = vaddq_u16(s0123, vcombine_u16(p2, p2));
|
const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
|
||||||
const uint8x8_t dc0 = vrshrn_n_u16(sum, 4); // (sum + 8) >> 4
|
const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
|
||||||
const uint8x8_t dc = vdup_lane_u8(dc0, 0);
|
const uint16x8_t s0 = vaddq_u16(L0, L1);
|
||||||
int i;
|
const uint16x8_t s1 = vaddq_u16(L2, L3);
|
||||||
for (i = 0; i < 8; ++i) {
|
const uint16x8_t s2 = vaddq_u16(L4, L5);
|
||||||
vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
|
const uint16x8_t s3 = vaddq_u16(L6, L7);
|
||||||
|
const uint16x8_t s01 = vaddq_u16(s0, s1);
|
||||||
|
const uint16x8_t s23 = vaddq_u16(s2, s3);
|
||||||
|
sum_left = vaddq_u16(s01, s23);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (do_top && do_left) {
|
||||||
|
const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
|
||||||
|
dc0 = vrshrn_n_u16(sum, 4);
|
||||||
|
} else if (do_top) {
|
||||||
|
dc0 = vrshrn_n_u16(sum_top, 3);
|
||||||
|
} else {
|
||||||
|
dc0 = vrshrn_n_u16(sum_left, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const uint8x8_t dc = vdup_lane_u8(dc0, 0);
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < 8; ++i) {
|
||||||
|
vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void DC8uv(uint8_t* dst) { return DC8(dst, 1, 1); }
|
||||||
|
static void DC8uvNoTop(uint8_t* dst) { return DC8(dst, 0, 1); }
|
||||||
|
static void DC8uvNoLeft(uint8_t* dst) { return DC8(dst, 1, 0); }
|
||||||
|
|
||||||
static void TM8uv(uint8_t* dst) { return TrueMotion(dst, 8); }
|
static void TM8uv(uint8_t* dst) { return TrueMotion(dst, 8); }
|
||||||
|
|
||||||
#endif // WEBP_USE_NEON
|
#endif // WEBP_USE_NEON
|
||||||
@ -1459,5 +1483,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
|
|||||||
|
|
||||||
VP8PredChroma8[0] = DC8uv;
|
VP8PredChroma8[0] = DC8uv;
|
||||||
VP8PredChroma8[1] = TM8uv;
|
VP8PredChroma8[1] = TM8uv;
|
||||||
|
VP8PredChroma8[4] = DC8uvNoTop;
|
||||||
|
VP8PredChroma8[5] = DC8uvNoLeft;
|
||||||
#endif // WEBP_USE_NEON
|
#endif // WEBP_USE_NEON
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user