dec_neon/TrueMotion: simply left border load

use vld1_dup_u8() rather than a separate ld+dup after the values were
zero extended; mildly faster at the function level

Change-Id: I1b3666a6aeb465722a1214dbc6d71c27689a7f89
This commit is contained in:
James Zern 2015-05-15 12:48:13 -07:00
parent bf46d0acff
commit ea95b305ca

View File

@ -1289,14 +1289,10 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
int y;
for (y = 0; y < size; y += 4) {
// left edge
const int16x8_t l0 = ConvertU8ToS16(vld1_u8(dst + 0 * BPS - 1));
const int16x8_t l1 = ConvertU8ToS16(vld1_u8(dst + 1 * BPS - 1));
const int16x8_t l2 = ConvertU8ToS16(vld1_u8(dst + 2 * BPS - 1));
const int16x8_t l3 = ConvertU8ToS16(vld1_u8(dst + 3 * BPS - 1));
const int16x8_t L0 = vdupq_lane_s16(vget_low_s16(l0), 0);
const int16x8_t L1 = vdupq_lane_s16(vget_low_s16(l1), 0);
const int16x8_t L2 = vdupq_lane_s16(vget_low_s16(l2), 0);
const int16x8_t L3 = vdupq_lane_s16(vget_low_s16(l3), 0);
const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1]
const int16x8_t r1 = vaddq_s16(L1, d);
const int16x8_t r2 = vaddq_s16(L2, d);