From fa52d7525ffe090335939b70c53e3b4acf94f21b Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 2 Apr 2014 23:03:18 -0700 Subject: [PATCH] dec_neon: use vld?_lane instead of vset?_lane results in fewer instructions, small speed improvement Change-Id: I61ab48d09a5ce7c5158eac8244d28287457edc7a --- src/dsp/dec_neon.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/dsp/dec_neon.c b/src/dsp/dec_neon.c index 75628b16..2d1f623f 100644 --- a/src/dsp/dec_neon.c +++ b/src/dsp/dec_neon.c @@ -114,12 +114,12 @@ static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride, #else #define LOAD_LANE_32b(VALUE, LANE) do { \ - (VALUE) = vset_lane_u32(*(const uint32_t*)src, (VALUE), (LANE)); \ + (VALUE) = vld1_lane_u32((const uint32_t*)src, (VALUE), (LANE)); \ src += stride; \ } while (0) #define LOADQ_LANE_32b(VALUE, LANE) do { \ - (VALUE) = vsetq_lane_u32(*(const uint32_t*)src, (VALUE), (LANE)); \ + (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE)); \ src += stride; \ } while (0) @@ -1142,10 +1142,10 @@ static void TransformDC(const int16_t* in, uint8_t* dst) { uint32x2_t dst23 = {0, 0}; // Load the source pixels. - dst01 = vset_lane_u32(*(uint32_t*)(dst + 0 * BPS), dst01, 0); - dst23 = vset_lane_u32(*(uint32_t*)(dst + 2 * BPS), dst23, 0); - dst01 = vset_lane_u32(*(uint32_t*)(dst + 1 * BPS), dst01, 1); - dst23 = vset_lane_u32(*(uint32_t*)(dst + 3 * BPS), dst23, 1); + dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0); + dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0); + dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1); + dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1); { // Convert to 16b. @@ -1248,10 +1248,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { uint32x2_t dst23 = {0, 0}; // Load the source pixels. - dst01 = vset_lane_u32(*(uint32_t*)(dst + 0 * BPS), dst01, 0); - dst23 = vset_lane_u32(*(uint32_t*)(dst + 2 * BPS), dst23, 0); - dst01 = vset_lane_u32(*(uint32_t*)(dst + 1 * BPS), dst01, 1); - dst23 = vset_lane_u32(*(uint32_t*)(dst + 3 * BPS), dst23, 1); + dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0); + dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0); + dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1); + dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1); { // Convert to 16b.