From bc03670f014d370011301008a9cbc4758619803d Mon Sep 17 00:00:00 2001 From: James Zern Date: Sun, 29 Jun 2014 13:40:45 -0700 Subject: [PATCH] neon: add INIT_VECTOR4 used to initialize NxMx4 vector types replaces initialization via '{{ }}' gnu-ism. Change-Id: I0da7b3d321f3d48579b7863fb2e4d3f449ae7f5e --- src/dsp/dec_neon.c | 30 ++++++++++++++++++++---------- src/dsp/enc_neon.c | 17 ++++++++++------- src/dsp/lossless_neon.c | 22 ++++++++++++---------- src/dsp/neon.h | 7 +++++++ src/dsp/upsampling_neon.c | 6 ++++-- 5 files changed, 53 insertions(+), 29 deletions(-) diff --git a/src/dsp/dec_neon.c b/src/dsp/dec_neon.c index b820eae1..c4c48bc2 100644 --- a/src/dsp/dec_neon.c +++ b/src/dsp/dec_neon.c @@ -49,7 +49,9 @@ // (register alloc, probably). The variants somewhat mitigate the problem, but // not quite. HFilter16i() remains problematic. static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) { - uint8x8x4_t out = {{{0}, {0}, {0}, {0}}}; + const uint8x8_t zero = vdup_n_u8(0); + uint8x8x4_t out; + INIT_VECTOR4(out, zero, zero, zero, zero); out = vld4_lane_u8(src + 0 * stride, out, 0); out = vld4_lane_u8(src + 1 * stride, out, 1); out = vld4_lane_u8(src + 2 * stride, out, 2); @@ -84,7 +86,9 @@ static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride, static WEBP_INLINE void Load4x16(const uint8_t* src, int stride, uint8x16_t* const p1, uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1) { - uint32x4x4_t in = {{{0}, {0}, {0}, {0}}}; + const uint32x4_t zero = vdupq_n_u32(0); + uint32x4x4_t in; + INIT_VECTOR4(in, zero, zero, zero, zero); src -= 2; LOADQ_LANE_32b(in.val[0], 0); LOADQ_LANE_32b(in.val[1], 0); @@ -273,10 +277,13 @@ static WEBP_INLINE void Store4x8(const uint8x8x4_t v, static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0, const uint8x16_t q0, const uint8x16_t q1, uint8_t* const dst, int stride) { - const uint8x8x4_t lo = {{ vget_low_u8(p1), vget_low_u8(p0), - vget_low_u8(q0), vget_low_u8(q1) }}; - const uint8x8x4_t hi = {{ vget_high_u8(p1), vget_high_u8(p0), - vget_high_u8(q0), vget_high_u8(q1) }}; + uint8x8x4_t lo, hi; + INIT_VECTOR4(lo, + vget_low_u8(p1), vget_low_u8(p0), + vget_low_u8(q0), vget_low_u8(q1)); + INIT_VECTOR4(hi, + vget_high_u8(p1), vget_high_u8(p0), + vget_high_u8(q0), vget_high_u8(q1)); Store4x8(lo, dst - 2 + 0 * stride, stride); Store4x8(hi, dst - 2 + 8 * stride, stride); } @@ -355,10 +362,13 @@ static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0, const uint8x16_t q0, const uint8x16_t q1, uint8_t* const u, uint8_t* const v, int stride) { - const uint8x8x4_t u0 = {{ vget_low_u8(p1), vget_low_u8(p0), - vget_low_u8(q0), vget_low_u8(q1) }}; - const uint8x8x4_t v0 = {{ vget_high_u8(p1), vget_high_u8(p0), - vget_high_u8(q0), vget_high_u8(q1) }}; + uint8x8x4_t u0, v0; + INIT_VECTOR4(u0, + vget_low_u8(p1), vget_low_u8(p0), + vget_low_u8(q0), vget_low_u8(q1)); + INIT_VECTOR4(v0, + vget_high_u8(p1), vget_high_u8(p0), + vget_high_u8(q0), vget_high_u8(q1)); vst4_lane_u8(u - 2 + 0 * stride, u0, 0); vst4_lane_u8(u - 2 + 1 * stride, u0, 1); vst4_lane_u8(u - 2 + 2 * stride, u0, 2); diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c index 53e0a6c8..ca7c44a1 100644 --- a/src/dsp/enc_neon.c +++ b/src/dsp/enc_neon.c @@ -479,9 +479,11 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, } while (0) static void FTransformWHT(const int16_t* src, int16_t* out) { - int32x4x4_t tmp0; const int stride = 16; - int16x4x4_t in = {{{0}, {0}, {0}, {0}}}; + const int16x4_t zero = vdup_n_s16(0); + int32x4x4_t tmp0; + int16x4x4_t in; + INIT_VECTOR4(in, zero, zero, zero, zero); LOAD_LANE_16b(in.val[0], 0); LOAD_LANE_16b(in.val[1], 0); LOAD_LANE_16b(in.val[2], 0); @@ -1010,11 +1012,12 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16], const VP8Matrix* const mtx) { const int16x8_t out0 = Quantize(in, mtx, 0); const int16x8_t out1 = Quantize(in, mtx, 8); - const uint8x8x4_t all_out = {{ - vreinterpret_u8_s16(vget_low_s16(out0)), - vreinterpret_u8_s16(vget_high_s16(out0)), - vreinterpret_u8_s16(vget_low_s16(out1)), - vreinterpret_u8_s16(vget_high_s16(out1)) }}; + uint8x8x4_t all_out; + INIT_VECTOR4(all_out, + vreinterpret_u8_s16(vget_low_s16(out0)), + vreinterpret_u8_s16(vget_high_s16(out0)), + vreinterpret_u8_s16(vget_low_s16(out1)), + vreinterpret_u8_s16(vget_high_s16(out1))); // Zigzag reordering vst1_u8((uint8_t*)(out + 0), vtbl4_u8(all_out, vld1_u8(kShuffles[0]))); vst1_u8((uint8_t*)(out + 4), vtbl4_u8(all_out, vld1_u8(kShuffles[1]))); diff --git a/src/dsp/lossless_neon.c b/src/dsp/lossless_neon.c index 954e3877..987767b5 100644 --- a/src/dsp/lossless_neon.c +++ b/src/dsp/lossless_neon.c @@ -96,11 +96,12 @@ static void ConvertBGRAToBGR(const uint32_t* src, const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]); const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]); for (; src < end; src += 8) { - const uint8x8x4_t pixels = {{ - vld1_u8((const uint8_t*)(src + 0)), - vld1_u8((const uint8_t*)(src + 2)), - vld1_u8((const uint8_t*)(src + 4)), - vld1_u8((const uint8_t*)(src + 6)) }}; + uint8x8x4_t pixels; + INIT_VECTOR4(pixels, + vld1_u8((const uint8_t*)(src + 0)), + vld1_u8((const uint8_t*)(src + 2)), + vld1_u8((const uint8_t*)(src + 4)), + vld1_u8((const uint8_t*)(src + 6))); vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0)); vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); @@ -122,11 +123,12 @@ static void ConvertBGRAToRGB(const uint32_t* src, const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]); const uint8x8_t shuffle2 = vld1_u8(kRGBShuffle[2]); for (; src < end; src += 8) { - const uint8x8x4_t pixels = {{ - vld1_u8((const uint8_t*)(src + 0)), - vld1_u8((const uint8_t*)(src + 2)), - vld1_u8((const uint8_t*)(src + 4)), - vld1_u8((const uint8_t*)(src + 6)) }}; + uint8x8x4_t pixels; + INIT_VECTOR4(pixels, + vld1_u8((const uint8_t*)(src + 0)), + vld1_u8((const uint8_t*)(src + 2)), + vld1_u8((const uint8_t*)(src + 4)), + vld1_u8((const uint8_t*)(src + 6))); vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0)); vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); diff --git a/src/dsp/neon.h b/src/dsp/neon.h index 47a91669..7e06eaee 100644 --- a/src/dsp/neon.h +++ b/src/dsp/neon.h @@ -33,6 +33,13 @@ v.val[2] = c; \ } while (0) +#define INIT_VECTOR4(v, a, b, c, d) do { \ + v.val[0] = a; \ + v.val[1] = b; \ + v.val[2] = c; \ + v.val[3] = d; \ +} while (0) + // if using intrinsics, this flag avoids some functions that make gcc-4.6.3 // crash ("internal compiler error: in immed_double_const, at emit-rtl."). // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183) diff --git a/src/dsp/upsampling_neon.c b/src/dsp/upsampling_neon.c index b607f83b..c3db70dc 100644 --- a/src/dsp/upsampling_neon.c +++ b/src/dsp/upsampling_neon.c @@ -106,12 +106,14 @@ static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG }; } while (0) #define STORE_Rgba(out, r, g, b) do { \ - const uint8x8x4_t r_g_b_v255 = {{ r, g, b, v255 }}; \ + uint8x8x4_t r_g_b_v255; \ + INIT_VECTOR4(r_g_b_v255, r, g, b, v255); \ vst4_u8(out, r_g_b_v255); \ } while (0) #define STORE_Bgra(out, r, g, b) do { \ - const uint8x8x4_t b_g_r_v255 = {{ b, g, r, v255 }}; \ + uint8x8x4_t b_g_r_v255; \ + INIT_VECTOR4(b_g_r_v255, b, g, r, v255); \ vst4_u8(out, b_g_r_v255); \ } while (0)