Merge changes I0da7b3d3,Idad2f278,I4accc305

* changes:
  neon: add INIT_VECTOR4
  neon: add INIT_VECTOR3
  neon: add INIT_VECTOR2
This commit is contained in:
James Zern 2014-07-01 00:48:38 -07:00 committed by Gerrit Code Review
commit 6ee7160dd2
5 changed files with 81 additions and 41 deletions

View File

@ -49,7 +49,9 @@
// (register alloc, probably). The variants somewhat mitigate the problem, but // (register alloc, probably). The variants somewhat mitigate the problem, but
// not quite. HFilter16i() remains problematic. // not quite. HFilter16i() remains problematic.
static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) { static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {
uint8x8x4_t out = {{{0}, {0}, {0}, {0}}}; const uint8x8_t zero = vdup_n_u8(0);
uint8x8x4_t out;
INIT_VECTOR4(out, zero, zero, zero, zero);
out = vld4_lane_u8(src + 0 * stride, out, 0); out = vld4_lane_u8(src + 0 * stride, out, 0);
out = vld4_lane_u8(src + 1 * stride, out, 1); out = vld4_lane_u8(src + 1 * stride, out, 1);
out = vld4_lane_u8(src + 2 * stride, out, 2); out = vld4_lane_u8(src + 2 * stride, out, 2);
@ -84,7 +86,9 @@ static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride,
static WEBP_INLINE void Load4x16(const uint8_t* src, int stride, static WEBP_INLINE void Load4x16(const uint8_t* src, int stride,
uint8x16_t* const p1, uint8x16_t* const p0, uint8x16_t* const p1, uint8x16_t* const p0,
uint8x16_t* const q0, uint8x16_t* const q1) { uint8x16_t* const q0, uint8x16_t* const q1) {
uint32x4x4_t in = {{{0}, {0}, {0}, {0}}}; const uint32x4_t zero = vdupq_n_u32(0);
uint32x4x4_t in;
INIT_VECTOR4(in, zero, zero, zero, zero);
src -= 2; src -= 2;
LOADQ_LANE_32b(in.val[0], 0); LOADQ_LANE_32b(in.val[0], 0);
LOADQ_LANE_32b(in.val[1], 0); LOADQ_LANE_32b(in.val[1], 0);
@ -273,10 +277,13 @@ static WEBP_INLINE void Store4x8(const uint8x8x4_t v,
static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0, static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q0, const uint8x16_t q1,
uint8_t* const dst, int stride) { uint8_t* const dst, int stride) {
const uint8x8x4_t lo = {{ vget_low_u8(p1), vget_low_u8(p0), uint8x8x4_t lo, hi;
vget_low_u8(q0), vget_low_u8(q1) }}; INIT_VECTOR4(lo,
const uint8x8x4_t hi = {{ vget_high_u8(p1), vget_high_u8(p0), vget_low_u8(p1), vget_low_u8(p0),
vget_high_u8(q0), vget_high_u8(q1) }}; vget_low_u8(q0), vget_low_u8(q1));
INIT_VECTOR4(hi,
vget_high_u8(p1), vget_high_u8(p0),
vget_high_u8(q0), vget_high_u8(q1));
Store4x8(lo, dst - 2 + 0 * stride, stride); Store4x8(lo, dst - 2 + 0 * stride, stride);
Store4x8(hi, dst - 2 + 8 * stride, stride); Store4x8(hi, dst - 2 + 8 * stride, stride);
} }
@ -327,12 +334,11 @@ static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,
const uint8x16_t q1, const uint8x16_t q2, const uint8x16_t q1, const uint8x16_t q2,
uint8_t* u, uint8_t* v, uint8_t* u, uint8_t* v,
int stride) { int stride) {
const uint8x8x3_t u0 = {{vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0)}}; uint8x8x3_t u0, u1, v0, v1;
const uint8x8x3_t u1 = {{vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2)}}; INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
const uint8x8x3_t v0 = INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
{{vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0)}}; INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
const uint8x8x3_t v1 = INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
{{vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2)}};
STORE6_LANE(u, u0, u1, 0); STORE6_LANE(u, u0, u1, 0);
STORE6_LANE(u, u0, u1, 1); STORE6_LANE(u, u0, u1, 1);
STORE6_LANE(u, u0, u1, 2); STORE6_LANE(u, u0, u1, 2);
@ -356,10 +362,13 @@ static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q0, const uint8x16_t q1,
uint8_t* const u, uint8_t* const v, uint8_t* const u, uint8_t* const v,
int stride) { int stride) {
const uint8x8x4_t u0 = {{ vget_low_u8(p1), vget_low_u8(p0), uint8x8x4_t u0, v0;
vget_low_u8(q0), vget_low_u8(q1) }}; INIT_VECTOR4(u0,
const uint8x8x4_t v0 = {{ vget_high_u8(p1), vget_high_u8(p0), vget_low_u8(p1), vget_low_u8(p0),
vget_high_u8(q0), vget_high_u8(q1) }}; vget_low_u8(q0), vget_low_u8(q1));
INIT_VECTOR4(v0,
vget_high_u8(p1), vget_high_u8(p0),
vget_high_u8(q0), vget_high_u8(q1));
vst4_lane_u8(u - 2 + 0 * stride, u0, 0); vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
vst4_lane_u8(u - 2 + 1 * stride, u0, 1); vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
vst4_lane_u8(u - 2 + 2 * stride, u0, 2); vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
@ -1014,7 +1023,8 @@ static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
} }
static void TransformOne(const int16_t* in, uint8_t* dst) { static void TransformOne(const int16_t* in, uint8_t* dst) {
int16x8x2_t rows = {{ vld1q_s16(in + 0), vld1q_s16(in + 8) }}; int16x8x2_t rows;
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
TransformPass(&rows); TransformPass(&rows);
TransformPass(&rows); TransformPass(&rows);
Add4x4(rows.val[0], rows.val[1], dst); Add4x4(rows.val[0], rows.val[1], dst);

View File

@ -118,7 +118,8 @@ static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
static void ITransformOne(const uint8_t* ref, static void ITransformOne(const uint8_t* ref,
const int16_t* in, uint8_t* dst) { const int16_t* in, uint8_t* dst) {
int16x8x2_t rows = {{ vld1q_s16(in + 0), vld1q_s16(in + 8) }}; int16x8x2_t rows;
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
TransformPass(&rows); TransformPass(&rows);
TransformPass(&rows); TransformPass(&rows);
Add4x4(rows.val[0], rows.val[1], ref, dst); Add4x4(rows.val[0], rows.val[1], ref, dst);
@ -478,9 +479,11 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
} while (0) } while (0)
static void FTransformWHT(const int16_t* src, int16_t* out) { static void FTransformWHT(const int16_t* src, int16_t* out) {
int32x4x4_t tmp0;
const int stride = 16; const int stride = 16;
int16x4x4_t in = {{{0}, {0}, {0}, {0}}}; const int16x4_t zero = vdup_n_s16(0);
int32x4x4_t tmp0;
int16x4x4_t in;
INIT_VECTOR4(in, zero, zero, zero, zero);
LOAD_LANE_16b(in.val[0], 0); LOAD_LANE_16b(in.val[0], 0);
LOAD_LANE_16b(in.val[1], 0); LOAD_LANE_16b(in.val[1], 0);
LOAD_LANE_16b(in.val[2], 0); LOAD_LANE_16b(in.val[2], 0);
@ -1009,11 +1012,12 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) { const VP8Matrix* const mtx) {
const int16x8_t out0 = Quantize(in, mtx, 0); const int16x8_t out0 = Quantize(in, mtx, 0);
const int16x8_t out1 = Quantize(in, mtx, 8); const int16x8_t out1 = Quantize(in, mtx, 8);
const uint8x8x4_t all_out = {{ uint8x8x4_t all_out;
vreinterpret_u8_s16(vget_low_s16(out0)), INIT_VECTOR4(all_out,
vreinterpret_u8_s16(vget_high_s16(out0)), vreinterpret_u8_s16(vget_low_s16(out0)),
vreinterpret_u8_s16(vget_low_s16(out1)), vreinterpret_u8_s16(vget_high_s16(out0)),
vreinterpret_u8_s16(vget_high_s16(out1)) }}; vreinterpret_u8_s16(vget_low_s16(out1)),
vreinterpret_u8_s16(vget_high_s16(out1)));
// Zigzag reordering // Zigzag reordering
vst1_u8((uint8_t*)(out + 0), vtbl4_u8(all_out, vld1_u8(kShuffles[0]))); vst1_u8((uint8_t*)(out + 0), vtbl4_u8(all_out, vld1_u8(kShuffles[0])));
vst1_u8((uint8_t*)(out + 4), vtbl4_u8(all_out, vld1_u8(kShuffles[1]))); vst1_u8((uint8_t*)(out + 4), vtbl4_u8(all_out, vld1_u8(kShuffles[1])));

View File

@ -96,11 +96,12 @@ static void ConvertBGRAToBGR(const uint32_t* src,
const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]); const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]); const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]);
for (; src < end; src += 8) { for (; src < end; src += 8) {
const uint8x8x4_t pixels = {{ uint8x8x4_t pixels;
vld1_u8((const uint8_t*)(src + 0)), INIT_VECTOR4(pixels,
vld1_u8((const uint8_t*)(src + 2)), vld1_u8((const uint8_t*)(src + 0)),
vld1_u8((const uint8_t*)(src + 4)), vld1_u8((const uint8_t*)(src + 2)),
vld1_u8((const uint8_t*)(src + 6)) }}; vld1_u8((const uint8_t*)(src + 4)),
vld1_u8((const uint8_t*)(src + 6)));
vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0)); vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0));
vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1));
vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
@ -122,11 +123,12 @@ static void ConvertBGRAToRGB(const uint32_t* src,
const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]); const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
const uint8x8_t shuffle2 = vld1_u8(kRGBShuffle[2]); const uint8x8_t shuffle2 = vld1_u8(kRGBShuffle[2]);
for (; src < end; src += 8) { for (; src < end; src += 8) {
const uint8x8x4_t pixels = {{ uint8x8x4_t pixels;
vld1_u8((const uint8_t*)(src + 0)), INIT_VECTOR4(pixels,
vld1_u8((const uint8_t*)(src + 2)), vld1_u8((const uint8_t*)(src + 0)),
vld1_u8((const uint8_t*)(src + 4)), vld1_u8((const uint8_t*)(src + 2)),
vld1_u8((const uint8_t*)(src + 6)) }}; vld1_u8((const uint8_t*)(src + 4)),
vld1_u8((const uint8_t*)(src + 6)));
vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0)); vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0));
vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1));
vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));

View File

@ -22,6 +22,24 @@
#define USE_INTRINSICS // use intrinsics when possible #define USE_INTRINSICS // use intrinsics when possible
#endif #endif
#define INIT_VECTOR2(v, a, b) do { \
v.val[0] = a; \
v.val[1] = b; \
} while (0)
#define INIT_VECTOR3(v, a, b, c) do { \
v.val[0] = a; \
v.val[1] = b; \
v.val[2] = c; \
} while (0)
#define INIT_VECTOR4(v, a, b, c, d) do { \
v.val[0] = a; \
v.val[1] = b; \
v.val[2] = c; \
v.val[3] = d; \
} while (0)
// if using intrinsics, this flag avoids some functions that make gcc-4.6.3 // if using intrinsics, this flag avoids some functions that make gcc-4.6.3
// crash ("internal compiler error: in immed_double_const, at emit-rtl."). // crash ("internal compiler error: in immed_double_const, at emit-rtl.").
// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183) // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)

View File

@ -19,6 +19,7 @@
#include <assert.h> #include <assert.h>
#include <arm_neon.h> #include <arm_neon.h>
#include <string.h> #include <string.h>
#include "./neon.h"
#include "./yuv.h" #include "./yuv.h"
#ifdef FANCY_UPSAMPLING #ifdef FANCY_UPSAMPLING
@ -61,8 +62,9 @@
d = vrhadd_u8(d, diag1); \ d = vrhadd_u8(d, diag1); \
\ \
{ \ { \
const uint8x8x2_t a_b = {{ a, b }}; \ uint8x8x2_t a_b, c_d; \
const uint8x8x2_t c_d = {{ c, d }}; \ INIT_VECTOR2(a_b, a, b); \
INIT_VECTOR2(c_d, c, d); \
vst2_u8(out, a_b); \ vst2_u8(out, a_b); \
vst2_u8(out + 32, c_d); \ vst2_u8(out + 32, c_d); \
} \ } \
@ -92,22 +94,26 @@ static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
#define v255 vmov_n_u8(255) #define v255 vmov_n_u8(255)
#define STORE_Rgb(out, r, g, b) do { \ #define STORE_Rgb(out, r, g, b) do { \
const uint8x8x3_t r_g_b = {{ r, g, b }}; \ uint8x8x3_t r_g_b; \
INIT_VECTOR3(r_g_b, r, g, b); \
vst3_u8(out, r_g_b); \ vst3_u8(out, r_g_b); \
} while (0) } while (0)
#define STORE_Bgr(out, r, g, b) do { \ #define STORE_Bgr(out, r, g, b) do { \
const uint8x8x3_t b_g_r = {{ b, g, r }}; \ uint8x8x3_t b_g_r; \
INIT_VECTOR3(b_g_r, b, g, r); \
vst3_u8(out, b_g_r); \ vst3_u8(out, b_g_r); \
} while (0) } while (0)
#define STORE_Rgba(out, r, g, b) do { \ #define STORE_Rgba(out, r, g, b) do { \
const uint8x8x4_t r_g_b_v255 = {{ r, g, b, v255 }}; \ uint8x8x4_t r_g_b_v255; \
INIT_VECTOR4(r_g_b_v255, r, g, b, v255); \
vst4_u8(out, r_g_b_v255); \ vst4_u8(out, r_g_b_v255); \
} while (0) } while (0)
#define STORE_Bgra(out, r, g, b) do { \ #define STORE_Bgra(out, r, g, b) do { \
const uint8x8x4_t b_g_r_v255 = {{ b, g, r, v255 }}; \ uint8x8x4_t b_g_r_v255; \
INIT_VECTOR4(b_g_r_v255, b, g, r, v255); \
vst4_u8(out, b_g_r_v255); \ vst4_u8(out, b_g_r_v255); \
} while (0) } while (0)