mirror of
https://github.com/webmproject/libwebp.git
synced 2025-01-15 17:18:23 +01:00
neon: add INIT_VECTOR4
used to initialize NxMx4 vector types replaces initialization via '{{ }}' gnu-ism. Change-Id: I0da7b3d321f3d48579b7863fb2e4d3f449ae7f5e
This commit is contained in:
parent
6c1c632b03
commit
bc03670f01
@ -49,7 +49,9 @@
|
|||||||
// (register alloc, probably). The variants somewhat mitigate the problem, but
|
// (register alloc, probably). The variants somewhat mitigate the problem, but
|
||||||
// not quite. HFilter16i() remains problematic.
|
// not quite. HFilter16i() remains problematic.
|
||||||
static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {
|
static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {
|
||||||
uint8x8x4_t out = {{{0}, {0}, {0}, {0}}};
|
const uint8x8_t zero = vdup_n_u8(0);
|
||||||
|
uint8x8x4_t out;
|
||||||
|
INIT_VECTOR4(out, zero, zero, zero, zero);
|
||||||
out = vld4_lane_u8(src + 0 * stride, out, 0);
|
out = vld4_lane_u8(src + 0 * stride, out, 0);
|
||||||
out = vld4_lane_u8(src + 1 * stride, out, 1);
|
out = vld4_lane_u8(src + 1 * stride, out, 1);
|
||||||
out = vld4_lane_u8(src + 2 * stride, out, 2);
|
out = vld4_lane_u8(src + 2 * stride, out, 2);
|
||||||
@ -84,7 +86,9 @@ static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride,
|
|||||||
static WEBP_INLINE void Load4x16(const uint8_t* src, int stride,
|
static WEBP_INLINE void Load4x16(const uint8_t* src, int stride,
|
||||||
uint8x16_t* const p1, uint8x16_t* const p0,
|
uint8x16_t* const p1, uint8x16_t* const p0,
|
||||||
uint8x16_t* const q0, uint8x16_t* const q1) {
|
uint8x16_t* const q0, uint8x16_t* const q1) {
|
||||||
uint32x4x4_t in = {{{0}, {0}, {0}, {0}}};
|
const uint32x4_t zero = vdupq_n_u32(0);
|
||||||
|
uint32x4x4_t in;
|
||||||
|
INIT_VECTOR4(in, zero, zero, zero, zero);
|
||||||
src -= 2;
|
src -= 2;
|
||||||
LOADQ_LANE_32b(in.val[0], 0);
|
LOADQ_LANE_32b(in.val[0], 0);
|
||||||
LOADQ_LANE_32b(in.val[1], 0);
|
LOADQ_LANE_32b(in.val[1], 0);
|
||||||
@ -273,10 +277,13 @@ static WEBP_INLINE void Store4x8(const uint8x8x4_t v,
|
|||||||
static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
|
static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
|
||||||
const uint8x16_t q0, const uint8x16_t q1,
|
const uint8x16_t q0, const uint8x16_t q1,
|
||||||
uint8_t* const dst, int stride) {
|
uint8_t* const dst, int stride) {
|
||||||
const uint8x8x4_t lo = {{ vget_low_u8(p1), vget_low_u8(p0),
|
uint8x8x4_t lo, hi;
|
||||||
vget_low_u8(q0), vget_low_u8(q1) }};
|
INIT_VECTOR4(lo,
|
||||||
const uint8x8x4_t hi = {{ vget_high_u8(p1), vget_high_u8(p0),
|
vget_low_u8(p1), vget_low_u8(p0),
|
||||||
vget_high_u8(q0), vget_high_u8(q1) }};
|
vget_low_u8(q0), vget_low_u8(q1));
|
||||||
|
INIT_VECTOR4(hi,
|
||||||
|
vget_high_u8(p1), vget_high_u8(p0),
|
||||||
|
vget_high_u8(q0), vget_high_u8(q1));
|
||||||
Store4x8(lo, dst - 2 + 0 * stride, stride);
|
Store4x8(lo, dst - 2 + 0 * stride, stride);
|
||||||
Store4x8(hi, dst - 2 + 8 * stride, stride);
|
Store4x8(hi, dst - 2 + 8 * stride, stride);
|
||||||
}
|
}
|
||||||
@ -355,10 +362,13 @@ static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
|
|||||||
const uint8x16_t q0, const uint8x16_t q1,
|
const uint8x16_t q0, const uint8x16_t q1,
|
||||||
uint8_t* const u, uint8_t* const v,
|
uint8_t* const u, uint8_t* const v,
|
||||||
int stride) {
|
int stride) {
|
||||||
const uint8x8x4_t u0 = {{ vget_low_u8(p1), vget_low_u8(p0),
|
uint8x8x4_t u0, v0;
|
||||||
vget_low_u8(q0), vget_low_u8(q1) }};
|
INIT_VECTOR4(u0,
|
||||||
const uint8x8x4_t v0 = {{ vget_high_u8(p1), vget_high_u8(p0),
|
vget_low_u8(p1), vget_low_u8(p0),
|
||||||
vget_high_u8(q0), vget_high_u8(q1) }};
|
vget_low_u8(q0), vget_low_u8(q1));
|
||||||
|
INIT_VECTOR4(v0,
|
||||||
|
vget_high_u8(p1), vget_high_u8(p0),
|
||||||
|
vget_high_u8(q0), vget_high_u8(q1));
|
||||||
vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
|
vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
|
||||||
vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
|
vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
|
||||||
vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
|
vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
|
||||||
|
@ -479,9 +479,11 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
|
|||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
static void FTransformWHT(const int16_t* src, int16_t* out) {
|
static void FTransformWHT(const int16_t* src, int16_t* out) {
|
||||||
int32x4x4_t tmp0;
|
|
||||||
const int stride = 16;
|
const int stride = 16;
|
||||||
int16x4x4_t in = {{{0}, {0}, {0}, {0}}};
|
const int16x4_t zero = vdup_n_s16(0);
|
||||||
|
int32x4x4_t tmp0;
|
||||||
|
int16x4x4_t in;
|
||||||
|
INIT_VECTOR4(in, zero, zero, zero, zero);
|
||||||
LOAD_LANE_16b(in.val[0], 0);
|
LOAD_LANE_16b(in.val[0], 0);
|
||||||
LOAD_LANE_16b(in.val[1], 0);
|
LOAD_LANE_16b(in.val[1], 0);
|
||||||
LOAD_LANE_16b(in.val[2], 0);
|
LOAD_LANE_16b(in.val[2], 0);
|
||||||
@ -1010,11 +1012,12 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
|||||||
const VP8Matrix* const mtx) {
|
const VP8Matrix* const mtx) {
|
||||||
const int16x8_t out0 = Quantize(in, mtx, 0);
|
const int16x8_t out0 = Quantize(in, mtx, 0);
|
||||||
const int16x8_t out1 = Quantize(in, mtx, 8);
|
const int16x8_t out1 = Quantize(in, mtx, 8);
|
||||||
const uint8x8x4_t all_out = {{
|
uint8x8x4_t all_out;
|
||||||
vreinterpret_u8_s16(vget_low_s16(out0)),
|
INIT_VECTOR4(all_out,
|
||||||
vreinterpret_u8_s16(vget_high_s16(out0)),
|
vreinterpret_u8_s16(vget_low_s16(out0)),
|
||||||
vreinterpret_u8_s16(vget_low_s16(out1)),
|
vreinterpret_u8_s16(vget_high_s16(out0)),
|
||||||
vreinterpret_u8_s16(vget_high_s16(out1)) }};
|
vreinterpret_u8_s16(vget_low_s16(out1)),
|
||||||
|
vreinterpret_u8_s16(vget_high_s16(out1)));
|
||||||
// Zigzag reordering
|
// Zigzag reordering
|
||||||
vst1_u8((uint8_t*)(out + 0), vtbl4_u8(all_out, vld1_u8(kShuffles[0])));
|
vst1_u8((uint8_t*)(out + 0), vtbl4_u8(all_out, vld1_u8(kShuffles[0])));
|
||||||
vst1_u8((uint8_t*)(out + 4), vtbl4_u8(all_out, vld1_u8(kShuffles[1])));
|
vst1_u8((uint8_t*)(out + 4), vtbl4_u8(all_out, vld1_u8(kShuffles[1])));
|
||||||
|
@ -96,11 +96,12 @@ static void ConvertBGRAToBGR(const uint32_t* src,
|
|||||||
const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
|
const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
|
||||||
const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]);
|
const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]);
|
||||||
for (; src < end; src += 8) {
|
for (; src < end; src += 8) {
|
||||||
const uint8x8x4_t pixels = {{
|
uint8x8x4_t pixels;
|
||||||
vld1_u8((const uint8_t*)(src + 0)),
|
INIT_VECTOR4(pixels,
|
||||||
vld1_u8((const uint8_t*)(src + 2)),
|
vld1_u8((const uint8_t*)(src + 0)),
|
||||||
vld1_u8((const uint8_t*)(src + 4)),
|
vld1_u8((const uint8_t*)(src + 2)),
|
||||||
vld1_u8((const uint8_t*)(src + 6)) }};
|
vld1_u8((const uint8_t*)(src + 4)),
|
||||||
|
vld1_u8((const uint8_t*)(src + 6)));
|
||||||
vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0));
|
vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0));
|
||||||
vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1));
|
vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1));
|
||||||
vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
|
vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
|
||||||
@ -122,11 +123,12 @@ static void ConvertBGRAToRGB(const uint32_t* src,
|
|||||||
const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
|
const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
|
||||||
const uint8x8_t shuffle2 = vld1_u8(kRGBShuffle[2]);
|
const uint8x8_t shuffle2 = vld1_u8(kRGBShuffle[2]);
|
||||||
for (; src < end; src += 8) {
|
for (; src < end; src += 8) {
|
||||||
const uint8x8x4_t pixels = {{
|
uint8x8x4_t pixels;
|
||||||
vld1_u8((const uint8_t*)(src + 0)),
|
INIT_VECTOR4(pixels,
|
||||||
vld1_u8((const uint8_t*)(src + 2)),
|
vld1_u8((const uint8_t*)(src + 0)),
|
||||||
vld1_u8((const uint8_t*)(src + 4)),
|
vld1_u8((const uint8_t*)(src + 2)),
|
||||||
vld1_u8((const uint8_t*)(src + 6)) }};
|
vld1_u8((const uint8_t*)(src + 4)),
|
||||||
|
vld1_u8((const uint8_t*)(src + 6)));
|
||||||
vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0));
|
vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0));
|
||||||
vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1));
|
vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1));
|
||||||
vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
|
vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
|
||||||
|
@ -33,6 +33,13 @@
|
|||||||
v.val[2] = c; \
|
v.val[2] = c; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
#define INIT_VECTOR4(v, a, b, c, d) do { \
|
||||||
|
v.val[0] = a; \
|
||||||
|
v.val[1] = b; \
|
||||||
|
v.val[2] = c; \
|
||||||
|
v.val[3] = d; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
// if using intrinsics, this flag avoids some functions that make gcc-4.6.3
|
// if using intrinsics, this flag avoids some functions that make gcc-4.6.3
|
||||||
// crash ("internal compiler error: in immed_double_const, at emit-rtl.").
|
// crash ("internal compiler error: in immed_double_const, at emit-rtl.").
|
||||||
// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
|
// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
|
||||||
|
@ -106,12 +106,14 @@ static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
|
|||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define STORE_Rgba(out, r, g, b) do { \
|
#define STORE_Rgba(out, r, g, b) do { \
|
||||||
const uint8x8x4_t r_g_b_v255 = {{ r, g, b, v255 }}; \
|
uint8x8x4_t r_g_b_v255; \
|
||||||
|
INIT_VECTOR4(r_g_b_v255, r, g, b, v255); \
|
||||||
vst4_u8(out, r_g_b_v255); \
|
vst4_u8(out, r_g_b_v255); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define STORE_Bgra(out, r, g, b) do { \
|
#define STORE_Bgra(out, r, g, b) do { \
|
||||||
const uint8x8x4_t b_g_r_v255 = {{ b, g, r, v255 }}; \
|
uint8x8x4_t b_g_r_v255; \
|
||||||
|
INIT_VECTOR4(b_g_r_v255, b, g, r, v255); \
|
||||||
vst4_u8(out, b_g_r_v255); \
|
vst4_u8(out, b_g_r_v255); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user