wasm: Add VE4 and RD4

BUG=webp:352

Change-Id: I24286685d7c002ec55534a98bcb88ecd82562f79
This commit is contained in:
Scott LaVarnway 2017-06-30 05:12:35 -07:00
parent 440945ca57
commit b4cefba731

View File

@ -11,8 +11,9 @@
// //
// Based on dec_sse2.c // Based on dec_sse2.c
#include "./dsp.h"
#include "../dec/vp8i_dec.h" #include "../dec/vp8i_dec.h"
#include "../utils/utils.h"
#include "./dsp.h"
#if defined(WEBP_USE_WASM) #if defined(WEBP_USE_WASM)
@ -20,8 +21,8 @@ typedef int32_t int32x4 __attribute__((__vector_size__(16)));
typedef uint32_t uint32x4 __attribute__((__vector_size__(16))); typedef uint32_t uint32x4 __attribute__((__vector_size__(16)));
typedef int16_t int16x8 __attribute__((__vector_size__(16))); typedef int16_t int16x8 __attribute__((__vector_size__(16)));
typedef uint16_t uint16x8 __attribute__((__vector_size__(16))); typedef uint16_t uint16x8 __attribute__((__vector_size__(16)));
typedef int8_t int8x16 __attribute__((__vector_size__(16))); typedef int8_t int8x16 __attribute__((__vector_size__(16)));
typedef uint8_t uint8x16 __attribute__((__vector_size__(16))); typedef uint8_t uint8x16 __attribute__((__vector_size__(16)));
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// //
@ -41,11 +42,80 @@ static WEBP_INLINE uint8x16 get_8_bytes(uint8_t* dst) {
static WEBP_INLINE uint8x16 splat_uint8(uint32_t val) { static WEBP_INLINE uint8x16 splat_uint8(uint32_t val) {
uint8x16 a; uint8x16 a;
a[0] = val; a[0] = val;
a = (uint8x16)__builtin_shufflevector( a = (uint8x16)__builtin_shufflevector(a, a, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
a, a, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 0, 0, 0, 0, 0);
return a; return a;
} }
static WEBP_INLINE uint32x4 cvt32_to_128(uint32_t x) {
uint32x4 value = (uint32x4){0};
value[0] = x;
return value;
}
//------------------------------------------------------------------------------
// 4x4 predictions
static void VE4(uint8_t* dst) { // vertical
const uint8x16 zero = (uint8x16){0};
const uint16x8 two = (uint16x8){2, 2, 2, 2, 2, 2, 2, 2};
const uint8x16 top = get_8_bytes(dst - BPS - 1);
const uint16x8 ABCDEFGH = (uint16x8)__builtin_shufflevector(
top, zero, 0, 16, 1, 16, 2, 16, 3, 16, 4, 16, 5, 16, 6, 16, 7, 16);
const uint16x8 BCDEFGHX = (uint16x8)__builtin_shufflevector(
top, zero, 1, 16, 2, 16, 3, 16, 4, 16, 5, 16, 6, 16, 7, 16, 16, 16);
const uint16x8 CDEFGHXX = (uint16x8)__builtin_shufflevector(
top, zero, 2, 16, 3, 16, 4, 16, 5, 16, 6, 16, 7, 16, 16, 16, 16, 16);
const uint16x8 avg3 =
(ABCDEFGH + BCDEFGHX + BCDEFGHX + CDEFGHXX + two) >> two;
const uint32x4 vals = (uint32x4)__builtin_shufflevector(
(uint8x16)avg3, zero, 0, 2, 4, 6, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16);
int i;
for (i = 0; i < 4; ++i) {
WebPUint32ToMem(dst + i * BPS, vals[0]);
}
}
static void RD4(uint8_t* dst) { // Down-right
const uint8x16 zero = (uint8x16){0};
const uint16x8 two = (uint16x8){2, 2, 2, 2, 2, 2, 2, 2};
const uint8x16 top = get_8_bytes(dst - BPS - 1);
const uint32_t I = dst[-1 + 0 * BPS];
const uint32_t J = dst[-1 + 1 * BPS];
const uint32_t K = dst[-1 + 2 * BPS];
const uint32_t L = dst[-1 + 3 * BPS];
const uint8x16 LKJI_____ =
(uint8x16)cvt32_to_128(L | (K << 8) | (J << 16) | (I << 24));
const uint8x16 lkjixabcd = (uint8x16)__builtin_shufflevector(
(uint8x16)LKJI_____, top, 0, 1, 2, 3, 16, 17, 18, 19, 20, 31, 31, 31, 31,
31, 31, 31);
const uint16x8 LKJIXABC = (uint16x8)__builtin_shufflevector(
lkjixabcd, zero, 0, 16, 1, 16, 2, 16, 3, 16, 4, 16, 5, 16, 6, 16, 7, 16);
const uint16x8 KJIXABCD_ = (uint16x8)__builtin_shufflevector(
lkjixabcd, zero, 1, 16, 2, 16, 3, 16, 4, 16, 5, 16, 6, 16, 7, 16, 8, 16);
const uint16x8 JIXABCD__ = (uint16x8)__builtin_shufflevector(
lkjixabcd, zero, 2, 16, 3, 16, 4, 16, 5, 16, 6, 16, 7, 16, 8, 16, 9, 16);
const uint16x8 avg3 =
(LKJIXABC + KJIXABCD_ + KJIXABCD_ + JIXABCD__ + two) >> two;
const uint32x4 vals0 = (uint32x4)__builtin_shufflevector(
(uint8x16)avg3, zero, 6, 8, 10, 12, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16);
const uint32x4 vals1 = (uint32x4)__builtin_shufflevector(
(uint8x16)avg3, zero, 4, 6, 8, 10, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16);
const uint32x4 vals2 = (uint32x4)__builtin_shufflevector(
(uint8x16)avg3, zero, 2, 4, 6, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16);
const uint32x4 vals3 = (uint32x4)__builtin_shufflevector(
(uint8x16)avg3, zero, 0, 2, 4, 6, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16);
WebPUint32ToMem(dst + 0 * BPS, vals0[0]);
WebPUint32ToMem(dst + 1 * BPS, vals1[0]);
WebPUint32ToMem(dst + 2 * BPS, vals2[0]);
WebPUint32ToMem(dst + 3 * BPS, vals3[0]);
}
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Luma 16x16 // Luma 16x16
@ -66,7 +136,7 @@ static void VE16(uint8_t* dst) {
} }
} }
static void HE16(uint8_t* dst) { // horizontal static void HE16(uint8_t* dst) { // horizontal
int j; int j;
for (j = 16; j > 0; --j) { for (j = 16; j > 0; --j) {
const uint8x16 values = splat_uint8(dst[-1]); const uint8x16 values = splat_uint8(dst[-1]);
@ -83,19 +153,19 @@ static WEBP_INLINE uint32_t add_horizontal_16(uint8_t* dst) {
const uint16x8 _a_hbw = (uint16x8)__builtin_shufflevector( const uint16x8 _a_hbw = (uint16x8)__builtin_shufflevector(
a, zero, 8, 16, 9, 16, 10, 16, 11, 16, 12, 16, 13, 16, 14, 16, 15, 16); a, zero, 8, 16, 9, 16, 10, 16, 11, 16, 12, 16, 13, 16, 14, 16, 15, 16);
const uint16x8 sum_a = _a_lbw + _a_hbw; const uint16x8 sum_a = _a_lbw + _a_hbw;
const uint16x8 sum_b = (uint16x8)__builtin_shufflevector( const uint16x8 sum_b =
sum_a, sum_a, 4, 5, 6, 7, 4, 5, 6, 7); (uint16x8)__builtin_shufflevector(sum_a, sum_a, 4, 5, 6, 7, 4, 5, 6, 7);
const uint16x8 sum_c = sum_a + sum_b; const uint16x8 sum_c = sum_a + sum_b;
const uint16x8 sum_d = (uint16x8)__builtin_shufflevector( const uint16x8 sum_d =
sum_c, sum_c, 2, 3, 2, 3, 2, 3, 2, 3); (uint16x8)__builtin_shufflevector(sum_c, sum_c, 2, 3, 2, 3, 2, 3, 2, 3);
const uint16x8 sum_e = sum_c + sum_d; const uint16x8 sum_e = sum_c + sum_d;
const uint16x8 sum_f = (uint16x8)__builtin_shufflevector( const uint16x8 sum_f =
sum_e, sum_e, 1, 1, 1, 1, 1, 1, 1, 1); (uint16x8)__builtin_shufflevector(sum_e, sum_e, 1, 1, 1, 1, 1, 1, 1, 1);
const uint16x8 sum_g = sum_e + sum_f; const uint16x8 sum_g = sum_e + sum_f;
return sum_g[0] & 0xffff; return sum_g[0] & 0xffff;
} }
static void DC16(uint8_t* dst) { // DC static void DC16(uint8_t* dst) { // DC
const uint32_t sum = add_horizontal_16(dst - BPS); const uint32_t sum = add_horizontal_16(dst - BPS);
int left = 0; int left = 0;
int j; int j;
@ -108,7 +178,7 @@ static void DC16(uint8_t* dst) { // DC
} }
} }
static void DC16NoTop(uint8_t* dst) { // DC with top samples not available static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
int DC = 8; int DC = 8;
int j; int j;
for (j = 0; j < 16; ++j) { for (j = 0; j < 16; ++j) {
@ -137,16 +207,16 @@ static WEBP_INLINE uint32_t add_horizontal_8(uint8_t* dst) {
const uint16x8 _a_hbw = (uint16x8)__builtin_shufflevector( const uint16x8 _a_hbw = (uint16x8)__builtin_shufflevector(
a, zero, 4, 16, 5, 16, 6, 16, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16); a, zero, 4, 16, 5, 16, 6, 16, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16);
const uint16x8 sum_a = _a_lbw + _a_hbw; const uint16x8 sum_a = _a_lbw + _a_hbw;
const uint16x8 sum_b = (uint16x8)__builtin_shufflevector( const uint16x8 sum_b =
sum_a, sum_a, 2, 3, 2, 3, 2, 3, 2, 3); (uint16x8)__builtin_shufflevector(sum_a, sum_a, 2, 3, 2, 3, 2, 3, 2, 3);
const uint16x8 sum_c = sum_a + sum_b; const uint16x8 sum_c = sum_a + sum_b;
const uint16x8 sum_d = (uint16x8)__builtin_shufflevector( const uint16x8 sum_d =
sum_c, sum_c, 1, 1, 1, 1, 1, 1, 1, 1); (uint16x8)__builtin_shufflevector(sum_c, sum_c, 1, 1, 1, 1, 1, 1, 1, 1);
const uint16x8 sum_e = sum_c + sum_d; const uint16x8 sum_e = sum_c + sum_d;
return sum_e[0] & 0xffff; return sum_e[0] & 0xffff;
} }
static void VE8uv(uint8_t* dst) { // vertical static void VE8uv(uint8_t* dst) { // vertical
const uint8x16 top = get_8_bytes(dst - BPS); const uint8x16 top = get_8_bytes(dst - BPS);
int j; int j;
for (j = 0; j < 8; ++j) { for (j = 0; j < 8; ++j) {
@ -154,7 +224,7 @@ static void VE8uv(uint8_t* dst) { // vertical
} }
} }
static void HE8uv(uint8_t* dst) { // horizontal static void HE8uv(uint8_t* dst) { // horizontal
int j; int j;
for (j = 8; j > 0; --j) { for (j = 8; j > 0; --j) {
const uint8x16 values = splat_uint8(dst[-1]); const uint8x16 values = splat_uint8(dst[-1]);
@ -172,7 +242,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
} }
} }
static void DC8uv(uint8_t* dst) { // DC static void DC8uv(uint8_t* dst) { // DC
int left = 0; int left = 0;
int j; int j;
const uint32_t sum = add_horizontal_8(dst - BPS); const uint32_t sum = add_horizontal_8(dst - BPS);
@ -185,7 +255,7 @@ static void DC8uv(uint8_t* dst) { // DC
} }
} }
static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
const uint32_t DC = 4 + add_horizontal_8(dst - BPS); const uint32_t DC = 4 + add_horizontal_8(dst - BPS);
Put8x8uv(DC >> 3, dst); Put8x8uv(DC >> 3, dst);
} }
@ -199,7 +269,7 @@ static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
Put8x8uv(dc0 >> 3, dst); Put8x8uv(dc0 >> 3, dst);
} }
static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing
Put8x8uv(0x80, dst); Put8x8uv(0x80, dst);
} }
@ -209,6 +279,9 @@ static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing
extern void VP8DspInitWASM(void); extern void VP8DspInitWASM(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitWASM(void) { WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitWASM(void) {
VP8PredLuma4[2] = VE4;
VP8PredLuma4[4] = RD4;
VP8PredLuma16[0] = DC16; VP8PredLuma16[0] = DC16;
VP8PredLuma16[2] = VE16; VP8PredLuma16[2] = VE16;
VP8PredLuma16[3] = HE16; VP8PredLuma16[3] = HE16;