From 3259571e7d1cf220ffe1bcd034190cb6531de58b Mon Sep 17 00:00:00 2001 From: Urvang Joshi Date: Wed, 18 May 2016 14:27:15 -0700 Subject: [PATCH 1/4] Refactor GetColorPalette method. This was defined (slightly differently) at two places. Created a common method and moved to utils/utils.[hc]. Change-Id: I66c3ac6dea24e0cd2c0eaa5440f3142b4dbbe23b --- src/enc/vp8l.c | 50 ++------------------------------ src/mux/anim_encode.c | 60 +++------------------------------------ src/utils/utils.c | 66 +++++++++++++++++++++++++++++++++++++++++++ src/utils/utils.h | 13 +++++++++ 4 files changed, 85 insertions(+), 104 deletions(-) diff --git a/src/enc/vp8l.c b/src/enc/vp8l.c index 72f50c40..d98a360b 100644 --- a/src/enc/vp8l.c +++ b/src/enc/vp8l.c @@ -126,54 +126,8 @@ static int AnalyzeAndCreatePalette(const WebPPicture* const pic, int low_effort, uint32_t palette[MAX_PALETTE_SIZE], int* const palette_size) { - int i, x, y, key; - int num_colors = 0; - uint8_t in_use[MAX_PALETTE_SIZE * 4] = { 0 }; - uint32_t colors[MAX_PALETTE_SIZE * 4]; - static const uint32_t kHashMul = 0x1e35a7bd; - const uint32_t* argb = pic->argb; - const int width = pic->width; - const int height = pic->height; - uint32_t last_pix = ~argb[0]; // so we're sure that last_pix != argb[0] - - for (y = 0; y < height; ++y) { - for (x = 0; x < width; ++x) { - if (argb[x] == last_pix) { - continue; - } - last_pix = argb[x]; - key = (kHashMul * last_pix) >> PALETTE_KEY_RIGHT_SHIFT; - while (1) { - if (!in_use[key]) { - colors[key] = last_pix; - in_use[key] = 1; - ++num_colors; - if (num_colors > MAX_PALETTE_SIZE) { - return 0; - } - break; - } else if (colors[key] == last_pix) { - // The color is already there. - break; - } else { - // Some other color sits there. - // Do linear conflict resolution. - ++key; - key &= (MAX_PALETTE_SIZE * 4 - 1); // key mask for 1K buffer. - } - } - } - argb += pic->argb_stride; - } - - // TODO(skal): could we reuse in_use[] to speed up EncodePalette()? - num_colors = 0; - for (i = 0; i < (int)(sizeof(in_use) / sizeof(in_use[0])); ++i) { - if (in_use[i]) { - palette[num_colors] = colors[i]; - ++num_colors; - } - } + const int num_colors = WebPGetColorPalette(pic, palette); + if (num_colors > MAX_PALETTE_SIZE) return 0; *palette_size = num_colors; qsort(palette, num_colors, sizeof(*palette), PaletteCompareColorsForQsort); if (!low_effort && PaletteHasNonMonotonousDeltas(palette, num_colors)) { diff --git a/src/mux/anim_encode.c b/src/mux/anim_encode.c index adb96955..53e2906a 100644 --- a/src/mux/anim_encode.c +++ b/src/mux/anim_encode.c @@ -646,61 +646,6 @@ static int IsLossyBlendingPossible(const WebPPicture* const src, return 1; } -#define MIN_COLORS_LOSSY 31 // Don't try lossy below this threshold. -#define MAX_COLORS_LOSSLESS 194 // Don't try lossless above this threshold. -#define MAX_COLOR_COUNT 256 // Power of 2 greater than MAX_COLORS_LOSSLESS. -#define HASH_SIZE (MAX_COLOR_COUNT * 4) -#define HASH_RIGHT_SHIFT 22 // 32 - log2(HASH_SIZE). - -// TODO(urvang): Also used in enc/vp8l.c. Move to utils. -// If the number of colors in the 'pic' is at least MAX_COLOR_COUNT, return -// MAX_COLOR_COUNT. Otherwise, return the exact number of colors in the 'pic'. -static int GetColorCount(const WebPPicture* const pic) { - int x, y; - int num_colors = 0; - uint8_t in_use[HASH_SIZE] = { 0 }; - uint32_t colors[HASH_SIZE]; - static const uint32_t kHashMul = 0x1e35a7bd; - const uint32_t* argb = pic->argb; - const int width = pic->width; - const int height = pic->height; - uint32_t last_pix = ~argb[0]; // so we're sure that last_pix != argb[0] - - for (y = 0; y < height; ++y) { - for (x = 0; x < width; ++x) { - int key; - if (argb[x] == last_pix) { - continue; - } - last_pix = argb[x]; - key = (kHashMul * last_pix) >> HASH_RIGHT_SHIFT; - while (1) { - if (!in_use[key]) { - colors[key] = last_pix; - in_use[key] = 1; - ++num_colors; - if (num_colors >= MAX_COLOR_COUNT) { - return MAX_COLOR_COUNT; // Exact count not needed. - } - break; - } else if (colors[key] == last_pix) { - break; // The color is already there. - } else { - // Some other color sits here, so do linear conflict resolution. - ++key; - key &= (HASH_SIZE - 1); // Key mask. - } - } - } - argb += pic->argb_stride; - } - return num_colors; -} - -#undef MAX_COLOR_COUNT -#undef HASH_SIZE -#undef HASH_RIGHT_SHIFT - // For pixels in 'rect', replace those pixels in 'dst' that are same as 'src' by // transparent pixels. // Returns true if at least one pixel gets modified. @@ -864,6 +809,9 @@ enum { CANDIDATE_COUNT }; +#define MIN_COLORS_LOSSY 31 // Don't try lossy below this threshold. +#define MAX_COLORS_LOSSLESS 194 // Don't try lossless above this threshold. + // Generates candidates for a given dispose method given pre-filled sub-frame // 'params'. static WebPEncodingError GenerateCandidates( @@ -898,7 +846,7 @@ static WebPEncodingError GenerateCandidates( candidate_ll->evaluate_ = is_lossless; candidate_lossy->evaluate_ = !is_lossless; } else { // Use a heuristic for trying lossless and/or lossy compression. - const int num_colors = GetColorCount(¶ms->sub_frame_ll_); + const int num_colors = WebPGetColorPalette(¶ms->sub_frame_ll_, NULL); candidate_ll->evaluate_ = (num_colors < MAX_COLORS_LOSSLESS); candidate_lossy->evaluate_ = (num_colors >= MIN_COLORS_LOSSY); } diff --git a/src/utils/utils.c b/src/utils/utils.c index d8e30930..2602ca3c 100644 --- a/src/utils/utils.c +++ b/src/utils/utils.c @@ -15,6 +15,7 @@ #include // for memcpy() #include "../webp/decode.h" #include "../webp/encode.h" +#include "../webp/format_constants.h" // for MAX_PALETTE_SIZE #include "./utils.h" // If PRINT_MEM_INFO is defined, extra info (like total memory used, number of @@ -237,3 +238,68 @@ void WebPCopyPixels(const WebPPicture* const src, WebPPicture* const dst) { } //------------------------------------------------------------------------------ + +#define MAX_COLOR_COUNT MAX_PALETTE_SIZE +#define COLOR_HASH_SIZE (MAX_COLOR_COUNT * 4) +#define COLOR_HASH_RIGHT_SHIFT 22 // 32 - log2(COLOR_HASH_SIZE). + +int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) { + int i; + int x, y; + int num_colors = 0; + uint8_t in_use[COLOR_HASH_SIZE] = { 0 }; + uint32_t colors[COLOR_HASH_SIZE]; + static const uint32_t kHashMul = 0x1e35a7bdU; + const uint32_t* argb = pic->argb; + const int width = pic->width; + const int height = pic->height; + uint32_t last_pix = ~argb[0]; // so we're sure that last_pix != argb[0] + assert(pic != NULL); + assert(pic->use_argb); + + for (y = 0; y < height; ++y) { + for (x = 0; x < width; ++x) { + int key; + if (argb[x] == last_pix) { + continue; + } + last_pix = argb[x]; + key = (kHashMul * last_pix) >> COLOR_HASH_RIGHT_SHIFT; + while (1) { + if (!in_use[key]) { + colors[key] = last_pix; + in_use[key] = 1; + ++num_colors; + if (num_colors > MAX_COLOR_COUNT) { + return MAX_COLOR_COUNT + 1; // Exact count not needed. + } + break; + } else if (colors[key] == last_pix) { + break; // The color is already there. + } else { + // Some other color sits here, so do linear conflict resolution. + ++key; + key &= (COLOR_HASH_SIZE - 1); // Key mask. + } + } + } + argb += pic->argb_stride; + } + + if (palette != NULL) { // Fill the colors into palette. + num_colors = 0; + for (i = 0; i < COLOR_HASH_SIZE; ++i) { + if (in_use[i]) { + palette[num_colors] = colors[i]; + ++num_colors; + } + } + } + return num_colors; +} + +#undef MAX_COLOR_COUNT +#undef COLOR_HASH_SIZE +#undef COLOR_HASH_RIGHT_SHIFT + +//------------------------------------------------------------------------------ diff --git a/src/utils/utils.h b/src/utils/utils.h index c6c10149..e0a81126 100644 --- a/src/utils/utils.h +++ b/src/utils/utils.h @@ -160,6 +160,19 @@ WEBP_EXTERN(void) WebPCopyPlane(const uint8_t* src, int src_stride, WEBP_EXTERN(void) WebPCopyPixels(const struct WebPPicture* const src, struct WebPPicture* const dst); +//------------------------------------------------------------------------------ +// Unique colors. + +// Returns count of unique colors in 'pic', assuming pic->use_argb is true. +// If the unique color count is more than MAX_COLOR_COUNT, returns +// MAX_COLOR_COUNT+1. +// If 'palette' is not NULL and number of unique colors is less than or equal to +// MAX_COLOR_COUNT, also outputs the actual unique colors into 'palette'. +// Note: 'palette' is assumed to be an array already allocated with at least +// MAX_COLOR_COUNT elements. +WEBP_EXTERN(int) WebPGetColorPalette(const struct WebPPicture* const pic, + uint32_t* const palette); + //------------------------------------------------------------------------------ #ifdef __cplusplus From 3dc28d7643dba8abc1fc4b056179d7bf6229bae9 Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 15 Jun 2016 23:53:48 -0700 Subject: [PATCH 2/4] configure: test for -Wfloat-conversion Change-Id: Ib7d0372dc43994d66228f7dd3ce4dc3b80a5482d --- configure.ac | 1 + 1 file changed, 1 insertion(+) diff --git a/configure.ac b/configure.ac index d5658228..36f79b8d 100644 --- a/configure.ac +++ b/configure.ac @@ -68,6 +68,7 @@ TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-fvisibility=hidden]) TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wall]) TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wdeclaration-after-statement]) TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wextra]) +TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wfloat-conversion]) TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wformat -Wformat-nonliteral]) TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wformat -Wformat-security]) TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wmissing-declarations]) From 5e60c42a76174ef1c8ffef5e42548a59174b3c10 Mon Sep 17 00:00:00 2001 From: Parag Salasakar Date: Thu, 9 Jun 2016 19:17:09 +0530 Subject: [PATCH 3/4] Added MSA optimized transform functions 1. TransformWHT 2. TransformTwo 3. TransformDC 4. TransformAC3 Change-Id: Ia3624cb4aed215bcaffce542b28794e643207039 --- Android.mk | 1 + Makefile.vc | 1 + build.gradle | 9 + makefile.unix | 2 + src/dsp/Makefile.am | 9 + src/dsp/dec.c | 6 + src/dsp/dec_msa.c | 172 ++++++++++++++ src/dsp/msa_macro.h | 555 ++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 755 insertions(+) create mode 100644 src/dsp/dec_msa.c create mode 100644 src/dsp/msa_macro.h diff --git a/Android.mk b/Android.mk index 6d60f36a..ef35795b 100644 --- a/Android.mk +++ b/Android.mk @@ -49,6 +49,7 @@ dsp_dec_srcs := \ src/dsp/dec_clip_tables.c \ src/dsp/dec_mips32.c \ src/dsp/dec_mips_dsp_r2.c \ + src/dsp/dec_msa.c \ src/dsp/dec_neon.$(NEON) \ src/dsp/dec_sse2.c \ src/dsp/dec_sse41.c \ diff --git a/Makefile.vc b/Makefile.vc index 06461a1f..5f145e24 100644 --- a/Makefile.vc +++ b/Makefile.vc @@ -194,6 +194,7 @@ DSP_DEC_OBJS = \ $(DIROBJ)\dsp\dec_clip_tables.obj \ $(DIROBJ)\dsp\dec_mips32.obj \ $(DIROBJ)\dsp\dec_mips_dsp_r2.obj \ + $(DIROBJ)\dsp\dec_msa.obj \ $(DIROBJ)\dsp\dec_neon.obj \ $(DIROBJ)\dsp\dec_sse2.obj \ $(DIROBJ)\dsp\dec_sse41.obj \ diff --git a/build.gradle b/build.gradle index a498b2aa..b81c6a82 100644 --- a/build.gradle +++ b/build.gradle @@ -38,12 +38,20 @@ model { architecture "x86_64" } mips32r2 + mips32r5 + mips64r6 } toolChains { gcc(Gcc) { target("mips32r2") { cCompiler.args "-mips32r2" } + target("mips32r5") { + cCompiler.args "-mips32r5" + } + target("mips64r6") { + cCompiler.args "-mips64r6" + } } } binaries { @@ -111,6 +119,7 @@ model { include "dec_clip_tables.c" include "dec_mips32.c" include "dec_mips_dsp_r2.c" + include "dec_msa.c" include "dec_neon.$NEON" include "dec_sse2.c" include "dec_sse41.c" diff --git a/makefile.unix b/makefile.unix index fc278731..fe4571e2 100644 --- a/makefile.unix +++ b/makefile.unix @@ -138,6 +138,7 @@ DSP_DEC_OBJS = \ src/dsp/dec_clip_tables.o \ src/dsp/dec_mips32.o \ src/dsp/dec_mips_dsp_r2.o \ + src/dsp/dec_msa.o \ src/dsp/dec_neon.o \ src/dsp/dec_sse2.o \ src/dsp/dec_sse41.o \ @@ -273,6 +274,7 @@ HDRS = \ src/dsp/dsp.h \ src/dsp/lossless.h \ src/dsp/mips_macro.h \ + src/dsp/msa_macro.h \ src/dsp/neon.h \ src/dsp/yuv.h \ src/enc/backward_references.h \ diff --git a/src/dsp/Makefile.am b/src/dsp/Makefile.am index c1a9d827..2ef5c380 100644 --- a/src/dsp/Makefile.am +++ b/src/dsp/Makefile.am @@ -2,6 +2,7 @@ noinst_LTLIBRARIES = libwebpdsp.la libwebpdsp_avx2.la noinst_LTLIBRARIES += libwebpdsp_sse2.la libwebpdspdecode_sse2.la noinst_LTLIBRARIES += libwebpdsp_sse41.la libwebpdspdecode_sse41.la noinst_LTLIBRARIES += libwebpdsp_neon.la libwebpdspdecode_neon.la +noinst_LTLIBRARIES += libwebpdspdecode_msa.la if BUILD_LIBWEBPDECODER noinst_LTLIBRARIES += libwebpdspdecode.la @@ -80,6 +81,12 @@ libwebpdspdecode_neon_la_SOURCES += upsampling_neon.c libwebpdspdecode_neon_la_CPPFLAGS = $(libwebpdsp_neon_la_CPPFLAGS) libwebpdspdecode_neon_la_CFLAGS = $(libwebpdsp_neon_la_CFLAGS) +libwebpdspdecode_msa_la_SOURCES = +libwebpdspdecode_msa_la_SOURCES += dec_msa.c +libwebpdspdecode_msa_la_SOURCES += msa_macro.h +libwebpdspdecode_msa_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS) +libwebpdspdecode_msa_la_CFLAGS = $(AM_CFLAGS) + libwebpdsp_sse2_la_SOURCES = libwebpdsp_sse2_la_SOURCES += argb_sse2.c libwebpdsp_sse2_la_SOURCES += cost_sse2.c @@ -117,6 +124,7 @@ libwebpdsp_la_LIBADD = libwebpdsp_la_LIBADD += libwebpdsp_avx2.la libwebpdsp_sse2.la libwebpdsp_la_LIBADD += libwebpdsp_sse41.la libwebpdsp_la_LIBADD += libwebpdsp_neon.la +libwebpdsp_la_LIBADD += libwebpdspdecode_msa.la if BUILD_LIBWEBPDECODER libwebpdspdecode_la_SOURCES = $(COMMON_SOURCES) @@ -127,4 +135,5 @@ if BUILD_LIBWEBPDECODER libwebpdspdecode_la_LIBADD += libwebpdspdecode_sse2.la libwebpdspdecode_la_LIBADD += libwebpdspdecode_sse41.la libwebpdspdecode_la_LIBADD += libwebpdspdecode_neon.la + libwebpdspdecode_la_LIBADD += libwebpdspdecode_msa.la endif diff --git a/src/dsp/dec.c b/src/dsp/dec.c index 17647f0a..e92d6933 100644 --- a/src/dsp/dec.c +++ b/src/dsp/dec.c @@ -699,6 +699,7 @@ extern void VP8DspInitSSE41(void); extern void VP8DspInitNEON(void); extern void VP8DspInitMIPS32(void); extern void VP8DspInitMIPSdspR2(void); +extern void VP8DspInitMSA(void); static volatile VP8CPUInfo dec_last_cpuinfo_used = (VP8CPUInfo)&dec_last_cpuinfo_used; @@ -783,6 +784,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) { if (VP8GetCPUInfo(kMIPSdspR2)) { VP8DspInitMIPSdspR2(); } +#endif +#if defined(WEBP_USE_MSA) + if (VP8GetCPUInfo(kMSA)) { + VP8DspInitMSA(); + } #endif } dec_last_cpuinfo_used = VP8GetCPUInfo; diff --git a/src/dsp/dec_msa.c b/src/dsp/dec_msa.c new file mode 100644 index 00000000..f76055ca --- /dev/null +++ b/src/dsp/dec_msa.c @@ -0,0 +1,172 @@ +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// MSA version of dsp functions +// +// Author(s): Prashant Patil (prashant.patil@imgtec.com) + + +#include "./dsp.h" + +#if defined(WEBP_USE_MSA) + +#include "./msa_macro.h" + +//------------------------------------------------------------------------------ +// Transforms + +#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) { \ + v4i32 a1_m, b1_m, c1_m, d1_m; \ + v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m; \ + const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091); \ + const v4i32 sinpi8sqrt2 = __msa_fill_w(35468); \ + \ + a1_m = in0 + in2; \ + b1_m = in0 - in2; \ + c_tmp1_m = (in1 * sinpi8sqrt2) >> 16; \ + c_tmp2_m = in3 + ((in3 * cospi8sqrt2minus1) >> 16); \ + c1_m = c_tmp1_m - c_tmp2_m; \ + d_tmp1_m = in1 + ((in1 * cospi8sqrt2minus1) >> 16); \ + d_tmp2_m = (in3 * sinpi8sqrt2) >> 16; \ + d1_m = d_tmp1_m + d_tmp2_m; \ + BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ +} +#define MULT1(a) ((((a) * 20091) >> 16) + (a)) +#define MULT2(a) (((a) * 35468) >> 16) + +static void TransformOne(const int16_t* in, uint8_t* dst) { + v8i16 input0, input1; + v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; + v4i32 res0, res1, res2, res3; + const v16i8 zero = { 0 }; + v16i8 dest0, dest1, dest2, dest3; + + LD_SH2(in, 8, input0, input1); + UNPCK_SH_SW(input0, in0, in1); + UNPCK_SH_SW(input1, in2, in3); + IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3); + TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3); + IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3); + SRARI_W4_SW(vt0, vt1, vt2, vt3, 3); + TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3); + LD_SB4(dst, BPS, dest0, dest1, dest2, dest3); + ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, + res0, res1, res2, res3); + ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, + res0, res1, res2, res3); + ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3); + CLIP_SW4_0_255(res0, res1, res2, res3); + PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1); + res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1); + ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS); +} + +static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { + TransformOne(in, dst); + if (do_two) { + TransformOne(in + 16, dst + 4); + } +} + +static void TransformWHT(const int16_t* in, int16_t* out) { + v8i16 input0, input1; + const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 }; + const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 }; + const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 }; + const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 }; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 out0, out1; + + LD_SH2(in, 8, input0, input1); + input1 = SLDI_SH(input1, input1, 8); + tmp0 = input0 + input1; + tmp1 = input0 - input1; + VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); + out0 = tmp2 + tmp3; + out1 = tmp2 - tmp3; + VSHF_H2_SH(out0, out1, out0, out1, mask2, mask3, input0, input1); + tmp0 = input0 + input1; + tmp1 = input0 - input1; + VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); + tmp0 = tmp2 + tmp3; + tmp1 = tmp2 - tmp3; + ADDVI_H2_SH(tmp0, 3, tmp1, 3, out0, out1); + SRAI_H2_SH(out0, out1, 3); + out[0] = __msa_copy_s_h(out0, 0); + out[16] = __msa_copy_s_h(out0, 4); + out[32] = __msa_copy_s_h(out1, 0); + out[48] = __msa_copy_s_h(out1, 4); + out[64] = __msa_copy_s_h(out0, 1); + out[80] = __msa_copy_s_h(out0, 5); + out[96] = __msa_copy_s_h(out1, 1); + out[112] = __msa_copy_s_h(out1, 5); + out[128] = __msa_copy_s_h(out0, 2); + out[144] = __msa_copy_s_h(out0, 6); + out[160] = __msa_copy_s_h(out1, 2); + out[176] = __msa_copy_s_h(out1, 6); + out[192] = __msa_copy_s_h(out0, 3); + out[208] = __msa_copy_s_h(out0, 7); + out[224] = __msa_copy_s_h(out1, 3); + out[240] = __msa_copy_s_h(out1, 7); +} + +static void TransformDC(const int16_t* in, uint8_t* dst) { + const int DC = (in[0] + 4) >> 3; + const v8i16 tmp0 = __msa_fill_h(DC); + ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS); +} + +static void TransformAC3(const int16_t* in, uint8_t* dst) { + const int a = in[0] + 4; + const int c4 = MULT2(in[4]); + const int d4 = MULT1(in[4]); + const int in2 = MULT2(in[1]); + const int in3 = MULT1(in[1]); + v4i32 tmp0 = { 0 }; + v4i32 out0 = __msa_fill_w(a + d4); + v4i32 out1 = __msa_fill_w(a + c4); + v4i32 out2 = __msa_fill_w(a - c4); + v4i32 out3 = __msa_fill_w(a - d4); + v4i32 res0, res1, res2, res3; + const v4i32 zero = { 0 }; + v16u8 dest0, dest1, dest2, dest3; + + INSERT_W4_SW(in3, in2, -in2, -in3, tmp0); + ADD4(out0, tmp0, out1, tmp0, out2, tmp0, out3, tmp0, + out0, out1, out2, out3); + SRAI_W4_SW(out0, out1, out2, out3, 3); + LD_UB4(dst, BPS, dest0, dest1, dest2, dest3); + ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, + res0, res1, res2, res3); + ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, + res0, res1, res2, res3); + ADD4(res0, out0, res1, out1, res2, out2, res3, out3, res0, res1, res2, res3); + CLIP_SW4_0_255(res0, res1, res2, res3); + PCKEV_B2_SW(res0, res1, res2, res3, out0, out1); + res0 = (v4i32)__msa_pckev_b((v16i8)out0, (v16i8)out1); + ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS); +} + +//------------------------------------------------------------------------------ +// Entry point + +extern void VP8DspInitMSA(void); + +WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMSA(void) { + VP8TransformWHT = TransformWHT; + VP8Transform = TransformTwo; + VP8TransformDC = TransformDC; + VP8TransformAC3 = TransformAC3; +} + +#else // !WEBP_USE_MSA + +WEBP_DSP_INIT_STUB(VP8DspInitMSA) + +#endif // WEBP_USE_MSA diff --git a/src/dsp/msa_macro.h b/src/dsp/msa_macro.h new file mode 100644 index 00000000..5c707f47 --- /dev/null +++ b/src/dsp/msa_macro.h @@ -0,0 +1,555 @@ +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// MSA common macros +// +// Author(s): Prashant Patil (prashant.patil@imgtec.com) + +#ifndef WEBP_DSP_MSA_MACRO_H_ +#define WEBP_DSP_MSA_MACRO_H_ + +#include +#include + +#if defined(__clang__) + #define CLANG_BUILD +#endif + +#ifdef CLANG_BUILD + #define ADDVI_H(a, b) __msa_addvi_h((v8i16)a, b) + #define SRAI_H(a, b) __msa_srai_h((v8i16)a, b) + #define SRAI_W(a, b) __msa_srai_w((v4i32)a, b) +#else + #define ADDVI_H(a, b) (a + b) + #define SRAI_H(a, b) (a >> b) + #define SRAI_W(a, b) (a >> b) +#endif + +#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) +#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) +#define LD_SB(...) LD_B(v16i8, __VA_ARGS__) + +#define LD_H(RTYPE, psrc) *((RTYPE*)(psrc)) +#define LD_UH(...) LD_H(v8u16, __VA_ARGS__) +#define LD_SH(...) LD_H(v8i16, __VA_ARGS__) + +#define LD_W(RTYPE, psrc) *((RTYPE*)(psrc)) +#define LD_UW(...) LD_W(v4u32, __VA_ARGS__) +#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) + +#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in +#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) +#define ST_SB(...) ST_B(v16i8, __VA_ARGS__) + +#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in +#define ST_UH(...) ST_H(v8u16, __VA_ARGS__) +#define ST_SH(...) ST_H(v8i16, __VA_ARGS__) + +#define ST_W(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in +#define ST_UW(...) ST_W(v4u32, __VA_ARGS__) +#define ST_SW(...) ST_W(v4i32, __VA_ARGS__) + +#define MSA_LOAD_FUNC(TYPE, INSTR, FUNC_NAME) \ + static inline TYPE FUNC_NAME(const void* const psrc) { \ + const uint8_t* const psrc_m = (const uint8_t*)psrc; \ + TYPE val_m; \ + asm volatile ( \ + "" #INSTR " %[val_m], %[psrc_m] \n\t" \ + : [val_m] "=r" (val_m) \ + : [psrc_m] "m" (*psrc_m)); \ + return val_m; \ + } + +#define MSA_LOAD(psrc, FUNC_NAME) FUNC_NAME(psrc) + +#define MSA_STORE_FUNC(TYPE, INSTR, FUNC_NAME) \ + static inline void FUNC_NAME(TYPE val, void* const pdst) { \ + uint8_t* const pdst_m = (uint8_t*)pdst; \ + TYPE val_m = val; \ + asm volatile ( \ + " " #INSTR " %[val_m], %[pdst_m] \n\t" \ + : [pdst_m] "=m" (*pdst_m) \ + : [val_m] "r" (val_m)); \ + } + +#define MSA_STORE(val, pdst, FUNC_NAME) FUNC_NAME(val, pdst) + +#if (__mips_isa_rev >= 6) + MSA_LOAD_FUNC(uint16_t, lh, msa_lh); + #define LH(psrc) MSA_LOAD(psrc, msa_lh) + MSA_LOAD_FUNC(uint32_t, lw, msa_lw); + #define LW(psrc) MSA_LOAD(psrc, msa_lw) + #if (__mips == 64) + MSA_LOAD_FUNC(uint64_t, ld, msa_ld); + #define LD(psrc) MSA_LOAD(psrc, msa_ld) + #else // !(__mips == 64) + #define LD(psrc) ((((uint64_t)MSA_LOAD(psrc + 4, msa_lw)) << 32) | \ + MSA_LOAD(psrc, msa_lw)) + #endif // (__mips == 64) + + MSA_STORE_FUNC(uint16_t, sh, msa_sh); + #define SH(val, pdst) MSA_STORE(val, pdst, msa_sh) + MSA_STORE_FUNC(uint32_t, sw, msa_sw); + #define SW(val, pdst) MSA_STORE(val, pdst, msa_sw) + MSA_STORE_FUNC(uint64_t, sd, msa_sd); + #define SD(val, pdst) MSA_STORE(val, pdst, msa_sd) +#else // !(__mips_isa_rev >= 6) + MSA_LOAD_FUNC(uint16_t, ulh, msa_ulh); + #define LH(psrc) MSA_LOAD(psrc, msa_ulh) + MSA_LOAD_FUNC(uint32_t, ulw, msa_ulw); + #define LW(psrc) MSA_LOAD(psrc, msa_ulw) + #if (__mips == 64) + MSA_LOAD_FUNC(uint64_t, uld, msa_uld); + #define LD(psrc) MSA_LOAD(psrc, msa_uld) + #else // !(__mips == 64) + #define LD(psrc) ((((uint64_t)MSA_LOAD(psrc + 4, msa_ulw)) << 32) | \ + MSA_LOAD(psrc, msa_ulw)) + #endif // (__mips == 64) + + MSA_STORE_FUNC(uint16_t, ush, msa_ush); + #define SH(val, pdst) MSA_STORE(val, pdst, msa_ush) + MSA_STORE_FUNC(uint32_t, usw, msa_usw); + #define SW(val, pdst) MSA_STORE(val, pdst, msa_usw) + #define SD(val, pdst) { \ + uint8_t* const pdst_sd_m = (uint8_t*)(pdst); \ + const uint32_t val0_m = (uint32_t)(val & 0x00000000FFFFFFFF); \ + const uint32_t val1_m = (uint32_t)((val >> 32) & 0x00000000FFFFFFFF); \ + SW(val0_m, pdst_sd_m); \ + SW(val1_m, pdst_sd_m + 4); \ + } +#endif // (__mips_isa_rev >= 6) + +/* Description : Load 4 words with stride + * Arguments : Inputs - psrc, stride + * Outputs - out0, out1, out2, out3 + * Details : Load word in 'out0' from (psrc) + * Load word in 'out1' from (psrc + stride) + * Load word in 'out2' from (psrc + 2 * stride) + * Load word in 'out3' from (psrc + 3 * stride) + */ +#define LW4(psrc, stride, out0, out1, out2, out3) { \ + const uint8_t* ptmp = (const uint8_t*)psrc; \ + out0 = LW(ptmp); \ + ptmp += stride; \ + out1 = LW(ptmp); \ + ptmp += stride; \ + out2 = LW(ptmp); \ + ptmp += stride; \ + out3 = LW(ptmp); \ +} + +/* Description : Store 4 words with stride + * Arguments : Inputs - in0, in1, in2, in3, pdst, stride + * Details : Store word from 'in0' to (pdst) + * Store word from 'in1' to (pdst + stride) + * Store word from 'in2' to (pdst + 2 * stride) + * Store word from 'in3' to (pdst + 3 * stride) + */ +#define SW4(in0, in1, in2, in3, pdst, stride) { \ + uint8_t* ptmp = (uint8_t*)pdst; \ + SW(in0, ptmp); \ + ptmp += stride; \ + SW(in1, ptmp); \ + ptmp += stride; \ + SW(in2, ptmp); \ + ptmp += stride; \ + SW(in3, ptmp); \ +} + +/* Description : Load vectors with 16 byte elements with stride + * Arguments : Inputs - psrc, stride + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Load 16 byte elements in 'out0' from (psrc) + * Load 16 byte elements in 'out1' from (psrc + stride) + */ +#define LD_B2(RTYPE, psrc, stride, out0, out1) { \ + out0 = LD_B(RTYPE, psrc); \ + out1 = LD_B(RTYPE, psrc + stride); \ +} +#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) +#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) + +#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) { \ + LD_B2(RTYPE, psrc, stride, out0, out1); \ + LD_B2(RTYPE, psrc + 2 * stride , stride, out2, out3); \ +} +#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) +#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) + +/* Description : Load vectors with 8 halfword elements with stride + * Arguments : Inputs - psrc, stride + * Outputs - out0, out1 + * Details : Load 8 halfword elements in 'out0' from (psrc) + * Load 8 halfword elements in 'out1' from (psrc + stride) + */ +#define LD_H2(RTYPE, psrc, stride, out0, out1) { \ + out0 = LD_H(RTYPE, psrc); \ + out1 = LD_H(RTYPE, psrc + stride); \ +} +#define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__) +#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) + +/* Description : Store 4x4 byte block to destination memory from input vector + * Arguments : Inputs - in0, in1, pdst, stride + * Details : 'Idx0' word element from input vector 'in0' is copied to the + * GP register and stored to (pdst) + * 'Idx1' word element from input vector 'in0' is copied to the + * GP register and stored to (pdst + stride) + * 'Idx2' word element from input vector 'in0' is copied to the + * GP register and stored to (pdst + 2 * stride) + * 'Idx3' word element from input vector 'in0' is copied to the + * GP register and stored to (pdst + 3 * stride) + */ +#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) { \ + uint8_t* const pblk_4x4_m = (uint8_t*)pdst; \ + const uint32_t out0_m = __msa_copy_s_w((v4i32)in0, idx0); \ + const uint32_t out1_m = __msa_copy_s_w((v4i32)in0, idx1); \ + const uint32_t out2_m = __msa_copy_s_w((v4i32)in1, idx2); \ + const uint32_t out3_m = __msa_copy_s_w((v4i32)in1, idx3); \ + SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ +} + +/* Description : Immediate number of elements to slide + * Arguments : Inputs - in0, in1, slide_val + * Outputs - out + * Return Type - as per RTYPE + * Details : Byte elements from 'in1' vector are slid into 'in0' by + * value specified in the 'slide_val' + */ +#define SLDI_B(RTYPE, in0, in1, slide_val) \ + (RTYPE)__msa_sldi_b((v16i8)in0, (v16i8)in1, slide_val) \ + +#define SLDI_UB(...) SLDI_B(v16u8, __VA_ARGS__) +#define SLDI_SB(...) SLDI_B(v16i8, __VA_ARGS__) +#define SLDI_SH(...) SLDI_B(v8i16, __VA_ARGS__) + +/* Description : Shuffle halfword vector elements as per mask vector + * Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : halfword elements from 'in0' & 'in1' are copied selectively to + * 'out0' as per control vector 'mask0' + */ +#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) { \ + out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0); \ + out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2); \ +} +#define VSHF_H2_UH(...) VSHF_H2(v8u16, __VA_ARGS__) +#define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__) + +/* Description : Clips all signed halfword elements of input vector + * between 0 & 255 + * Arguments : Input/output - val + * Return Type - signed halfword + */ +#define CLIP_SH_0_255(val) { \ + const v8i16 max_m = __msa_ldi_h(255); \ + val = __msa_maxi_s_h((v8i16)val, 0); \ + val = __msa_min_s_h(max_m, (v8i16)val); \ +} +#define CLIP_SH2_0_255(in0, in1) { \ + CLIP_SH_0_255(in0); \ + CLIP_SH_0_255(in1); \ +} + +/* Description : Clips all signed word elements of input vector + * between 0 & 255 + * Arguments : Input/output - val + * Return Type - signed word + */ +#define CLIP_SW_0_255(val) { \ + const v4i32 max_m = __msa_ldi_w(255); \ + val = __msa_maxi_s_w((v4i32)val, 0); \ + val = __msa_min_s_w(max_m, (v4i32)val); \ +} +#define CLIP_SW4_0_255(in0, in1, in2, in3) { \ + CLIP_SW_0_255(in0); \ + CLIP_SW_0_255(in1); \ + CLIP_SW_0_255(in2); \ + CLIP_SW_0_255(in3); \ +} + +/* Description : Set element n input vector to GPR value + * Arguments : Inputs - in0, in1, in2, in3 + * Output - out + * Return Type - as per RTYPE + * Details : Set element 0 in vector 'out' to value specified in 'in0' + */ +#define INSERT_W2(RTYPE, in0, in1, out) { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ +} +#define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__) +#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) + +#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ +} +#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) +#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) +#define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__) + +/* Description : Interleave right half of byte elements from vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Right half of byte elements of 'in0' and 'in1' are interleaved + * and written to out0. + */ +#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ +} +#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) +#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) +#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) +#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) +#define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__) + +#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) +#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) +#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) +#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) +#define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__) + +/* Description : Interleave right half of halfword elements from vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Right half of halfword elements of 'in0' and 'in1' are + * interleaved and written to 'out0'. + */ +#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ +} +#define ILVR_H2_UB(...) ILVR_H2(v16u8, __VA_ARGS__) +#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) +#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) + +#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVR_H4_UB(...) ILVR_H4(v16u8, __VA_ARGS__) +#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) +#define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__) + +/* Description : Interleave right half of double word elements from vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Right half of double word elements of 'in0' and 'in1' are + * interleaved and written to 'out0'. + */ +#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_d((v2i64)in0, (v2i64)in1); \ + out1 = (RTYPE)__msa_ilvr_d((v2i64)in2, (v2i64)in3); \ +} +#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) +#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) +#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) + +#define ILVRL_H2(RTYPE, in0, in1, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ +} +#define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__) +#define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__) +#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) +#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) +#define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__) + +#define ILVRL_W2(RTYPE, in0, in1, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ +} +#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) +#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) +#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) + +/* Description : Pack even byte elements of vector pairs + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Even byte elements of 'in0' are copied to the left half of + * 'out0' & even byte elements of 'in1' are copied to the right + * half of 'out0'. + */ +#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ +} +#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) +#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) +#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) +#define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__) + +/* Description : Arithmetic immediate shift right all elements of word vector + * Arguments : Inputs - in0, in1, shift + * Outputs - in place operation + * Return Type - as per input vector RTYPE + * Details : Each element of vector 'in0' is right shifted by 'shift' and + * the result is written in-place. 'shift' is a GP variable. + */ +#define SRAI_W2(RTYPE, in0, in1, shift_val) { \ + in0 = (RTYPE)SRAI_W(in0, shift_val); \ + in1 = (RTYPE)SRAI_W(in1, shift_val); \ +} +#define SRAI_W2_SW(...) SRAI_W2(v4i32, __VA_ARGS__) +#define SRAI_W2_UW(...) SRAI_W2(v4u32, __VA_ARGS__) + +#define SRAI_W4(RTYPE, in0, in1, in2, in3, shift_val) { \ + SRAI_W2(RTYPE, in0, in1, shift_val); \ + SRAI_W2(RTYPE, in2, in3, shift_val); \ +} +#define SRAI_W4_SW(...) SRAI_W4(v4i32, __VA_ARGS__) +#define SRAI_W4_UW(...) SRAI_W4(v4u32, __VA_ARGS__) + +/* Description : Arithmetic shift right all elements of half-word vector + * Arguments : Inputs - in0, in1, shift + * Outputs - in place operation + * Return Type - as per input vector RTYPE + * Details : Each element of vector 'in0' is right shifted by 'shift' and + * the result is written in-place. 'shift' is a GP variable. + */ +#define SRAI_H2(RTYPE, in0, in1, shift_val) { \ + in0 = (RTYPE)SRAI_H(in0, shift_val); \ + in1 = (RTYPE)SRAI_H(in1, shift_val); \ +} +#define SRAI_H2_SH(...) SRAI_H2(v8i16, __VA_ARGS__) +#define SRAI_H2_UH(...) SRAI_H2(v8u16, __VA_ARGS__) + +/* Description : Arithmetic rounded shift right all elements of word vector + * Arguments : Inputs - in0, in1, shift + * Outputs - in place operation + * Return Type - as per input vector RTYPE + * Details : Each element of vector 'in0' is right shifted by 'shift' and + * the result is written in-place. 'shift' is a GP variable. + */ +#define SRARI_W2(RTYPE, in0, in1, shift) { \ + in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ + in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ +} +#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) + +#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) { \ + SRARI_W2(RTYPE, in0, in1, shift); \ + SRARI_W2(RTYPE, in2, in3, shift); \ +} +#define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__) +#define SRARI_W4_UW(...) SRARI_W4(v4u32, __VA_ARGS__) +#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) + +/* Description : Addition of 2 pairs of half-word vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Details : Each element in 'in0' is added to 'in1' and result is written + * to 'out0'. + */ +#define ADDVI_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)ADDVI_H(in0, in1); \ + out1 = (RTYPE)ADDVI_H(in2, in3); \ +} +#define ADDVI_H2_SH(...) ADDVI_H2(v8i16, __VA_ARGS__) +#define ADDVI_H2_UH(...) ADDVI_H2(v8u16, __VA_ARGS__) + +/* Description : Addition of 2 pairs of vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Details : Each element in 'in0' is added to 'in1' and result is written + * to 'out0'. + */ +#define ADD2(in0, in1, in2, in3, out0, out1) { \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ +} +#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ADD2(in0, in1, in2, in3, out0, out1); \ + ADD2(in4, in5, in6, in7, out2, out3); \ +} + +/* Description : Sign extend halfword elements from input vector and return + * the result in pair of vectors + * Arguments : Input - in (halfword vector) + * Outputs - out0, out1 (sign extended word vectors) + * Return Type - signed word + * Details : Sign bit of halfword elements from input vector 'in' is + * extracted and interleaved right with same vector 'in0' to + * generate 4 signed word elements in 'out0' + * Then interleaved left with same vector 'in0' to + * generate 4 signed word elements in 'out1' + */ +#define UNPCK_SH_SW(in, out0, out1) { \ + const v8i16 tmp_m = __msa_clti_s_h((v8i16)in, 0); \ + ILVRL_H2_SW(tmp_m, in, out0, out1); \ +} + +/* Description : Butterfly of 4 input vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Details : Butterfly operation + */ +#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) { \ + out0 = in0 + in3; \ + out1 = in1 + in2; \ + out2 = in1 - in2; \ + out3 = in0 - in3; \ +} + +/* Description : Transpose 4x4 block with word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Return Type - as per RTYPE + */ +#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) { \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ + ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ + out0 = (RTYPE)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ + out1 = (RTYPE)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ + out2 = (RTYPE)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ + out3 = (RTYPE)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ +} +#define TRANSPOSE4x4_SW_SW(...) TRANSPOSE4x4_W(v4i32, __VA_ARGS__) + +/* Description : Add block 4x4 + * Arguments : Inputs - in0, in1, in2, in3, pdst, stride + * Details : Least significant 4 bytes from each input vector are added to + * the destination bytes, clipped between 0-255 and stored. + */ +#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) { \ + uint32_t src0_m, src1_m, src2_m, src3_m; \ + v8i16 inp0_m, inp1_m, res0_m, res1_m; \ + v16i8 dst0_m = { 0 }; \ + v16i8 dst1_m = { 0 }; \ + const v16i8 zero_m = { 0 }; \ + ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m); \ + LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ + INSERT_W2_SB(src0_m, src1_m, dst0_m); \ + INSERT_W2_SB(src2_m, src3_m, dst1_m); \ + ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ + ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ + CLIP_SH2_0_255(res0_m, res1_m); \ + PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ + ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ +} + +#endif /* WEBP_DSP_MSA_MACRO_H_ */ From 2abfa54f9554125df6327a15c492487f4d2f6524 Mon Sep 17 00:00:00 2001 From: Pascal Massimino Date: Thu, 16 Jun 2016 22:24:30 -0700 Subject: [PATCH 4/4] DecodeImageData(): change the incorrect assert this function can be called not to decode pixels, but simply to finish processing (through process_func()) the already decoded pixels. Change-Id: I80485e92e3c47f0aa3389476dcb82745a243fc4a --- src/dec/vp8l.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dec/vp8l.c b/src/dec/vp8l.c index 164d2d03..e154a0af 100644 --- a/src/dec/vp8l.c +++ b/src/dec/vp8l.c @@ -1055,7 +1055,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data, const int mask = hdr->huffman_mask_; const HTreeGroup* htree_group = (src < src_last) ? GetHtreeGroupForPos(hdr, col, row) : NULL; - assert(src < src_end); + assert(dec->last_row_ < last_row); assert(src_last <= src_end); while (src < src_last) {