diff --git a/Android.mk b/Android.mk index a626b4b7..20d1d691 100644 --- a/Android.mk +++ b/Android.mk @@ -79,8 +79,9 @@ ifneq ($(findstring armeabi-v7a, $(TARGET_ARCH_ABI)),) # instructions to be generated for armv7a code. Instead target the neon code # specifically. LOCAL_SRC_FILES += src/dsp/dec_neon.c.neon - LOCAL_SRC_FILES += src/dsp/upsampling_neon.c.neon LOCAL_SRC_FILES += src/dsp/enc_neon.c.neon + LOCAL_SRC_FILES += src/dsp/lossless_neon.c.neon + LOCAL_SRC_FILES += src/dsp/upsampling_neon.c.neon endif LOCAL_STATIC_LIBRARIES := cpufeatures diff --git a/Makefile.vc b/Makefile.vc index d18fa32d..237d501b 100644 --- a/Makefile.vc +++ b/Makefile.vc @@ -174,6 +174,7 @@ DSP_DEC_OBJS = \ $(DIROBJ)\dsp\dec_neon.obj \ $(DIROBJ)\dsp\dec_sse2.obj \ $(DIROBJ)\dsp\lossless.obj \ + $(DIROBJ)\dsp\lossless_neon.obj \ $(DIROBJ)\dsp\lossless_sse2.obj \ $(DIROBJ)\dsp\upsampling.obj \ $(DIROBJ)\dsp\upsampling_mips32.obj \ diff --git a/makefile.unix b/makefile.unix index 630b0a43..30b96cc9 100644 --- a/makefile.unix +++ b/makefile.unix @@ -69,6 +69,10 @@ EXTRA_FLAGS += -Wdeclaration-after-statement EXTRA_FLAGS += -Wshadow # EXTRA_FLAGS += -Wvla +# NEON-specific flags: +# EXTRA_FLAGS += -march=armv7-a -mfloat-abi=hard -mfpu=neon -mtune=cortex-a8 +# -> seems to make the overall lib slower: -fno-split-wide-types + #### Nothing should normally be changed below this line #### AR = ar @@ -105,6 +109,7 @@ DSP_DEC_OBJS = \ src/dsp/dec_neon.o \ src/dsp/dec_sse2.o \ src/dsp/lossless.o \ + src/dsp/lossless_neon.o \ src/dsp/lossless_sse2.o \ src/dsp/upsampling.o \ src/dsp/upsampling_mips32.o \ diff --git a/src/dsp/Makefile.am b/src/dsp/Makefile.am index 7b883dcd..0b329093 100644 --- a/src/dsp/Makefile.am +++ b/src/dsp/Makefile.am @@ -17,6 +17,7 @@ COMMON_SOURCES += dec_neon.c COMMON_SOURCES += dec_sse2.c COMMON_SOURCES += dsp.h COMMON_SOURCES += lossless.c +COMMON_SOURCES += lossless_neon.c COMMON_SOURCES += lossless_sse2.c COMMON_SOURCES += lossless.h COMMON_SOURCES += upsampling.c diff --git a/src/dsp/lossless.c b/src/dsp/lossless.c index 9542bf80..cde4a8de 100644 --- a/src/dsp/lossless.c +++ b/src/dsp/lossless.c @@ -1475,6 +1475,7 @@ VP8LConvertFunc VP8LConvertBGRAToRGB565; VP8LConvertFunc VP8LConvertBGRAToBGR; extern void VP8LDspInitSSE2(void); +extern void VP8LDspInitNEON(void); void VP8LDspInit(void) { memcpy(VP8LPredictors, kPredictorsC, sizeof(VP8LPredictors)); @@ -1494,6 +1495,11 @@ void VP8LDspInit(void) { if (VP8GetCPUInfo(kSSE2)) { VP8LDspInitSSE2(); } +#endif +#if defined(WEBP_USE_NEON) + if (VP8GetCPUInfo(kNEON)) { + VP8LDspInitNEON(); + } #endif } } diff --git a/src/dsp/lossless_neon.c b/src/dsp/lossless_neon.c new file mode 100644 index 00000000..ed5e1a7d --- /dev/null +++ b/src/dsp/lossless_neon.c @@ -0,0 +1,81 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// NEON variant of methods for lossless decoder +// +// Author: Skal (pascal.massimino@gmail.com) + +#include "./dsp.h" + +#if defined(WEBP_USE_NEON) + +#include + +#include "./lossless.h" + +//------------------------------------------------------------------------------ +// Colorspace conversion functions + +static void ConvertBGRAToRGBA(const uint32_t* src, + int num_pixels, uint8_t* dst) { + const uint32_t* const end = src + num_pixels - 16; + for (; src <= end; src += 16) { + uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); + // swap B and R. (VSWP d0,d2 has no intrinsics equivalent!) + const uint8x16_t tmp = pixel.val[0]; + pixel.val[0] = pixel.val[2]; + pixel.val[2] = tmp; + vst4q_u8(dst, pixel); + dst += 64; + } + num_pixels &= 15; + VP8LConvertBGRAToRGBA_C(src, num_pixels, dst); // left-overs +} + +static void ConvertBGRAToBGR(const uint32_t* src, + int num_pixels, uint8_t* dst) { + const uint32_t* const end = src + num_pixels - 16; + for (; src <= end; src += 16) { + const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); + const uint8x16x3_t tmp = { { pixel.val[0], pixel.val[1], pixel.val[2] } }; + vst3q_u8(dst, tmp); + dst += 48; + } + num_pixels &= 15; + VP8LConvertBGRAToBGR_C(src, num_pixels, dst); // left-overs +} + +static void ConvertBGRAToRGB(const uint32_t* src, + int num_pixels, uint8_t* dst) { + const uint32_t* const end = src + num_pixels - 16; + for (; src <= end; src += 16) { + const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); + const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } }; + vst3q_u8(dst, tmp); + dst += 48; + } + num_pixels &= 15; + VP8LConvertBGRAToRGB_C(src, num_pixels, dst); // left-overs +} + +#endif // WEBP_USE_NEON + +//------------------------------------------------------------------------------ + +extern void VP8LDspInitNEON(void); + +void VP8LDspInitNEON(void) { +#if defined(WEBP_USE_NEON) + VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; + VP8LConvertBGRAToBGR = ConvertBGRAToBGR; + VP8LConvertBGRAToRGB = ConvertBGRAToRGB; +#endif // WEBP_USE_NEON +} + +//------------------------------------------------------------------------------