add some colorspace conversion functions in NEON

new file: lossless_neon.c
speedup is ~5%

gcc 4.6.3 seems to be doing some sub-optimal things here,
storing register on stack using 'vstmia' and such.
Looks similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=51509

I've tried adding  -fno-split-wide-types and it does help
the generated assembly. But the overall speed gets worse with
this flag. We should only compile lossless_neon.c with it -> urk.

Change-Id: I2ccc0929f5ef9dfb0105960e65c0b79b5f18d3b0
This commit is contained in:
skal 2014-03-31 16:36:33 +02:00
parent daccbf400d
commit 97e5fac389
6 changed files with 96 additions and 1 deletions

View File

@ -79,8 +79,9 @@ ifneq ($(findstring armeabi-v7a, $(TARGET_ARCH_ABI)),)
# instructions to be generated for armv7a code. Instead target the neon code
# specifically.
LOCAL_SRC_FILES += src/dsp/dec_neon.c.neon
LOCAL_SRC_FILES += src/dsp/upsampling_neon.c.neon
LOCAL_SRC_FILES += src/dsp/enc_neon.c.neon
LOCAL_SRC_FILES += src/dsp/lossless_neon.c.neon
LOCAL_SRC_FILES += src/dsp/upsampling_neon.c.neon
endif
LOCAL_STATIC_LIBRARIES := cpufeatures

View File

@ -174,6 +174,7 @@ DSP_DEC_OBJS = \
$(DIROBJ)\dsp\dec_neon.obj \
$(DIROBJ)\dsp\dec_sse2.obj \
$(DIROBJ)\dsp\lossless.obj \
$(DIROBJ)\dsp\lossless_neon.obj \
$(DIROBJ)\dsp\lossless_sse2.obj \
$(DIROBJ)\dsp\upsampling.obj \
$(DIROBJ)\dsp\upsampling_mips32.obj \

View File

@ -69,6 +69,10 @@ EXTRA_FLAGS += -Wdeclaration-after-statement
EXTRA_FLAGS += -Wshadow
# EXTRA_FLAGS += -Wvla
# NEON-specific flags:
# EXTRA_FLAGS += -march=armv7-a -mfloat-abi=hard -mfpu=neon -mtune=cortex-a8
# -> seems to make the overall lib slower: -fno-split-wide-types
#### Nothing should normally be changed below this line ####
AR = ar
@ -105,6 +109,7 @@ DSP_DEC_OBJS = \
src/dsp/dec_neon.o \
src/dsp/dec_sse2.o \
src/dsp/lossless.o \
src/dsp/lossless_neon.o \
src/dsp/lossless_sse2.o \
src/dsp/upsampling.o \
src/dsp/upsampling_mips32.o \

View File

@ -17,6 +17,7 @@ COMMON_SOURCES += dec_neon.c
COMMON_SOURCES += dec_sse2.c
COMMON_SOURCES += dsp.h
COMMON_SOURCES += lossless.c
COMMON_SOURCES += lossless_neon.c
COMMON_SOURCES += lossless_sse2.c
COMMON_SOURCES += lossless.h
COMMON_SOURCES += upsampling.c

View File

@ -1475,6 +1475,7 @@ VP8LConvertFunc VP8LConvertBGRAToRGB565;
VP8LConvertFunc VP8LConvertBGRAToBGR;
extern void VP8LDspInitSSE2(void);
extern void VP8LDspInitNEON(void);
void VP8LDspInit(void) {
memcpy(VP8LPredictors, kPredictorsC, sizeof(VP8LPredictors));
@ -1494,6 +1495,11 @@ void VP8LDspInit(void) {
if (VP8GetCPUInfo(kSSE2)) {
VP8LDspInitSSE2();
}
#endif
#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
VP8LDspInitNEON();
}
#endif
}
}

81
src/dsp/lossless_neon.c Normal file
View File

@ -0,0 +1,81 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// NEON variant of methods for lossless decoder
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#if defined(WEBP_USE_NEON)
#include <arm_neon.h>
#include "./lossless.h"
//------------------------------------------------------------------------------
// Colorspace conversion functions
static void ConvertBGRAToRGBA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + num_pixels - 16;
for (; src <= end; src += 16) {
uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
// swap B and R. (VSWP d0,d2 has no intrinsics equivalent!)
const uint8x16_t tmp = pixel.val[0];
pixel.val[0] = pixel.val[2];
pixel.val[2] = tmp;
vst4q_u8(dst, pixel);
dst += 64;
}
num_pixels &= 15;
VP8LConvertBGRAToRGBA_C(src, num_pixels, dst); // left-overs
}
static void ConvertBGRAToBGR(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + num_pixels - 16;
for (; src <= end; src += 16) {
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
const uint8x16x3_t tmp = { { pixel.val[0], pixel.val[1], pixel.val[2] } };
vst3q_u8(dst, tmp);
dst += 48;
}
num_pixels &= 15;
VP8LConvertBGRAToBGR_C(src, num_pixels, dst); // left-overs
}
static void ConvertBGRAToRGB(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + num_pixels - 16;
for (; src <= end; src += 16) {
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } };
vst3q_u8(dst, tmp);
dst += 48;
}
num_pixels &= 15;
VP8LConvertBGRAToRGB_C(src, num_pixels, dst); // left-overs
}
#endif // WEBP_USE_NEON
//------------------------------------------------------------------------------
extern void VP8LDspInitNEON(void);
void VP8LDspInitNEON(void) {
#if defined(WEBP_USE_NEON)
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
#endif // WEBP_USE_NEON
}
//------------------------------------------------------------------------------