add some colorspace conversion functions in NEON

new file: lossless_neon.c speedup is ~5% gcc 4.6.3 seems to be doing some sub-optimal things here, storing register on stack using 'vstmia' and such. Looks similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=51509 I've tried adding -fno-split-wide-types and it does help the generated assembly. But the overall speed gets worse with this flag. We should only compile lossless_neon.c with it -> urk. Change-Id: I2ccc0929f5ef9dfb0105960e65c0b79b5f18d3b0
2025-07-26 18:59:48 +02:00 · 2014-03-31 16:36:33 +02:00
parent daccbf400d
commit 97e5fac389
6 changed files with 96 additions and 1 deletions
--- a/Android.mk
+++ b/Android.mk
@ -79,8 +79,9 @@ ifneq ($(findstring armeabi-v7a, $(TARGET_ARCH_ABI)),)
  # instructions to be generated for armv7a code. Instead target the neon code
  # specifically.
  LOCAL_SRC_FILES += src/dsp/dec_neon.c.neon
  LOCAL_SRC_FILES += src/dsp/upsampling_neon.c.neon
  LOCAL_SRC_FILES += src/dsp/enc_neon.c.neon
  LOCAL_SRC_FILES += src/dsp/lossless_neon.c.neon
  LOCAL_SRC_FILES += src/dsp/upsampling_neon.c.neon
 endif
 LOCAL_STATIC_LIBRARIES := cpufeatures
--- a/Makefile.vc
+++ b/Makefile.vc
@ -174,6 +174,7 @@ DSP_DEC_OBJS = \
    $(DIROBJ)\dsp\dec_neon.obj \
    $(DIROBJ)\dsp\dec_sse2.obj \
    $(DIROBJ)\dsp\lossless.obj \
    $(DIROBJ)\dsp\lossless_neon.obj \
    $(DIROBJ)\dsp\lossless_sse2.obj \
    $(DIROBJ)\dsp\upsampling.obj \
    $(DIROBJ)\dsp\upsampling_mips32.obj \
--- a/makefile.unix
+++ b/makefile.unix
@ -69,6 +69,10 @@ EXTRA_FLAGS += -Wdeclaration-after-statement
 EXTRA_FLAGS += -Wshadow
 # EXTRA_FLAGS += -Wvla
 # NEON-specific flags:
 # EXTRA_FLAGS += -march=armv7-a -mfloat-abi=hard -mfpu=neon -mtune=cortex-a8
 # -> seems to make the overall lib slower: -fno-split-wide-types
 #### Nothing should normally be changed below this line ####
 AR = ar
@ -105,6 +109,7 @@ DSP_DEC_OBJS = \
    src/dsp/dec_neon.o \
    src/dsp/dec_sse2.o \
    src/dsp/lossless.o \
    src/dsp/lossless_neon.o \
    src/dsp/lossless_sse2.o \
    src/dsp/upsampling.o \
    src/dsp/upsampling_mips32.o \
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@ -17,6 +17,7 @@ COMMON_SOURCES += dec_neon.c
 COMMON_SOURCES += dec_sse2.c
 COMMON_SOURCES += dsp.h
 COMMON_SOURCES += lossless.c
 COMMON_SOURCES += lossless_neon.c
 COMMON_SOURCES += lossless_sse2.c
 COMMON_SOURCES += lossless.h
 COMMON_SOURCES += upsampling.c
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
@ -1475,6 +1475,7 @@ VP8LConvertFunc VP8LConvertBGRAToRGB565;
 VP8LConvertFunc VP8LConvertBGRAToBGR;
 extern void VP8LDspInitSSE2(void);
 extern void VP8LDspInitNEON(void);
 void VP8LDspInit(void) {
  memcpy(VP8LPredictors, kPredictorsC, sizeof(VP8LPredictors));
@ -1494,6 +1495,11 @@ void VP8LDspInit(void) {
    if (VP8GetCPUInfo(kSSE2)) {
      VP8LDspInitSSE2();
    }
 #endif
 #if defined(WEBP_USE_NEON)
    if (VP8GetCPUInfo(kNEON)) {
      VP8LDspInitNEON();
    }
 #endif
  }
 }
--- a/src/dsp/lossless_neon.c
+++ b/src/dsp/lossless_neon.c
@ -0,0 +1,81 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // NEON variant of methods for lossless decoder
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include "./dsp.h"
 #if defined(WEBP_USE_NEON)
 #include <arm_neon.h>
 #include "./lossless.h"
 //------------------------------------------------------------------------------
 // Colorspace conversion functions
 static void ConvertBGRAToRGBA(const uint32_t* src,
                              int num_pixels, uint8_t* dst) {
  const uint32_t* const end = src + num_pixels - 16;
  for (; src <= end; src += 16) {
    uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
    // swap B and R. (VSWP d0,d2 has no intrinsics equivalent!)
    const uint8x16_t tmp = pixel.val[0];
    pixel.val[0] = pixel.val[2];
    pixel.val[2] = tmp;
    vst4q_u8(dst, pixel);
    dst += 64;
  }
  num_pixels &= 15;
  VP8LConvertBGRAToRGBA_C(src, num_pixels, dst);  // left-overs
 }
 static void ConvertBGRAToBGR(const uint32_t* src,
                             int num_pixels, uint8_t* dst) {
  const uint32_t* const end = src + num_pixels - 16;
  for (; src <= end; src += 16) {
    const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
    const uint8x16x3_t tmp = { { pixel.val[0], pixel.val[1], pixel.val[2] } };
    vst3q_u8(dst, tmp);
    dst += 48;
  }
  num_pixels &= 15;
  VP8LConvertBGRAToBGR_C(src, num_pixels, dst);  // left-overs
 }
 static void ConvertBGRAToRGB(const uint32_t* src,
                             int num_pixels, uint8_t* dst) {
  const uint32_t* const end = src + num_pixels - 16;
  for (; src <= end; src += 16) {
    const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
    const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } };
    vst3q_u8(dst, tmp);
    dst += 48;
  }
  num_pixels &= 15;
  VP8LConvertBGRAToRGB_C(src, num_pixels, dst);  // left-overs
 }
 #endif   // WEBP_USE_NEON
 //------------------------------------------------------------------------------
 extern void VP8LDspInitNEON(void);
 void VP8LDspInitNEON(void) {
 #if defined(WEBP_USE_NEON)
  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
  VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
 #endif   // WEBP_USE_NEON
 }
 //------------------------------------------------------------------------------