add some colorspace conversion functions in NEON

new file: lossless_neon.c speedup is ~5% gcc 4.6.3 seems to be doing some sub-optimal things here, storing register on stack using 'vstmia' and such. Looks similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=51509 I've tried adding -fno-split-wide-types and it does help the generated assembly. But the overall speed gets worse with this flag. We should only compile lossless_neon.c with it -> urk. Change-Id: I2ccc0929f5ef9dfb0105960e65c0b79b5f18d3b0
2026-04-09 14:22:31 +02:00 · 2014-03-31 16:36:33 +02:00
parent daccbf400d
commit 97e5fac389
6 changed files with 96 additions and 1 deletions
--- a/Android.mk
+++ b/Android.mk
@@ -79,8 +79,9 @@ ifneq ($(findstring armeabi-v7a, $(TARGET_ARCH_ABI)),)
  # instructions to be generated for armv7a code. Instead target the neon code
  # specifically.
  LOCAL_SRC_FILES += src/dsp/dec_neon.c.neon
-  LOCAL_SRC_FILES += src/dsp/upsampling_neon.c.neon
  LOCAL_SRC_FILES += src/dsp/enc_neon.c.neon
+  LOCAL_SRC_FILES += src/dsp/lossless_neon.c.neon
+  LOCAL_SRC_FILES += src/dsp/upsampling_neon.c.neon
 endif
 LOCAL_STATIC_LIBRARIES := cpufeatures

--- a/Makefile.vc
+++ b/Makefile.vc
@@ -174,6 +174,7 @@ DSP_DEC_OBJS = \
    $(DIROBJ)\dsp\dec_neon.obj \
    $(DIROBJ)\dsp\dec_sse2.obj \
    $(DIROBJ)\dsp\lossless.obj \
+    $(DIROBJ)\dsp\lossless_neon.obj \
    $(DIROBJ)\dsp\lossless_sse2.obj \
    $(DIROBJ)\dsp\upsampling.obj \
    $(DIROBJ)\dsp\upsampling_mips32.obj \
--- a/makefile.unix
+++ b/makefile.unix
@@ -69,6 +69,10 @@ EXTRA_FLAGS += -Wdeclaration-after-statement
 EXTRA_FLAGS += -Wshadow
 # EXTRA_FLAGS += -Wvla

+# NEON-specific flags:
+# EXTRA_FLAGS += -march=armv7-a -mfloat-abi=hard -mfpu=neon -mtune=cortex-a8
+# -> seems to make the overall lib slower: -fno-split-wide-types
+
 #### Nothing should normally be changed below this line ####

 AR = ar
@@ -105,6 +109,7 @@ DSP_DEC_OBJS = \
    src/dsp/dec_neon.o \
    src/dsp/dec_sse2.o \
    src/dsp/lossless.o \
+    src/dsp/lossless_neon.o \
    src/dsp/lossless_sse2.o \
    src/dsp/upsampling.o \
    src/dsp/upsampling_mips32.o \
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@@ -17,6 +17,7 @@ COMMON_SOURCES += dec_neon.c
 COMMON_SOURCES += dec_sse2.c
 COMMON_SOURCES += dsp.h
 COMMON_SOURCES += lossless.c
+COMMON_SOURCES += lossless_neon.c
 COMMON_SOURCES += lossless_sse2.c
 COMMON_SOURCES += lossless.h
 COMMON_SOURCES += upsampling.c
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
@@ -1475,6 +1475,7 @@ VP8LConvertFunc VP8LConvertBGRAToRGB565;
 VP8LConvertFunc VP8LConvertBGRAToBGR;

 extern void VP8LDspInitSSE2(void);
+extern void VP8LDspInitNEON(void);

 void VP8LDspInit(void) {
  memcpy(VP8LPredictors, kPredictorsC, sizeof(VP8LPredictors));
@@ -1494,6 +1495,11 @@ void VP8LDspInit(void) {
    if (VP8GetCPUInfo(kSSE2)) {
      VP8LDspInitSSE2();
    }
+#endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8LDspInitNEON();
+    }
 #endif
  }
 }
--- a/src/dsp/lossless_neon.c
+++ b/src/dsp/lossless_neon.c
@@ -0,0 +1,81 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// NEON variant of methods for lossless decoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <arm_neon.h>
+
+#include "./lossless.h"
+
+//------------------------------------------------------------------------------
+// Colorspace conversion functions
+
+static void ConvertBGRAToRGBA(const uint32_t* src,
+                              int num_pixels, uint8_t* dst) {
+  const uint32_t* const end = src + num_pixels - 16;
+  for (; src <= end; src += 16) {
+    uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
+    // swap B and R. (VSWP d0,d2 has no intrinsics equivalent!)
+    const uint8x16_t tmp = pixel.val[0];
+    pixel.val[0] = pixel.val[2];
+    pixel.val[2] = tmp;
+    vst4q_u8(dst, pixel);
+    dst += 64;
+  }
+  num_pixels &= 15;
+  VP8LConvertBGRAToRGBA_C(src, num_pixels, dst);  // left-overs
+}
+
+static void ConvertBGRAToBGR(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  const uint32_t* const end = src + num_pixels - 16;
+  for (; src <= end; src += 16) {
+    const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
+    const uint8x16x3_t tmp = { { pixel.val[0], pixel.val[1], pixel.val[2] } };
+    vst3q_u8(dst, tmp);
+    dst += 48;
+  }
+  num_pixels &= 15;
+  VP8LConvertBGRAToBGR_C(src, num_pixels, dst);  // left-overs
+}
+
+static void ConvertBGRAToRGB(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  const uint32_t* const end = src + num_pixels - 16;
+  for (; src <= end; src += 16) {
+    const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
+    const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } };
+    vst3q_u8(dst, tmp);
+    dst += 48;
+  }
+  num_pixels &= 15;
+  VP8LConvertBGRAToRGB_C(src, num_pixels, dst);  // left-overs
+}
+
+#endif   // WEBP_USE_NEON
+
+//------------------------------------------------------------------------------
+
+extern void VP8LDspInitNEON(void);
+
+void VP8LDspInitNEON(void) {
+#if defined(WEBP_USE_NEON)
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
+#endif   // WEBP_USE_NEON
+}
+
+//------------------------------------------------------------------------------