libwebp/src/dsp/neon.h

// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
//  NEON common code.

#ifndef WEBP_DSP_NEON_H_
#define WEBP_DSP_NEON_H_

#include <arm_neon.h>

#include "./dsp.h"

// Right now, some intrinsics functions seem slower, so we disable them
// everywhere except aarch64 where the inline assembly is incompatible.
#if defined(__aarch64__)
#define WEBP_USE_INTRINSICS   // use intrinsics when possible
#endif

#define INIT_VECTOR2(v, a, b) do {  \
  v.val[0] = a;                     \
  v.val[1] = b;                     \
} while (0)

#define INIT_VECTOR3(v, a, b, c) do {  \
  v.val[0] = a;                        \
  v.val[1] = b;                        \
  v.val[2] = c;                        \
} while (0)

#define INIT_VECTOR4(v, a, b, c, d) do {  \
  v.val[0] = a;                           \
  v.val[1] = b;                           \
  v.val[2] = c;                           \
  v.val[3] = d;                           \
} while (0)

// if using intrinsics, this flag avoids some functions that make gcc-4.6.3
// crash ("internal compiler error: in immed_double_const, at emit-rtl.").
// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
#define WORK_AROUND_GCC
#endif

static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
  uint64x2x2_t row01, row23;

  row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
  row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
  row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
  row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
  // Transpose 64-bit values (there's no vswp equivalent)
  {
    const uint64x1_t row0h = vget_high_u64(row01.val[0]);
    const uint64x1_t row2l = vget_low_u64(row23.val[0]);
    const uint64x1_t row1h = vget_high_u64(row01.val[1]);
    const uint64x1_t row3l = vget_low_u64(row23.val[1]);
    row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
    row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
    row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
    row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
  }
  {
    const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
                                        vreinterpretq_s32_u64(row01.val[1]));
    const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
                                        vreinterpretq_s32_u64(row23.val[1]));
    int32x4x4_t out;
    out.val[0] = out01.val[0];
    out.val[1] = out01.val[1];
    out.val[2] = out23.val[0];
    out.val[3] = out23.val[1];
    return out;
  }
}

#endif  // WEBP_DSP_NEON_H_
enc_neon: move Transpose4x4 to dsp/neon.h + reuse it in TransformWHT() Change-Id: Idfbd0f9b58d6253ac3d65ba55b58989c427ee989 2014-04-26 21:11:00 +02:00			`// Copyright 2014 Google Inc. All Rights Reserved.`
			`//`
			`// Use of this source code is governed by a BSD-style license`
			`// that can be found in the COPYING file in the root of the source`
			`// tree. An additional intellectual property rights grant can be found`
			`// in the file PATENTS. All contributing project authors may`
			`// be found in the AUTHORS file in the root of the source tree.`
			`// -----------------------------------------------------------------------------`
			`//`
			`// NEON common code.`

			`#ifndef WEBP_DSP_NEON_H_`
			`#define WEBP_DSP_NEON_H_`

			`#include <arm_neon.h>`

dsp/neon.h: coalesce intrinsics-related defines Change-Id: Ifadd41a5bbf7f99eeb6d75d2b67daa25e0544946 2014-05-03 20:34:07 +02:00			`#include "./dsp.h"`

enable NEON intrinsics in aarch64 builds avoids functions that use vtbl? as in iOS builds these are marked unavailable Change-Id: I17aedc3c7dc8f1d5be0941205de0b22c3772ef1b 2014-05-03 05:38:21 +02:00			`// Right now, some intrinsics functions seem slower, so we disable them`
			`// everywhere except aarch64 where the inline assembly is incompatible.`
			`#if defined(__aarch64__)`
dsp: s/USE_INTRINSICS/WEBP_USE_INTRINSICS/ for consistency with other defines shared across modules Change-Id: I30cdb9f892e9ea48265883f560500ffb1d6799ee 2015-01-12 23:26:08 +01:00			`#define WEBP_USE_INTRINSICS // use intrinsics when possible`
enable NEON intrinsics in aarch64 builds avoids functions that use vtbl? as in iOS builds these are marked unavailable Change-Id: I17aedc3c7dc8f1d5be0941205de0b22c3772ef1b 2014-05-03 05:38:21 +02:00			`#endif`
dsp/neon.h: coalesce intrinsics-related defines Change-Id: Ifadd41a5bbf7f99eeb6d75d2b67daa25e0544946 2014-05-03 20:34:07 +02:00
neon: add INIT_VECTOR2 used to initialize NxMx2 vector types replaces initialization via '{{ }}' gnu-ism. Change-Id: I4accc305c7dd4c886b63c22e38890b629bffb139 2014-06-29 22:40:45 +02:00			`#define INIT_VECTOR2(v, a, b) do { \`
			`v.val[0] = a; \`
			`v.val[1] = b; \`
			`} while (0)`

neon: add INIT_VECTOR3 used to initialize NxMx3 vector types replaces initialization via '{{ }}' gnu-ism. Change-Id: Idad2f278ab104cf2cc650517194258ce3cfb37b4 2014-06-29 22:40:45 +02:00			`#define INIT_VECTOR3(v, a, b, c) do { \`
			`v.val[0] = a; \`
			`v.val[1] = b; \`
			`v.val[2] = c; \`
			`} while (0)`

neon: add INIT_VECTOR4 used to initialize NxMx4 vector types replaces initialization via '{{ }}' gnu-ism. Change-Id: I0da7b3d321f3d48579b7863fb2e4d3f449ae7f5e 2014-06-29 22:40:45 +02:00			`#define INIT_VECTOR4(v, a, b, c, d) do { \`
			`v.val[0] = a; \`
			`v.val[1] = b; \`
			`v.val[2] = c; \`
			`v.val[3] = d; \`
			`} while (0)`

dsp/neon.h: coalesce intrinsics-related defines Change-Id: Ifadd41a5bbf7f99eeb6d75d2b67daa25e0544946 2014-05-03 20:34:07 +02:00			`// if using intrinsics, this flag avoids some functions that make gcc-4.6.3`
			`// crash ("internal compiler error: in immed_double_const, at emit-rtl.").`
			`// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)`
enable NEON intrinsics in aarch64 builds avoids functions that use vtbl? as in iOS builds these are marked unavailable Change-Id: I17aedc3c7dc8f1d5be0941205de0b22c3772ef1b 2014-05-03 05:38:21 +02:00			`#if !(LOCAL_GCC_PREREQ(4,8) \|\| defined(__aarch64__))`
dsp/neon.h: coalesce intrinsics-related defines Change-Id: Ifadd41a5bbf7f99eeb6d75d2b67daa25e0544946 2014-05-03 20:34:07 +02:00			`#define WORK_AROUND_GCC`
			`#endif`

enc_neon: move Transpose4x4 to dsp/neon.h + reuse it in TransformWHT() Change-Id: Idfbd0f9b58d6253ac3d65ba55b58989c427ee989 2014-04-26 21:11:00 +02:00			`static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {`
			`uint64x2x2_t row01, row23;`

			`row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);`
			`row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);`
			`row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);`
			`row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);`
			`// Transpose 64-bit values (there's no vswp equivalent)`
			`{`
			`const uint64x1_t row0h = vget_high_u64(row01.val[0]);`
			`const uint64x1_t row2l = vget_low_u64(row23.val[0]);`
			`const uint64x1_t row1h = vget_high_u64(row01.val[1]);`
			`const uint64x1_t row3l = vget_low_u64(row23.val[1]);`
			`row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);`
			`row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));`
			`row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);`
			`row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));`
			`}`
			`{`
			`const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),`
			`vreinterpretq_s32_u64(row01.val[1]));`
			`const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),`
			`vreinterpretq_s32_u64(row23.val[1]));`
			`int32x4x4_t out;`
			`out.val[0] = out01.val[0];`
			`out.val[1] = out01.val[1];`
			`out.val[2] = out23.val[0];`
			`out.val[3] = out23.val[1];`
			`return out;`
			`}`
			`}`

			`#endif // WEBP_DSP_NEON_H_`