Files
libwebp/src/dsp/enc.c
James Zern b7971d0e22 dsp: avoid defining _C functions w/NEON builds
when targeting NEON C functions with NEON equivalents won't be used, but
will contribute to binary size. the same goes for sse2, etc., but this
change is primarily concerned with binary sizes for android arm targets.

note '-noasm' or otherwise modifying VP8GetCPUInfo will have no effect
on the use of NEON functions.

this decision can be overridden by defining WEBP_DSP_OMIT_C_CODE to 0.

Change-Id: I47bd453c84a3d341ca39bc986a39eb9c785aface
2017-10-27 10:54:56 -07:00

844 lines
24 KiB
C

// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Speed-critical encoding functions.
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h> // for abs()
#include "src/dsp/dsp.h"
#include "src/enc/vp8i_enc.h"
static WEBP_INLINE uint8_t clip_8b(int v) {
return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
}
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE int clip_max(int v, int max) {
return (v > max) ? max : v;
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
const int VP8DspScan[16 + 4 + 4] = {
// Luma
0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
};
// general-purpose util function
void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
VP8Histogram* const histo) {
int max_value = 0, last_non_zero = 1;
int k;
for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
const int value = distribution[k];
if (value > 0) {
if (value > max_value) max_value = value;
last_non_zero = k;
}
}
histo->max_value = max_value;
histo->last_non_zero = last_non_zero;
}
#if !WEBP_NEON_OMIT_C_CODE
static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
int k;
int16_t out[16];
VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
// Convert coefficients to bin.
for (k = 0; k < 16; ++k) {
const int v = abs(out[k]) >> 3;
const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
++distribution[clipped_value];
}
}
VP8SetHistogramData(distribution, histo);
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// run-time tables (~4k)
static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255]
// We declare this variable 'volatile' to prevent instruction reordering
// and make sure it's set to true _last_ (so as to be thread-safe)
static volatile int tables_ok = 0;
static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
if (!tables_ok) {
int i;
for (i = -255; i <= 255 + 255; ++i) {
clip1[255 + i] = clip_8b(i);
}
tables_ok = 1;
}
}
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
#if !WEBP_NEON_OMIT_C_CODE
#define STORE(x, y, v) \
dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
#define MUL(a, b) (((a) * (b)) >> 16)
static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
uint8_t* dst) {
int C[4 * 4], *tmp;
int i;
tmp = C;
for (i = 0; i < 4; ++i) { // vertical pass
const int a = in[0] + in[8];
const int b = in[0] - in[8];
const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
tmp[0] = a + d;
tmp[1] = b + c;
tmp[2] = b - c;
tmp[3] = a - d;
tmp += 4;
in++;
}
tmp = C;
for (i = 0; i < 4; ++i) { // horizontal pass
const int dc = tmp[0] + 4;
const int a = dc + tmp[8];
const int b = dc - tmp[8];
const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
STORE(0, i, a + d);
STORE(1, i, b + c);
STORE(2, i, b - c);
STORE(3, i, a - d);
tmp++;
}
}
static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
int i;
int tmp[16];
for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
const int d0 = src[0] - ref[0]; // 9bit dynamic range ([-255,255])
const int d1 = src[1] - ref[1];
const int d2 = src[2] - ref[2];
const int d3 = src[3] - ref[3];
const int a0 = (d0 + d3); // 10b [-510,510]
const int a1 = (d1 + d2);
const int a2 = (d1 - d2);
const int a3 = (d0 - d3);
tmp[0 + i * 4] = (a0 + a1) * 8; // 14b [-8160,8160]
tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9; // [-7536,7542]
tmp[2 + i * 4] = (a0 - a1) * 8;
tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 937) >> 9;
}
for (i = 0; i < 4; ++i) {
const int a0 = (tmp[0 + i] + tmp[12 + i]); // 15b
const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
const int a3 = (tmp[0 + i] - tmp[12 + i]);
out[0 + i] = (a0 + a1 + 7) >> 4; // 12b
out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
out[8 + i] = (a0 - a1 + 7) >> 4;
out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
VP8FTransform(src, ref, out);
VP8FTransform(src + 4, ref + 4, out + 16);
}
#if !WEBP_NEON_OMIT_C_CODE
static void FTransformWHT_C(const int16_t* in, int16_t* out) {
// input is 12b signed
int32_t tmp[16];
int i;
for (i = 0; i < 4; ++i, in += 64) {
const int a0 = (in[0 * 16] + in[2 * 16]); // 13b
const int a1 = (in[1 * 16] + in[3 * 16]);
const int a2 = (in[1 * 16] - in[3 * 16]);
const int a3 = (in[0 * 16] - in[2 * 16]);
tmp[0 + i * 4] = a0 + a1; // 14b
tmp[1 + i * 4] = a3 + a2;
tmp[2 + i * 4] = a3 - a2;
tmp[3 + i * 4] = a0 - a1;
}
for (i = 0; i < 4; ++i) {
const int a0 = (tmp[0 + i] + tmp[8 + i]); // 15b
const int a1 = (tmp[4 + i] + tmp[12+ i]);
const int a2 = (tmp[4 + i] - tmp[12+ i]);
const int a3 = (tmp[0 + i] - tmp[8 + i]);
const int b0 = a0 + a1; // 16b
const int b1 = a3 + a2;
const int b2 = a3 - a2;
const int b3 = a0 - a1;
out[ 0 + i] = b0 >> 1; // 15b
out[ 4 + i] = b1 >> 1;
out[ 8 + i] = b2 >> 1;
out[12 + i] = b3 >> 1;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
#undef MUL
#undef STORE
//------------------------------------------------------------------------------
// Intra predictions
static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
int j;
for (j = 0; j < size; ++j) {
memset(dst + j * BPS, value, size);
}
}
static WEBP_INLINE void VerticalPred(uint8_t* dst,
const uint8_t* top, int size) {
int j;
if (top != NULL) {
for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
} else {
Fill(dst, 127, size);
}
}
static WEBP_INLINE void HorizontalPred(uint8_t* dst,
const uint8_t* left, int size) {
if (left != NULL) {
int j;
for (j = 0; j < size; ++j) {
memset(dst + j * BPS, left[j], size);
}
} else {
Fill(dst, 129, size);
}
}
static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
const uint8_t* top, int size) {
int y;
if (left != NULL) {
if (top != NULL) {
const uint8_t* const clip = clip1 + 255 - left[-1];
for (y = 0; y < size; ++y) {
const uint8_t* const clip_table = clip + left[y];
int x;
for (x = 0; x < size; ++x) {
dst[x] = clip_table[top[x]];
}
dst += BPS;
}
} else {
HorizontalPred(dst, left, size);
}
} else {
// true motion without left samples (hence: with default 129 value)
// is equivalent to VE prediction where you just copy the top samples.
// Note that if top samples are not available, the default value is
// then 129, and not 127 as in the VerticalPred case.
if (top != NULL) {
VerticalPred(dst, top, size);
} else {
Fill(dst, 129, size);
}
}
}
static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
const uint8_t* top,
int size, int round, int shift) {
int DC = 0;
int j;
if (top != NULL) {
for (j = 0; j < size; ++j) DC += top[j];
if (left != NULL) { // top and left present
for (j = 0; j < size; ++j) DC += left[j];
} else { // top, but no left
DC += DC;
}
DC = (DC + round) >> shift;
} else if (left != NULL) { // left but no top
for (j = 0; j < size; ++j) DC += left[j];
DC += DC;
DC = (DC + round) >> shift;
} else { // no top, no left, nothing.
DC = 0x80;
}
Fill(dst, DC, size);
}
//------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2)
static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
// U block
DCMode(C8DC8 + dst, left, top, 8, 8, 4);
VerticalPred(C8VE8 + dst, top, 8);
HorizontalPred(C8HE8 + dst, left, 8);
TrueMotion(C8TM8 + dst, left, top, 8);
// V block
dst += 8;
if (top != NULL) top += 8;
if (left != NULL) left += 16;
DCMode(C8DC8 + dst, left, top, 8, 8, 4);
VerticalPred(C8VE8 + dst, top, 8);
HorizontalPred(C8HE8 + dst, left, 8);
TrueMotion(C8TM8 + dst, left, top, 8);
}
//------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3)
static void Intra16Preds_C(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
DCMode(I16DC16 + dst, left, top, 16, 16, 5);
VerticalPred(I16VE16 + dst, top, 16);
HorizontalPred(I16HE16 + dst, left, 16);
TrueMotion(I16TM16 + dst, left, top, 16);
}
//------------------------------------------------------------------------------
// luma 4x4 prediction
#define DST(x, y) dst[(x) + (y) * BPS]
#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
static void VE4(uint8_t* dst, const uint8_t* top) { // vertical
const uint8_t vals[4] = {
AVG3(top[-1], top[0], top[1]),
AVG3(top[ 0], top[1], top[2]),
AVG3(top[ 1], top[2], top[3]),
AVG3(top[ 2], top[3], top[4])
};
int i;
for (i = 0; i < 4; ++i) {
memcpy(dst + i * BPS, vals, 4);
}
}
static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
}
static void DC4(uint8_t* dst, const uint8_t* top) {
uint32_t dc = 4;
int i;
for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
Fill(dst, dc >> 3, 4);
}
static void RD4(uint8_t* dst, const uint8_t* top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
DST(0, 3) = AVG3(J, K, L);
DST(0, 2) = DST(1, 3) = AVG3(I, J, K);
DST(0, 1) = DST(1, 2) = DST(2, 3) = AVG3(X, I, J);
DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
DST(1, 0) = DST(2, 1) = DST(3, 2) = AVG3(B, A, X);
DST(2, 0) = DST(3, 1) = AVG3(C, B, A);
DST(3, 0) = AVG3(D, C, B);
}
static void LD4(uint8_t* dst, const uint8_t* top) {
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
const int E = top[4];
const int F = top[5];
const int G = top[6];
const int H = top[7];
DST(0, 0) = AVG3(A, B, C);
DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
DST(3, 3) = AVG3(G, H, H);
}
static void VR4(uint8_t* dst, const uint8_t* top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
DST(0, 0) = DST(1, 2) = AVG2(X, A);
DST(1, 0) = DST(2, 2) = AVG2(A, B);
DST(2, 0) = DST(3, 2) = AVG2(B, C);
DST(3, 0) = AVG2(C, D);
DST(0, 3) = AVG3(K, J, I);
DST(0, 2) = AVG3(J, I, X);
DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
DST(3, 1) = AVG3(B, C, D);
}
static void VL4(uint8_t* dst, const uint8_t* top) {
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
const int E = top[4];
const int F = top[5];
const int G = top[6];
const int H = top[7];
DST(0, 0) = AVG2(A, B);
DST(1, 0) = DST(0, 2) = AVG2(B, C);
DST(2, 0) = DST(1, 2) = AVG2(C, D);
DST(3, 0) = DST(2, 2) = AVG2(D, E);
DST(0, 1) = AVG3(A, B, C);
DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
DST(3, 2) = AVG3(E, F, G);
DST(3, 3) = AVG3(F, G, H);
}
static void HU4(uint8_t* dst, const uint8_t* top) {
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
DST(0, 0) = AVG2(I, J);
DST(2, 0) = DST(0, 1) = AVG2(J, K);
DST(2, 1) = DST(0, 2) = AVG2(K, L);
DST(1, 0) = AVG3(I, J, K);
DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
DST(3, 2) = DST(2, 2) =
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
static void HD4(uint8_t* dst, const uint8_t* top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
const int A = top[0];
const int B = top[1];
const int C = top[2];
DST(0, 0) = DST(2, 1) = AVG2(I, X);
DST(0, 1) = DST(2, 2) = AVG2(J, I);
DST(0, 2) = DST(2, 3) = AVG2(K, J);
DST(0, 3) = AVG2(L, K);
DST(3, 0) = AVG3(A, B, C);
DST(2, 0) = AVG3(X, A, B);
DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
DST(1, 3) = AVG3(L, K, J);
}
static void TM4(uint8_t* dst, const uint8_t* top) {
int x, y;
const uint8_t* const clip = clip1 + 255 - top[-1];
for (y = 0; y < 4; ++y) {
const uint8_t* const clip_table = clip + top[-2 - y];
for (x = 0; x < 4; ++x) {
dst[x] = clip_table[top[x]];
}
dst += BPS;
}
}
#undef DST
#undef AVG3
#undef AVG2
// Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7]
static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
HE4(I4HE4 + dst, top);
RD4(I4RD4 + dst, top);
VR4(I4VR4 + dst, top);
LD4(I4LD4 + dst, top);
VL4(I4VL4 + dst, top);
HD4(I4HD4 + dst, top);
HU4(I4HU4 + dst, top);
}
//------------------------------------------------------------------------------
// Metric
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
int w, int h) {
int count = 0;
int y, x;
for (y = 0; y < h; ++y) {
for (x = 0; x < w; ++x) {
const int diff = (int)a[x] - b[x];
count += diff * diff;
}
a += BPS;
b += BPS;
}
return count;
}
static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 16, 16);
}
static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 16, 8);
}
static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 8, 8);
}
static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 4, 4);
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
int k, x, y;
for (k = 0; k < 4; ++k) {
uint32_t avg = 0;
for (y = 0; y < 4; ++y) {
for (x = 0; x < 4; ++x) {
avg += ref[x + y * BPS];
}
}
dc[k] = avg;
ref += 4; // go to next 4x4 block.
}
}
//------------------------------------------------------------------------------
// Texture distortion
//
// We try to match the spectral content (weighted) between source and
// reconstructed samples.
#if !WEBP_NEON_OMIT_C_CODE
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
static int TTransform(const uint8_t* in, const uint16_t* w) {
int sum = 0;
int tmp[16];
int i;
// horizontal pass
for (i = 0; i < 4; ++i, in += BPS) {
const int a0 = in[0] + in[2];
const int a1 = in[1] + in[3];
const int a2 = in[1] - in[3];
const int a3 = in[0] - in[2];
tmp[0 + i * 4] = a0 + a1;
tmp[1 + i * 4] = a3 + a2;
tmp[2 + i * 4] = a3 - a2;
tmp[3 + i * 4] = a0 - a1;
}
// vertical pass
for (i = 0; i < 4; ++i, ++w) {
const int a0 = tmp[0 + i] + tmp[8 + i];
const int a1 = tmp[4 + i] + tmp[12+ i];
const int a2 = tmp[4 + i] - tmp[12+ i];
const int a3 = tmp[0 + i] - tmp[8 + i];
const int b0 = a0 + a1;
const int b1 = a3 + a2;
const int b2 = a3 - a2;
const int b3 = a0 - a1;
sum += w[ 0] * abs(b0);
sum += w[ 4] * abs(b1);
sum += w[ 8] * abs(b2);
sum += w[12] * abs(b3);
}
return sum;
}
static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int sum1 = TTransform(a, w);
const int sum2 = TTransform(b, w);
return abs(sum2 - sum1) >> 5;
}
static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4_C(a + x + y, b + x + y, w);
}
}
return D;
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Quantization
//
static const uint8_t kZigzag[16] = {
0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
};
// Simple quantization
static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
int last = -1;
int n;
for (n = 0; n < 16; ++n) {
const int j = kZigzag[n];
const int sign = (in[j] < 0);
const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
if (coeff > mtx->zthresh_[j]) {
const uint32_t Q = mtx->q_[j];
const uint32_t iQ = mtx->iq_[j];
const uint32_t B = mtx->bias_[j];
int level = QUANTDIV(coeff, iQ, B);
if (level > MAX_LEVEL) level = MAX_LEVEL;
if (sign) level = -level;
in[j] = level * (int)Q;
out[n] = level;
if (level) last = n;
} else {
out[n] = 0;
in[j] = 0;
}
}
return (last >= 0);
}
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Block copy
static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
int y;
for (y = 0; y < h; ++y) {
memcpy(dst, src, w);
src += BPS;
dst += BPS;
}
}
static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
Copy(src, dst, 4, 4);
}
static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
Copy(src, dst, 16, 8);
}
//------------------------------------------------------------------------------
// Initialization
// Speed-critical function pointers. We have to initialize them to the default
// implementations within VP8EncDspInit().
VP8CHisto VP8CollectHistogram;
VP8Idct VP8ITransform;
VP8Fdct VP8FTransform;
VP8Fdct VP8FTransform2;
VP8WHT VP8FTransformWHT;
VP8Intra4Preds VP8EncPredLuma4;
VP8IntraPreds VP8EncPredLuma16;
VP8IntraPreds VP8EncPredChroma8;
VP8Metric VP8SSE16x16;
VP8Metric VP8SSE8x8;
VP8Metric VP8SSE16x8;
VP8Metric VP8SSE4x4;
VP8WMetric VP8TDisto4x4;
VP8WMetric VP8TDisto16x16;
VP8MeanMetric VP8Mean16x4;
VP8QuantizeBlock VP8EncQuantizeBlock;
VP8Quantize2Blocks VP8EncQuantize2Blocks;
VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
VP8BlockCopy VP8Copy4x4;
VP8BlockCopy VP8Copy16x8;
extern void VP8EncDspInitSSE2(void);
extern void VP8EncDspInitSSE41(void);
extern void VP8EncDspInitAVX2(void);
extern void VP8EncDspInitNEON(void);
extern void VP8EncDspInitMIPS32(void);
extern void VP8EncDspInitMIPSdspR2(void);
extern void VP8EncDspInitMSA(void);
static volatile VP8CPUInfo enc_last_cpuinfo_used =
(VP8CPUInfo)&enc_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
if (enc_last_cpuinfo_used == VP8GetCPUInfo) return;
VP8DspInit(); // common inverse transforms
InitTables();
// default C implementations
#if !WEBP_NEON_OMIT_C_CODE
VP8ITransform = ITransform_C;
VP8FTransform = FTransform_C;
VP8FTransformWHT = FTransformWHT_C;
VP8TDisto4x4 = Disto4x4_C;
VP8TDisto16x16 = Disto16x16_C;
VP8CollectHistogram = CollectHistogram_C;
VP8SSE16x16 = SSE16x16_C;
VP8SSE16x8 = SSE16x8_C;
VP8SSE8x8 = SSE8x8_C;
VP8SSE4x4 = SSE4x4_C;
#endif
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
VP8EncQuantizeBlock = QuantizeBlock_C;
VP8EncQuantize2Blocks = Quantize2Blocks_C;
#endif
VP8FTransform2 = FTransform2_C;
VP8EncPredLuma4 = Intra4Preds_C;
VP8EncPredLuma16 = Intra16Preds_C;
VP8EncPredChroma8 = IntraChromaPreds_C;
VP8Mean16x4 = Mean16x4_C;
VP8EncQuantizeBlockWHT = QuantizeBlock_C;
VP8Copy4x4 = Copy4x4_C;
VP8Copy16x8 = Copy16x8_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8EncDspInitSSE2();
#if defined(WEBP_USE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
VP8EncDspInitSSE41();
}
#endif
}
#endif
#if defined(WEBP_USE_AVX2)
if (VP8GetCPUInfo(kAVX2)) {
VP8EncDspInitAVX2();
}
#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
VP8EncDspInitMIPS32();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8EncDspInitMIPSdspR2();
}
#endif
#if defined(WEBP_USE_MSA)
if (VP8GetCPUInfo(kMSA)) {
VP8EncDspInitMSA();
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8EncDspInitNEON();
}
#endif
assert(VP8ITransform != NULL);
assert(VP8FTransform != NULL);
assert(VP8FTransformWHT != NULL);
assert(VP8TDisto4x4 != NULL);
assert(VP8TDisto16x16 != NULL);
assert(VP8CollectHistogram != NULL);
assert(VP8SSE16x16 != NULL);
assert(VP8SSE16x8 != NULL);
assert(VP8SSE8x8 != NULL);
assert(VP8SSE4x4 != NULL);
assert(VP8EncQuantizeBlock != NULL);
assert(VP8EncQuantize2Blocks != NULL);
assert(VP8FTransform2 != NULL);
assert(VP8EncPredLuma4 != NULL);
assert(VP8EncPredLuma16 != NULL);
assert(VP8EncPredChroma8 != NULL);
assert(VP8Mean16x4 != NULL);
assert(VP8EncQuantizeBlockWHT != NULL);
assert(VP8Copy4x4 != NULL);
assert(VP8Copy16x8 != NULL);
enc_last_cpuinfo_used = VP8GetCPUInfo;
}