mirror of
https://github.com/webmproject/libwebp.git
synced 2024-12-26 13:48:21 +01:00
sse2 version of CollectHistogram()
~3% faster encoding. Patch by Christian Duvivier (cduvivier at google dot com) Change-Id: I8c11d63d0cffb35e145fe0ea74cb66a53f4950d9
This commit is contained in:
parent
c1c728d617
commit
d757523889
@ -20,51 +20,12 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define MAX_COEFF_THRESH 64
|
||||
#define MAX_ITERS_K_MEANS 6
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Compute susceptibility based on DCT-coeff histograms:
|
||||
// the higher, the "easier" the macroblock is to compress.
|
||||
|
||||
static int ClipAlpha(int alpha) {
|
||||
return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
|
||||
}
|
||||
|
||||
static int GetAlpha(const int histo[MAX_COEFF_THRESH]) {
|
||||
int num = 0, den = 0, val = 0;
|
||||
int k;
|
||||
int alpha;
|
||||
for (k = 0; k < MAX_COEFF_THRESH; ++k) {
|
||||
if (histo[k]) {
|
||||
val += histo[k];
|
||||
num += val * (k + 1);
|
||||
den += (k + 1) * (k + 1);
|
||||
}
|
||||
}
|
||||
// we scale the value to a usable [0..255] range
|
||||
alpha = den ? 10 * num / den - 5 : 0;
|
||||
return ClipAlpha(alpha);
|
||||
}
|
||||
|
||||
static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
||||
int start_block, int end_block) {
|
||||
int histo[MAX_COEFF_THRESH] = { 0 };
|
||||
int16_t out[16];
|
||||
int j, k;
|
||||
for (j = start_block; j < end_block; ++j) {
|
||||
VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out);
|
||||
for (k = 0; k < 16; ++k) {
|
||||
const int v = abs(out[k]) >> 2;
|
||||
if (v) {
|
||||
const int bin = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v;
|
||||
histo[bin - 1]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return GetAlpha(histo);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Smooth the segment map by replacing isolated block by the majority of its
|
||||
// neighbours.
|
||||
@ -278,7 +239,7 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
|
||||
|
||||
VP8MakeLuma16Preds(it);
|
||||
for (mode = 0; mode < max_mode; ++mode) {
|
||||
const int alpha = CollectHistogram(it->yuv_in_ + Y_OFF,
|
||||
const int alpha = VP8CollectHistogram(it->yuv_in_ + Y_OFF,
|
||||
it->yuv_p_ + VP8I16ModeOffsets[mode],
|
||||
0, 16);
|
||||
if (alpha > best_alpha) {
|
||||
@ -303,7 +264,7 @@ static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
|
||||
|
||||
VP8MakeIntra4Preds(it);
|
||||
for (mode = 0; mode < max_mode; ++mode) {
|
||||
const int alpha = CollectHistogram(src,
|
||||
const int alpha = VP8CollectHistogram(src,
|
||||
it->yuv_p_ + VP8I4ModeOffsets[mode],
|
||||
0, 1);
|
||||
if (alpha > best_mode_alpha) {
|
||||
@ -329,7 +290,7 @@ static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
|
||||
int mode;
|
||||
VP8MakeChroma8Preds(it);
|
||||
for (mode = 0; mode < max_mode; ++mode) {
|
||||
const int alpha = CollectHistogram(it->yuv_in_ + U_OFF,
|
||||
const int alpha = VP8CollectHistogram(it->yuv_in_ + U_OFF,
|
||||
it->yuv_p_ + VP8UVModeOffsets[mode],
|
||||
16, 16 + 4 + 4);
|
||||
if (alpha > best_alpha) {
|
||||
|
@ -16,6 +16,48 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Compute susceptibility based on DCT-coeff histograms:
|
||||
// the higher, the "easier" the macroblock is to compress.
|
||||
|
||||
static int ClipAlpha(int alpha) {
|
||||
return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
|
||||
}
|
||||
|
||||
static int GetAlpha(const int histo[MAX_COEFF_THRESH]) {
|
||||
int num = 0, den = 0, val = 0;
|
||||
int k;
|
||||
int alpha;
|
||||
for (k = 0; k < MAX_COEFF_THRESH; ++k) {
|
||||
if (histo[k]) {
|
||||
val += histo[k];
|
||||
num += val * (k + 1);
|
||||
den += (k + 1) * (k + 1);
|
||||
}
|
||||
}
|
||||
// we scale the value to a usable [0..255] range
|
||||
alpha = den ? 10 * num / den - 5 : 0;
|
||||
return ClipAlpha(alpha);
|
||||
}
|
||||
|
||||
static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
||||
int start_block, int end_block) {
|
||||
int histo[MAX_COEFF_THRESH] = { 0 };
|
||||
int16_t out[16];
|
||||
int j, k;
|
||||
for (j = start_block; j < end_block; ++j) {
|
||||
VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out);
|
||||
for (k = 0; k < 16; ++k) {
|
||||
const int v = abs(out[k]) >> 2;
|
||||
if (v) {
|
||||
const int bin = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v;
|
||||
histo[bin - 1]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return GetAlpha(histo);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// run-time tables (~4k)
|
||||
|
||||
@ -657,6 +699,7 @@ VP8CPUInfo VP8GetCPUInfo = NULL;
|
||||
|
||||
// Speed-critical function pointers. We have to initialize them to the default
|
||||
// implementations within VP8EncDspInit().
|
||||
VP8CHisto VP8CollectHistogram;
|
||||
VP8Idct VP8ITransform;
|
||||
VP8Fdct VP8FTransform;
|
||||
VP8WHT VP8ITransformWHT;
|
||||
@ -681,6 +724,7 @@ void VP8EncDspInit(void) {
|
||||
InitTables();
|
||||
|
||||
// default C implementations
|
||||
VP8CollectHistogram = CollectHistogram;
|
||||
VP8ITransform = ITransform;
|
||||
VP8FTransform = FTransform;
|
||||
VP8ITransformWHT = ITransformWHT;
|
||||
|
@ -18,6 +18,66 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Compute susceptibility based on DCT-coeff histograms:
|
||||
// the higher, the "easier" the macroblock is to compress.
|
||||
|
||||
static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
|
||||
int start_block, int end_block) {
|
||||
int histo[MAX_COEFF_THRESH + 1] = { 0 };
|
||||
int16_t out[16];
|
||||
int j, k;
|
||||
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
|
||||
for (j = start_block; j < end_block; ++j) {
|
||||
VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out);
|
||||
|
||||
// Convert coefficients to bin (within out[]).
|
||||
{
|
||||
// Load.
|
||||
const __m128i out0 = _mm_loadu_si128((__m128i *)&out[0]);
|
||||
const __m128i out1 = _mm_loadu_si128((__m128i *)&out[8]);
|
||||
// sign(out) = out >> 15 (0x0000 if positive, 0xffff if negative)
|
||||
const __m128i sign0 = _mm_srai_epi16(out0, 15);
|
||||
const __m128i sign1 = _mm_srai_epi16(out1, 15);
|
||||
// abs(out) = (out ^ sign) - sign
|
||||
const __m128i xor0 = _mm_xor_si128(out0, sign0);
|
||||
const __m128i xor1 = _mm_xor_si128(out1, sign1);
|
||||
const __m128i abs0 = _mm_sub_epi16(xor0, sign0);
|
||||
const __m128i abs1 = _mm_sub_epi16(xor1, sign1);
|
||||
// v = abs(out) >> 2
|
||||
const __m128i v0 = _mm_srai_epi16(abs0, 2);
|
||||
const __m128i v1 = _mm_srai_epi16(abs1, 2);
|
||||
// bin = min(v, MAX_COEFF_THRESH)
|
||||
const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
|
||||
const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
|
||||
// Store.
|
||||
_mm_storeu_si128((__m128i *)&out[0], bin0);
|
||||
_mm_storeu_si128((__m128i *)&out[8], bin1);
|
||||
}
|
||||
|
||||
// Use bin to update histogram.
|
||||
for (k = 0; k < 16; ++k) {
|
||||
histo[out[k]]++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
int num = 0, den = 0, val = 0;
|
||||
int alpha;
|
||||
for (k = 0; k < MAX_COEFF_THRESH; ++k) {
|
||||
if (histo[k + 1]) {
|
||||
val += histo[k + 1];
|
||||
num += val * (k + 1);
|
||||
den += (k + 1) * (k + 1);
|
||||
}
|
||||
}
|
||||
// we scale the value to a usable [0..255] range
|
||||
alpha = den ? 10 * num / den - 5 : 0;
|
||||
alpha = alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
|
||||
return alpha;
|
||||
}
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Transforms (Paragraph 14.4)
|
||||
|
||||
@ -762,6 +822,7 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
|
||||
|
||||
extern void VP8EncDspInitSSE2(void);
|
||||
void VP8EncDspInitSSE2(void) {
|
||||
VP8CollectHistogram = CollectHistogramSSE2;
|
||||
VP8EncQuantizeBlock = QuantizeBlockSSE2;
|
||||
VP8ITransform = ITransformSSE2;
|
||||
VP8FTransform = FTransformSSE2;
|
||||
|
@ -28,6 +28,9 @@ extern "C" {
|
||||
#define ENC_MIN_VERSION 1
|
||||
#define ENC_REV_VERSION 2
|
||||
|
||||
// size of histogram used by CollectHistogram.
|
||||
#define MAX_COEFF_THRESH 64
|
||||
|
||||
// intra prediction modes
|
||||
enum { B_DC_PRED = 0, // 4x4 modes
|
||||
B_TM_PRED = 1,
|
||||
@ -408,6 +411,11 @@ int VP8EncLoop(VP8Encoder* const enc);
|
||||
int VP8StatLoop(VP8Encoder* const enc);
|
||||
|
||||
// in analysis.c
|
||||
// Compute susceptibility based on DCT-coeff histograms:
|
||||
// the higher, the "easier" the macroblock is to compress.
|
||||
typedef int (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
|
||||
int start_block, int end_block);
|
||||
extern VP8CHisto VP8CollectHistogram;
|
||||
// Main analysis loop. Decides the segmentations and complexity.
|
||||
// Assigns a first guess for Intra16 and uvmode_ prediction modes.
|
||||
int VP8EncAnalyze(VP8Encoder* const enc);
|
||||
|
Loading…
Reference in New Issue
Block a user