mirror of
https://github.com/webmproject/libwebp.git
synced 2024-12-27 06:08:21 +01:00
sse2 version of CollectHistogram()
~3% faster encoding. Patch by Christian Duvivier (cduvivier at google dot com) Change-Id: I8c11d63d0cffb35e145fe0ea74cb66a53f4950d9
This commit is contained in:
parent
c1c728d617
commit
d757523889
@ -20,51 +20,12 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define MAX_COEFF_THRESH 64
|
|
||||||
#define MAX_ITERS_K_MEANS 6
|
#define MAX_ITERS_K_MEANS 6
|
||||||
|
|
||||||
//-----------------------------------------------------------------------------
|
|
||||||
// Compute susceptibility based on DCT-coeff histograms:
|
|
||||||
// the higher, the "easier" the macroblock is to compress.
|
|
||||||
|
|
||||||
static int ClipAlpha(int alpha) {
|
static int ClipAlpha(int alpha) {
|
||||||
return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
|
return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int GetAlpha(const int histo[MAX_COEFF_THRESH]) {
|
|
||||||
int num = 0, den = 0, val = 0;
|
|
||||||
int k;
|
|
||||||
int alpha;
|
|
||||||
for (k = 0; k < MAX_COEFF_THRESH; ++k) {
|
|
||||||
if (histo[k]) {
|
|
||||||
val += histo[k];
|
|
||||||
num += val * (k + 1);
|
|
||||||
den += (k + 1) * (k + 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// we scale the value to a usable [0..255] range
|
|
||||||
alpha = den ? 10 * num / den - 5 : 0;
|
|
||||||
return ClipAlpha(alpha);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
|
||||||
int start_block, int end_block) {
|
|
||||||
int histo[MAX_COEFF_THRESH] = { 0 };
|
|
||||||
int16_t out[16];
|
|
||||||
int j, k;
|
|
||||||
for (j = start_block; j < end_block; ++j) {
|
|
||||||
VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out);
|
|
||||||
for (k = 0; k < 16; ++k) {
|
|
||||||
const int v = abs(out[k]) >> 2;
|
|
||||||
if (v) {
|
|
||||||
const int bin = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v;
|
|
||||||
histo[bin - 1]++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return GetAlpha(histo);
|
|
||||||
}
|
|
||||||
|
|
||||||
//-----------------------------------------------------------------------------
|
//-----------------------------------------------------------------------------
|
||||||
// Smooth the segment map by replacing isolated block by the majority of its
|
// Smooth the segment map by replacing isolated block by the majority of its
|
||||||
// neighbours.
|
// neighbours.
|
||||||
@ -278,7 +239,7 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
|
|||||||
|
|
||||||
VP8MakeLuma16Preds(it);
|
VP8MakeLuma16Preds(it);
|
||||||
for (mode = 0; mode < max_mode; ++mode) {
|
for (mode = 0; mode < max_mode; ++mode) {
|
||||||
const int alpha = CollectHistogram(it->yuv_in_ + Y_OFF,
|
const int alpha = VP8CollectHistogram(it->yuv_in_ + Y_OFF,
|
||||||
it->yuv_p_ + VP8I16ModeOffsets[mode],
|
it->yuv_p_ + VP8I16ModeOffsets[mode],
|
||||||
0, 16);
|
0, 16);
|
||||||
if (alpha > best_alpha) {
|
if (alpha > best_alpha) {
|
||||||
@ -303,7 +264,7 @@ static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
|
|||||||
|
|
||||||
VP8MakeIntra4Preds(it);
|
VP8MakeIntra4Preds(it);
|
||||||
for (mode = 0; mode < max_mode; ++mode) {
|
for (mode = 0; mode < max_mode; ++mode) {
|
||||||
const int alpha = CollectHistogram(src,
|
const int alpha = VP8CollectHistogram(src,
|
||||||
it->yuv_p_ + VP8I4ModeOffsets[mode],
|
it->yuv_p_ + VP8I4ModeOffsets[mode],
|
||||||
0, 1);
|
0, 1);
|
||||||
if (alpha > best_mode_alpha) {
|
if (alpha > best_mode_alpha) {
|
||||||
@ -329,7 +290,7 @@ static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
|
|||||||
int mode;
|
int mode;
|
||||||
VP8MakeChroma8Preds(it);
|
VP8MakeChroma8Preds(it);
|
||||||
for (mode = 0; mode < max_mode; ++mode) {
|
for (mode = 0; mode < max_mode; ++mode) {
|
||||||
const int alpha = CollectHistogram(it->yuv_in_ + U_OFF,
|
const int alpha = VP8CollectHistogram(it->yuv_in_ + U_OFF,
|
||||||
it->yuv_p_ + VP8UVModeOffsets[mode],
|
it->yuv_p_ + VP8UVModeOffsets[mode],
|
||||||
16, 16 + 4 + 4);
|
16, 16 + 4 + 4);
|
||||||
if (alpha > best_alpha) {
|
if (alpha > best_alpha) {
|
||||||
|
@ -16,6 +16,48 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
// Compute susceptibility based on DCT-coeff histograms:
|
||||||
|
// the higher, the "easier" the macroblock is to compress.
|
||||||
|
|
||||||
|
static int ClipAlpha(int alpha) {
|
||||||
|
return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int GetAlpha(const int histo[MAX_COEFF_THRESH]) {
|
||||||
|
int num = 0, den = 0, val = 0;
|
||||||
|
int k;
|
||||||
|
int alpha;
|
||||||
|
for (k = 0; k < MAX_COEFF_THRESH; ++k) {
|
||||||
|
if (histo[k]) {
|
||||||
|
val += histo[k];
|
||||||
|
num += val * (k + 1);
|
||||||
|
den += (k + 1) * (k + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// we scale the value to a usable [0..255] range
|
||||||
|
alpha = den ? 10 * num / den - 5 : 0;
|
||||||
|
return ClipAlpha(alpha);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
||||||
|
int start_block, int end_block) {
|
||||||
|
int histo[MAX_COEFF_THRESH] = { 0 };
|
||||||
|
int16_t out[16];
|
||||||
|
int j, k;
|
||||||
|
for (j = start_block; j < end_block; ++j) {
|
||||||
|
VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out);
|
||||||
|
for (k = 0; k < 16; ++k) {
|
||||||
|
const int v = abs(out[k]) >> 2;
|
||||||
|
if (v) {
|
||||||
|
const int bin = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v;
|
||||||
|
histo[bin - 1]++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return GetAlpha(histo);
|
||||||
|
}
|
||||||
|
|
||||||
//-----------------------------------------------------------------------------
|
//-----------------------------------------------------------------------------
|
||||||
// run-time tables (~4k)
|
// run-time tables (~4k)
|
||||||
|
|
||||||
@ -657,6 +699,7 @@ VP8CPUInfo VP8GetCPUInfo = NULL;
|
|||||||
|
|
||||||
// Speed-critical function pointers. We have to initialize them to the default
|
// Speed-critical function pointers. We have to initialize them to the default
|
||||||
// implementations within VP8EncDspInit().
|
// implementations within VP8EncDspInit().
|
||||||
|
VP8CHisto VP8CollectHistogram;
|
||||||
VP8Idct VP8ITransform;
|
VP8Idct VP8ITransform;
|
||||||
VP8Fdct VP8FTransform;
|
VP8Fdct VP8FTransform;
|
||||||
VP8WHT VP8ITransformWHT;
|
VP8WHT VP8ITransformWHT;
|
||||||
@ -681,6 +724,7 @@ void VP8EncDspInit(void) {
|
|||||||
InitTables();
|
InitTables();
|
||||||
|
|
||||||
// default C implementations
|
// default C implementations
|
||||||
|
VP8CollectHistogram = CollectHistogram;
|
||||||
VP8ITransform = ITransform;
|
VP8ITransform = ITransform;
|
||||||
VP8FTransform = FTransform;
|
VP8FTransform = FTransform;
|
||||||
VP8ITransformWHT = ITransformWHT;
|
VP8ITransformWHT = ITransformWHT;
|
||||||
|
@ -18,6 +18,66 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------------
|
||||||
|
// Compute susceptibility based on DCT-coeff histograms:
|
||||||
|
// the higher, the "easier" the macroblock is to compress.
|
||||||
|
|
||||||
|
static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
|
||||||
|
int start_block, int end_block) {
|
||||||
|
int histo[MAX_COEFF_THRESH + 1] = { 0 };
|
||||||
|
int16_t out[16];
|
||||||
|
int j, k;
|
||||||
|
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
|
||||||
|
for (j = start_block; j < end_block; ++j) {
|
||||||
|
VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out);
|
||||||
|
|
||||||
|
// Convert coefficients to bin (within out[]).
|
||||||
|
{
|
||||||
|
// Load.
|
||||||
|
const __m128i out0 = _mm_loadu_si128((__m128i *)&out[0]);
|
||||||
|
const __m128i out1 = _mm_loadu_si128((__m128i *)&out[8]);
|
||||||
|
// sign(out) = out >> 15 (0x0000 if positive, 0xffff if negative)
|
||||||
|
const __m128i sign0 = _mm_srai_epi16(out0, 15);
|
||||||
|
const __m128i sign1 = _mm_srai_epi16(out1, 15);
|
||||||
|
// abs(out) = (out ^ sign) - sign
|
||||||
|
const __m128i xor0 = _mm_xor_si128(out0, sign0);
|
||||||
|
const __m128i xor1 = _mm_xor_si128(out1, sign1);
|
||||||
|
const __m128i abs0 = _mm_sub_epi16(xor0, sign0);
|
||||||
|
const __m128i abs1 = _mm_sub_epi16(xor1, sign1);
|
||||||
|
// v = abs(out) >> 2
|
||||||
|
const __m128i v0 = _mm_srai_epi16(abs0, 2);
|
||||||
|
const __m128i v1 = _mm_srai_epi16(abs1, 2);
|
||||||
|
// bin = min(v, MAX_COEFF_THRESH)
|
||||||
|
const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
|
||||||
|
const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
|
||||||
|
// Store.
|
||||||
|
_mm_storeu_si128((__m128i *)&out[0], bin0);
|
||||||
|
_mm_storeu_si128((__m128i *)&out[8], bin1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use bin to update histogram.
|
||||||
|
for (k = 0; k < 16; ++k) {
|
||||||
|
histo[out[k]]++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
int num = 0, den = 0, val = 0;
|
||||||
|
int alpha;
|
||||||
|
for (k = 0; k < MAX_COEFF_THRESH; ++k) {
|
||||||
|
if (histo[k + 1]) {
|
||||||
|
val += histo[k + 1];
|
||||||
|
num += val * (k + 1);
|
||||||
|
den += (k + 1) * (k + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// we scale the value to a usable [0..255] range
|
||||||
|
alpha = den ? 10 * num / den - 5 : 0;
|
||||||
|
alpha = alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
|
||||||
|
return alpha;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//-----------------------------------------------------------------------------
|
//-----------------------------------------------------------------------------
|
||||||
// Transforms (Paragraph 14.4)
|
// Transforms (Paragraph 14.4)
|
||||||
|
|
||||||
@ -762,6 +822,7 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
|
|||||||
|
|
||||||
extern void VP8EncDspInitSSE2(void);
|
extern void VP8EncDspInitSSE2(void);
|
||||||
void VP8EncDspInitSSE2(void) {
|
void VP8EncDspInitSSE2(void) {
|
||||||
|
VP8CollectHistogram = CollectHistogramSSE2;
|
||||||
VP8EncQuantizeBlock = QuantizeBlockSSE2;
|
VP8EncQuantizeBlock = QuantizeBlockSSE2;
|
||||||
VP8ITransform = ITransformSSE2;
|
VP8ITransform = ITransformSSE2;
|
||||||
VP8FTransform = FTransformSSE2;
|
VP8FTransform = FTransformSSE2;
|
||||||
|
@ -28,6 +28,9 @@ extern "C" {
|
|||||||
#define ENC_MIN_VERSION 1
|
#define ENC_MIN_VERSION 1
|
||||||
#define ENC_REV_VERSION 2
|
#define ENC_REV_VERSION 2
|
||||||
|
|
||||||
|
// size of histogram used by CollectHistogram.
|
||||||
|
#define MAX_COEFF_THRESH 64
|
||||||
|
|
||||||
// intra prediction modes
|
// intra prediction modes
|
||||||
enum { B_DC_PRED = 0, // 4x4 modes
|
enum { B_DC_PRED = 0, // 4x4 modes
|
||||||
B_TM_PRED = 1,
|
B_TM_PRED = 1,
|
||||||
@ -408,6 +411,11 @@ int VP8EncLoop(VP8Encoder* const enc);
|
|||||||
int VP8StatLoop(VP8Encoder* const enc);
|
int VP8StatLoop(VP8Encoder* const enc);
|
||||||
|
|
||||||
// in analysis.c
|
// in analysis.c
|
||||||
|
// Compute susceptibility based on DCT-coeff histograms:
|
||||||
|
// the higher, the "easier" the macroblock is to compress.
|
||||||
|
typedef int (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
|
||||||
|
int start_block, int end_block);
|
||||||
|
extern VP8CHisto VP8CollectHistogram;
|
||||||
// Main analysis loop. Decides the segmentations and complexity.
|
// Main analysis loop. Decides the segmentations and complexity.
|
||||||
// Assigns a first guess for Intra16 and uvmode_ prediction modes.
|
// Assigns a first guess for Intra16 and uvmode_ prediction modes.
|
||||||
int VP8EncAnalyze(VP8Encoder* const enc);
|
int VP8EncAnalyze(VP8Encoder* const enc);
|
||||||
|
Loading…
Reference in New Issue
Block a user