sse2 version of CollectHistogram()

~3% faster encoding.

Patch by Christian Duvivier (cduvivier at google dot com)

Change-Id: I8c11d63d0cffb35e145fe0ea74cb66a53f4950d9
This commit is contained in:
Pascal Massimino 2011-04-28 17:00:22 -07:00
parent c1c728d617
commit d757523889
4 changed files with 127 additions and 53 deletions

View File

@ -20,51 +20,12 @@
extern "C" {
#endif
#define MAX_COEFF_THRESH 64
#define MAX_ITERS_K_MEANS 6
//-----------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
static int ClipAlpha(int alpha) {
return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
}
static int GetAlpha(const int histo[MAX_COEFF_THRESH]) {
int num = 0, den = 0, val = 0;
int k;
int alpha;
for (k = 0; k < MAX_COEFF_THRESH; ++k) {
if (histo[k]) {
val += histo[k];
num += val * (k + 1);
den += (k + 1) * (k + 1);
}
}
// we scale the value to a usable [0..255] range
alpha = den ? 10 * num / den - 5 : 0;
return ClipAlpha(alpha);
}
static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block) {
int histo[MAX_COEFF_THRESH] = { 0 };
int16_t out[16];
int j, k;
for (j = start_block; j < end_block; ++j) {
VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out);
for (k = 0; k < 16; ++k) {
const int v = abs(out[k]) >> 2;
if (v) {
const int bin = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v;
histo[bin - 1]++;
}
}
}
return GetAlpha(histo);
}
//-----------------------------------------------------------------------------
// Smooth the segment map by replacing isolated block by the majority of its
// neighbours.
@ -278,7 +239,7 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
VP8MakeLuma16Preds(it);
for (mode = 0; mode < max_mode; ++mode) {
const int alpha = CollectHistogram(it->yuv_in_ + Y_OFF,
const int alpha = VP8CollectHistogram(it->yuv_in_ + Y_OFF,
it->yuv_p_ + VP8I16ModeOffsets[mode],
0, 16);
if (alpha > best_alpha) {
@ -303,7 +264,7 @@ static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
VP8MakeIntra4Preds(it);
for (mode = 0; mode < max_mode; ++mode) {
const int alpha = CollectHistogram(src,
const int alpha = VP8CollectHistogram(src,
it->yuv_p_ + VP8I4ModeOffsets[mode],
0, 1);
if (alpha > best_mode_alpha) {
@ -329,7 +290,7 @@ static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
int mode;
VP8MakeChroma8Preds(it);
for (mode = 0; mode < max_mode; ++mode) {
const int alpha = CollectHistogram(it->yuv_in_ + U_OFF,
const int alpha = VP8CollectHistogram(it->yuv_in_ + U_OFF,
it->yuv_p_ + VP8UVModeOffsets[mode],
16, 16 + 4 + 4);
if (alpha > best_alpha) {

View File

@ -16,6 +16,48 @@
extern "C" {
#endif
//-----------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
static int ClipAlpha(int alpha) {
return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
}
static int GetAlpha(const int histo[MAX_COEFF_THRESH]) {
int num = 0, den = 0, val = 0;
int k;
int alpha;
for (k = 0; k < MAX_COEFF_THRESH; ++k) {
if (histo[k]) {
val += histo[k];
num += val * (k + 1);
den += (k + 1) * (k + 1);
}
}
// we scale the value to a usable [0..255] range
alpha = den ? 10 * num / den - 5 : 0;
return ClipAlpha(alpha);
}
static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block) {
int histo[MAX_COEFF_THRESH] = { 0 };
int16_t out[16];
int j, k;
for (j = start_block; j < end_block; ++j) {
VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out);
for (k = 0; k < 16; ++k) {
const int v = abs(out[k]) >> 2;
if (v) {
const int bin = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v;
histo[bin - 1]++;
}
}
}
return GetAlpha(histo);
}
//-----------------------------------------------------------------------------
// run-time tables (~4k)
@ -657,6 +699,7 @@ VP8CPUInfo VP8GetCPUInfo = NULL;
// Speed-critical function pointers. We have to initialize them to the default
// implementations within VP8EncDspInit().
VP8CHisto VP8CollectHistogram;
VP8Idct VP8ITransform;
VP8Fdct VP8FTransform;
VP8WHT VP8ITransformWHT;
@ -681,6 +724,7 @@ void VP8EncDspInit(void) {
InitTables();
// default C implementations
VP8CollectHistogram = CollectHistogram;
VP8ITransform = ITransform;
VP8FTransform = FTransform;
VP8ITransformWHT = ITransformWHT;

View File

@ -18,6 +18,66 @@
extern "C" {
#endif
//-----------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block) {
int histo[MAX_COEFF_THRESH + 1] = { 0 };
int16_t out[16];
int j, k;
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
for (j = start_block; j < end_block; ++j) {
VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out);
// Convert coefficients to bin (within out[]).
{
// Load.
const __m128i out0 = _mm_loadu_si128((__m128i *)&out[0]);
const __m128i out1 = _mm_loadu_si128((__m128i *)&out[8]);
// sign(out) = out >> 15 (0x0000 if positive, 0xffff if negative)
const __m128i sign0 = _mm_srai_epi16(out0, 15);
const __m128i sign1 = _mm_srai_epi16(out1, 15);
// abs(out) = (out ^ sign) - sign
const __m128i xor0 = _mm_xor_si128(out0, sign0);
const __m128i xor1 = _mm_xor_si128(out1, sign1);
const __m128i abs0 = _mm_sub_epi16(xor0, sign0);
const __m128i abs1 = _mm_sub_epi16(xor1, sign1);
// v = abs(out) >> 2
const __m128i v0 = _mm_srai_epi16(abs0, 2);
const __m128i v1 = _mm_srai_epi16(abs1, 2);
// bin = min(v, MAX_COEFF_THRESH)
const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
// Store.
_mm_storeu_si128((__m128i *)&out[0], bin0);
_mm_storeu_si128((__m128i *)&out[8], bin1);
}
// Use bin to update histogram.
for (k = 0; k < 16; ++k) {
histo[out[k]]++;
}
}
{
int num = 0, den = 0, val = 0;
int alpha;
for (k = 0; k < MAX_COEFF_THRESH; ++k) {
if (histo[k + 1]) {
val += histo[k + 1];
num += val * (k + 1);
den += (k + 1) * (k + 1);
}
}
// we scale the value to a usable [0..255] range
alpha = den ? 10 * num / den - 5 : 0;
alpha = alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
return alpha;
}
}
//-----------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
@ -762,6 +822,7 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
extern void VP8EncDspInitSSE2(void);
void VP8EncDspInitSSE2(void) {
VP8CollectHistogram = CollectHistogramSSE2;
VP8EncQuantizeBlock = QuantizeBlockSSE2;
VP8ITransform = ITransformSSE2;
VP8FTransform = FTransformSSE2;

View File

@ -28,6 +28,9 @@ extern "C" {
#define ENC_MIN_VERSION 1
#define ENC_REV_VERSION 2
// size of histogram used by CollectHistogram.
#define MAX_COEFF_THRESH 64
// intra prediction modes
enum { B_DC_PRED = 0, // 4x4 modes
B_TM_PRED = 1,
@ -408,6 +411,11 @@ int VP8EncLoop(VP8Encoder* const enc);
int VP8StatLoop(VP8Encoder* const enc);
// in analysis.c
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
typedef int (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block);
extern VP8CHisto VP8CollectHistogram;
// Main analysis loop. Decides the segmentations and complexity.
// Assigns a first guess for Intra16 and uvmode_ prediction modes.
int VP8EncAnalyze(VP8Encoder* const enc);