From c16cd99abab37042aa1bc89e10f50aa4f86f348c Mon Sep 17 00:00:00 2001 From: Vikas Arora Date: Fri, 21 Feb 2014 11:41:38 -0800 Subject: [PATCH] Speed up lossless encoder. Speedup lossless encoder by 20-25% by optimizing: - GetBestColorTransformForTile: Use techniques like binary search and local minima search to reduce the search space. - VP8LFastSLog2Slow & VP8LFastLog2Slow: Adding the correction factor for log(1 + x) and increase the threshold for calling the approximate version of log_2 (compared to costly call to log()). Change-Id: Ia2444c914521ac298492aafa458e617028fc2f9d --- src/dsp/lossless.c | 299 ++++++++++++++++++++++++++++++--------------- src/dsp/lossless.h | 2 +- src/enc/vp8l.c | 3 +- 3 files changed, 205 insertions(+), 99 deletions(-) diff --git a/src/dsp/lossless.c b/src/dsp/lossless.c index 0baacb09..d26c8093 100644 --- a/src/dsp/lossless.c +++ b/src/dsp/lossless.c @@ -28,8 +28,6 @@ #define MAX_DIFF_COST (1e30f) // lookup table for small values of log2(int) -#define APPROX_LOG_MAX 4096 -#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086 const float kLog2Table[LOG_LOOKUP_IDX_MAX] = { 0.0000000000000000f, 0.0000000000000000f, 1.0000000000000000f, 1.5849625007211560f, @@ -331,16 +329,34 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = { 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126 }; +// The threshold till approximate version of log_2 can be used. +// Practically, we can get rid of the call to log() as the two values match to +// very high degree (the ratio of these two is 0.99999x). +// Keeping a high threshold for now. +#define APPROX_LOG_WITH_CORRECTION_MAX 65536 +#define APPROX_LOG_MAX 4096 +#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086 float VP8LFastSLog2Slow(int v) { assert(v >= LOG_LOOKUP_IDX_MAX); - if (v < APPROX_LOG_MAX) { + if (v < APPROX_LOG_WITH_CORRECTION_MAX) { int log_cnt = 0; + int y = 1; + int correction = 0; const float v_f = (float)v; - while (v >= LOG_LOOKUP_IDX_MAX) { + const int orig_v = v; + do { ++log_cnt; v = v >> 1; - } - return v_f * (kLog2Table[v] + log_cnt); + y = y << 1; + } while (v >= LOG_LOOKUP_IDX_MAX); + // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256 + // Xf = floor(Xf) * (1 + (v % y) / v) + // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v) + // The correction factor: log(1 + d) ~ d; for very small d values, so + // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v + // LOG_2_RECIPROCAL ~ 23/16 + correction = (23 * (orig_v % y)) >> 4; + return v_f * (kLog2Table[v] + log_cnt) + correction; } else { return (float)(LOG_2_RECIPROCAL * v * log((double)v)); } @@ -348,13 +364,24 @@ float VP8LFastSLog2Slow(int v) { float VP8LFastLog2Slow(int v) { assert(v >= LOG_LOOKUP_IDX_MAX); - if (v < APPROX_LOG_MAX) { + if (v < APPROX_LOG_WITH_CORRECTION_MAX) { int log_cnt = 0; - while (v >= LOG_LOOKUP_IDX_MAX) { + int y = 1; + const int orig_v = v; + double log_2; + do { ++log_cnt; v = v >> 1; + y = y << 1; + } while (v >= LOG_LOOKUP_IDX_MAX); + log_2 = kLog2Table[v] + log_cnt; + if (orig_v >= APPROX_LOG_MAX) { + // Since the division is still expensive, add this correction factor only + // for large values of 'v'. + const int correction = (23 * (orig_v % y)) >> 4; + log_2 += (double)correction / orig_v; } - return kLog2Table[v] + log_cnt; + return (float)log_2; } else { return (float)(LOG_2_RECIPROCAL * log((double)v)); } @@ -881,99 +908,180 @@ static float PredictionCostCrossColor(const int accumulated[256], PredictionCostSpatial(counts, 3, kExpValue); } -static Multipliers GetBestColorTransformForTile( - int tile_x, int tile_y, int bits, - Multipliers prevX, - Multipliers prevY, - int step, int xsize, int ysize, - int* accumulated_red_histo, - int* accumulated_blue_histo, +static float GetPredictionCostCrossColorRed( + int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max, + int xsize, Multipliers prev_x, Multipliers prev_y, int green_to_red, + const int* const accumulated_red_histo, const uint32_t* const argb) { + int all_y; + int histo[256] = { 0 }; + float cur_diff; + for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) { + int ix = all_y * xsize + tile_x_offset; + int all_x; + for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) { + if (SkipRepeatedPixels(argb, ix, xsize)) { + continue; + } + ++histo[TransformColorRed(green_to_red, argb[ix])]; // red. + } + } + cur_diff = PredictionCostCrossColor(&accumulated_red_histo[0], &histo[0]); + if ((uint8_t)green_to_red == prev_x.green_to_red_) { + cur_diff -= 3; // favor keeping the areas locally similar + } + if ((uint8_t)green_to_red == prev_y.green_to_red_) { + cur_diff -= 3; // favor keeping the areas locally similar + } + if (green_to_red == 0) { + cur_diff -= 3; + } + return cur_diff; +} + +static void GetBestGreenToRed( + int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max, + int xsize, Multipliers prev_x, Multipliers prev_y, + const int* const accumulated_red_histo, const uint32_t* const argb, + Multipliers* best_tx) { + int min_green_to_red = -64; + int max_green_to_red = 64; + int green_to_red = 0; + int eval_min = 1; + int eval_max = 1; + float cur_diff_min = MAX_DIFF_COST; + float cur_diff_max = MAX_DIFF_COST; + // Do a binary search to find the optimal green_to_red color transform. + while (max_green_to_red - min_green_to_red > 2) { + if (eval_min) { + cur_diff_min = GetPredictionCostCrossColorRed( + tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize, + prev_x, prev_y, min_green_to_red, &accumulated_red_histo[0], argb); + eval_min = 0; + } + if (eval_max) { + cur_diff_max = GetPredictionCostCrossColorRed( + tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize, + prev_x, prev_y, max_green_to_red, &accumulated_red_histo[0], argb); + eval_max = 0; + } + if (cur_diff_min < cur_diff_max) { + green_to_red = min_green_to_red; + max_green_to_red = (max_green_to_red + min_green_to_red) / 2; + eval_max = 1; + } else { + green_to_red = max_green_to_red; + min_green_to_red = (max_green_to_red + min_green_to_red) / 2; + eval_min = 1; + } + } + best_tx->green_to_red_ = green_to_red; +} + +static float GetPredictionCostCrossColorBlue( + int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max, + int xsize, Multipliers prev_x, Multipliers prev_y, int green_to_blue, + int red_to_blue, const int* const accumulated_blue_histo, const uint32_t* const argb) { + int all_y; + int histo[256] = { 0 }; + float cur_diff; + for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) { + int all_x; + int ix = all_y * xsize + tile_x_offset; + for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) { + if (SkipRepeatedPixels(argb, ix, xsize)) { + continue; + } + ++histo[TransformColorBlue(green_to_blue, red_to_blue, argb[ix])]; + } + } + cur_diff = PredictionCostCrossColor(&accumulated_blue_histo[0], &histo[0]); + if ((uint8_t)green_to_blue == prev_x.green_to_blue_) { + cur_diff -= 3; // favor keeping the areas locally similar + } + if ((uint8_t)green_to_blue == prev_y.green_to_blue_) { + cur_diff -= 3; // favor keeping the areas locally similar + } + if ((uint8_t)red_to_blue == prev_x.red_to_blue_) { + cur_diff -= 3; // favor keeping the areas locally similar + } + if ((uint8_t)red_to_blue == prev_y.red_to_blue_) { + cur_diff -= 3; // favor keeping the areas locally similar + } + if (green_to_blue == 0) { + cur_diff -= 3; + } + if (red_to_blue == 0) { + cur_diff -= 3; + } + return cur_diff; +} + +static void GetBestGreenRedToBlue( + int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max, + int xsize, Multipliers prev_x, Multipliers prev_y, int quality, + const int* const accumulated_blue_histo, const uint32_t* const argb, + Multipliers* best_tx) { float best_diff = MAX_DIFF_COST; float cur_diff; - const int halfstep = step / 2; + const int step = (quality < 25) ? 32 : (quality > 50) ? 8 : 16; + const int min_green_to_blue = -32; + const int max_green_to_blue = 32; + const int min_red_to_blue = -16; + const int max_red_to_blue = 16; + const int num_iters = + (1 + (max_green_to_blue - min_green_to_blue) / step) * + (1 + (max_red_to_blue - min_red_to_blue) / step); + // Number of tries to get optimal green_to_blue & red_to_blue color transforms + // after finding a local minima. + const int max_tries_after_min = 4 + (num_iters >> 2); + int num_tries_after_min = 0; + int green_to_blue; + for (green_to_blue = min_green_to_blue; + green_to_blue <= max_green_to_blue && + num_tries_after_min < max_tries_after_min; + green_to_blue += step) { + int red_to_blue; + for (red_to_blue = min_red_to_blue; + red_to_blue <= max_red_to_blue && + num_tries_after_min < max_tries_after_min; + red_to_blue += step) { + cur_diff = GetPredictionCostCrossColorBlue( + tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize, prev_x, + prev_y, green_to_blue, red_to_blue, &accumulated_blue_histo[0], argb); + if (cur_diff < best_diff) { + best_diff = cur_diff; + best_tx->green_to_blue_ = green_to_blue; + best_tx->red_to_blue_ = red_to_blue; + num_tries_after_min = 0; + } else { + ++num_tries_after_min; + } + } + } +} + +static Multipliers GetBestColorTransformForTile( + int tile_x, int tile_y, int bits, + Multipliers prev_x, + Multipliers prev_y, + int quality, int xsize, int ysize, + const int* const accumulated_red_histo, + const int* const accumulated_blue_histo, + const uint32_t* const argb) { const int max_tile_size = 1 << bits; const int tile_y_offset = tile_y * max_tile_size; const int tile_x_offset = tile_x * max_tile_size; - int green_to_red; - int green_to_blue; - int red_to_blue; const int all_x_max = GetMin(tile_x_offset + max_tile_size, xsize); const int all_y_max = GetMin(tile_y_offset + max_tile_size, ysize); Multipliers best_tx; MultipliersClear(&best_tx); - for (green_to_red = -64; green_to_red <= 64; green_to_red += halfstep) { - int histo[256] = { 0 }; - int all_y; - - for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) { - int ix = all_y * xsize + tile_x_offset; - int all_x; - for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) { - if (SkipRepeatedPixels(argb, ix, xsize)) { - continue; - } - ++histo[TransformColorRed(green_to_red, argb[ix])]; // red. - } - } - cur_diff = PredictionCostCrossColor(&accumulated_red_histo[0], &histo[0]); - if ((uint8_t)green_to_red == prevX.green_to_red_) { - cur_diff -= 3; // favor keeping the areas locally similar - } - if ((uint8_t)green_to_red == prevY.green_to_red_) { - cur_diff -= 3; // favor keeping the areas locally similar - } - if (green_to_red == 0) { - cur_diff -= 3; - } - if (cur_diff < best_diff) { - best_diff = cur_diff; - best_tx.green_to_red_ = green_to_red; - } - } - best_diff = MAX_DIFF_COST; - for (green_to_blue = -32; green_to_blue <= 32; green_to_blue += step) { - for (red_to_blue = -32; red_to_blue <= 32; red_to_blue += step) { - int all_y; - int histo[256] = { 0 }; - for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) { - int all_x; - int ix = all_y * xsize + tile_x_offset; - for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) { - if (SkipRepeatedPixels(argb, ix, xsize)) { - continue; - } - ++histo[TransformColorBlue(green_to_blue, red_to_blue, argb[ix])]; - } - } - cur_diff = - PredictionCostCrossColor(&accumulated_blue_histo[0], &histo[0]); - if ((uint8_t)green_to_blue == prevX.green_to_blue_) { - cur_diff -= 3; // favor keeping the areas locally similar - } - if ((uint8_t)green_to_blue == prevY.green_to_blue_) { - cur_diff -= 3; // favor keeping the areas locally similar - } - if ((uint8_t)red_to_blue == prevX.red_to_blue_) { - cur_diff -= 3; // favor keeping the areas locally similar - } - if ((uint8_t)red_to_blue == prevY.red_to_blue_) { - cur_diff -= 3; // favor keeping the areas locally similar - } - if (green_to_blue == 0) { - cur_diff -= 3; - } - if (red_to_blue == 0) { - cur_diff -= 3; - } - if (cur_diff < best_diff) { - best_diff = cur_diff; - best_tx.green_to_blue_ = green_to_blue; - best_tx.red_to_blue_ = red_to_blue; - } - } - } + GetBestGreenToRed(tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize, + prev_x, prev_y, &accumulated_red_histo[0], argb, &best_tx); + GetBestGreenRedToBlue(tile_x_offset, tile_y_offset, all_x_max, all_y_max, + xsize, prev_x, prev_y, quality, + &accumulated_blue_histo[0], argb, &best_tx); return best_tx; } @@ -994,7 +1102,7 @@ static void CopyTileWithColorTransform(int xsize, int ysize, } } -void VP8LColorSpaceTransform(int width, int height, int bits, int step, +void VP8LColorSpaceTransform(int width, int height, int bits, int quality, uint32_t* const argb, uint32_t* image) { const int max_tile_size = 1 << bits; const int tile_xsize = VP8LSubSampleSize(width, bits); @@ -1018,14 +1126,13 @@ void VP8LColorSpaceTransform(int width, int height, int bits, int step, } prev_x = GetBestColorTransformForTile(tile_x, tile_y, bits, prev_x, prev_y, - step, width, height, + quality, width, height, &accumulated_red_histo[0], &accumulated_blue_histo[0], argb); image[offset] = MultipliersToColorCode(&prev_x); - CopyTileWithColorTransform(width, height, - tile_x_offset, tile_y_offset, max_tile_size, - prev_x, argb); + CopyTileWithColorTransform(width, height, tile_x_offset, tile_y_offset, + max_tile_size, prev_x, argb); // Gather accumulated histogram data. for (y = tile_y_offset; y < all_y_max; ++y) { diff --git a/src/dsp/lossless.h b/src/dsp/lossless.h index 0f1d4420..776487ce 100644 --- a/src/dsp/lossless.h +++ b/src/dsp/lossless.h @@ -66,7 +66,7 @@ void VP8LResidualImage(int width, int height, int bits, uint32_t* const argb, uint32_t* const argb_scratch, uint32_t* const image); -void VP8LColorSpaceTransform(int width, int height, int bits, int step, +void VP8LColorSpaceTransform(int width, int height, int bits, int quality, uint32_t* const argb, uint32_t* image); //------------------------------------------------------------------------------ diff --git a/src/enc/vp8l.c b/src/enc/vp8l.c index 07caafa0..81bcd300 100644 --- a/src/enc/vp8l.c +++ b/src/enc/vp8l.c @@ -695,9 +695,8 @@ static int ApplyCrossColorFilter(const VP8LEncoder* const enc, const int ccolor_transform_bits = enc->transform_bits_; const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits); const int transform_height = VP8LSubSampleSize(height, ccolor_transform_bits); - const int step = (quality < 25) ? 32 : (quality > 50) ? 8 : 16; - VP8LColorSpaceTransform(width, height, ccolor_transform_bits, step, + VP8LColorSpaceTransform(width, height, ccolor_transform_bits, quality, enc->argb_, enc->transform_data_); VP8LWriteBits(bw, 1, TRANSFORM_PRESENT); VP8LWriteBits(bw, 2, CROSS_COLOR_TRANSFORM);