factorize BPS definition in dsp.h and add VP8Copy16x8

Change-Id: Id73a1e968c96455808755df4d131d74e3e2e135d
This commit is contained in:
Pascal Massimino 2014-12-04 13:45:14 +01:00
parent 57606047ec
commit 66ad372500
5 changed files with 19 additions and 14 deletions

View File

@ -69,7 +69,7 @@ enum { MB_FEATURE_TREE_PROBS = 3,
NUM_PROBAS = 11,
NUM_MV_PROBAS = 19 };
// YUV-cache parameters.
// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
// Constraints are: We need to store one 16x16 block of luma samples (y),
// and two 8x8 chroma blocks (u/v). These are better be 16-bytes aligned,
// in order to be SIMD-friendly. We also need to store the top, left and
@ -91,8 +91,6 @@ enum { MB_FEATURE_TREE_PROBS = 3,
// 'y' = y-samples 'u' = u-samples 'v' = u-samples
// '|' = left sample, '-' = top sample, '+' = top-left sample
// 't' = extra top-right sample for 4x4 modes
// With this layout, BPS (=Bytes Per Scan-line) is one cacheline size.
#define BPS 32 // this is the common stride used by yuv[]
#define YUV_SIZE (BPS * 17 + BPS * 9)
#define Y_SIZE (BPS * 17)
#define Y_OFF (BPS * 1 + 8)

View File

@ -24,6 +24,8 @@
extern "C" {
#endif
#define BPS 32 // this is the common stride for enc/dec
//------------------------------------------------------------------------------
// CPU detection
@ -132,6 +134,7 @@ extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
extern VP8BlockCopy VP8Copy4x4;
extern VP8BlockCopy VP8Copy16x8;
// Quantization
struct VP8Matrix; // forward declaration
typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],

View File

@ -662,16 +662,22 @@ static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
//------------------------------------------------------------------------------
// Block copy
static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int size) {
static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
int y;
for (y = 0; y < size; ++y) {
memcpy(dst, src, size);
for (y = 0; y < h; ++y) {
memcpy(dst, src, w);
src += BPS;
dst += BPS;
}
}
static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
static void Copy4x4(const uint8_t* src, uint8_t* dst) {
Copy(src, dst, 4, 4);
}
static void Copy16x8(const uint8_t* src, uint8_t* dst) {
Copy(src, dst, 16, 8);
}
//------------------------------------------------------------------------------
// Initialization
@ -695,6 +701,7 @@ VP8QuantizeBlock VP8EncQuantizeBlock;
VP8Quantize2Blocks VP8EncQuantize2Blocks;
VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
VP8BlockCopy VP8Copy4x4;
VP8BlockCopy VP8Copy16x8;
extern void VP8EncDspInitSSE2(void);
extern void VP8EncDspInitAVX2(void);
@ -724,6 +731,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
VP8EncQuantize2Blocks = Quantize2Blocks;
VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
VP8Copy4x4 = Copy4x4;
VP8Copy16x8 = Copy16x8;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {

View File

@ -1067,10 +1067,7 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
VP8SetIntraUVMode(it, rd->mode_uv);
AddScore(rd, &rd_best);
if (dst != dst0) { // copy 16x8 block if needed
int i;
for (i = 0; i < 8; ++i) {
memcpy(dst0 + i * BPS, dst + i * BPS, 2 * 8 * sizeof(*dst0));
}
VP8Copy16x8(dst0, dst);
}
}

View File

@ -69,7 +69,7 @@ typedef enum { // Rate-distortion optimization levels
RD_OPT_TRELLIS_ALL = 3 // trellis-quant for every scoring (much slower)
} VP8RDLevel;
// YUV-cache parameters. Cache is 32-pixels wide.
// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
// The original or reconstructed samples can be accessed using VP8Scan[].
// The predicted blocks can be accessed using offsets to yuv_p_ and
// the arrays VP8*ModeOffsets[].
@ -91,8 +91,7 @@ typedef enum { // Rate-distortion optimization levels
// Intra 4x4 predictions (4x4 block each)
// |I4DC4 I4TM4 I4VE4 I4HE4|I4RD4 I4VR4 I4LD4 I4VL4|
// |I4HD4 I4HU4 I4TMP .....|.......................| <- ~31% wasted
#define BPS 32 // this is the common stride
#define YUV_SIZE (BPS * 16) // 25% lost
#define YUV_SIZE (BPS * 16)
#define PRED_SIZE (32 * BPS + 16 * BPS + 8 * BPS) // I16+Chroma+I4 preds
#define Y_OFF (0)
#define U_OFF (16)