diff --git a/src/dec/vp8i.h b/src/dec/vp8i.h index 29701be7..383f6a1e 100644 --- a/src/dec/vp8i.h +++ b/src/dec/vp8i.h @@ -69,7 +69,7 @@ enum { MB_FEATURE_TREE_PROBS = 3, NUM_PROBAS = 11, NUM_MV_PROBAS = 19 }; -// YUV-cache parameters. +// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline). // Constraints are: We need to store one 16x16 block of luma samples (y), // and two 8x8 chroma blocks (u/v). These are better be 16-bytes aligned, // in order to be SIMD-friendly. We also need to store the top, left and @@ -91,8 +91,6 @@ enum { MB_FEATURE_TREE_PROBS = 3, // 'y' = y-samples 'u' = u-samples 'v' = u-samples // '|' = left sample, '-' = top sample, '+' = top-left sample // 't' = extra top-right sample for 4x4 modes -// With this layout, BPS (=Bytes Per Scan-line) is one cacheline size. -#define BPS 32 // this is the common stride used by yuv[] #define YUV_SIZE (BPS * 17 + BPS * 9) #define Y_SIZE (BPS * 17) #define Y_OFF (BPS * 1 + 8) diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index f106b410..4ea304b7 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -24,6 +24,8 @@ extern "C" { #endif +#define BPS 32 // this is the common stride for enc/dec + //------------------------------------------------------------------------------ // CPU detection @@ -132,6 +134,7 @@ extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16; typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst); extern VP8BlockCopy VP8Copy4x4; +extern VP8BlockCopy VP8Copy16x8; // Quantization struct VP8Matrix; // forward declaration typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16], diff --git a/src/dsp/enc.c b/src/dsp/enc.c index 81c5d4a9..e42cd207 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -662,16 +662,22 @@ static int QuantizeBlockWHT(int16_t in[16], int16_t out[16], //------------------------------------------------------------------------------ // Block copy -static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int size) { +static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) { int y; - for (y = 0; y < size; ++y) { - memcpy(dst, src, size); + for (y = 0; y < h; ++y) { + memcpy(dst, src, w); src += BPS; dst += BPS; } } -static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); } +static void Copy4x4(const uint8_t* src, uint8_t* dst) { + Copy(src, dst, 4, 4); +} + +static void Copy16x8(const uint8_t* src, uint8_t* dst) { + Copy(src, dst, 16, 8); +} //------------------------------------------------------------------------------ // Initialization @@ -695,6 +701,7 @@ VP8QuantizeBlock VP8EncQuantizeBlock; VP8Quantize2Blocks VP8EncQuantize2Blocks; VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT; VP8BlockCopy VP8Copy4x4; +VP8BlockCopy VP8Copy16x8; extern void VP8EncDspInitSSE2(void); extern void VP8EncDspInitAVX2(void); @@ -724,6 +731,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) { VP8EncQuantize2Blocks = Quantize2Blocks; VP8EncQuantizeBlockWHT = QuantizeBlockWHT; VP8Copy4x4 = Copy4x4; + VP8Copy16x8 = Copy16x8; // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { diff --git a/src/enc/quant.c b/src/enc/quant.c index 03cf26a9..35fef714 100644 --- a/src/enc/quant.c +++ b/src/enc/quant.c @@ -1067,10 +1067,7 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) { VP8SetIntraUVMode(it, rd->mode_uv); AddScore(rd, &rd_best); if (dst != dst0) { // copy 16x8 block if needed - int i; - for (i = 0; i < 8; ++i) { - memcpy(dst0 + i * BPS, dst + i * BPS, 2 * 8 * sizeof(*dst0)); - } + VP8Copy16x8(dst0, dst); } } diff --git a/src/enc/vp8enci.h b/src/enc/vp8enci.h index 2244799a..cd06113e 100644 --- a/src/enc/vp8enci.h +++ b/src/enc/vp8enci.h @@ -69,7 +69,7 @@ typedef enum { // Rate-distortion optimization levels RD_OPT_TRELLIS_ALL = 3 // trellis-quant for every scoring (much slower) } VP8RDLevel; -// YUV-cache parameters. Cache is 32-pixels wide. +// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline). // The original or reconstructed samples can be accessed using VP8Scan[]. // The predicted blocks can be accessed using offsets to yuv_p_ and // the arrays VP8*ModeOffsets[]. @@ -91,8 +91,7 @@ typedef enum { // Rate-distortion optimization levels // Intra 4x4 predictions (4x4 block each) // |I4DC4 I4TM4 I4VE4 I4HE4|I4RD4 I4VR4 I4LD4 I4VL4| // |I4HD4 I4HU4 I4TMP .....|.......................| <- ~31% wasted -#define BPS 32 // this is the common stride -#define YUV_SIZE (BPS * 16) // 25% lost +#define YUV_SIZE (BPS * 16) #define PRED_SIZE (32 * BPS + 16 * BPS + 8 * BPS) // I16+Chroma+I4 preds #define Y_OFF (0) #define U_OFF (16)