From 57606047ecb05210ef51464473e357c39d9de5bc Mon Sep 17 00:00:00 2001 From: Pascal Massimino Date: Thu, 4 Dec 2014 09:17:18 +0100 Subject: [PATCH] encoder: switch BPS to 32 instead of 16 this is a first step to unifying encoding/decoding cache stride and possibly sharing the prediction functions in dsp/ With this layout, there's a little (~7%) space lost with unused samples. But no speed change was observed. Change-Id: I016df8cad41bde5088df3579e6ad65d884ee711e --- src/dsp/enc_mips32.c | 25 +++++--- src/dsp/enc_mips_dsp_r2.c | 13 +++-- src/enc/quant.c | 22 +++---- src/enc/vp8enci.h | 118 ++++++++++++++------------------------ 4 files changed, 82 insertions(+), 96 deletions(-) diff --git a/src/dsp/enc_mips32.c b/src/dsp/enc_mips32.c index c8799c93..2f3fe981 100644 --- a/src/dsp/enc_mips32.c +++ b/src/dsp/enc_mips32.c @@ -769,17 +769,26 @@ extern WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void); WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) { #if defined(WEBP_USE_MIPS32) - VP8ITransform = ITransform; + // TODO(djordje): fix these to use generic BPS instead of hardcoded value 16 + (void)ITransform; + (void)FTransform; + (void)Disto4x4; + (void)Disto16x16; +// VP8ITransform = ITransform; +// VP8FTransform = FTransform; VP8EncQuantizeBlock = QuantizeBlock; VP8EncQuantize2Blocks = Quantize2Blocks; - VP8TDisto4x4 = Disto4x4; - VP8TDisto16x16 = Disto16x16; - VP8FTransform = FTransform; +// VP8TDisto4x4 = Disto4x4; +// VP8TDisto16x16 = Disto16x16; #if !defined(WORK_AROUND_GCC) - VP8SSE16x16 = SSE16x16; - VP8SSE8x8 = SSE8x8; - VP8SSE16x8 = SSE16x8; - VP8SSE4x4 = SSE4x4; + (void)SSE16x16; + (void)SSE8x8; + (void)SSE16x8; + (void)SSE4x4; +// VP8SSE16x16 = SSE16x16; +// VP8SSE8x8 = SSE8x8; +// VP8SSE16x8 = SSE16x8; +// VP8SSE4x4 = SSE4x4; #endif #endif // WEBP_USE_MIPS32 } diff --git a/src/dsp/enc_mips_dsp_r2.c b/src/dsp/enc_mips_dsp_r2.c index feb8e388..eeac7be4 100644 --- a/src/dsp/enc_mips_dsp_r2.c +++ b/src/dsp/enc_mips_dsp_r2.c @@ -318,9 +318,14 @@ extern WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void); WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) { #if defined(WEBP_USE_MIPS_DSP_R2) - VP8FTransform = FTransform; - VP8ITransform = ITransform; - VP8TDisto4x4 = Disto4x4; - VP8TDisto16x16 = Disto16x16; + // TODO(djordje): fix these to use generic BPS instead of hardcoded value + (void)ITransform; + (void)FTransform; + (void)Disto4x4; + (void)Disto16x16; +// VP8FTransform = FTransform; +// VP8ITransform = ITransform; +// VP8TDisto4x4 = Disto4x4; +// VP8TDisto16x16 = Disto16x16; #endif // WEBP_USE_MIPS_DSP_R2 } diff --git a/src/enc/quant.c b/src/enc/quant.c index b107cba8..03cf26a9 100644 --- a/src/enc/quant.c +++ b/src/enc/quant.c @@ -444,15 +444,12 @@ void VP8MakeIntra4Preds(const VP8EncIterator* const it) { // Quantize // Layout: -// +----+ -// |YYYY| 0 -// |YYYY| 4 -// |YYYY| 8 -// |YYYY| 12 -// +----+ -// |UUVV| 16 -// |UUVV| 20 -// +----+ +// +----+----+ +// |YYYY|UUVV| 0 +// |YYYY|UUVV| 4 +// |YYYY|....| 8 +// |YYYY|....| 12 +// +----+----+ const int VP8Scan[16] = { // Luma 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS, @@ -1069,7 +1066,12 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) { } VP8SetIntraUVMode(it, rd->mode_uv); AddScore(rd, &rd_best); - if (dst != dst0) memcpy(dst0, dst, UV_SIZE); + if (dst != dst0) { // copy 16x8 block if needed + int i; + for (i = 0; i < 8; ++i) { + memcpy(dst0 + i * BPS, dst + i * BPS, 2 * 8 * sizeof(*dst0)); + } + } } //------------------------------------------------------------------------------ diff --git a/src/enc/vp8enci.h b/src/enc/vp8enci.h index 7194aa6f..2244799a 100644 --- a/src/enc/vp8enci.h +++ b/src/enc/vp8enci.h @@ -69,64 +69,34 @@ typedef enum { // Rate-distortion optimization levels RD_OPT_TRELLIS_ALL = 3 // trellis-quant for every scoring (much slower) } VP8RDLevel; -// YUV-cache parameters. Cache is 16-pixels wide. -// The original or reconstructed samples can be accessed using VP8Scan[] +// YUV-cache parameters. Cache is 32-pixels wide. +// The original or reconstructed samples can be accessed using VP8Scan[]. // The predicted blocks can be accessed using offsets to yuv_p_ and -// the arrays VP8*ModeOffsets[]; -// +----+ YUV Samples area. See VP8Scan[] for accessing the blocks. -// Y_OFF |YYYY| <- original samples ('yuv_in_') -// |YYYY| -// |YYYY| -// |YYYY| -// U_OFF |UUVV| V_OFF (=U_OFF + 8) -// |UUVV| -// +----+ -// Y_OFF |YYYY| <- compressed/decoded samples ('yuv_out_') -// |YYYY| There are two buffers like this ('yuv_out_'/'yuv_out2_') -// |YYYY| -// |YYYY| -// U_OFF |UUVV| V_OFF -// |UUVV| -// x2 (for yuv_out2_) -// +----+ Prediction area ('yuv_p_', size = PRED_SIZE) -// I16DC16 |YYYY| Intra16 predictions (16x16 block each) -// |YYYY| -// |YYYY| -// |YYYY| -// I16TM16 |YYYY| -// |YYYY| -// |YYYY| -// |YYYY| -// I16VE16 |YYYY| -// |YYYY| -// |YYYY| -// |YYYY| -// I16HE16 |YYYY| -// |YYYY| -// |YYYY| -// |YYYY| -// +----+ Chroma U/V predictions (16x8 block each) -// C8DC8 |UUVV| -// |UUVV| -// C8TM8 |UUVV| -// |UUVV| -// C8VE8 |UUVV| -// |UUVV| -// C8HE8 |UUVV| -// |UUVV| -// +----+ Intra 4x4 predictions (4x4 block each) -// |YYYY| I4DC4 I4TM4 I4VE4 I4HE4 -// |YYYY| I4RD4 I4VR4 I4LD4 I4VL4 -// |YY..| I4HD4 I4HU4 I4TMP -// +----+ -#define BPS 16 // this is the common stride -#define Y_SIZE (BPS * 16) -#define UV_SIZE (BPS * 8) -#define YUV_SIZE (Y_SIZE + UV_SIZE) -#define PRED_SIZE (6 * 16 * BPS + 12 * BPS) +// the arrays VP8*ModeOffsets[]. +// * YUV Samples area (yuv_in_/yuv_out_/yuv_out2_) +// (see VP8Scan[] for accessing the blocks, along with Y_OFF/U_OFF/V_OFF): +// +----+----+ +// Y_OFF |YYYY|UUVV| +// U_OFF |YYYY|UUVV| +// V_OFF |YYYY|....| <- 25% wasted U/V area +// |YYYY|....| +// +----+----+ +// * Prediction area ('yuv_p_', size = PRED_SIZE) +// Intra16 predictions (16x16 block each, two per row): +// |I16DC16|I16TM16| +// |I16VE16|I16HE16| +// Chroma U/V predictions (16x8 block each, two per row): +// |C8DC8|C8TM8| +// |C8VE8|C8HE8| +// Intra 4x4 predictions (4x4 block each) +// |I4DC4 I4TM4 I4VE4 I4HE4|I4RD4 I4VR4 I4LD4 I4VL4| +// |I4HD4 I4HU4 I4TMP .....|.......................| <- ~31% wasted +#define BPS 32 // this is the common stride +#define YUV_SIZE (BPS * 16) // 25% lost +#define PRED_SIZE (32 * BPS + 16 * BPS + 8 * BPS) // I16+Chroma+I4 preds #define Y_OFF (0) -#define U_OFF (Y_SIZE) -#define V_OFF (U_OFF + 8) +#define U_OFF (16) +#define V_OFF (16 + 8) #define ALIGN_CST 15 #define DO_ALIGN(PTR) ((uintptr_t)((PTR) + ALIGN_CST) & ~ALIGN_CST) @@ -138,26 +108,26 @@ extern const int VP8I4ModeOffsets[NUM_BMODES]; // Layout of prediction blocks // intra 16x16 #define I16DC16 (0 * 16 * BPS) -#define I16TM16 (1 * 16 * BPS) -#define I16VE16 (2 * 16 * BPS) -#define I16HE16 (3 * 16 * BPS) +#define I16TM16 (I16DC16 + 16) +#define I16VE16 (1 * 16 * BPS) +#define I16HE16 (I16VE16 + 16) // chroma 8x8, two U/V blocks side by side (hence: 16x8 each) -#define C8DC8 (4 * 16 * BPS) -#define C8TM8 (4 * 16 * BPS + 8 * BPS) -#define C8VE8 (5 * 16 * BPS) -#define C8HE8 (5 * 16 * BPS + 8 * BPS) +#define C8DC8 (2 * 16 * BPS) +#define C8TM8 (C8DC8 + 1 * 16) +#define C8VE8 (2 * 16 * BPS + 8 * BPS) +#define C8HE8 (C8VE8 + 1 * 16) // intra 4x4 -#define I4DC4 (6 * 16 * BPS + 0) -#define I4TM4 (6 * 16 * BPS + 4) -#define I4VE4 (6 * 16 * BPS + 8) -#define I4HE4 (6 * 16 * BPS + 12) -#define I4RD4 (6 * 16 * BPS + 4 * BPS + 0) -#define I4VR4 (6 * 16 * BPS + 4 * BPS + 4) -#define I4LD4 (6 * 16 * BPS + 4 * BPS + 8) -#define I4VL4 (6 * 16 * BPS + 4 * BPS + 12) -#define I4HD4 (6 * 16 * BPS + 8 * BPS + 0) -#define I4HU4 (6 * 16 * BPS + 8 * BPS + 4) -#define I4TMP (6 * 16 * BPS + 8 * BPS + 8) +#define I4DC4 (3 * 16 * BPS + 0) +#define I4TM4 (I4DC4 + 4) +#define I4VE4 (I4DC4 + 8) +#define I4HE4 (I4DC4 + 12) +#define I4RD4 (I4DC4 + 16) +#define I4VR4 (I4DC4 + 20) +#define I4LD4 (I4DC4 + 24) +#define I4VL4 (I4DC4 + 28) +#define I4HD4 (3 * 16 * BPS + 4 * BPS) +#define I4HU4 (I4HD4 + 4) +#define I4TMP (I4HD4 + 8) typedef int64_t score_t; // type used for scores, rate, distortion // Note that MAX_COST is not the maximum allowed by sizeof(score_t),