store top Y/U/V samples in packed fashion

More cache-line friendly than storing them split.

Change-Id: Ifb23cc3518ff1b5c37afe007558d4278868d75ea
This commit is contained in:
skal 2013-06-13 06:01:27 +02:00
parent 068db59e26
commit 86daf77c47
2 changed files with 26 additions and 27 deletions

View File

@ -409,7 +409,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
const int mb_w = dec->mb_w_;
// Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise.
const size_t intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
const size_t top_size = (16 + 8 + 8) * mb_w;
const size_t top_size = sizeof(VP8TopSamples) * mb_w;
const size_t mb_info_size = (mb_w + 1) * sizeof(VP8MB);
const size_t f_info_size =
(dec->filter_type_ > 0) ?
@ -446,12 +446,8 @@ static int AllocateMemory(VP8Decoder* const dec) {
dec->intra_t_ = (uint8_t*)mem;
mem += intra_pred_mode_size;
dec->y_t_ = (uint8_t*)mem;
mem += 16 * mb_w;
dec->u_t_ = (uint8_t*)mem;
mem += 8 * mb_w;
dec->v_t_ = (uint8_t*)mem;
mem += 8 * mb_w;
dec->yuv_t_ = (VP8TopSamples*)mem;
mem += top_size;
dec->mb_info_ = ((VP8MB*)mem) + 1;
mem += mb_info_size;
@ -580,16 +576,14 @@ void VP8ReconstructBlock(const VP8Decoder* const dec) {
}
{
// bring top samples into the cache
uint8_t* const top_y = dec->y_t_ + dec->mb_x_ * 16;
uint8_t* const top_u = dec->u_t_ + dec->mb_x_ * 8;
uint8_t* const top_v = dec->v_t_ + dec->mb_x_ * 8;
VP8TopSamples* const top_yuv = dec->yuv_t_ + dec->mb_x_;
const int16_t* const coeffs = block->coeffs_;
int n;
if (dec->mb_y_ > 0) {
memcpy(y_dst - BPS, top_y, 16);
memcpy(u_dst - BPS, top_u, 8);
memcpy(v_dst - BPS, top_v, 8);
memcpy(y_dst - BPS, top_yuv[0].y, 16);
memcpy(u_dst - BPS, top_yuv[0].u, 8);
memcpy(v_dst - BPS, top_yuv[0].v, 8);
} else if (dec->mb_x_ == 0) {
// we only need to do this init once at block (0,0).
// Afterward, it remains valid for the whole topmost row.
@ -604,9 +598,9 @@ void VP8ReconstructBlock(const VP8Decoder* const dec) {
if (dec->mb_y_ > 0) {
if (dec->mb_x_ >= dec->mb_w_ - 1) { // on rightmost border
top_right[0] = top_y[15] * 0x01010101u;
memset(top_right, top_yuv[0].y[15], sizeof(*top_right));
} else {
memcpy(top_right, top_y + 16, sizeof(*top_right));
memcpy(top_right, top_yuv[1].y, sizeof(*top_right));
}
}
// replicate the top-right pixels below
@ -663,9 +657,9 @@ void VP8ReconstructBlock(const VP8Decoder* const dec) {
// stash away top samples for next block
if (dec->mb_y_ < dec->mb_h_ - 1) {
memcpy(top_y, y_dst + 15 * BPS, 16);
memcpy(top_u, u_dst + 7 * BPS, 8);
memcpy(top_v, v_dst + 7 * BPS, 8);
memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16);
memcpy(top_yuv[0].u, u_dst + 7 * BPS, 8);
memcpy(top_yuv[0].v, v_dst + 7 * BPS, 8);
}
}
// Transfer reconstructed samples from yuv_b_ cache to final destination.

View File

@ -193,6 +193,11 @@ typedef struct {
VP8Io io_; // copy of the VP8Io to pass to put()
} VP8ThreadContext;
// Saved top samples, per macroblock. Fits into a cache-line.
typedef struct {
uint8_t y[16], u[8], v[8];
} VP8TopSamples;
//------------------------------------------------------------------------------
// VP8Decoder: the main opaque structure handed over to user
@ -250,17 +255,17 @@ struct VP8Decoder {
#endif
// Boundary data cache and persistent buffers.
uint8_t* intra_t_; // top intra modes values: 4 * mb_w_
uint8_t intra_l_[4]; // left intra modes values
uint8_t* y_t_; // top luma samples: 16 * mb_w_
uint8_t* u_t_, *v_t_; // top u/v samples: 8 * mb_w_ each
uint8_t segment_; // segment of the currently parsed block
uint8_t* intra_t_; // top intra modes values: 4 * mb_w_
uint8_t intra_l_[4]; // left intra modes values
VP8MB* mb_info_; // contextual macroblock info (mb_w_ + 1)
VP8FInfo* f_info_; // filter strength info
uint8_t* yuv_b_; // main block for Y/U/V (size = YUV_SIZE)
uint8_t segment_; // segment of the currently parsed block
VP8TopSamples* yuv_t_; // top y/u/v samples
uint8_t* cache_y_; // macroblock row for storing unfiltered samples
VP8MB* mb_info_; // contextual macroblock info (mb_w_ + 1)
VP8FInfo* f_info_; // filter strength info
uint8_t* yuv_b_; // main block for Y/U/V (size = YUV_SIZE)
uint8_t* cache_y_; // macroblock row for storing unfiltered samples
uint8_t* cache_u_;
uint8_t* cache_v_;
int cache_y_stride_;