diff --git a/src/dec/frame.c b/src/dec/frame.c index 92c23185..9baac8e1 100644 --- a/src/dec/frame.c +++ b/src/dec/frame.c @@ -409,7 +409,7 @@ static int AllocateMemory(VP8Decoder* const dec) { const int mb_w = dec->mb_w_; // Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise. const size_t intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t); - const size_t top_size = (16 + 8 + 8) * mb_w; + const size_t top_size = sizeof(VP8TopSamples) * mb_w; const size_t mb_info_size = (mb_w + 1) * sizeof(VP8MB); const size_t f_info_size = (dec->filter_type_ > 0) ? @@ -446,12 +446,8 @@ static int AllocateMemory(VP8Decoder* const dec) { dec->intra_t_ = (uint8_t*)mem; mem += intra_pred_mode_size; - dec->y_t_ = (uint8_t*)mem; - mem += 16 * mb_w; - dec->u_t_ = (uint8_t*)mem; - mem += 8 * mb_w; - dec->v_t_ = (uint8_t*)mem; - mem += 8 * mb_w; + dec->yuv_t_ = (VP8TopSamples*)mem; + mem += top_size; dec->mb_info_ = ((VP8MB*)mem) + 1; mem += mb_info_size; @@ -580,16 +576,14 @@ void VP8ReconstructBlock(const VP8Decoder* const dec) { } { // bring top samples into the cache - uint8_t* const top_y = dec->y_t_ + dec->mb_x_ * 16; - uint8_t* const top_u = dec->u_t_ + dec->mb_x_ * 8; - uint8_t* const top_v = dec->v_t_ + dec->mb_x_ * 8; + VP8TopSamples* const top_yuv = dec->yuv_t_ + dec->mb_x_; const int16_t* const coeffs = block->coeffs_; int n; if (dec->mb_y_ > 0) { - memcpy(y_dst - BPS, top_y, 16); - memcpy(u_dst - BPS, top_u, 8); - memcpy(v_dst - BPS, top_v, 8); + memcpy(y_dst - BPS, top_yuv[0].y, 16); + memcpy(u_dst - BPS, top_yuv[0].u, 8); + memcpy(v_dst - BPS, top_yuv[0].v, 8); } else if (dec->mb_x_ == 0) { // we only need to do this init once at block (0,0). // Afterward, it remains valid for the whole topmost row. @@ -604,9 +598,9 @@ void VP8ReconstructBlock(const VP8Decoder* const dec) { if (dec->mb_y_ > 0) { if (dec->mb_x_ >= dec->mb_w_ - 1) { // on rightmost border - top_right[0] = top_y[15] * 0x01010101u; + memset(top_right, top_yuv[0].y[15], sizeof(*top_right)); } else { - memcpy(top_right, top_y + 16, sizeof(*top_right)); + memcpy(top_right, top_yuv[1].y, sizeof(*top_right)); } } // replicate the top-right pixels below @@ -663,9 +657,9 @@ void VP8ReconstructBlock(const VP8Decoder* const dec) { // stash away top samples for next block if (dec->mb_y_ < dec->mb_h_ - 1) { - memcpy(top_y, y_dst + 15 * BPS, 16); - memcpy(top_u, u_dst + 7 * BPS, 8); - memcpy(top_v, v_dst + 7 * BPS, 8); + memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16); + memcpy(top_yuv[0].u, u_dst + 7 * BPS, 8); + memcpy(top_yuv[0].v, v_dst + 7 * BPS, 8); } } // Transfer reconstructed samples from yuv_b_ cache to final destination. diff --git a/src/dec/vp8i.h b/src/dec/vp8i.h index 35ad36f0..2f97527b 100644 --- a/src/dec/vp8i.h +++ b/src/dec/vp8i.h @@ -193,6 +193,11 @@ typedef struct { VP8Io io_; // copy of the VP8Io to pass to put() } VP8ThreadContext; +// Saved top samples, per macroblock. Fits into a cache-line. +typedef struct { + uint8_t y[16], u[8], v[8]; +} VP8TopSamples; + //------------------------------------------------------------------------------ // VP8Decoder: the main opaque structure handed over to user @@ -250,17 +255,17 @@ struct VP8Decoder { #endif // Boundary data cache and persistent buffers. - uint8_t* intra_t_; // top intra modes values: 4 * mb_w_ - uint8_t intra_l_[4]; // left intra modes values - uint8_t* y_t_; // top luma samples: 16 * mb_w_ - uint8_t* u_t_, *v_t_; // top u/v samples: 8 * mb_w_ each - uint8_t segment_; // segment of the currently parsed block + uint8_t* intra_t_; // top intra modes values: 4 * mb_w_ + uint8_t intra_l_[4]; // left intra modes values - VP8MB* mb_info_; // contextual macroblock info (mb_w_ + 1) - VP8FInfo* f_info_; // filter strength info - uint8_t* yuv_b_; // main block for Y/U/V (size = YUV_SIZE) + uint8_t segment_; // segment of the currently parsed block + VP8TopSamples* yuv_t_; // top y/u/v samples - uint8_t* cache_y_; // macroblock row for storing unfiltered samples + VP8MB* mb_info_; // contextual macroblock info (mb_w_ + 1) + VP8FInfo* f_info_; // filter strength info + uint8_t* yuv_b_; // main block for Y/U/V (size = YUV_SIZE) + + uint8_t* cache_y_; // macroblock row for storing unfiltered samples uint8_t* cache_u_; uint8_t* cache_v_; int cache_y_stride_;