Merge changes I55f8da52,Id73a1e96

* changes: cosmetics: add some missing != NULL comparisons factorize BPS definition in dsp.h and add VP8Copy16x8
2025-06-06 22:14:23 +02:00 · 2014-12-04 20:46:29 -08:00 · 2014-12-04 20:46:29 -08:00 · 441f273f19
commit 441f273f19
parent 432e5b550e 4a279a680e
5 changed files with 29 additions and 24 deletions
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@ -69,7 +69,7 @@ enum { MB_FEATURE_TREE_PROBS = 3,
       NUM_PROBAS = 11,
       NUM_MV_PROBAS = 19 };

-// YUV-cache parameters.
+// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
 // and two 8x8 chroma blocks (u/v). These are better be 16-bytes aligned,
 // in order to be SIMD-friendly. We also need to store the top, left and
@ -91,8 +91,6 @@ enum { MB_FEATURE_TREE_PROBS = 3,
 //  'y' = y-samples   'u' = u-samples     'v' = u-samples
 //  '|' = left sample,   '-' = top sample,    '+' = top-left sample
 //  't' = extra top-right sample for 4x4 modes
-// With this layout, BPS (=Bytes Per Scan-line) is one cacheline size.
-#define BPS       32    // this is the common stride used by yuv[]
 #define YUV_SIZE (BPS * 17 + BPS * 9)
 #define Y_SIZE   (BPS * 17)
 #define Y_OFF    (BPS * 1 + 8)
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@ -24,6 +24,8 @@
 extern "C" {
 #endif

+#define BPS 32   // this is the common stride for enc/dec
+
 //------------------------------------------------------------------------------
 // CPU detection

@ -132,6 +134,7 @@ extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;

 typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
 extern VP8BlockCopy VP8Copy4x4;
+extern VP8BlockCopy VP8Copy16x8;
 // Quantization
 struct VP8Matrix;   // forward declaration
 typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@ -207,7 +207,7 @@ static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
 static WEBP_INLINE void VerticalPred(uint8_t* dst,
                                     const uint8_t* top, int size) {
  int j;
-  if (top) {
+  if (top != NULL) {
    for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
  } else {
    Fill(dst, 127, size);
@ -216,7 +216,7 @@ static WEBP_INLINE void VerticalPred(uint8_t* dst,

 static WEBP_INLINE void HorizontalPred(uint8_t* dst,
                                       const uint8_t* left, int size) {
-  if (left) {
+  if (left != NULL) {
    int j;
    for (j = 0; j < size; ++j) {
      memset(dst + j * BPS, left[j], size);
@ -229,8 +229,8 @@ static WEBP_INLINE void HorizontalPred(uint8_t* dst,
 static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
                                   const uint8_t* top, int size) {
  int y;
-  if (left) {
-    if (top) {
+  if (left != NULL) {
+    if (top != NULL) {
      const uint8_t* const clip = clip1 + 255 - left[-1];
      for (y = 0; y < size; ++y) {
        const uint8_t* const clip_table = clip + left[y];
@ -248,7 +248,7 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
    // is equivalent to VE prediction where you just copy the top samples.
    // Note that if top samples are not available, the default value is
    // then 129, and not 127 as in the VerticalPred case.
-    if (top) {
+    if (top != NULL) {
      VerticalPred(dst, top, size);
    } else {
      Fill(dst, 129, size);
@ -261,15 +261,15 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
                               int size, int round, int shift) {
  int DC = 0;
  int j;
-  if (top) {
+  if (top != NULL) {
    for (j = 0; j < size; ++j) DC += top[j];
-    if (left) {   // top and left present
+    if (left != NULL) {   // top and left present
      for (j = 0; j < size; ++j) DC += left[j];
    } else {      // top, but no left
      DC += DC;
    }
    DC = (DC + round) >> shift;
-  } else if (left) {   // left but no top
+  } else if (left != NULL) {   // left but no top
    for (j = 0; j < size; ++j) DC += left[j];
    DC += DC;
    DC = (DC + round) >> shift;
@ -291,8 +291,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
  TrueMotion(C8TM8 + dst, left, top, 8);
  // V block
  dst += 8;
-  if (top) top += 8;
-  if (left) left += 16;
+  if (top != NULL) top += 8;
+  if (left != NULL) left += 16;
  DCMode(C8DC8 + dst, left, top, 8, 8, 4);
  VerticalPred(C8VE8 + dst, top, 8);
  HorizontalPred(C8HE8 + dst, left, 8);
@ -662,16 +662,22 @@ static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
 //------------------------------------------------------------------------------
 // Block copy

-static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int size) {
+static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
  int y;
-  for (y = 0; y < size; ++y) {
-    memcpy(dst, src, size);
+  for (y = 0; y < h; ++y) {
+    memcpy(dst, src, w);
    src += BPS;
    dst += BPS;
  }
 }

-static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
+static void Copy4x4(const uint8_t* src, uint8_t* dst) {
+  Copy(src, dst, 4, 4);
+}
+
+static void Copy16x8(const uint8_t* src, uint8_t* dst) {
+  Copy(src, dst, 16, 8);
+}

 //------------------------------------------------------------------------------
 // Initialization
@ -695,6 +701,7 @@ VP8QuantizeBlock VP8EncQuantizeBlock;
 VP8Quantize2Blocks VP8EncQuantize2Blocks;
 VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
 VP8BlockCopy VP8Copy4x4;
+VP8BlockCopy VP8Copy16x8;

 extern void VP8EncDspInitSSE2(void);
 extern void VP8EncDspInitAVX2(void);
@ -724,6 +731,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
  VP8EncQuantize2Blocks = Quantize2Blocks;
  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
  VP8Copy4x4 = Copy4x4;
+  VP8Copy16x8 = Copy16x8;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
--- a/src/enc/quant.c
+++ b/src/enc/quant.c
@ -1067,10 +1067,7 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
  VP8SetIntraUVMode(it, rd->mode_uv);
  AddScore(rd, &rd_best);
  if (dst != dst0) {   // copy 16x8 block if needed
-    int i;
-    for (i = 0; i < 8; ++i) {
-      memcpy(dst0 + i * BPS, dst + i * BPS, 2 * 8 * sizeof(*dst0));
-    }
+    VP8Copy16x8(dst0, dst);
  }
 }

--- a/src/enc/vp8enci.h
+++ b/src/enc/vp8enci.h
@ -69,7 +69,7 @@ typedef enum {   // Rate-distortion optimization levels
  RD_OPT_TRELLIS_ALL = 3   // trellis-quant for every scoring (much slower)
 } VP8RDLevel;

-// YUV-cache parameters. Cache is 32-pixels wide.
+// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // The original or reconstructed samples can be accessed using VP8Scan[].
 // The predicted blocks can be accessed using offsets to yuv_p_ and
 // the arrays VP8*ModeOffsets[].
@ -91,8 +91,7 @@ typedef enum {   // Rate-distortion optimization levels
 //   Intra 4x4 predictions (4x4 block each)
 //         |I4DC4 I4TM4 I4VE4 I4HE4|I4RD4 I4VR4 I4LD4 I4VL4|
 //         |I4HD4 I4HU4 I4TMP .....|.......................| <- ~31% wasted
-#define BPS       32   // this is the common stride
-#define YUV_SIZE (BPS * 16)  // 25% lost
+#define YUV_SIZE (BPS * 16)
 #define PRED_SIZE (32 * BPS + 16 * BPS + 8 * BPS)   // I16+Chroma+I4 preds
 #define Y_OFF    (0)
 #define U_OFF    (16)