rescaler: add some SSE2 code

The rounding and arithmetic is not the same as previously, to prevent overflow cases for large upscale factors. We still rely on 32b x 32b -> 64b multiplies. Raised the fixed-point precision to 32b so that we have some nice shifts from epi64 to epi32. Changed rescaler_t type to 'uint32_t' in order to squeeze in all the precision required. The MIPS code has been disabled because it's now out-of-sync. Will be fixed in a subsequent CL when the dust settles. ~30-35% faster Change-Id: I32e4ddc00933f1b1aa3463403086199fd5dad07b
2025-07-13 14:34:33 +02:00 · 2015-09-25 14:34:02 +02:00
parent 1df1d0eedb
commit 76a7dc39e5
10 changed files with 320 additions and 44 deletions
--- a/src/utils/rescaler.c
+++ b/src/utils/rescaler.c
@ -41,19 +41,20 @@ void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add;
  wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
  if (!wrk->x_expand) {  // fx_scale is not used otherwise
-    wrk->fx_scale = WEBP_RESCALER_ONE / wrk->x_sub;
+    wrk->fx_scale = WEBP_RESCALER_FRAC(1, wrk->x_sub);
  }
  // vertical scaling parameters
  wrk->y_add = wrk->y_expand ? y_add - 1 : y_add;
-  wrk->y_sub = wrk->y_expand ? y_sub - 1: y_sub;
+  wrk->y_sub = wrk->y_expand ? y_sub - 1 : y_sub;
  wrk->y_accum = wrk->y_expand ? wrk->y_sub : wrk->y_add;
  if (!wrk->y_expand) {
-    wrk->fy_scale = WEBP_RESCALER_ONE / wrk->y_sub;
-    wrk->fxy_scale = ((uint64_t)dst_height << WEBP_RESCALER_RFIX)
-                   / (wrk->x_add * wrk->y_add);
+    // note the very special case where x_add = y_add = 1 cannot be represented.
+    // We special-case fxy_scale = 0 in this case, in ExportRowShrink
+    wrk->fxy_scale = WEBP_RESCALER_FRAC(dst_height, wrk->x_add * wrk->y_add);
+    wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->y_sub);
  } else {
-    wrk->fy_scale = WEBP_RESCALER_ONE / wrk->x_add;
-    wrk->fxy_scale = WEBP_RESCALER_ONE / (wrk->x_add * wrk->y_sub);
+    wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->x_add);
+    // wrk->fxy_scale is unused here.
  }
  wrk->irow = work;
  wrk->frow = work + num_channels * dst_width;
--- a/src/utils/rescaler.h
+++ b/src/utils/rescaler.h
@ -20,11 +20,12 @@ extern "C" {

 #include "../webp/types.h"

-#define WEBP_RESCALER_RFIX 30   // fixed-point precision for multiplies
-#define WEBP_RESCALER_ONE (1u << WEBP_RESCALER_RFIX)
+#define WEBP_RESCALER_RFIX 32   // fixed-point precision for multiplies
+#define WEBP_RESCALER_ONE (1ull << WEBP_RESCALER_RFIX)
+#define WEBP_RESCALER_FRAC(x, y) (((uint64_t)(x) << WEBP_RESCALER_RFIX) / (y))

 // Structure used for on-the-fly rescaling
-typedef int32_t rescaler_t;   // type for side-buffer
+typedef uint32_t rescaler_t;   // type for side-buffer
 typedef struct WebPRescaler WebPRescaler;
 struct WebPRescaler {
  int x_expand;               // true if we're expanding in the x direction
@ -32,7 +33,7 @@ struct WebPRescaler {
  int num_channels;           // bytes to jump between pixels
  uint32_t fx_scale;          // fixed-point scaling factors
  uint32_t fy_scale;          // ''
-  uint64_t fxy_scale;         // ''
+  uint32_t fxy_scale;         // ''
  int y_accum;                // vertical accumulator
  int y_add, y_sub;           // vertical increments
  int x_add, x_sub;           // horizontal increments