rescaler: simplify ImportRow logic

incorporates the loop over 'channel' and removes one parameter Change-Id: I4e3b33c111ca825fe96461583420413b17326409
2025-08-28 06:42:27 +02:00 · 2015-09-19 09:59:32 -07:00
parent 5ff0079ece
commit 9ba1894b9b
5 changed files with 250 additions and 239 deletions
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -338,8 +338,7 @@ struct WebPRescaler;
 // 'channel' denotes the channel number to be imported. 'Expand' corresponds to
 // the wrk->x_expand case. Otherwise, 'Shrink' is to be used.
 typedef void (*WebPRescalerImportRowFunc)(struct WebPRescaler* const wrk,
-                                          const uint8_t* const src,
-                                          int channel);
+                                          const uint8_t* src);

 extern WebPRescalerImportRowFunc WebPRescalerImportRowExpand;
 extern WebPRescalerImportRowFunc WebPRescalerImportRowShrink;
@@ -353,15 +352,15 @@ extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;

 // Plain-C implementation, as fall-back.
 extern void WebPRescalerImportRowExpandC(struct WebPRescaler* const wrk,
-                                         const uint8_t* const src, int channel);
+                                         const uint8_t* src);
 extern void WebPRescalerImportRowShrinkC(struct WebPRescaler* const wrk,
-                                         const uint8_t* const src, int channel);
+                                         const uint8_t* src);
 extern void WebPRescalerExportRowExpandC(struct WebPRescaler* const wrk);
 extern void WebPRescalerExportRowShrinkC(struct WebPRescaler* const wrk);

 // Main entry calls:
 extern void WebPRescalerImportRow(struct WebPRescaler* const wrk,
-                                  const uint8_t* const src, int channel);
+                                  const uint8_t* src);
 // Export one row (starting at x_out position) from rescaler.
 extern void WebPRescalerExportRow(struct WebPRescaler* const wrk);

--- a/src/dsp/rescaler.c
+++ b/src/dsp/rescaler.c
@@ -23,66 +23,69 @@
 //------------------------------------------------------------------------------
 // Row import

-void WebPRescalerImportRowExpandC(WebPRescaler* const wrk,
-                                  const uint8_t* const src, int channel) {
+void WebPRescalerImportRowExpandC(WebPRescaler* wrk, const uint8_t* src) {
  const int x_stride = wrk->num_channels;
  const int x_out_max = wrk->dst_width * wrk->num_channels;
-  int x_in = channel;
-  int x_out;
-  // simple bilinear interpolation
-  int accum = wrk->x_add;
-  int left = src[x_in];
-  int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left;
-  x_in += x_stride;
-  x_out = channel;
-
+  int channel;
  assert(!WebPRescalerInputDone(wrk));
  assert(wrk->x_expand);
-  while (1) {
-    wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
-    x_out += x_stride;
-    if (x_out >= x_out_max) break;
-    accum -= wrk->x_sub;
-    if (accum < 0) {
-      left = right;
-      x_in += x_stride;
-      assert(x_in < wrk->src_width * x_stride);
-      right = src[x_in];
-      accum += wrk->x_add;
+  for (channel = 0; channel < x_stride; ++channel) {
+    int x_in = channel;
+    int x_out = channel;
+    // simple bilinear interpolation
+    int accum = wrk->x_add;
+    int left = src[x_in];
+    int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left;
+    x_in += x_stride;
+    while (1) {
+      wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
+      x_out += x_stride;
+      if (x_out >= x_out_max) break;
+      accum -= wrk->x_sub;
+      if (accum < 0) {
+        left = right;
+        x_in += x_stride;
+        assert(x_in < wrk->src_width * x_stride);
+        right = src[x_in];
+        accum += wrk->x_add;
+      }
    }
+    assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0);
  }
-  assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0);
 }

 void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk,
-                                  const uint8_t* const src, int channel) {
+                                  const uint8_t* src) {
  const int x_stride = wrk->num_channels;
  const int x_out_max = wrk->dst_width * wrk->num_channels;
-  int x_in = channel;
-  int x_out;
-  uint32_t sum = 0;
-  int accum = 0;
-
+  int channel;
  assert(!WebPRescalerInputDone(wrk));
  assert(!wrk->x_expand);
-  for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
-    uint32_t base = 0;
-    accum += wrk->x_add;
-    while (accum > 0) {
-      accum -= wrk->x_sub;
-      assert(x_in < wrk->src_width * x_stride);
-      base = src[x_in];
-      sum += base;
-      x_in += x_stride;
-    }
-    {        // Emit next horizontal pixel.
-      const rescaler_t frac = base * (-accum);
-      wrk->frow[x_out] = sum * wrk->x_sub - frac;
-      // fresh fractional start for next pixel
-      sum = (int)MULT_FIX(frac, wrk->fx_scale);
+  for (channel = 0; channel < x_stride; ++channel) {
+    int x_in = channel;
+    int x_out = channel;
+    uint32_t sum = 0;
+    int accum = 0;
+    while (x_out < x_out_max) {
+      uint32_t base = 0;
+      accum += wrk->x_add;
+      while (accum > 0) {
+        accum -= wrk->x_sub;
+        assert(x_in < wrk->src_width * x_stride);
+        base = src[x_in];
+        sum += base;
+        x_in += x_stride;
+      }
+      {        // Emit next horizontal pixel.
+        const rescaler_t frac = base * (-accum);
+        wrk->frow[x_out] = sum * wrk->x_sub - frac;
+        // fresh fractional start for next pixel
+        sum = (int)MULT_FIX(frac, wrk->fx_scale);
+      }
+      x_out += x_stride;
    }
+    assert(accum == 0);
  }
-  assert(accum == 0);
 }

 //------------------------------------------------------------------------------
@@ -145,13 +148,12 @@ void WebPRescalerExportRowShrinkC(WebPRescaler* const wrk) {
 //------------------------------------------------------------------------------
 // Main entry calls

-void WebPRescalerImportRow(WebPRescaler* const wrk,
-                           const uint8_t* const src, int channel) {
+void WebPRescalerImportRow(WebPRescaler* const wrk, const uint8_t* const src) {
  assert(!WebPRescalerInputDone(wrk));
  if (!wrk->x_expand) {
-    WebPRescalerImportRowShrink(wrk, src, channel);
+    WebPRescalerImportRowShrink(wrk, src);
  } else {
-    WebPRescalerImportRowExpand(wrk, src, channel);
+    WebPRescalerImportRowExpand(wrk, src);
  }
 }

--- a/src/dsp/rescaler_mips32.c
+++ b/src/dsp/rescaler_mips32.c
@@ -18,117 +18,123 @@
 #include <assert.h>
 #include "../utils/rescaler.h"

-static void ImportRowShrink(WebPRescaler* const wrk,
-                      const uint8_t* const src, int channel) {
+static void ImportRowShrink(WebPRescaler* const wrk, const uint8_t* src) {
  const int x_stride = wrk->num_channels;
  const int x_out_max = wrk->dst_width * wrk->num_channels;
  const int fx_scale = wrk->fx_scale;
  const int x_add = wrk->x_add;
  const int x_sub = wrk->x_sub;
-  int* frow = wrk->frow + channel;
-  const uint8_t* src1 = src + channel;
-  int temp1, temp2, temp3;
-  int base, frac, sum;
-  int accum, accum1;
  const int x_stride1 = x_stride << 2;
-  int loop_c = x_out_max - channel;
-
+  int channel;
  assert(!wrk->x_expand);
  assert(!WebPRescalerInputDone(wrk));
-  __asm__ volatile (
-    "li     %[temp1],   0x8000                    \n\t"
-    "li     %[temp2],   0x10000                   \n\t"
-    "li     %[sum],     0                         \n\t"
-    "li     %[accum],   0                         \n\t"
-  "1:                                             \n\t"
-    "addu   %[accum],   %[accum],   %[x_add]      \n\t"
-    "li     %[base],    0                         \n\t"
-    "blez   %[accum],   3f                        \n\t"
-  "2:                                             \n\t"
-    "lbu    %[base],    0(%[src1])                \n\t"
-    "subu   %[accum],   %[accum],   %[x_sub]      \n\t"
-    "addu   %[src1],    %[src1],    %[x_stride]   \n\t"
-    "addu   %[sum],     %[sum],     %[base]       \n\t"
-    "bgtz   %[accum],   2b                        \n\t"
-  "3:                                             \n\t"
-    "negu   %[accum1],  %[accum]                  \n\t"
-    "mul    %[frac],    %[base],    %[accum1]     \n\t"
-    "mul    %[temp3],   %[sum],     %[x_sub]      \n\t"
-    "subu   %[loop_c],  %[loop_c],  %[x_stride]   \n\t"
-    "sll    %[accum1],  %[frac],    2             \n\t"
-    "mult   %[temp1],   %[temp2]                  \n\t"
-    "madd   %[accum1],  %[fx_scale]               \n\t"
-    "mfhi   %[sum]                                \n\t"
-    "subu   %[temp3],   %[temp3],   %[frac]       \n\t"
-    "sw     %[temp3],   0(%[frow])                \n\t"
-    "addu   %[frow],    %[frow],    %[x_stride1]  \n\t"
-    "bgtz   %[loop_c],  1b                        \n\t"
-    : [accum] "=&r" (accum), [src1] "+r" (src1), [temp3] "=&r" (temp3),
-      [sum] "=&r" (sum), [base] "=&r" (base), [frac] "=&r" (frac),
-      [frow] "+r" (frow), [accum1] "=&r" (accum1),
-      [temp2] "=&r" (temp2), [temp1] "=&r" (temp1)
-    : [x_stride] "r" (x_stride), [fx_scale] "r" (fx_scale),
-      [x_sub] "r" (x_sub), [x_add] "r" (x_add),
-      [loop_c] "r" (loop_c), [x_stride1] "r" (x_stride1)
-    : "memory", "hi", "lo"
-  );
+
+  for (channel = 0; channel < x_stride; ++channel) {
+    const uint8_t* src1 = src + channel;
+    int* frow = wrk->frow + channel;
+    int temp1, temp2, temp3;
+    int base, frac, sum;
+    int accum, accum1;
+    int loop_c = x_out_max;
+
+    __asm__ volatile (
+      "li     %[temp1],   0x8000                    \n\t"
+      "li     %[temp2],   0x10000                   \n\t"
+      "li     %[sum],     0                         \n\t"
+      "li     %[accum],   0                         \n\t"
+    "1:                                             \n\t"
+      "addu   %[accum],   %[accum],   %[x_add]      \n\t"
+      "li     %[base],    0                         \n\t"
+      "blez   %[accum],   3f                        \n\t"
+    "2:                                             \n\t"
+      "lbu    %[base],    0(%[src1])                \n\t"
+      "subu   %[accum],   %[accum],   %[x_sub]      \n\t"
+      "addu   %[src1],    %[src1],    %[x_stride]   \n\t"
+      "addu   %[sum],     %[sum],     %[base]       \n\t"
+      "bgtz   %[accum],   2b                        \n\t"
+    "3:                                             \n\t"
+      "negu   %[accum1],  %[accum]                  \n\t"
+      "mul    %[frac],    %[base],    %[accum1]     \n\t"
+      "mul    %[temp3],   %[sum],     %[x_sub]      \n\t"
+      "subu   %[loop_c],  %[loop_c],  %[x_stride]   \n\t"
+      "sll    %[accum1],  %[frac],    2             \n\t"
+      "mult   %[temp1],   %[temp2]                  \n\t"
+      "madd   %[accum1],  %[fx_scale]               \n\t"
+      "mfhi   %[sum]                                \n\t"
+      "subu   %[temp3],   %[temp3],   %[frac]       \n\t"
+      "sw     %[temp3],   0(%[frow])                \n\t"
+      "addu   %[frow],    %[frow],    %[x_stride1]  \n\t"
+      "bgtz   %[loop_c],  1b                        \n\t"
+      : [accum] "=&r" (accum), [src1] "+r" (src1), [temp3] "=&r" (temp3),
+        [sum] "=&r" (sum), [base] "=&r" (base), [frac] "=&r" (frac),
+        [frow] "+r" (frow), [accum1] "=&r" (accum1),
+        [temp2] "=&r" (temp2), [temp1] "=&r" (temp1)
+      : [x_stride] "r" (x_stride), [fx_scale] "r" (fx_scale),
+        [x_sub] "r" (x_sub), [x_add] "r" (x_add),
+        [loop_c] "r" (loop_c), [x_stride1] "r" (x_stride1)
+      : "memory", "hi", "lo"
+    );
+  }
 }

-static void ImportRowExpand(WebPRescaler* const wrk,
-                            const uint8_t* const src, int channel) {
+static void ImportRowExpand(WebPRescaler* const wrk, const uint8_t* src) {
  const int x_stride = wrk->num_channels;
  const int x_out_max = wrk->dst_width * wrk->num_channels;
  const int x_add = wrk->x_add;
  const int x_sub = wrk->x_sub;
  const int src_width = wrk->src_width;
-  int* frow = wrk->frow + channel;
-  const uint8_t* src1 = src + channel;
-  int temp1, temp2, temp3, temp4;
-  int frac;
-  int accum;
  const int x_stride1 = x_stride << 2;
-  int x_out = channel;
-
+  int channel;
  assert(wrk->x_expand);
  assert(!WebPRescalerInputDone(wrk));
-  __asm__ volatile (
-    "addiu  %[temp3],   %[src_width], -1            \n\t"
-    "lbu    %[temp2],   0(%[src1])                  \n\t"
-    "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
-    "bgtz   %[temp3],   0f                          \n\t"
-    "addiu  %[temp1],   %[temp2],     0             \n\t"
-    "b      3f                                      \n\t"
-  "0:                                               \n\t"
-    "lbu    %[temp1],   0(%[src1])                  \n\t"
-  "3:                                               \n\t"
-    "addiu  %[accum],   %[x_add],     0             \n\t"
-  "1:                                               \n\t"
-    "subu   %[temp3],   %[temp2],     %[temp1]      \n\t"
-    "mul    %[temp3],   %[temp3],     %[accum]      \n\t"
-    "mul    %[temp4],   %[temp1],     %[x_add]      \n\t"
-    "addu   %[temp3],   %[temp4],     %[temp3]      \n\t"
-    "sw     %[temp3],   0(%[frow])                  \n\t"
-    "addu   %[frow],    %[frow],      %[x_stride1]  \n\t"
-    "addu   %[x_out],   %[x_out],     %[x_stride]   \n\t"
-    "subu   %[temp3],   %[x_out],     %[x_out_max]  \n\t"
-    "bgez   %[temp3],   2f                          \n\t"
-    "subu   %[accum],   %[accum],     %[x_sub]      \n\t"
-    "bgez   %[accum],   4f                          \n\t"
-    "addiu  %[temp2],   %[temp1],     0             \n\t"
-    "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
-    "lbu    %[temp1],   0(%[src1])                  \n\t"
-    "addu   %[accum],   %[accum],     %[x_add]      \n\t"
-  "4:                                               \n\t"
-    "b      1b                                      \n\t"
-  "2:                                               \n\t"
-    : [src1] "+r" (src1), [accum] "=&r" (accum), [temp1] "=&r" (temp1),
-      [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [temp4] "=&r" (temp4),
-      [x_out] "+r" (x_out), [frac] "=&r" (frac), [frow] "+r" (frow)
-    : [x_stride] "r" (x_stride), [x_add] "r" (x_add), [x_sub] "r" (x_sub),
-      [x_stride1] "r" (x_stride1), [src_width] "r" (src_width),
-      [x_out_max] "r" (x_out_max)
-    : "memory", "hi", "lo"
-  );
+
+  for (channel = 0; channel < x_stride; ++channel) {
+    const uint8_t* src1 = src + channel;
+    int* frow = wrk->frow + channel;
+    int temp1, temp2, temp3, temp4;
+    int frac;
+    int accum;
+    int x_out = 0;
+
+    __asm__ volatile (
+      "addiu  %[temp3],   %[src_width], -1            \n\t"
+      "lbu    %[temp2],   0(%[src1])                  \n\t"
+      "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
+      "bgtz   %[temp3],   0f                          \n\t"
+      "addiu  %[temp1],   %[temp2],     0             \n\t"
+      "b      3f                                      \n\t"
+    "0:                                               \n\t"
+      "lbu    %[temp1],   0(%[src1])                  \n\t"
+    "3:                                               \n\t"
+      "addiu  %[accum],   %[x_add],     0             \n\t"
+    "1:                                               \n\t"
+      "subu   %[temp3],   %[temp2],     %[temp1]      \n\t"
+      "mul    %[temp3],   %[temp3],     %[accum]      \n\t"
+      "mul    %[temp4],   %[temp1],     %[x_add]      \n\t"
+      "addu   %[temp3],   %[temp4],     %[temp3]      \n\t"
+      "sw     %[temp3],   0(%[frow])                  \n\t"
+      "addu   %[frow],    %[frow],      %[x_stride1]  \n\t"
+      "addu   %[x_out],   %[x_out],     %[x_stride]   \n\t"
+      "subu   %[temp3],   %[x_out],     %[x_out_max]  \n\t"
+      "bgez   %[temp3],   2f                          \n\t"
+      "subu   %[accum],   %[accum],     %[x_sub]      \n\t"
+      "bgez   %[accum],   4f                          \n\t"
+      "addiu  %[temp2],   %[temp1],     0             \n\t"
+      "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
+      "lbu    %[temp1],   0(%[src1])                  \n\t"
+      "addu   %[accum],   %[accum],     %[x_add]      \n\t"
+    "4:                                               \n\t"
+      "b      1b                                      \n\t"
+    "2:                                               \n\t"
+      : [src1] "+r" (src1), [accum] "=&r" (accum), [temp1] "=&r" (temp1),
+        [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [temp4] "=&r" (temp4),
+        [x_out] "+r" (x_out), [frac] "=&r" (frac), [frow] "+r" (frow)
+      : [x_stride] "r" (x_stride), [x_add] "r" (x_add), [x_sub] "r" (x_sub),
+        [x_stride1] "r" (x_stride1), [src_width] "r" (src_width),
+        [x_out_max] "r" (x_out_max)
+      : "memory", "hi", "lo"
+    );
+  }
 }

 static void ExportRowShrink(WebPRescaler* const wrk) {
--- a/src/dsp/rescaler_mips_dsp_r2.c
+++ b/src/dsp/rescaler_mips_dsp_r2.c
@@ -18,112 +18,118 @@
 #include <assert.h>
 #include "../utils/rescaler.h"

-static void ImportRowShrink(WebPRescaler* const wrk,
-                            const uint8_t* const src, int channel) {
+static void ImportRowShrink(WebPRescaler* const wrk, const uint8_t* src) {
  const int x_stride = wrk->num_channels;
  const int x_out_max = wrk->dst_width * wrk->num_channels;
  const int fx_scale = wrk->fx_scale;
  const int x_add = wrk->x_add;
  const int x_sub = wrk->x_sub;
-  int* frow = wrk->frow + channel;
-  const uint8_t* src1 = src + channel;
-  int temp3;
-  int base, frac, sum;
-  int accum, accum1;
  const int x_stride1 = x_stride << 2;
-  int loop_c = x_out_max - channel;
-
+  int channel;
  assert(!wrk->x_expand);
  assert(!WebPRescalerInputDone(wrk));
-  __asm__ volatile (
-      "li         %[sum],     0                         \n\t"
-      "li         %[accum],   0                         \n\t"
-    "1:                                                 \n\t"
-      "addu       %[accum],   %[accum],   %[x_add]      \n\t"
-      "li         %[base],    0                         \n\t"
-      "blez       %[accum],   3f                        \n\t"
-    "2:                                                 \n\t"
-      "lbu        %[base],    0(%[src1])                \n\t"
-      "subu       %[accum],   %[accum],   %[x_sub]      \n\t"
-      "addu       %[src1],    %[src1],    %[x_stride]   \n\t"
-      "addu       %[sum],     %[sum],     %[base]       \n\t"
-      "bgtz       %[accum],   2b                        \n\t"
-    "3:                                                 \n\t"
-      "negu       %[accum1],  %[accum]                  \n\t"
-      "mul        %[frac],    %[base],    %[accum1]     \n\t"
-      "mul        %[temp3],   %[sum],     %[x_sub]      \n\t"
-      "sll        %[accum1],  %[frac],    1             \n\t"
-      "subu       %[loop_c],  %[loop_c],  %[x_stride]   \n\t"
-      "mulq_rs.w  %[sum],     %[accum1],  %[fx_scale]   \n\t"
-      "subu       %[temp3],   %[temp3],   %[frac]       \n\t"
-      "sw         %[temp3],   0(%[frow])                \n\t"
-      "addu       %[frow],    %[frow],    %[x_stride1]  \n\t"
-      "bgtz       %[loop_c],  1b                        \n\t"
-    : [accum]"=&r"(accum), [src1]"+&r"(src1), [temp3]"=&r"(temp3),
-      [sum]"=&r"(sum), [base]"=&r"(base), [frac]"=&r"(frac),
-      [frow]"+&r"(frow), [accum1]"=&r"(accum1),
-      [loop_c]"+&r"(loop_c)
-    : [x_stride]"r"(x_stride), [fx_scale]"r"(fx_scale), [x_sub]"r"(x_sub),
-      [x_add] "r" (x_add), [x_stride1] "r" (x_stride1)
-    : "memory", "hi", "lo"
-  );
+
+  for (channel = 0; channel < x_stride; ++channel) {
+    int* frow = wrk->frow + channel;
+    const uint8_t* src1 = src + channel;
+    int temp3;
+    int base, frac, sum;
+    int accum, accum1;
+    int loop_c = x_out_max;
+
+    __asm__ volatile (
+        "li         %[sum],     0                         \n\t"
+        "li         %[accum],   0                         \n\t"
+      "1:                                                 \n\t"
+        "addu       %[accum],   %[accum],   %[x_add]      \n\t"
+        "li         %[base],    0                         \n\t"
+        "blez       %[accum],   3f                        \n\t"
+      "2:                                                 \n\t"
+        "lbu        %[base],    0(%[src1])                \n\t"
+        "subu       %[accum],   %[accum],   %[x_sub]      \n\t"
+        "addu       %[src1],    %[src1],    %[x_stride]   \n\t"
+        "addu       %[sum],     %[sum],     %[base]       \n\t"
+        "bgtz       %[accum],   2b                        \n\t"
+      "3:                                                 \n\t"
+        "negu       %[accum1],  %[accum]                  \n\t"
+        "mul        %[frac],    %[base],    %[accum1]     \n\t"
+        "mul        %[temp3],   %[sum],     %[x_sub]      \n\t"
+        "sll        %[accum1],  %[frac],    1             \n\t"
+        "subu       %[loop_c],  %[loop_c],  %[x_stride]   \n\t"
+        "mulq_rs.w  %[sum],     %[accum1],  %[fx_scale]   \n\t"
+        "subu       %[temp3],   %[temp3],   %[frac]       \n\t"
+        "sw         %[temp3],   0(%[frow])                \n\t"
+        "addu       %[frow],    %[frow],    %[x_stride1]  \n\t"
+        "bgtz       %[loop_c],  1b                        \n\t"
+      : [accum]"=&r"(accum), [src1]"+&r"(src1), [temp3]"=&r"(temp3),
+        [sum]"=&r"(sum), [base]"=&r"(base), [frac]"=&r"(frac),
+        [frow]"+&r"(frow), [accum1]"=&r"(accum1),
+        [loop_c]"+&r"(loop_c)
+      : [x_stride]"r"(x_stride), [fx_scale]"r"(fx_scale), [x_sub]"r"(x_sub),
+        [x_add] "r" (x_add), [x_stride1] "r" (x_stride1)
+      : "memory", "hi", "lo"
+    );
+  }
 }

-static void ImportRowExpand(WebPRescaler* const wrk,
-                            const uint8_t* const src, int channel) {
+static void ImportRowExpand(WebPRescaler* const wrk, const uint8_t* src) {
  const int x_stride = wrk->num_channels;
  const int x_out_max = wrk->dst_width * wrk->num_channels;
  const int x_add = wrk->x_add;
  const int x_sub = wrk->x_sub;
  const int src_width = wrk->src_width;
-  int* frow = wrk->frow + channel;
-  const uint8_t* src1 = src + channel;
-  int temp1, temp2, temp3, temp4;
-  int frac;
-  int accum;
  const int x_stride1 = x_stride << 2;
-  int x_out = channel;
-
+  int channel;
  assert(wrk->x_expand);
  assert(!WebPRescalerInputDone(wrk));
-  __asm__ volatile (
-    "addiu  %[temp3],   %[src_width], -1            \n\t"
-    "lbu    %[temp2],   0(%[src1])                  \n\t"
-    "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
-    "bgtz   %[temp3],   0f                          \n\t"
-    "addiu  %[temp1],   %[temp2],     0             \n\t"
-    "b      3f                                      \n\t"
-  "0:                                               \n\t"
-    "lbu    %[temp1],   0(%[src1])                  \n\t"
-  "3:                                               \n\t"
-    "addiu  %[accum],   %[x_add],     0             \n\t"
-  "1:                                               \n\t"
-    "subu   %[temp3],   %[temp2],     %[temp1]      \n\t"
-    "mul    %[temp3],   %[temp3],     %[accum]      \n\t"
-    "mul    %[temp4],   %[temp1],     %[x_add]      \n\t"
-    "addu   %[temp3],   %[temp4],     %[temp3]      \n\t"
-    "sw     %[temp3],   0(%[frow])                  \n\t"
-    "addu   %[frow],    %[frow],      %[x_stride1]  \n\t"
-    "addu   %[x_out],   %[x_out],     %[x_stride]   \n\t"
-    "subu   %[temp3],   %[x_out],     %[x_out_max]  \n\t"
-    "bgez   %[temp3],   2f                          \n\t"
-    "subu   %[accum],   %[accum],     %[x_sub]      \n\t"
-    "bgez   %[accum],   4f                          \n\t"
-    "addiu  %[temp2],   %[temp1],     0             \n\t"
-    "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
-    "lbu    %[temp1],   0(%[src1])                  \n\t"
-    "addu   %[accum],   %[accum],     %[x_add]      \n\t"
-  "4:                                               \n\t"
-    "b      1b                                      \n\t"
-  "2:                                               \n\t"
-    : [src1] "+r" (src1), [accum] "=&r" (accum), [temp1] "=&r" (temp1),
-      [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [temp4] "=&r" (temp4),
-      [x_out] "+r" (x_out), [frac] "=&r" (frac), [frow] "+r" (frow)
-    : [x_stride] "r" (x_stride), [x_add] "r" (x_add), [x_sub] "r" (x_sub),
-      [x_stride1] "r" (x_stride1), [src_width] "r" (src_width),
-      [x_out_max] "r" (x_out_max)
-    : "memory", "hi", "lo"
-  );
+
+  for (channel = 0; channel < x_stride; ++channel) {
+    int* frow = wrk->frow + channel;
+    const uint8_t* src1 = src + channel;
+    int temp1, temp2, temp3, temp4;
+    int frac;
+    int accum;
+    int x_out = channel;
+
+    __asm__ volatile (
+      "addiu  %[temp3],   %[src_width], -1            \n\t"
+      "lbu    %[temp2],   0(%[src1])                  \n\t"
+      "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
+      "bgtz   %[temp3],   0f                          \n\t"
+      "addiu  %[temp1],   %[temp2],     0             \n\t"
+      "b      3f                                      \n\t"
+    "0:                                               \n\t"
+      "lbu    %[temp1],   0(%[src1])                  \n\t"
+    "3:                                               \n\t"
+      "addiu  %[accum],   %[x_add],     0             \n\t"
+    "1:                                               \n\t"
+      "subu   %[temp3],   %[temp2],     %[temp1]      \n\t"
+      "mul    %[temp3],   %[temp3],     %[accum]      \n\t"
+      "mul    %[temp4],   %[temp1],     %[x_add]      \n\t"
+      "addu   %[temp3],   %[temp4],     %[temp3]      \n\t"
+      "sw     %[temp3],   0(%[frow])                  \n\t"
+      "addu   %[frow],    %[frow],      %[x_stride1]  \n\t"
+      "addu   %[x_out],   %[x_out],     %[x_stride]   \n\t"
+      "subu   %[temp3],   %[x_out],     %[x_out_max]  \n\t"
+      "bgez   %[temp3],   2f                          \n\t"
+      "subu   %[accum],   %[accum],     %[x_sub]      \n\t"
+      "bgez   %[accum],   4f                          \n\t"
+      "addiu  %[temp2],   %[temp1],     0             \n\t"
+      "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
+      "lbu    %[temp1],   0(%[src1])                  \n\t"
+      "addu   %[accum],   %[accum],     %[x_add]      \n\t"
+    "4:                                               \n\t"
+      "b      1b                                      \n\t"
+    "2:                                               \n\t"
+      : [src1] "+r" (src1), [accum] "=&r" (accum), [temp1] "=&r" (temp1),
+        [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [temp4] "=&r" (temp4),
+        [x_out] "+r" (x_out), [frac] "=&r" (frac), [frow] "+r" (frow)
+      : [x_stride] "r" (x_stride), [x_add] "r" (x_add), [x_sub] "r" (x_sub),
+        [x_stride1] "r" (x_stride1), [src_width] "r" (src_width),
+        [x_out_max] "r" (x_out_max)
+      : "memory", "hi", "lo"
+    );
+  }
 }

 static void ExportRowShrink(WebPRescaler* const wrk) {
--- a/src/utils/rescaler.c
+++ b/src/utils/rescaler.c
@@ -102,16 +102,14 @@ int WebPRescalerImport(WebPRescaler* const wrk, int num_lines,
                       const uint8_t* src, int src_stride) {
  int total_imported = 0;
  while (total_imported < num_lines && !WebPRescalerHasPendingOutput(wrk)) {
-    int x, channel;
    if (wrk->y_expand) {
      rescaler_t* const tmp = wrk->irow;
      wrk->irow = wrk->frow;
      wrk->frow = tmp;
    }
-    for (channel = 0; channel < wrk->num_channels; ++channel) {
-      WebPRescalerImportRow(wrk, src, channel);
-    }
+    WebPRescalerImportRow(wrk, src);
    if (!wrk->y_expand) {     // Accumulate the contribution of the new row.
+      int x;
      for (x = 0; x < wrk->num_channels * wrk->dst_width; ++x) {
        wrk->irow[x] += wrk->frow[x];
      }