diff --git a/src/dsp/dec_mips_dsp_r2.c b/src/dsp/dec_mips_dsp_r2.c
index 02e59881..9e0ec7e9 100644
--- a/src/dsp/dec_mips_dsp_r2.c
+++ b/src/dsp/dec_mips_dsp_r2.c
@@ -21,54 +21,194 @@ static const int kC2 = 35468;
 
 #define MUL(a, b) (((a) * (b)) >> 16)
 
+// temp0[31..16 | 15..0] = temp0[31..16 | 15..0] + temp8[31..16 | 15..0]
+// temp0[31..16 | 15..0] = temp0[31..16 <<(s) 7 | 15..0 <<(s) 7]
+// temp1..temp7 same as temp0
+// precrqu_s.qb.ph temp0, temp1, temp0:
+//   temp0 = temp1[31..24] | temp1[15..8] | temp0[31..24] | temp0[15..8]
+// store temp0 to dst
+// IO - input/output
+// I - input (macro doesn't change it)
+#define STORE_SAT_SUM_X2(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7,               \
+                         I0, I1, I2, I3, I4, I5, I6, I7)                       \
+  "addq.ph          %["#IO0"],  %["#IO0"],  %["#I0"]          \n\t"            \
+  "addq.ph          %["#IO1"],  %["#IO1"],  %["#I1"]          \n\t"            \
+  "addq.ph          %["#IO2"],  %["#IO2"],  %["#I2"]          \n\t"            \
+  "addq.ph          %["#IO3"],  %["#IO3"],  %["#I3"]          \n\t"            \
+  "addq.ph          %["#IO4"],  %["#IO4"],  %["#I4"]          \n\t"            \
+  "addq.ph          %["#IO5"],  %["#IO5"],  %["#I5"]          \n\t"            \
+  "addq.ph          %["#IO6"],  %["#IO6"],  %["#I6"]          \n\t"            \
+  "addq.ph          %["#IO7"],  %["#IO7"],  %["#I7"]          \n\t"            \
+  "shll_s.ph        %["#IO0"],  %["#IO0"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO1"],  %["#IO1"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO2"],  %["#IO2"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO3"],  %["#IO3"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO4"],  %["#IO4"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO5"],  %["#IO5"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO6"],  %["#IO6"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO7"],  %["#IO7"],  7                 \n\t"            \
+  "precrqu_s.qb.ph  %["#IO0"],  %["#IO1"],  %["#IO0"]         \n\t"            \
+  "precrqu_s.qb.ph  %["#IO2"],  %["#IO3"],  %["#IO2"]         \n\t"            \
+  "precrqu_s.qb.ph  %["#IO4"],  %["#IO5"],  %["#IO4"]         \n\t"            \
+  "precrqu_s.qb.ph  %["#IO6"],  %["#IO7"],  %["#IO6"]         \n\t"            \
+  "usw              %["#IO0"],  0(%[dst])                     \n\t"            \
+  "usw              %["#IO2"],  32(%[dst])                    \n\t"            \
+  "usw              %["#IO4"],  64(%[dst])                    \n\t"            \
+  "usw              %["#IO6"],  96(%[dst])                    \n\t"
+
+// temp0[31..16 | 15..0] = temp8[31..16 | 15..0] + temp12[31..16 | 15..0]
+// temp1[31..16 | 15..0] = temp8[31..16 | 15..0] - temp12[31..16 | 15..0]
+// temp0[31..16 | 15..0] = temp0[31..16 >> 3 | 15..0 >> 3]
+// temp1[31..16 | 15..0] = temp1[31..16 >> 3 | 15..0 >> 3]
+// O - output
+// I - input (macro doesn't change it)
+#define SHIFT_R_SUM_X2(O0, O1, O2, O3, O4, O5, O6, O7,                         \
+                       I0, I1, I2, I3, I4, I5, I6, I7)                         \
+  "addq.ph          %["#O0"],   %["#I0"],   %["#I4"]          \n\t"            \
+  "subq.ph          %["#O1"],   %["#I0"],   %["#I4"]          \n\t"            \
+  "addq.ph          %["#O2"],   %["#I1"],   %["#I5"]          \n\t"            \
+  "subq.ph          %["#O3"],   %["#I1"],   %["#I5"]          \n\t"            \
+  "addq.ph          %["#O4"],   %["#I2"],   %["#I6"]          \n\t"            \
+  "subq.ph          %["#O5"],   %["#I2"],   %["#I6"]          \n\t"            \
+  "addq.ph          %["#O6"],   %["#I3"],   %["#I7"]          \n\t"            \
+  "subq.ph          %["#O7"],   %["#I3"],   %["#I7"]          \n\t"            \
+  "shra.ph          %["#O0"],   %["#O0"],   3                 \n\t"            \
+  "shra.ph          %["#O1"],   %["#O1"],   3                 \n\t"            \
+  "shra.ph          %["#O2"],   %["#O2"],   3                 \n\t"            \
+  "shra.ph          %["#O3"],   %["#O3"],   3                 \n\t"            \
+  "shra.ph          %["#O4"],   %["#O4"],   3                 \n\t"            \
+  "shra.ph          %["#O5"],   %["#O5"],   3                 \n\t"            \
+  "shra.ph          %["#O6"],   %["#O6"],   3                 \n\t"            \
+  "shra.ph          %["#O7"],   %["#O7"],   3                 \n\t"
+
+// preceu.ph.qbr temp0, temp8
+//   temp0 = 0 | 0 | temp8[23..16] | temp8[7..0]
+// preceu.ph.qbl temp1, temp8
+//   temp1 = temp8[23..16] | temp8[7..0] | 0 | 0
+// O - output
+// I - input (macro doesn't change it)
+#define CONVERT_2_BYTES_TO_HALF(O0, O1, O2, O3, O4, O5, O6, O7,                \
+                                I0, I1, I2, I3)                                \
+  "preceu.ph.qbr    %["#O0"],   %["#I0"]                      \n\t"            \
+  "preceu.ph.qbl    %["#O1"],   %["#I0"]                      \n\t"            \
+  "preceu.ph.qbr    %["#O2"],   %["#I1"]                      \n\t"            \
+  "preceu.ph.qbl    %["#O3"],   %["#I1"]                      \n\t"            \
+  "preceu.ph.qbr    %["#O4"],   %["#I2"]                      \n\t"            \
+  "preceu.ph.qbl    %["#O5"],   %["#I2"]                      \n\t"            \
+  "preceu.ph.qbr    %["#O6"],   %["#I3"]                      \n\t"            \
+  "preceu.ph.qbl    %["#O7"],   %["#I3"]                      \n\t"
+
+// O - output
+#define LOAD_DST(O0, O1, O2, O3)                                               \
+  "ulw              %["#O0"],  0(%[dst])                      \n\t"            \
+  "ulw              %["#O1"],  32(%[dst])                     \n\t"            \
+  "ulw              %["#O2"],  64(%[dst])                     \n\t"            \
+  "ulw              %["#O3"],  96(%[dst])                     \n\t"
+
+// precrq.ph.w temp0, temp8, temp2
+//   temp0 = temp8[31..16] | temp2[31..16]
+// ins temp2, temp8, 16, 16
+//   temp2 = temp8[31..16] | temp2[15..0]
+// O - output
+// IO - input/output
+// I - input (macro doesn't change it)
+#define PACK_2_HALVES_TO_WORD(O0, O1, O2, O3,                                  \
+                              IO0, IO1, IO2, IO3,                              \
+                              I0, I1, I2, I3)                                  \
+  "precrq.ph.w      %["#O0"],    %["#I0"],  %["#IO0"]         \n\t"            \
+  "precrq.ph.w      %["#O1"],    %["#I1"],  %["#IO1"]         \n\t"            \
+  "ins              %["#IO0"],   %["#I0"],  16,    16         \n\t"            \
+  "ins              %["#IO1"],   %["#I1"],  16,    16         \n\t"            \
+  "precrq.ph.w      %["#O2"],    %["#I2"],  %["#IO2"]         \n\t"            \
+  "precrq.ph.w      %["#O3"],    %["#I3"],  %["#IO3"]         \n\t"            \
+  "ins              %["#IO2"],   %["#I2"],  16,    16         \n\t"            \
+  "ins              %["#IO3"],   %["#I3"],  16,    16         \n\t"
+
+// O - output
+// IO - input/output
+// I - input (macro doesn't change it)
+#define MUL_SHIFT_SUM(O0, O1, O2, O3, O4, O5, O6, O7,                          \
+                      IO0, IO1, IO2, IO3,                                      \
+                      I0, I1, I2, I3, I4, I5, I6, I7)                          \
+  "mul              %["#O0"],   %["#I0"],   %[kC2]            \n\t"            \
+  "mul              %["#O1"],   %["#I0"],   %[kC1]            \n\t"            \
+  "mul              %["#O2"],   %["#I1"],   %[kC2]            \n\t"            \
+  "mul              %["#O3"],   %["#I1"],   %[kC1]            \n\t"            \
+  "mul              %["#O4"],   %["#I2"],   %[kC2]            \n\t"            \
+  "mul              %["#O5"],   %["#I2"],   %[kC1]            \n\t"            \
+  "mul              %["#O6"],   %["#I3"],   %[kC2]            \n\t"            \
+  "mul              %["#O7"],   %["#I3"],   %[kC1]            \n\t"            \
+  "sra              %["#O0"],   %["#O0"],   16                \n\t"            \
+  "sra              %["#O1"],   %["#O1"],   16                \n\t"            \
+  "sra              %["#O2"],   %["#O2"],   16                \n\t"            \
+  "sra              %["#O3"],   %["#O3"],   16                \n\t"            \
+  "sra              %["#O4"],   %["#O4"],   16                \n\t"            \
+  "sra              %["#O5"],   %["#O5"],   16                \n\t"            \
+  "sra              %["#O6"],   %["#O6"],   16                \n\t"            \
+  "sra              %["#O7"],   %["#O7"],   16                \n\t"            \
+  "addu             %["#IO0"],  %["#IO0"],  %["#I4"]          \n\t"            \
+  "addu             %["#IO1"],  %["#IO1"],  %["#I5"]          \n\t"            \
+  "subu             %["#IO2"],  %["#IO2"],  %["#I6"]          \n\t"            \
+  "subu             %["#IO3"],  %["#IO3"],  %["#I7"]          \n\t"
+
+// O - output
+// I - input (macro doesn't change it)
+#define ADD_SUB_HALVES(O0, O1,                                                 \
+                       I0, I1)                                                 \
+  "addq.ph          %["#O0"],   %["#I0"],  %["#I1"]           \n\t"            \
+  "subq.ph          %["#O1"],   %["#I0"],  %["#I1"]           \n\t"
+
+// O - output
+// I - input (macro doesn't change it)
+// I[0/1] - offset in bytes
+#define LOAD_IN_X2(O0, O1,                                                     \
+                   I0, I1)                                                     \
+  "lh               %["#O0"],   "#I0"(%[in])                  \n\t"            \
+  "lh               %["#O1"],   "#I1"(%[in])                  \n\t"
+
+// O - output
+// I - input (macro doesn't change it)
+#define SRA_16(O0, O1, O2, O3,                                                 \
+               I0, I1, I2, I3)                                                 \
+  "sra              %["#O0"],  %["#I0"],  16                  \n\t"            \
+  "sra              %["#O1"],  %["#I1"],  16                  \n\t"            \
+  "sra              %["#O2"],  %["#I2"],  16                  \n\t"            \
+  "sra              %["#O3"],  %["#I3"],  16                  \n\t"
+
+// O - output
+// I - input (macro doesn't change it)
+#define INSERT_HALF_X2(O0, O1,                                                 \
+                       I0, I1)                                                 \
+  "ins              %["#O0"],   %["#I0"], 16,    16           \n\t"            \
+  "ins              %["#O1"],   %["#I1"], 16,    16           \n\t"
+
+#define OUTPUT_EARLY_CLOBBER_REGS_10()                                         \
+  : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),             \
+    [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),             \
+    [temp7]"=&r"(temp7), [temp8]"=&r"(temp8), [temp9]"=&r"(temp9),             \
+    [temp10]"=&r"(temp10)
+
+#define OUTPUT_EARLY_CLOBBER_REGS_18()                                         \
+  OUTPUT_EARLY_CLOBBER_REGS_10(),                                              \
+  [temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13),         \
+  [temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16),         \
+  [temp17]"=&r"(temp17), [temp18]"=&r"(temp18)
+
 static void TransformDC(const int16_t* in, uint8_t* dst) {
   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
 
   __asm__ volatile (
-    "ulw              %[temp1],  0(%[dst])              \n\t"
-    "ulw              %[temp2],  32(%[dst])             \n\t"
-    "ulw              %[temp3],  64(%[dst])             \n\t"
-    "ulw              %[temp4],  96(%[dst])             \n\t"
+    LOAD_DST(temp1, temp2, temp3, temp4)
     "lh               %[temp5],  0(%[in])               \n\t"
     "addiu            %[temp5],  %[temp5],  4           \n\t"
     "ins              %[temp5],  %[temp5],  16, 16      \n\t"
     "shra.ph          %[temp5],  %[temp5],  3           \n\t"
-    "preceu.ph.qbr    %[temp6],  %[temp1]               \n\t"
-    "preceu.ph.qbl    %[temp7],  %[temp1]               \n\t"
-    "preceu.ph.qbr    %[temp8],  %[temp2]               \n\t"
-    "preceu.ph.qbl    %[temp9],  %[temp2]               \n\t"
-    "preceu.ph.qbr    %[temp10], %[temp3]               \n\t"
-    "preceu.ph.qbl    %[temp1],  %[temp3]               \n\t"
-    "preceu.ph.qbr    %[temp2],  %[temp4]               \n\t"
-    "preceu.ph.qbl    %[temp3],  %[temp4]               \n\t"
-    "addq.ph          %[temp6],  %[temp6],  %[temp5]    \n\t"
-    "addq.ph          %[temp7],  %[temp7],  %[temp5]    \n\t"
-    "addq.ph          %[temp8],  %[temp8],  %[temp5]    \n\t"
-    "addq.ph          %[temp9],  %[temp9],  %[temp5]    \n\t"
-    "addq.ph          %[temp10], %[temp10], %[temp5]    \n\t"
-    "addq.ph          %[temp1],  %[temp1],  %[temp5]    \n\t"
-    "addq.ph          %[temp2],  %[temp2],  %[temp5]    \n\t"
-    "addq.ph          %[temp3],  %[temp3],  %[temp5]    \n\t"
-    "shll_s.ph        %[temp6],  %[temp6],  7           \n\t"
-    "shll_s.ph        %[temp7],  %[temp7],  7           \n\t"
-    "shll_s.ph        %[temp8],  %[temp8],  7           \n\t"
-    "shll_s.ph        %[temp9],  %[temp9],  7           \n\t"
-    "shll_s.ph        %[temp10], %[temp10], 7           \n\t"
-    "shll_s.ph        %[temp1],  %[temp1],  7           \n\t"
-    "shll_s.ph        %[temp2],  %[temp2],  7           \n\t"
-    "shll_s.ph        %[temp3],  %[temp3],  7           \n\t"
-    "precrqu_s.qb.ph  %[temp6],  %[temp7],  %[temp6]    \n\t"
-    "precrqu_s.qb.ph  %[temp8],  %[temp9],  %[temp8]    \n\t"
-    "precrqu_s.qb.ph  %[temp10], %[temp1],  %[temp10]   \n\t"
-    "precrqu_s.qb.ph  %[temp2],  %[temp3],  %[temp2]    \n\t"
-    "usw              %[temp6],  0(%[dst])              \n\t"
-    "usw              %[temp8],  32(%[dst])             \n\t"
-    "usw              %[temp10], 64(%[dst])             \n\t"
-    "usw              %[temp2],  96(%[dst])             \n\t"
-    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
-      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
-      [temp7]"=&r"(temp7), [temp8]"=&r"(temp8), [temp9]"=&r"(temp9),
-      [temp10]"=&r"(temp10)
+    CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,
+                            temp3, temp1, temp2, temp3, temp4)
+    STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
+                     temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5)
+
+    OUTPUT_EARLY_CLOBBER_REGS_10()
     : [in]"r"(in), [dst]"r"(dst)
     : "memory"
   );
@@ -87,81 +227,110 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
     "ins              %[c4],      %[d4],     16,       16    \n\t"
     "replv.ph         %[temp1],   %[a]                       \n\t"
     "replv.ph         %[temp4],   %[d1]                      \n\t"
-    "addq.ph          %[temp2],   %[temp1],  %[c4]           \n\t"
-    "subq.ph          %[temp3],   %[temp1],  %[c4]           \n\t"
+    ADD_SUB_HALVES(temp2, temp3, temp1, c4)
     "replv.ph         %[temp5],   %[c1]                      \n\t"
-    "addq.ph          %[temp1],   %[temp2],  %[temp4]        \n\t"
-    "subq.ph          %[temp6],   %[temp2],  %[temp4]        \n\t"
-    "addq.ph          %[temp7],   %[temp2],  %[temp5]        \n\t"
-    "subq.ph          %[temp8],   %[temp2],  %[temp5]        \n\t"
-    "addq.ph          %[temp2],   %[temp3],  %[temp4]        \n\t"
-    "subq.ph          %[temp9],   %[temp3],  %[temp4]        \n\t"
-    "addq.ph          %[temp10],  %[temp3],  %[temp5]        \n\t"
-    "subq.ph          %[temp4],   %[temp3],  %[temp5]        \n\t"
-    "shra.ph          %[temp1],   %[temp1],  3               \n\t"
-    "shra.ph          %[temp6],   %[temp6],  3               \n\t"
-    "shra.ph          %[temp7],   %[temp7],  3               \n\t"
-    "shra.ph          %[temp8],   %[temp8],  3               \n\t"
-    "shra.ph          %[temp2],   %[temp2],  3               \n\t"
-    "shra.ph          %[temp9],   %[temp9],  3               \n\t"
-    "shra.ph          %[temp10],  %[temp10], 3               \n\t"
-    "shra.ph          %[temp4],   %[temp4],  3               \n\t"
-    "ulw              %[temp3],   0(%[dst])                  \n\t"
-    "ulw              %[temp5],   32(%[dst])                 \n\t"
-    "ulw              %[temp11],  64(%[dst])                 \n\t"
-    "ulw              %[temp12],  96(%[dst])                 \n\t"
-    "preceu.ph.qbr    %[temp13],  %[temp3]                   \n\t"
-    "preceu.ph.qbl    %[temp14],  %[temp3]                   \n\t"
-    "preceu.ph.qbr    %[temp3],   %[temp5]                   \n\t"
-    "preceu.ph.qbl    %[temp15],  %[temp5]                   \n\t"
-    "preceu.ph.qbr    %[temp5],   %[temp11]                  \n\t"
-    "preceu.ph.qbl    %[temp16],  %[temp11]                  \n\t"
-    "preceu.ph.qbr    %[temp11],  %[temp12]                  \n\t"
-    "preceu.ph.qbl    %[temp17],  %[temp12]                  \n\t"
-    "precrq.ph.w      %[temp12],  %[temp7],  %[temp1]        \n\t"
-    "precrq.ph.w      %[temp18],  %[temp6],  %[temp8]        \n\t"
-    "ins              %[temp1],   %[temp7],  16,       16    \n\t"
-    "ins              %[temp8],   %[temp6],  16,       16    \n\t"
-    "precrq.ph.w      %[temp7],   %[temp10], %[temp2]        \n\t"
-    "precrq.ph.w      %[temp6],   %[temp9],  %[temp4]        \n\t"
-    "ins              %[temp2],   %[temp10], 16,       16    \n\t"
-    "ins              %[temp4],   %[temp9],  16,       16    \n\t"
-    "addq.ph          %[temp13],  %[temp13], %[temp12]       \n\t"
-    "addq.ph          %[temp14],  %[temp14], %[temp18]       \n\t"
-    "addq.ph          %[temp3],   %[temp3],  %[temp1]        \n\t"
-    "addq.ph          %[temp15],  %[temp15], %[temp8]        \n\t"
-    "addq.ph          %[temp5],   %[temp5],  %[temp2]        \n\t"
-    "addq.ph          %[temp16],  %[temp16], %[temp4]        \n\t"
-    "addq.ph          %[temp11],  %[temp11], %[temp7]        \n\t"
-    "addq.ph          %[temp17],  %[temp17], %[temp6]        \n\t"
-    "shll_s.ph        %[temp13],  %[temp13], 7               \n\t"
-    "shll_s.ph        %[temp14],  %[temp14], 7               \n\t"
-    "shll_s.ph        %[temp3],   %[temp3],  7               \n\t"
-    "shll_s.ph        %[temp15],  %[temp15], 7               \n\t"
-    "shll_s.ph        %[temp5],   %[temp5],  7               \n\t"
-    "shll_s.ph        %[temp16],  %[temp16], 7               \n\t"
-    "shll_s.ph        %[temp11],  %[temp11], 7               \n\t"
-    "shll_s.ph        %[temp17],  %[temp17], 7               \n\t"
-    "precrqu_s.qb.ph  %[temp13],  %[temp14], %[temp13]       \n\t"
-    "precrqu_s.qb.ph  %[temp3],   %[temp15], %[temp3]        \n\t"
-    "precrqu_s.qb.ph  %[temp5],   %[temp16], %[temp5]        \n\t"
-    "precrqu_s.qb.ph  %[temp11],  %[temp17], %[temp11]       \n\t"
-    "usw              %[temp13],  0(%[dst])                  \n\t"
-    "usw              %[temp3],   32(%[dst])                 \n\t"
-    "usw              %[temp5],   64(%[dst])                 \n\t"
-    "usw              %[temp11],  96(%[dst])                 \n\t"
-    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
-      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
-      [temp7]"=&r"(temp7), [temp8]"=&r"(temp8), [temp9]"=&r"(temp9),
-      [temp10]"=&r"(temp10), [temp11]"=&r"(temp11), [temp12]"=&r"(temp12),
-      [temp13]"=&r"(temp13), [temp14]"=&r"(temp14), [temp15]"=&r"(temp15),
-      [temp16]"=&r"(temp16), [temp17]"=&r"(temp17), [temp18]"=&r"(temp18),
+    SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
+                   temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
+    LOAD_DST(temp3, temp5, temp11, temp12)
+    CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
+                            temp11, temp17, temp3, temp5, temp11, temp12)
+    PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
+                          temp4, temp7, temp6, temp10, temp9)
+    STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
+                     temp17, temp12, temp18, temp1, temp8, temp2, temp4,
+                     temp7, temp6)
+
+    OUTPUT_EARLY_CLOBBER_REGS_18(),
       [c4]"+&r"(c4)
     : [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1)
     : "memory"
   );
 }
 
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
+
+  __asm__ volatile (
+    "ulw              %[temp1],   0(%[in])                 \n\t"
+    "ulw              %[temp2],   16(%[in])                \n\t"
+    LOAD_IN_X2(temp5, temp6, 24, 26)
+    ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
+    LOAD_IN_X2(temp1, temp2, 8, 10)
+    MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
+                  temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
+                  temp13, temp11, temp14, temp12)
+    INSERT_HALF_X2(temp8, temp7, temp10, temp9)
+    "ulw              %[temp17],  4(%[in])                 \n\t"
+    "ulw              %[temp18],  20(%[in])                \n\t"
+    ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
+    ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
+    ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
+    LOAD_IN_X2(temp17, temp18, 12, 14)
+    LOAD_IN_X2(temp9, temp10, 28, 30)
+    MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
+                  temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
+                  temp15, temp4, temp16, temp17)
+    INSERT_HALF_X2(temp11, temp12, temp13, temp14)
+    ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
+    ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
+
+    // horizontal
+    SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
+    INSERT_HALF_X2(temp1, temp6, temp5, temp2)
+    SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
+    "repl.ph          %[temp2],   0x4                      \n\t"
+    INSERT_HALF_X2(temp3, temp8, temp17, temp4)
+    "addq.ph          %[temp1],   %[temp1],  %[temp2]      \n\t"
+    "addq.ph          %[temp6],   %[temp6],  %[temp2]      \n\t"
+    ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
+    ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
+    MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
+                  temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
+                  temp6, temp17, temp8, temp18)
+    MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
+                  temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
+                  temp18, temp12, temp17, temp16)
+    INSERT_HALF_X2(temp1, temp3, temp9, temp13)
+    INSERT_HALF_X2(temp6, temp8, temp11, temp15)
+    SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
+                   temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
+                   temp6)
+    PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
+                          temp16, temp11, temp10, temp15, temp14)
+    LOAD_DST(temp10, temp11, temp14, temp15)
+    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
+                            temp11, temp10, temp11, temp14, temp15)
+    STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
+                     temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4)
+
+    OUTPUT_EARLY_CLOBBER_REGS_18()
+    : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
+    : "memory", "hi", "lo"
+  );
+}
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
+  if (do_two) {
+    TransformOne(in + 16, dst + 4);
+  }
+}
+
+#undef OUTPUT_EARLY_CLOBBER_REGS_18
+#undef OUTPUT_EARLY_CLOBBER_REGS_10
+#undef INSERT_HALF_X2
+#undef SRA_16
+#undef LOAD_IN_X2
+#undef ADD_SUB_HALVES
+#undef MUL_SHIFT_SUM
+#undef PACK_2_HALVES_TO_WORD
+#undef LOAD_DST
+#undef CONVERT_BYTES_TO_HALF
+#undef SHIFT_R_SUM_X2
+#undef STORE_SAT_SUM_X2
+#undef MUL
+
 #endif  // WEBP_USE_MIPS_DSP_R2
 
 //------------------------------------------------------------------------------
@@ -173,5 +342,6 @@ void VP8DspInitMIPSdspR2(void) {
 #if defined(WEBP_USE_MIPS_DSP_R2)
   VP8TransformDC = TransformDC;
   VP8TransformAC3 = TransformAC3;
+  VP8Transform = TransformTwo;
 #endif
 }