diff --git a/src/dsp/dec_mips32.c b/src/dsp/dec_mips32.c
index e85bbb15..4e9ef426 100644
--- a/src/dsp/dec_mips32.c
+++ b/src/dsp/dec_mips32.c
@@ -391,7 +391,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "sra      %[temp7],  %[temp7],  3                  \n\t"
     "sra      %[temp4],  %[temp4],  3                  \n\t"
     "addiu    %[temp6],  $zero,     255                \n\t"
-    "lbu      %[temp1],  0+0*"XSTR(BPS)"(%[dst])       \n\t"
+    "lbu      %[temp1],  0+0*" XSTR(BPS) "(%[dst])     \n\t"
     "addu     %[temp1],  %[temp1],  %[temp5]           \n\t"
     "sra      %[temp5],  %[temp1],  8                  \n\t"
     "sra      %[temp18], %[temp1],  31                 \n\t"
@@ -399,8 +399,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
     "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
   "1:                                                  \n\t"
-    "lbu      %[temp18], 1+0*"XSTR(BPS)"(%[dst])       \n\t"
-    "sb       %[temp1],  0+0*"XSTR(BPS)"(%[dst])       \n\t"
+    "lbu      %[temp18], 1+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp1],  0+0*" XSTR(BPS) "(%[dst])     \n\t"
     "addu     %[temp18], %[temp18], %[temp11]          \n\t"
     "sra      %[temp11], %[temp18], 8                  \n\t"
     "sra      %[temp1],  %[temp18], 31                 \n\t"
@@ -408,8 +408,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp18], %[temp18], %[temp18]          \n\t"
     "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
   "2:                                                  \n\t"
-    "lbu      %[temp1],  2+0*"XSTR(BPS)"(%[dst])       \n\t"
-    "sb       %[temp18], 1+0*"XSTR(BPS)"(%[dst])       \n\t"
+    "lbu      %[temp1],  2+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp18], 1+0*" XSTR(BPS) "(%[dst])     \n\t"
     "addu     %[temp1],  %[temp1],  %[temp8]           \n\t"
     "sra      %[temp8],  %[temp1],  8                  \n\t"
     "sra      %[temp18], %[temp1],  31                 \n\t"
@@ -417,8 +417,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
     "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
   "3:                                                  \n\t"
-    "lbu      %[temp18], 3+0*"XSTR(BPS)"(%[dst])       \n\t"
-    "sb       %[temp1],  2+0*"XSTR(BPS)"(%[dst])       \n\t"
+    "lbu      %[temp18], 3+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp1],  2+0*" XSTR(BPS) "(%[dst])     \n\t"
     "addu     %[temp18], %[temp18], %[temp16]          \n\t"
     "sra      %[temp16], %[temp18], 8                  \n\t"
     "sra      %[temp1],  %[temp18], 31                 \n\t"
@@ -426,11 +426,11 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp18], %[temp18], %[temp18]          \n\t"
     "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
   "4:                                                  \n\t"
-    "sb       %[temp18], 3+0*"XSTR(BPS)"(%[dst])       \n\t"
-    "lbu      %[temp5],  0+1*"XSTR(BPS)"(%[dst])       \n\t"
-    "lbu      %[temp8],  1+1*"XSTR(BPS)"(%[dst])       \n\t"
-    "lbu      %[temp11], 2+1*"XSTR(BPS)"(%[dst])       \n\t"
-    "lbu      %[temp16], 3+1*"XSTR(BPS)"(%[dst])       \n\t"
+    "sb       %[temp18], 3+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp5],  0+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp8],  1+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp11], 2+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp16], 3+1*" XSTR(BPS) "(%[dst])     \n\t"
     "addu     %[temp5],  %[temp5],  %[temp17]          \n\t"
     "addu     %[temp8],  %[temp8],  %[temp15]          \n\t"
     "addu     %[temp11], %[temp11], %[temp12]          \n\t"
@@ -459,14 +459,14 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp16], %[temp16], %[temp16]          \n\t"
     "movz     %[temp16], %[temp6],  %[temp15]          \n\t"
   "8:                                                  \n\t"
-    "sb       %[temp5],  0+1*"XSTR(BPS)"(%[dst])       \n\t"
-    "sb       %[temp8],  1+1*"XSTR(BPS)"(%[dst])       \n\t"
-    "sb       %[temp11], 2+1*"XSTR(BPS)"(%[dst])       \n\t"
-    "sb       %[temp16], 3+1*"XSTR(BPS)"(%[dst])       \n\t"
-    "lbu      %[temp5],  0+2*"XSTR(BPS)"(%[dst])       \n\t"
-    "lbu      %[temp8],  1+2*"XSTR(BPS)"(%[dst])       \n\t"
-    "lbu      %[temp11], 2+2*"XSTR(BPS)"(%[dst])       \n\t"
-    "lbu      %[temp16], 3+2*"XSTR(BPS)"(%[dst])       \n\t"
+    "sb       %[temp5],  0+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp8],  1+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp11], 2+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp16], 3+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp5],  0+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp8],  1+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp11], 2+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp16], 3+2*" XSTR(BPS) "(%[dst])     \n\t"
     "addu     %[temp5],  %[temp5],  %[temp9]           \n\t"
     "addu     %[temp8],  %[temp8],  %[temp3]           \n\t"
     "addu     %[temp11], %[temp11], %[temp0]           \n\t"
@@ -495,14 +495,14 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp16], %[temp16], %[temp16]          \n\t"
     "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
   "12:                                                 \n\t"
-    "sb       %[temp5],  0+2*"XSTR(BPS)"(%[dst])       \n\t"
-    "sb       %[temp8],  1+2*"XSTR(BPS)"(%[dst])       \n\t"
-    "sb       %[temp11], 2+2*"XSTR(BPS)"(%[dst])       \n\t"
-    "sb       %[temp16], 3+2*"XSTR(BPS)"(%[dst])       \n\t"
-    "lbu      %[temp5],  0+3*"XSTR(BPS)"(%[dst])       \n\t"
-    "lbu      %[temp8],  1+3*"XSTR(BPS)"(%[dst])       \n\t"
-    "lbu      %[temp11], 2+3*"XSTR(BPS)"(%[dst])       \n\t"
-    "lbu      %[temp16], 3+3*"XSTR(BPS)"(%[dst])       \n\t"
+    "sb       %[temp5],  0+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp8],  1+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp11], 2+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp16], 3+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp5],  0+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp8],  1+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp11], 2+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp16], 3+3*" XSTR(BPS) "(%[dst])     \n\t"
     "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
     "addu     %[temp8],  %[temp8],  %[temp7]           \n\t"
     "addu     %[temp11], %[temp11], %[temp4]           \n\t"
@@ -531,10 +531,10 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp16], %[temp16], %[temp16]          \n\t"
     "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
   "16:                                                 \n\t"
-    "sb       %[temp5],  0+3*"XSTR(BPS)"(%[dst])       \n\t"
-    "sb       %[temp8],  1+3*"XSTR(BPS)"(%[dst])       \n\t"
-    "sb       %[temp11], 2+3*"XSTR(BPS)"(%[dst])       \n\t"
-    "sb       %[temp16], 3+3*"XSTR(BPS)"(%[dst])       \n\t"
+    "sb       %[temp5],  0+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp8],  1+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp11], 2+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp16], 3+3*" XSTR(BPS) "(%[dst])     \n\t"
 
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
diff --git a/src/dsp/dec_mips_dsp_r2.c b/src/dsp/dec_mips_dsp_r2.c
index 40e4d821..db5c6572 100644
--- a/src/dsp/dec_mips_dsp_r2.c
+++ b/src/dsp/dec_mips_dsp_r2.c
@@ -548,10 +548,10 @@ static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
 // TEMP3 = SRC[D + D1 * BPS]
 #define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3,                               \
                      A, A1, B, B1, C, C1, D, D1, SRC)                          \
-  "lbu      %[" #TEMP0 "],   " #A "+" #A1 "*"XSTR(BPS)"(%[" #SRC "]) \n\t"     \
-  "lbu      %[" #TEMP1 "],   " #B "+" #B1 "*"XSTR(BPS)"(%[" #SRC "]) \n\t"     \
-  "lbu      %[" #TEMP2 "],   " #C "+" #C1 "*"XSTR(BPS)"(%[" #SRC "]) \n\t"     \
-  "lbu      %[" #TEMP3 "],   " #D "+" #D1 "*"XSTR(BPS)"(%[" #SRC "]) \n\t"     \
+  "lbu      %[" #TEMP0 "],   " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+  "lbu      %[" #TEMP1 "],   " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+  "lbu      %[" #TEMP2 "],   " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+  "lbu      %[" #TEMP3 "],   " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
 
 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
   int i;
@@ -623,8 +623,8 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
 // DST[A * BPS]     = TEMP0
 // DST[B + C * BPS] = TEMP1
 #define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST)                              \
-  "usw    %[" #TEMP0 "],   " #A "*"XSTR(BPS)"(%[" #DST "])         \n\t"       \
-  "usw    %[" #TEMP1 "],   " #B "+" #C "*"XSTR(BPS)"(%[" #DST "])  \n\t"
+  "usw    %[" #TEMP0 "],   " #A "*" XSTR(BPS) "(%[" #DST "])         \n\t"     \
+  "usw    %[" #TEMP1 "],   " #B "+" #C "*" XSTR(BPS) "(%[" #DST "])  \n\t"
 
 static void VE4(uint8_t* dst) {    // vertical
   const uint8_t* top = dst - BPS;
@@ -659,7 +659,7 @@ static void VE4(uint8_t* dst) {    // vertical
 static void DC4(uint8_t* dst) {   // DC
   int temp0, temp1, temp2, temp3, temp4;
   __asm__ volatile (
-    "ulw          %[temp0],   -1*"XSTR(BPS)"(%[dst])   \n\t"
+    "ulw          %[temp0],   -1*" XSTR(BPS) "(%[dst]) \n\t"
     LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst)
     "ins          %[temp1],   %[temp2],    8,     8    \n\t"
     "ins          %[temp1],   %[temp3],    16,    8    \n\t"
@@ -683,7 +683,7 @@ static void RD4(uint8_t* dst) {   // Down-right
   int temp5, temp6, temp7, temp8;
   __asm__ volatile (
     LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst)
-    "ulw            %[temp7],   -1-"XSTR(BPS)"(%[dst])         \n\t"
+    "ulw            %[temp7],   -1-" XSTR(BPS) "(%[dst])       \n\t"
     "ins            %[temp1],   %[temp0], 16, 16               \n\t"
     "preceu.ph.qbr  %[temp5],   %[temp7]                       \n\t"
     "ins            %[temp2],   %[temp1], 16, 16               \n\t"
@@ -702,7 +702,7 @@ static void RD4(uint8_t* dst) {   // Down-right
     "shll.ph        %[temp0],   %[temp0], 1                    \n\t"
     "shra_r.ph      %[temp1],   %[temp1], 2                    \n\t"
     "addq.ph        %[temp8],   %[temp0], %[temp8]             \n\t"
-    "lbu            %[temp5],   3-"XSTR(BPS)"(%[dst])          \n\t"
+    "lbu            %[temp5],   3-" XSTR(BPS) "(%[dst])        \n\t"
     "precrq.ph.w    %[temp7],   %[temp7], %[temp7]             \n\t"
     "shra_r.ph      %[temp8],   %[temp8], 2                    \n\t"
     "ins            %[temp7],   %[temp5], 0,  8                \n\t"
@@ -725,8 +725,8 @@ static void RD4(uint8_t* dst) {   // Down-right
 // TEMP0 = SRC[A * BPS]
 // TEMP1 = SRC[B + C * BPS]
 #define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC)                               \
-  "ulw    %[" #TEMP0 "],   " #A "*"XSTR(BPS)"(%[" #SRC "])         \n\t"       \
-  "ulw    %[" #TEMP1 "],   " #B "+" #C "*"XSTR(BPS)"(%[" #SRC "])  \n\t"
+  "ulw    %[" #TEMP0 "],   " #A "*" XSTR(BPS) "(%[" #SRC "])         \n\t"     \
+  "ulw    %[" #TEMP1 "],   " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "])  \n\t"
 
 static void LD4(uint8_t* dst) {   // Down-Left
   int temp0, temp1, temp2, temp3, temp4;
diff --git a/src/dsp/enc_mips32.c b/src/dsp/enc_mips32.c
index b50e08b2..fd10143d 100644
--- a/src/dsp/enc_mips32.c
+++ b/src/dsp/enc_mips32.c
@@ -59,58 +59,58 @@ static const int kC2 = 35468;
 // A - offset in bytes to load from ref and store to dst buffer
 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
 #define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12)                       \
-  "addiu   %[" #TEMP0 "],    %[" #TEMP0 "],    4             \n\t"            \
-  "addu    %[temp16],      %[" #TEMP0 "],    %[" #TEMP8 "]   \n\t"            \
-  "subu    %[temp17],      %[" #TEMP0 "],    %[" #TEMP8 "]   \n\t"            \
-  "mul     %[" #TEMP0 "],    %[" #TEMP4 "],    %[kC2]        \n\t"            \
-  "mul     %[" #TEMP8 "],    %[" #TEMP12 "],   %[kC1]        \n\t"            \
-  "mul     %[" #TEMP4 "],    %[" #TEMP4 "],    %[kC1]        \n\t"            \
-  "mul     %[" #TEMP12 "],   %[" #TEMP12 "],   %[kC2]        \n\t"            \
-  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16            \n\t"            \
-  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    16            \n\t"            \
-  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    16            \n\t"            \
-  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   16            \n\t"            \
-  "subu    %[temp18],      %[" #TEMP0 "],    %[" #TEMP8 "]   \n\t"            \
-  "addu    %[temp19],      %[" #TEMP4 "],    %[" #TEMP12 "]  \n\t"            \
-  "addu    %[" #TEMP0 "],    %[temp16],      %[temp19]       \n\t"            \
-  "addu    %[" #TEMP4 "],    %[temp17],      %[temp18]       \n\t"            \
-  "subu    %[" #TEMP8 "],    %[temp17],      %[temp18]       \n\t"            \
-  "subu    %[" #TEMP12 "],   %[temp16],      %[temp19]       \n\t"            \
-  "lw      %[temp20],      0(%[args])                        \n\t"            \
-  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    3             \n\t"            \
-  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    3             \n\t"            \
-  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    3             \n\t"            \
-  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   3             \n\t"            \
-  "lbu     %[temp16],      0+"XSTR(BPS)"*" #A "(%[temp20])   \n\t"            \
-  "lbu     %[temp17],      1+"XSTR(BPS)"*" #A "(%[temp20])   \n\t"            \
-  "lbu     %[temp18],      2+"XSTR(BPS)"*" #A "(%[temp20])   \n\t"            \
-  "lbu     %[temp19],      3+"XSTR(BPS)"*" #A "(%[temp20])   \n\t"            \
-  "addu    %[" #TEMP0 "],    %[temp16],      %[" #TEMP0 "]   \n\t"            \
-  "addu    %[" #TEMP4 "],    %[temp17],      %[" #TEMP4 "]   \n\t"            \
-  "addu    %[" #TEMP8 "],    %[temp18],      %[" #TEMP8 "]   \n\t"            \
-  "addu    %[" #TEMP12 "],   %[temp19],      %[" #TEMP12 "]  \n\t"            \
-  "slt     %[temp16],      %[" #TEMP0 "],    $zero           \n\t"            \
-  "slt     %[temp17],      %[" #TEMP4 "],    $zero           \n\t"            \
-  "slt     %[temp18],      %[" #TEMP8 "],    $zero           \n\t"            \
-  "slt     %[temp19],      %[" #TEMP12 "],   $zero           \n\t"            \
-  "movn    %[" #TEMP0 "],    $zero,          %[temp16]       \n\t"            \
-  "movn    %[" #TEMP4 "],    $zero,          %[temp17]       \n\t"            \
-  "movn    %[" #TEMP8 "],    $zero,          %[temp18]       \n\t"            \
-  "movn    %[" #TEMP12 "],   $zero,          %[temp19]       \n\t"            \
-  "addiu   %[temp20],      $zero,          255               \n\t"            \
-  "slt     %[temp16],      %[" #TEMP0 "],    %[temp20]       \n\t"            \
-  "slt     %[temp17],      %[" #TEMP4 "],    %[temp20]       \n\t"            \
-  "slt     %[temp18],      %[" #TEMP8 "],    %[temp20]       \n\t"            \
-  "slt     %[temp19],      %[" #TEMP12 "],   %[temp20]       \n\t"            \
-  "movz    %[" #TEMP0 "],    %[temp20],      %[temp16]       \n\t"            \
-  "movz    %[" #TEMP4 "],    %[temp20],      %[temp17]       \n\t"            \
-  "lw      %[temp16],      8(%[args])                        \n\t"            \
-  "movz    %[" #TEMP8 "],    %[temp20],      %[temp18]       \n\t"            \
-  "movz    %[" #TEMP12 "],   %[temp20],      %[temp19]       \n\t"            \
-  "sb      %[" #TEMP0 "],    0+"XSTR(BPS)"*" #A "(%[temp16]) \n\t"            \
-  "sb      %[" #TEMP4 "],    1+"XSTR(BPS)"*" #A "(%[temp16]) \n\t"            \
-  "sb      %[" #TEMP8 "],    2+"XSTR(BPS)"*" #A "(%[temp16]) \n\t"            \
-  "sb      %[" #TEMP12 "],   3+"XSTR(BPS)"*" #A "(%[temp16]) \n\t"
+  "addiu   %[" #TEMP0 "],    %[" #TEMP0 "],    4               \n\t"          \
+  "addu    %[temp16],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
+  "subu    %[temp17],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
+  "mul     %[" #TEMP0 "],    %[" #TEMP4 "],    %[kC2]          \n\t"          \
+  "mul     %[" #TEMP8 "],    %[" #TEMP12 "],   %[kC1]          \n\t"          \
+  "mul     %[" #TEMP4 "],    %[" #TEMP4 "],    %[kC1]          \n\t"          \
+  "mul     %[" #TEMP12 "],   %[" #TEMP12 "],   %[kC2]          \n\t"          \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16              \n\t"          \
+  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    16              \n\t"          \
+  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    16              \n\t"          \
+  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   16              \n\t"          \
+  "subu    %[temp18],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
+  "addu    %[temp19],      %[" #TEMP4 "],    %[" #TEMP12 "]    \n\t"          \
+  "addu    %[" #TEMP0 "],    %[temp16],      %[temp19]         \n\t"          \
+  "addu    %[" #TEMP4 "],    %[temp17],      %[temp18]         \n\t"          \
+  "subu    %[" #TEMP8 "],    %[temp17],      %[temp18]         \n\t"          \
+  "subu    %[" #TEMP12 "],   %[temp16],      %[temp19]         \n\t"          \
+  "lw      %[temp20],      0(%[args])                          \n\t"          \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    3               \n\t"          \
+  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    3               \n\t"          \
+  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    3               \n\t"          \
+  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   3               \n\t"          \
+  "lbu     %[temp16],      0+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "lbu     %[temp17],      1+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "lbu     %[temp18],      2+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "lbu     %[temp19],      3+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "addu    %[" #TEMP0 "],    %[temp16],      %[" #TEMP0 "]     \n\t"          \
+  "addu    %[" #TEMP4 "],    %[temp17],      %[" #TEMP4 "]     \n\t"          \
+  "addu    %[" #TEMP8 "],    %[temp18],      %[" #TEMP8 "]     \n\t"          \
+  "addu    %[" #TEMP12 "],   %[temp19],      %[" #TEMP12 "]    \n\t"          \
+  "slt     %[temp16],      %[" #TEMP0 "],    $zero             \n\t"          \
+  "slt     %[temp17],      %[" #TEMP4 "],    $zero             \n\t"          \
+  "slt     %[temp18],      %[" #TEMP8 "],    $zero             \n\t"          \
+  "slt     %[temp19],      %[" #TEMP12 "],   $zero             \n\t"          \
+  "movn    %[" #TEMP0 "],    $zero,          %[temp16]         \n\t"          \
+  "movn    %[" #TEMP4 "],    $zero,          %[temp17]         \n\t"          \
+  "movn    %[" #TEMP8 "],    $zero,          %[temp18]         \n\t"          \
+  "movn    %[" #TEMP12 "],   $zero,          %[temp19]         \n\t"          \
+  "addiu   %[temp20],      $zero,          255                 \n\t"          \
+  "slt     %[temp16],      %[" #TEMP0 "],    %[temp20]         \n\t"          \
+  "slt     %[temp17],      %[" #TEMP4 "],    %[temp20]         \n\t"          \
+  "slt     %[temp18],      %[" #TEMP8 "],    %[temp20]         \n\t"          \
+  "slt     %[temp19],      %[" #TEMP12 "],   %[temp20]         \n\t"          \
+  "movz    %[" #TEMP0 "],    %[temp20],      %[temp16]         \n\t"          \
+  "movz    %[" #TEMP4 "],    %[temp20],      %[temp17]         \n\t"          \
+  "lw      %[temp16],      8(%[args])                          \n\t"          \
+  "movz    %[" #TEMP8 "],    %[temp20],      %[temp18]         \n\t"          \
+  "movz    %[" #TEMP12 "],   %[temp20],      %[temp19]         \n\t"          \
+  "sb      %[" #TEMP0 "],    0+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \
+  "sb      %[" #TEMP4 "],    1+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \
+  "sb      %[" #TEMP8 "],    2+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \
+  "sb      %[" #TEMP12 "],   3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
 
 // Does one or two inverse transforms.
 static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
@@ -253,39 +253,39 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
 // A - offset in bytes to load from a and b buffers
 // E..H - offsets in bytes to store first results to tmp buffer
 // E1..H1 - offsets in bytes to store second results to tmp buffer
-#define HORIZONTAL_PASS(A, E, F, G, H, E1, F1, G1, H1)                \
-  "lbu    %[temp0],  0+"XSTR(BPS)"*" #A "(%[a])  \n\t"                \
-  "lbu    %[temp1],  1+"XSTR(BPS)"*" #A "(%[a])  \n\t"                \
-  "lbu    %[temp2],  2+"XSTR(BPS)"*" #A "(%[a])  \n\t"                \
-  "lbu    %[temp3],  3+"XSTR(BPS)"*" #A "(%[a])  \n\t"                \
-  "lbu    %[temp4],  0+"XSTR(BPS)"*" #A "(%[b])  \n\t"                \
-  "lbu    %[temp5],  1+"XSTR(BPS)"*" #A "(%[b])  \n\t"                \
-  "lbu    %[temp6],  2+"XSTR(BPS)"*" #A "(%[b])  \n\t"                \
-  "lbu    %[temp7],  3+"XSTR(BPS)"*" #A "(%[b])  \n\t"                \
-  "addu   %[temp8],  %[temp0],    %[temp2]       \n\t"                \
-  "subu   %[temp0],  %[temp0],    %[temp2]       \n\t"                \
-  "addu   %[temp2],  %[temp1],    %[temp3]       \n\t"                \
-  "subu   %[temp1],  %[temp1],    %[temp3]       \n\t"                \
-  "addu   %[temp3],  %[temp4],    %[temp6]       \n\t"                \
-  "subu   %[temp4],  %[temp4],    %[temp6]       \n\t"                \
-  "addu   %[temp6],  %[temp5],    %[temp7]       \n\t"                \
-  "subu   %[temp5],  %[temp5],    %[temp7]       \n\t"                \
-  "addu   %[temp7],  %[temp8],    %[temp2]       \n\t"                \
-  "subu   %[temp2],  %[temp8],    %[temp2]       \n\t"                \
-  "addu   %[temp8],  %[temp0],    %[temp1]       \n\t"                \
-  "subu   %[temp0],  %[temp0],    %[temp1]       \n\t"                \
-  "addu   %[temp1],  %[temp3],    %[temp6]       \n\t"                \
-  "subu   %[temp3],  %[temp3],    %[temp6]       \n\t"                \
-  "addu   %[temp6],  %[temp4],    %[temp5]       \n\t"                \
-  "subu   %[temp4],  %[temp4],    %[temp5]       \n\t"                \
-  "sw     %[temp7],  " #E "(%[tmp])              \n\t"                \
-  "sw     %[temp2],  " #H "(%[tmp])              \n\t"                \
-  "sw     %[temp8],  " #F "(%[tmp])              \n\t"                \
-  "sw     %[temp0],  " #G "(%[tmp])              \n\t"                \
-  "sw     %[temp1],  " #E1 "(%[tmp])             \n\t"                \
-  "sw     %[temp3],  " #H1 "(%[tmp])             \n\t"                \
-  "sw     %[temp6],  " #F1 "(%[tmp])             \n\t"                \
-  "sw     %[temp4],  " #G1 "(%[tmp])             \n\t"
+#define HORIZONTAL_PASS(A, E, F, G, H, E1, F1, G1, H1)                  \
+  "lbu    %[temp0],  0+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp1],  1+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp2],  2+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp3],  3+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp4],  0+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "lbu    %[temp5],  1+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "lbu    %[temp6],  2+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "lbu    %[temp7],  3+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp2]         \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp2]         \n\t"                \
+  "addu   %[temp2],  %[temp1],    %[temp3]         \n\t"                \
+  "subu   %[temp1],  %[temp1],    %[temp3]         \n\t"                \
+  "addu   %[temp3],  %[temp4],    %[temp6]         \n\t"                \
+  "subu   %[temp4],  %[temp4],    %[temp6]         \n\t"                \
+  "addu   %[temp6],  %[temp5],    %[temp7]         \n\t"                \
+  "subu   %[temp5],  %[temp5],    %[temp7]         \n\t"                \
+  "addu   %[temp7],  %[temp8],    %[temp2]         \n\t"                \
+  "subu   %[temp2],  %[temp8],    %[temp2]         \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp1]         \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp1]         \n\t"                \
+  "addu   %[temp1],  %[temp3],    %[temp6]         \n\t"                \
+  "subu   %[temp3],  %[temp3],    %[temp6]         \n\t"                \
+  "addu   %[temp6],  %[temp4],    %[temp5]         \n\t"                \
+  "subu   %[temp4],  %[temp4],    %[temp5]         \n\t"                \
+  "sw     %[temp7],  " #E "(%[tmp])                \n\t"                \
+  "sw     %[temp2],  " #H "(%[tmp])                \n\t"                \
+  "sw     %[temp8],  " #F "(%[tmp])                \n\t"                \
+  "sw     %[temp0],  " #G "(%[tmp])                \n\t"                \
+  "sw     %[temp1],  " #E1 "(%[tmp])               \n\t"                \
+  "sw     %[temp3],  " #H1 "(%[tmp])               \n\t"                \
+  "sw     %[temp6],  " #F1 "(%[tmp])               \n\t"                \
+  "sw     %[temp4],  " #G1 "(%[tmp])               \n\t"
 
 // macro for one vertical pass in Disto4x4 (TTransform)
 // two calls of function TTransform are merged into single one
@@ -412,39 +412,39 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
 // temp0..temp15 holds tmp[0]..tmp[15]
 // A - offset in bytes to load from src and ref buffers
 // TEMP0..TEMP3 - registers for corresponding tmp elements
-#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3)                \
-  "lw     %[" #TEMP1 "],  0(%[args])                         \n\t"    \
-  "lw     %[" #TEMP2 "],  4(%[args])                         \n\t"    \
-  "lbu    %[temp16],    0+"XSTR(BPS)"*" #A "(%[" #TEMP1 "])  \n\t"    \
-  "lbu    %[temp17],    0+"XSTR(BPS)"*" #A "(%[" #TEMP2 "])  \n\t"    \
-  "lbu    %[temp18],    1+"XSTR(BPS)"*" #A "(%[" #TEMP1 "])  \n\t"    \
-  "lbu    %[temp19],    1+"XSTR(BPS)"*" #A "(%[" #TEMP2 "])  \n\t"    \
-  "subu   %[temp20],    %[temp16],    %[temp17]              \n\t"    \
-  "lbu    %[temp16],    2+"XSTR(BPS)"*" #A "(%[" #TEMP1 "])  \n\t"    \
-  "lbu    %[temp17],    2+"XSTR(BPS)"*" #A "(%[" #TEMP2 "])  \n\t"    \
-  "subu   %[" #TEMP0 "],  %[temp18],    %[temp19]            \n\t"    \
-  "lbu    %[temp18],    3+"XSTR(BPS)"*" #A "(%[" #TEMP1 "])  \n\t"    \
-  "lbu    %[temp19],    3+"XSTR(BPS)"*" #A "(%[" #TEMP2 "])  \n\t"    \
-  "subu   %[" #TEMP1 "],  %[temp16],    %[temp17]            \n\t"    \
-  "subu   %[" #TEMP2 "],  %[temp18],    %[temp19]            \n\t"    \
-  "addu   %[" #TEMP3 "],  %[temp20],    %[" #TEMP2 "]        \n\t"    \
-  "subu   %[" #TEMP2 "],  %[temp20],    %[" #TEMP2 "]        \n\t"    \
-  "addu   %[temp20],    %[" #TEMP0 "],  %[" #TEMP1 "]        \n\t"    \
-  "subu   %[" #TEMP0 "],  %[" #TEMP0 "],  %[" #TEMP1 "]      \n\t"    \
-  "mul    %[temp16],    %[" #TEMP2 "],  %[c5352]             \n\t"    \
-  "mul    %[temp17],    %[" #TEMP2 "],  %[c2217]             \n\t"    \
-  "mul    %[temp18],    %[" #TEMP0 "],  %[c5352]             \n\t"    \
-  "mul    %[temp19],    %[" #TEMP0 "],  %[c2217]             \n\t"    \
-  "addu   %[" #TEMP1 "],  %[" #TEMP3 "],  %[temp20]          \n\t"    \
-  "subu   %[temp20],    %[" #TEMP3 "],  %[temp20]            \n\t"    \
-  "sll    %[" #TEMP0 "],  %[" #TEMP1 "],  3                  \n\t"    \
-  "sll    %[" #TEMP2 "],  %[temp20],    3                    \n\t"    \
-  "addiu  %[temp16],    %[temp16],    1812                   \n\t"    \
-  "addiu  %[temp17],    %[temp17],    937                    \n\t"    \
-  "addu   %[temp16],    %[temp16],    %[temp19]              \n\t"    \
-  "subu   %[temp17],    %[temp17],    %[temp18]              \n\t"    \
-  "sra    %[" #TEMP1 "],  %[temp16],    9                    \n\t"    \
-  "sra    %[" #TEMP3 "],  %[temp17],    9                    \n\t"
+#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3)                  \
+  "lw     %[" #TEMP1 "],  0(%[args])                           \n\t"    \
+  "lw     %[" #TEMP2 "],  4(%[args])                           \n\t"    \
+  "lbu    %[temp16],    0+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp17],    0+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "lbu    %[temp18],    1+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp19],    1+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "subu   %[temp20],    %[temp16],    %[temp17]                \n\t"    \
+  "lbu    %[temp16],    2+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp17],    2+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "subu   %[" #TEMP0 "],  %[temp18],    %[temp19]              \n\t"    \
+  "lbu    %[temp18],    3+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp19],    3+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "subu   %[" #TEMP1 "],  %[temp16],    %[temp17]              \n\t"    \
+  "subu   %[" #TEMP2 "],  %[temp18],    %[temp19]              \n\t"    \
+  "addu   %[" #TEMP3 "],  %[temp20],    %[" #TEMP2 "]          \n\t"    \
+  "subu   %[" #TEMP2 "],  %[temp20],    %[" #TEMP2 "]          \n\t"    \
+  "addu   %[temp20],    %[" #TEMP0 "],  %[" #TEMP1 "]          \n\t"    \
+  "subu   %[" #TEMP0 "],  %[" #TEMP0 "],  %[" #TEMP1 "]        \n\t"    \
+  "mul    %[temp16],    %[" #TEMP2 "],  %[c5352]               \n\t"    \
+  "mul    %[temp17],    %[" #TEMP2 "],  %[c2217]               \n\t"    \
+  "mul    %[temp18],    %[" #TEMP0 "],  %[c5352]               \n\t"    \
+  "mul    %[temp19],    %[" #TEMP0 "],  %[c2217]               \n\t"    \
+  "addu   %[" #TEMP1 "],  %[" #TEMP3 "],  %[temp20]            \n\t"    \
+  "subu   %[temp20],    %[" #TEMP3 "],  %[temp20]              \n\t"    \
+  "sll    %[" #TEMP0 "],  %[" #TEMP1 "],  3                    \n\t"    \
+  "sll    %[" #TEMP2 "],  %[temp20],    3                      \n\t"    \
+  "addiu  %[temp16],    %[temp16],    1812                     \n\t"    \
+  "addiu  %[temp17],    %[temp17],    937                      \n\t"    \
+  "addu   %[temp16],    %[temp16],    %[temp19]                \n\t"    \
+  "subu   %[temp17],    %[temp17],    %[temp18]                \n\t"    \
+  "sra    %[" #TEMP1 "],  %[temp16],    9                      \n\t"    \
+  "sra    %[" #TEMP3 "],  %[temp17],    9                      \n\t"
 
 // macro for one vertical pass in FTransform
 // temp0..temp15 holds tmp[0]..tmp[15]
diff --git a/src/dsp/enc_mips_dsp_r2.c b/src/dsp/enc_mips_dsp_r2.c
index 44f6fd25..7c814fa0 100644
--- a/src/dsp/enc_mips_dsp_r2.c
+++ b/src/dsp/enc_mips_dsp_r2.c
@@ -79,8 +79,8 @@ static const int kC2 = 35468;
 #define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3)                         \
   "lw              %[" #TEMP0 "],   0(%[args])                          \n\t"  \
   "lw              %[" #TEMP1 "],   4(%[args])                          \n\t"  \
-  "lw              %[" #TEMP2 "],   "XSTR(BPS)"*" #A "(%[" #TEMP0 "])   \n\t"  \
-  "lw              %[" #TEMP3 "],   "XSTR(BPS)"*" #A "(%[" #TEMP1 "])   \n\t"  \
+  "lw              %[" #TEMP2 "],   " XSTR(BPS) "*" #A "(%[" #TEMP0 "]) \n\t"  \
+  "lw              %[" #TEMP3 "],   " XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t"  \
   "preceu.ph.qbl   %[" #TEMP0 "],   %[" #TEMP2 "]                       \n\t"  \
   "preceu.ph.qbl   %[" #TEMP1 "],   %[" #TEMP3 "]                       \n\t"  \
   "preceu.ph.qbr   %[" #TEMP2 "],   %[" #TEMP2 "]                       \n\t"  \
@@ -328,13 +328,13 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
 //------------------------------------------------------------------------------
 // Intra predictions
 
-#define FILL_PART(J, SIZE)                                          \
-    "usw        %[value],  0+" #J "*"XSTR(BPS)"(%[dst])  \n\t"      \
-    "usw        %[value],  4+" #J "*"XSTR(BPS)"(%[dst])  \n\t"      \
-  ".if " #SIZE " == 16                                   \n\t"      \
-    "usw        %[value],  8+" #J "*"XSTR(BPS)"(%[dst])  \n\t"      \
-    "usw        %[value], 12+" #J "*"XSTR(BPS)"(%[dst])  \n\t"      \
-  ".endif                                                \n\t"
+#define FILL_PART(J, SIZE)                                            \
+    "usw        %[value],  0+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
+    "usw        %[value],  4+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
+  ".if " #SIZE " == 16                                     \n\t"      \
+    "usw        %[value],  8+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
+    "usw        %[value], 12+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
+  ".endif                                                  \n\t"
 
 #define FILL_8_OR_16(DST, VALUE, SIZE) do {                         \
   int value = (VALUE);                                              \
@@ -597,10 +597,10 @@ static void DC4(uint8_t* dst, const uint8_t* top) {
     "addiu        %[temp0],   %[temp0],    4          \n\t"
     "srl          %[temp0],   %[temp0],    3          \n\t"
     "replv.qb     %[temp0],   %[temp0]                \n\t"
-    "usw          %[temp0],   0*"XSTR(BPS)"(%[dst])   \n\t"
-    "usw          %[temp0],   1*"XSTR(BPS)"(%[dst])   \n\t"
-    "usw          %[temp0],   2*"XSTR(BPS)"(%[dst])   \n\t"
-    "usw          %[temp0],   3*"XSTR(BPS)"(%[dst])   \n\t"
+    "usw          %[temp0],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw          %[temp0],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw          %[temp0],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw          %[temp0],   3*" XSTR(BPS) "(%[dst]) \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
     : [top]"r"(top), [dst]"r"(dst)
     : "memory"
@@ -650,10 +650,10 @@ static void TM4(uint8_t* dst, const uint8_t* top) {
     "shll_s.ph        %[temp5],  %[temp5],   7                 \n\t"
     "precrqu_s.qb.ph  %[temp2],  %[temp3],   %[temp2]          \n\t"
     "precrqu_s.qb.ph  %[temp3],  %[temp4],   %[temp5]          \n\t"
-    "usw              %[temp1],  0*"XSTR(BPS)"(%[dst])         \n\t"
-    "usw              %[temp0],  1*"XSTR(BPS)"(%[dst])         \n\t"
-    "usw              %[temp3],  2*"XSTR(BPS)"(%[dst])         \n\t"
-    "usw              %[temp2],  3*"XSTR(BPS)"(%[dst])         \n\t"
+    "usw              %[temp1],  0*" XSTR(BPS) "(%[dst])       \n\t"
+    "usw              %[temp0],  1*" XSTR(BPS) "(%[dst])       \n\t"
+    "usw              %[temp3],  2*" XSTR(BPS) "(%[dst])       \n\t"
+    "usw              %[temp2],  3*" XSTR(BPS) "(%[dst])       \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
       [a10]"=&r"(a10), [a32]"=&r"(a32)
@@ -681,10 +681,10 @@ static void VE4(uint8_t* dst, const uint8_t* top) {
     "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
     "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
     "precr.qb.ph     %[temp4],   %[temp6],    %[temp2]   \n\t"
-    "usw             %[temp4],   0*"XSTR(BPS)"(%[dst])   \n\t"
-    "usw             %[temp4],   1*"XSTR(BPS)"(%[dst])   \n\t"
-    "usw             %[temp4],   2*"XSTR(BPS)"(%[dst])   \n\t"
-    "usw             %[temp4],   3*"XSTR(BPS)"(%[dst])   \n\t"
+    "usw             %[temp4],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp4],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp4],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp4],   3*" XSTR(BPS) "(%[dst]) \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
       [temp6]"=&r"(temp6)
@@ -717,10 +717,10 @@ static void HE4(uint8_t* dst, const uint8_t* top) {
     "srl             %[temp2],   %[temp2],    16         \n\t"
     "replv.qb        %[temp3],   %[temp3]                \n\t"
     "replv.qb        %[temp2],   %[temp2]                \n\t"
-    "usw             %[temp3],   0*"XSTR(BPS)"(%[dst])   \n\t"
-    "usw             %[temp0],   1*"XSTR(BPS)"(%[dst])   \n\t"
-    "usw             %[temp2],   2*"XSTR(BPS)"(%[dst])   \n\t"
-    "usw             %[temp1],   3*"XSTR(BPS)"(%[dst])   \n\t"
+    "usw             %[temp3],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp0],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp2],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp1],   3*" XSTR(BPS) "(%[dst]) \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
       [temp6]"=&r"(temp6)
@@ -763,12 +763,12 @@ static void RD4(uint8_t* dst, const uint8_t* top) {
     "precr.qb.ph     %[temp9],    %[temp10],   %[temp9]    \n\t"
     "shra_r.w        %[temp0],    %[temp0],    2           \n\t"
     "precr.qb.ph     %[temp10],   %[temp11],   %[temp10]   \n\t"
-    "usw             %[temp9],    3*"XSTR(BPS)"(%[dst])    \n\t"
-    "usw             %[temp10],   1*"XSTR(BPS)"(%[dst])    \n\t"
+    "usw             %[temp9],    3*" XSTR(BPS) "(%[dst])  \n\t"
+    "usw             %[temp10],   1*" XSTR(BPS) "(%[dst])  \n\t"
     "prepend         %[temp9],    %[temp11],   8           \n\t"
     "prepend         %[temp10],   %[temp0],    8           \n\t"
-    "usw             %[temp9],    2*"XSTR(BPS)"(%[dst])    \n\t"
-    "usw             %[temp10],   0*"XSTR(BPS)"(%[dst])    \n\t"
+    "usw             %[temp9],    2*" XSTR(BPS) "(%[dst])  \n\t"
+    "usw             %[temp10],   0*" XSTR(BPS) "(%[dst])  \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
@@ -812,13 +812,13 @@ static void VR4(uint8_t* dst, const uint8_t* top) {
     "append           %[temp3],   %[temp1],    16         \n\t"
     "precr.qb.ph      %[temp8],   %[temp8],    %[temp4]   \n\t"
     "precr.qb.ph      %[temp3],   %[temp2],    %[temp3]   \n\t"
-    "usw              %[temp8],   0*"XSTR(BPS)"(%[dst])   \n\t"
-    "usw              %[temp3],   1*"XSTR(BPS)"(%[dst])   \n\t"
+    "usw              %[temp8],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp3],   1*" XSTR(BPS) "(%[dst]) \n\t"
     "append           %[temp3],   %[temp6],    8          \n\t"
     "srl              %[temp6],   %[temp6],    16         \n\t"
     "append           %[temp8],   %[temp6],    8          \n\t"
-    "usw              %[temp3],   3*"XSTR(BPS)"(%[dst])   \n\t"
-    "usw              %[temp8],   2*"XSTR(BPS)"(%[dst])   \n\t"
+    "usw              %[temp3],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp8],   2*" XSTR(BPS) "(%[dst]) \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
@@ -860,12 +860,12 @@ static void LD4(uint8_t* dst, const uint8_t* top) {
     "precr.qb.ph     %[temp10],   %[temp11],   %[temp10]  \n\t"
     "addu            %[temp1],    %[temp1],    %[temp5]   \n\t"
     "shra_r.w        %[temp1],    %[temp1],    2          \n\t"
-    "usw             %[temp9],    0*"XSTR(BPS)"(%[dst])   \n\t"
-    "usw             %[temp10],   2*"XSTR(BPS)"(%[dst])   \n\t"
+    "usw             %[temp9],    0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp10],   2*" XSTR(BPS) "(%[dst]) \n\t"
     "prepend         %[temp9],    %[temp11],   8          \n\t"
     "prepend         %[temp10],   %[temp1],    8          \n\t"
-    "usw             %[temp9],    1*"XSTR(BPS)"(%[dst])   \n\t"
-    "usw             %[temp10],   3*"XSTR(BPS)"(%[dst])   \n\t"
+    "usw             %[temp9],    1*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp10],   3*" XSTR(BPS) "(%[dst]) \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
@@ -908,13 +908,13 @@ static void VL4(uint8_t* dst, const uint8_t* top) {
     "append           %[temp2],   %[temp0],    16         \n\t"
     "precr.qb.ph      %[temp8],   %[temp8],    %[temp5]   \n\t"
     "precr.qb.ph      %[temp3],   %[temp3],    %[temp2]   \n\t"
-    "usw              %[temp8],   0*"XSTR(BPS)"(%[dst])   \n\t"
+    "usw              %[temp8],   0*" XSTR(BPS) "(%[dst]) \n\t"
     "prepend          %[temp8],   %[temp6],    8          \n\t"
-    "usw              %[temp3],   1*"XSTR(BPS)"(%[dst])   \n\t"
+    "usw              %[temp3],   1*" XSTR(BPS) "(%[dst]) \n\t"
     "srl              %[temp6],   %[temp6],    16         \n\t"
     "prepend          %[temp3],   %[temp6],    8          \n\t"
-    "usw              %[temp8],   2*"XSTR(BPS)"(%[dst])   \n\t"
-    "usw              %[temp3],   3*"XSTR(BPS)"(%[dst])   \n\t"
+    "usw              %[temp8],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp3],   3*" XSTR(BPS) "(%[dst]) \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
@@ -955,14 +955,14 @@ static void HD4(uint8_t* dst, const uint8_t* top) {
     "precrq.ph.w      %[temp3],   %[temp0],    %[temp4]   \n\t"
     "precr.qb.ph      %[temp7],   %[temp6],    %[temp1]   \n\t"
     "precr.qb.ph      %[temp6],   %[temp1],    %[temp3]   \n\t"
-    "usw              %[temp7],   0*"XSTR(BPS)"(%[dst])   \n\t"
-    "usw              %[temp6],   1*"XSTR(BPS)"(%[dst])   \n\t"
+    "usw              %[temp7],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp6],   1*" XSTR(BPS) "(%[dst]) \n\t"
     "append           %[temp2],   %[temp5],    16         \n\t"
     "append           %[temp0],   %[temp4],    16         \n\t"
     "precr.qb.ph      %[temp5],   %[temp3],    %[temp2]   \n\t"
     "precr.qb.ph      %[temp4],   %[temp2],    %[temp0]   \n\t"
-    "usw              %[temp5],   2*"XSTR(BPS)"(%[dst])   \n\t"
-    "usw              %[temp4],   3*"XSTR(BPS)"(%[dst])   \n\t"
+    "usw              %[temp5],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp4],   3*" XSTR(BPS) "(%[dst]) \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
@@ -994,12 +994,12 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
     "precrq.ph.w     %[temp2],   %[temp6],    %[temp4]   \n\t"
     "append          %[temp0],   %[temp5],    16         \n\t"
     "precr.qb.ph     %[temp3],   %[temp3],    %[temp2]   \n\t"
-    "usw             %[temp3],   0*"XSTR(BPS)"(%[dst])   \n\t"
+    "usw             %[temp3],   0*" XSTR(BPS) "(%[dst]) \n\t"
     "precr.qb.ph     %[temp1],   %[temp7],    %[temp0]   \n\t"
-    "usw             %[temp7],   3*"XSTR(BPS)"(%[dst])   \n\t"
+    "usw             %[temp7],   3*" XSTR(BPS) "(%[dst]) \n\t"
     "packrl.ph       %[temp2],   %[temp1],    %[temp3]   \n\t"
-    "usw             %[temp1],   2*"XSTR(BPS)"(%[dst])   \n\t"
-    "usw             %[temp2],   1*"XSTR(BPS)"(%[dst])   \n\t"
+    "usw             %[temp1],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp2],   1*" XSTR(BPS) "(%[dst]) \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
diff --git a/src/dsp/mips_macro.h b/src/dsp/mips_macro.h
index e09d2c4a..44aba9b7 100644
--- a/src/dsp/mips_macro.h
+++ b/src/dsp/mips_macro.h
@@ -40,10 +40,10 @@
 // I1..I9 - offsets in bytes
 #define LOAD_WITH_OFFSET_X4(O0, O1, O2, O3,                                    \
                             I0, I1, I2, I3, I4, I5, I6, I7, I8, I9)            \
-  "ulw    %[" #O0 "],    " #I1 "+"XSTR(I9)"*" #I5 "(%[" #I0 "])       \n\t"    \
-  "ulw    %[" #O1 "],    " #I2 "+"XSTR(I9)"*" #I6 "(%[" #I0 "])       \n\t"    \
-  "ulw    %[" #O2 "],    " #I3 "+"XSTR(I9)"*" #I7 "(%[" #I0 "])       \n\t"    \
-  "ulw    %[" #O3 "],    " #I4 "+"XSTR(I9)"*" #I8 "(%[" #I0 "])       \n\t"
+  "ulw    %[" #O0 "],    " #I1 "+" XSTR(I9) "*" #I5 "(%[" #I0 "])       \n\t"  \
+  "ulw    %[" #O1 "],    " #I2 "+" XSTR(I9) "*" #I6 "(%[" #I0 "])       \n\t"  \
+  "ulw    %[" #O2 "],    " #I3 "+" XSTR(I9) "*" #I7 "(%[" #I0 "])       \n\t"  \
+  "ulw    %[" #O3 "],    " #I4 "+" XSTR(I9) "*" #I8 "(%[" #I0 "])       \n\t"
 
 // O - output
 // IO - input/output
@@ -180,10 +180,10 @@
   "precrqu_s.qb.ph  %[" #IO2 "],  %[" #IO3 "],  %[" #IO2 "]         \n\t"      \
   "precrqu_s.qb.ph  %[" #IO4 "],  %[" #IO5 "],  %[" #IO4 "]         \n\t"      \
   "precrqu_s.qb.ph  %[" #IO6 "],  %[" #IO7 "],  %[" #IO6 "]         \n\t"      \
-  "usw              %[" #IO0 "],  "XSTR(I13)"*" #I9 "(%[" #I8 "])   \n\t"      \
-  "usw              %[" #IO2 "],  "XSTR(I13)"*" #I10 "(%[" #I8 "])  \n\t"      \
-  "usw              %[" #IO4 "],  "XSTR(I13)"*" #I11 "(%[" #I8 "])  \n\t"      \
-  "usw              %[" #IO6 "],  "XSTR(I13)"*" #I12 "(%[" #I8 "])  \n\t"
+  "usw              %[" #IO0 "],  " XSTR(I13) "*" #I9 "(%[" #I8 "])   \n\t"    \
+  "usw              %[" #IO2 "],  " XSTR(I13) "*" #I10 "(%[" #I8 "])  \n\t"    \
+  "usw              %[" #IO4 "],  " XSTR(I13) "*" #I11 "(%[" #I8 "])  \n\t"    \
+  "usw              %[" #IO6 "],  " XSTR(I13) "*" #I12 "(%[" #I8 "])  \n\t"
 
 #define OUTPUT_EARLY_CLOBBER_REGS_10()                                         \
   : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),             \