From 8a218b4a96ed627fad01fe18e2496cc0e9876914 Mon Sep 17 00:00:00 2001
From: Djordje Pesut <djordje.pesut@imgtec.com>
Date: Fri, 20 Feb 2015 12:34:09 +0100
Subject: [PATCH] MIPS: [mips32|dspr2]: GetResidualCost rebased

Change-Id: Ie15524c773f7a8c79e002097881a508187ca7cc6
---
 src/dsp/cost_mips32.c      | 133 +++++++++++++++----------------------
 src/dsp/cost_mips_dsp_r2.c |  55 +++++++--------
 2 files changed, 77 insertions(+), 111 deletions(-)

diff --git a/src/dsp/cost_mips32.c b/src/dsp/cost_mips32.c
index 11e29b8d..8e71115e 100644
--- a/src/dsp/cost_mips32.c
+++ b/src/dsp/cost_mips32.c
@@ -16,97 +16,68 @@
 #include "../enc/cost.h"
 
 static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+  int temp0, temp1;
+  int v_reg, ctx_reg;
   int n = res->first;
   // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
   int p0 = res->prob[n][ctx0][0];
-  const uint16_t* t = res->cost[n][ctx0];
-  int cost;
-  const int const_2 = 2;
-  const int const_255 = 255;
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+  const int16_t* res_coeffs = res->coeffs;
+  const int res_last = res->last;
   const int const_max_level = MAX_VARIABLE_LEVEL;
-  int res_cost;
-  int res_prob;
-  int res_coeffs;
-  int res_last;
-  int v_reg;
-  int b_reg;
-  int ctx_reg;
-  int cost_add, temp_1, temp_2, temp_3;
+  const int const_2 = 2;
+  const uint16_t** p_costs = &costs[n][0];
+  const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
 
   if (res->last < 0) {
     return VP8BitCost(0, p0);
   }
 
-  cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
-
-  res_cost = (int)res->cost;
-  res_prob = (int)res->prob;
-  res_coeffs = (int)res->coeffs;
-  res_last = (int)res->last;
-
-  __asm__ volatile(
-    ".set   push                                                           \n\t"
-    ".set   noreorder                                                      \n\t"
-
-    "sll    %[temp_1],     %[n],              1                            \n\t"
-    "addu   %[res_coeffs], %[res_coeffs],     %[temp_1]                    \n\t"
-    "slt    %[temp_2],     %[n],              %[res_last]                  \n\t"
-    "bnez   %[temp_2],     1f                                              \n\t"
-    " li    %[cost_add],   0                                               \n\t"
-    "b      2f                                                             \n\t"
-    " nop                                                                  \n\t"
+  __asm__ volatile (
+    ".set      push                                                        \n\t"
+    ".set      noreorder                                                   \n\t"
+    "subu      %[temp1],        %[res_last],        %[n]                   \n\t"
+    "sll       %[temp0],        %[n],               1                      \n\t"
+    "blez      %[temp1],        2f                                         \n\t"
+    " addu     %[res_coeffs],   %[res_coeffs],      %[temp0]               \n\t"
   "1:                                                                      \n\t"
-    "lh     %[v_reg],      0(%[res_coeffs])                                \n\t"
-    "addu   %[b_reg],      %[n],              %[VP8EncBands]               \n\t"
-    "move   %[temp_1],     %[const_max_level]                              \n\t"
-    "addu   %[cost],       %[cost],           %[cost_add]                  \n\t"
-    "negu   %[temp_2],     %[v_reg]                                        \n\t"
-    "slti   %[temp_3],     %[v_reg],          0                            \n\t"
-    "movn   %[v_reg],      %[temp_2],         %[temp_3]                    \n\t"
-    "lbu    %[b_reg],      1(%[b_reg])                                     \n\t"
-    "li     %[cost_add],   0                                               \n\t"
-
-    "sltiu  %[temp_3],     %[v_reg],          2                            \n\t"
-    "move   %[ctx_reg],    %[v_reg]                                        \n\t"
-    "movz   %[ctx_reg],    %[const_2],        %[temp_3]                    \n\t"
-    //  cost += VP8LevelCost(t, v);
-    "slt    %[temp_3],     %[v_reg],          %[const_max_level]           \n\t"
-    "movn   %[temp_1],     %[v_reg],          %[temp_3]                    \n\t"
-    "sll    %[temp_2],     %[v_reg],          1                            \n\t"
-    "addu   %[temp_2],     %[temp_2],         %[VP8LevelFixedCosts]        \n\t"
-    "lhu    %[temp_2],     0(%[temp_2])                                    \n\t"
-    "sll    %[temp_1],     %[temp_1],         1                            \n\t"
-    "addu   %[temp_1],     %[temp_1],         %[t]                         \n\t"
-    "lhu    %[temp_3],     0(%[temp_1])                                    \n\t"
-    "addu   %[cost],       %[cost],           %[temp_2]                    \n\t"
-
-    //  t = res->cost[b][ctx];
-    "sll    %[temp_1],     %[ctx_reg],        7                            \n\t"
-    "sll    %[temp_2],     %[ctx_reg],        3                            \n\t"
-    "addu   %[cost],       %[cost],           %[temp_3]                    \n\t"
-    "addu   %[temp_1],     %[temp_1],         %[temp_2]                    \n\t"
-    "sll    %[temp_2],     %[b_reg],          3                            \n\t"
-    "sll    %[temp_3],     %[b_reg],          5                            \n\t"
-    "sub    %[temp_2],     %[temp_3],         %[temp_2]                    \n\t"
-    "sll    %[temp_3],     %[temp_2],         4                            \n\t"
-    "addu   %[temp_1],     %[temp_1],         %[temp_3]                    \n\t"
-    "addu   %[temp_2],     %[temp_2],         %[res_cost]                  \n\t"
-    "addiu  %[n],          %[n],              1                            \n\t"
-    "addu   %[t],          %[temp_1],         %[temp_2]                    \n\t"
-    "slt    %[temp_1],     %[n],              %[res_last]                  \n\t"
-    "bnez   %[temp_1],     1b                                              \n\t"
-    " addiu %[res_coeffs], %[res_coeffs],     2                            \n\t"
-   "2:                                                                     \n\t"
-
-    ".set   pop                                                            \n\t"
-    : [cost]"+r"(cost), [t]"+r"(t), [n]"+r"(n), [v_reg]"=&r"(v_reg),
-      [ctx_reg]"=&r"(ctx_reg), [b_reg]"=&r"(b_reg), [cost_add]"=&r"(cost_add),
-      [temp_1]"=&r"(temp_1), [temp_2]"=&r"(temp_2), [temp_3]"=&r"(temp_3)
-    : [const_2]"r"(const_2), [const_255]"r"(const_255), [res_last]"r"(res_last),
-      [VP8EntropyCost]"r"(VP8EntropyCost), [VP8EncBands]"r"(VP8EncBands),
-      [const_max_level]"r"(const_max_level), [res_prob]"r"(res_prob),
-      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_coeffs]"r"(res_coeffs),
-      [res_cost]"r"(res_cost)
+    "lh        %[v_reg],        0(%[res_coeffs])                           \n\t"
+    "addiu     %[n],            %[n],               1                      \n\t"
+    "negu      %[temp0],        %[v_reg]                                   \n\t"
+    "slti      %[temp1],        %[v_reg],           0                      \n\t"
+    "movn      %[v_reg],        %[temp0],           %[temp1]               \n\t"
+    "sltiu     %[temp0],        %[v_reg],           2                      \n\t"
+    "move      %[ctx_reg],      %[v_reg]                                   \n\t"
+    "movz      %[ctx_reg],      %[const_2],         %[temp0]               \n\t"
+    "sll       %[temp1],        %[v_reg],           1                      \n\t"
+    "addu      %[temp1],        %[temp1],           %[VP8LevelFixedCosts]  \n\t"
+    "lhu       %[temp1],        0(%[temp1])                                \n\t"
+    "slt       %[temp0],        %[v_reg],           %[const_max_level]     \n\t"
+    "movz      %[v_reg],        %[const_max_level], %[temp0]               \n\t"
+    "addu      %[cost],         %[cost],            %[temp1]               \n\t"
+    "sll       %[v_reg],        %[v_reg],           1                      \n\t"
+    "sll       %[ctx_reg],      %[ctx_reg],         2                      \n\t"
+    "addu      %[v_reg],        %[v_reg],           %[t]                   \n\t"
+    "lhu       %[temp0],        0(%[v_reg])                                \n\t"
+    "addu      %[p_costs],      %[p_costs],         %[inc_p_costs]         \n\t"
+    "addu      %[t],            %[p_costs],         %[ctx_reg]             \n\t"
+    "addu      %[cost],         %[cost],            %[temp0]               \n\t"
+    "addiu     %[res_coeffs],   %[res_coeffs],      2                      \n\t"
+    "bne       %[n],            %[res_last],        1b                     \n\t"
+    " lw       %[t],            0(%[t])                                    \n\t"
+  "2:                                                                      \n\t"
+    ".set      pop                                                         \n\t"
+    : [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
+      [ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [res_coeffs]"+&r"(res_coeffs)
+    : [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
+      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
+      [inc_p_costs]"r"(inc_p_costs)
     : "memory"
   );
 
diff --git a/src/dsp/cost_mips_dsp_r2.c b/src/dsp/cost_mips_dsp_r2.c
index 89cd4cc4..f66ee50b 100644
--- a/src/dsp/cost_mips_dsp_r2.c
+++ b/src/dsp/cost_mips_dsp_r2.c
@@ -16,68 +16,63 @@
 #include "../enc/cost.h"
 
 static int GetResidualCost(int ctx0, const VP8Residual* const res) {
-  int temp0, temp1, temp2;
-  int v_reg, b_reg, ctx_reg;
+  int temp0, temp1;
+  int v_reg, ctx_reg;
   int n = res->first;
   // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
   int p0 = res->prob[n][ctx0][0];
-  const uint16_t* t = res->cost[n][ctx0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
   // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
   // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
   // be missing during the loop.
   int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
-  int res_cost = (int)res->cost;
-  int res_coeffs = (int)res->coeffs;
-  int res_last = (int)res->last;
+  const int16_t* res_coeffs = res->coeffs;
+  const int res_last = res->last;
   const int const_max_level = MAX_VARIABLE_LEVEL;
   const int const_2 = 2;
-  const int const_408 = 408;
-  int mult_136_408 = 136;
+  const uint16_t** p_costs = &costs[n][0];
+  const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
 
   if (res->last < 0) {
     return VP8BitCost(0, p0);
   }
 
-  __asm__ volatile(
+  __asm__ volatile (
     ".set      push                                                     \n\t"
     ".set      noreorder                                                \n\t"
     "subu      %[temp1],        %[res_last],        %[n]                \n\t"
     "blez      %[temp1],        2f                                      \n\t"
-    " ins      %[mult_136_408], %[const_408],       16,         16      \n\t"
+    " nop                                                               \n\t"
   "1:                                                                   \n\t"
     "sll       %[temp0],        %[n],               1                   \n\t"
     "lhx       %[v_reg],        %[temp0](%[res_coeffs])                 \n\t"
     "addiu     %[n],            %[n],               1                   \n\t"
     "absq_s.w  %[v_reg],        %[v_reg]                                \n\t"
-    "lbux      %[b_reg],        %[n](%[VP8EncBands])                    \n\t"
-    "sltiu     %[temp2],        %[v_reg],           2                   \n\t"
+    "sltiu     %[temp0],        %[v_reg],           2                   \n\t"
     "move      %[ctx_reg],      %[v_reg]                                \n\t"
-    "movz      %[ctx_reg],      %[const_2],         %[temp2]            \n\t"
+    "movz      %[ctx_reg],      %[const_2],         %[temp0]            \n\t"
     "sll       %[temp1],        %[v_reg],           1                   \n\t"
     "lhx       %[temp1],        %[temp1](%[VP8LevelFixedCosts])         \n\t"
-    "slt       %[temp2],        %[v_reg],           %[const_max_level]  \n\t"
-    "ins       %[ctx_reg],      %[b_reg],           16,         16      \n\t"
-    "movz      %[v_reg],        %[const_max_level], %[temp2]            \n\t"
-    "mul.ph    %[temp0],        %[ctx_reg],         %[mult_136_408]     \n\t"
+    "slt       %[temp0],        %[v_reg],           %[const_max_level]  \n\t"
+    "movz      %[v_reg],        %[const_max_level], %[temp0]            \n\t"
     "addu      %[cost],         %[cost],            %[temp1]            \n\t"
     "sll       %[v_reg],        %[v_reg],           1                   \n\t"
-    "lhx       %[temp2],        %[v_reg](%[t])                          \n\t"
-    "ext       %[temp1],        %[temp0],           0,          16      \n\t"
-    "ext       %[temp0],        %[temp0],           16,         16      \n\t"
-    "addu      %[cost],         %[cost],            %[temp2]            \n\t"
-    "addu      %[temp1],        %[temp1],           %[res_cost]         \n\t"
+    "sll       %[ctx_reg],      %[ctx_reg],         2                   \n\t"
+    "lhx       %[temp0],        %[v_reg](%[t])                          \n\t"
+    "addu      %[p_costs],      %[p_costs],         %[inc_p_costs]      \n\t"
+    "addu      %[t],            %[p_costs],         %[ctx_reg]          \n\t"
+    "addu      %[cost],         %[cost],            %[temp0]            \n\t"
     "bne       %[n],            %[res_last],        1b                  \n\t"
-    " addu     %[t],            %[temp0],           %[temp1]            \n\t"
+    " lw       %[t],            0(%[t])                                 \n\t"
   "2:                                                                   \n\t"
     ".set      pop                                                      \n\t"
     : [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
-      [ctx_reg]"=&r"(ctx_reg), [b_reg]"=&r"(b_reg), [temp0]"=&r"(temp0),
-      [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
-      [mult_136_408]"+&r"(mult_136_408)
-    : [const_2]"r"(const_2), [res_last]"r"(res_last),
-      [VP8EncBands]"r"(VP8EncBands), [const_max_level]"r"(const_max_level),
-      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_cost]"r"(res_cost),
-      [const_408]"r"(const_408), [res_coeffs]"r"(res_coeffs)
+      [ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1)
+    : [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
+      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
+      [res_coeffs]"r"(res_coeffs), [inc_p_costs]"r"(inc_p_costs)
     : "memory"
   );