From 2298d5f301d7352b837e44db95460c9bf5ba20ce Mon Sep 17 00:00:00 2001
From: Jovan Zelincevic <jovan.zelincevic@imgtec.com>
Date: Wed, 19 Feb 2014 16:27:41 +0100
Subject: [PATCH] MIPS: MIPS32r1: Added optimization for QuantizeBlock

Change-Id: I6047ab107e4d474e35b5af1dac391d5b3d8c049b
---
 src/dsp/enc_mips32.c | 98 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 97 insertions(+), 1 deletion(-)

diff --git a/src/dsp/enc_mips32.c b/src/dsp/enc_mips32.c
index 3fe9f3cc..350d1381 100755
--- a/src/dsp/enc_mips32.c
+++ b/src/dsp/enc_mips32.c
@@ -9,12 +9,15 @@
 //
 // MIPS version of speed-critical encoding functions.
 //
-// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
+// Author(s): Djordje Pesut    (djordje.pesut@imgtec.com)
+//            Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 
 #include "./dsp.h"
 
 #if defined(WEBP_USE_MIPS32)
 
+#include "../enc/vp8enci.h"
+
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 
@@ -150,6 +153,98 @@ static void ITransformMIPS32(const uint8_t* ref, const int16_t* in,
 #undef VERTICAL_PASS
 #undef HORIZONTAL_PASS
 
+// macro for one pass through for loop in QuantizeBlock
+// QUANTDIV macro inlined
+// J - offset in bytes (kZigzag[n] * 2)
+// K - offset in bytes (kZigzag[n] * 4)
+// N - offset in bytes (n * 2)
+#define QUANTIZE_ONE(J, K, N)                                               \
+  "lh           %[temp0],       "#J"(%[ppin])                       \n\t"   \
+  "lhu          %[temp1],       "#J"(%[ppsharpen])                  \n\t"   \
+  "lw           %[temp2],       "#K"(%[ppzthresh])                  \n\t"   \
+  "sra          %[sign],        %[temp0],           15              \n\t"   \
+  "xor          %[coeff],       %[temp0],           %[sign]         \n\t"   \
+  "subu         %[coeff],       %[coeff],           %[sign]         \n\t"   \
+  "addu         %[coeff],       %[coeff],           %[temp1]        \n\t"   \
+  "slt          %[temp4],       %[temp2],           %[coeff]        \n\t"   \
+  "addiu        %[temp5],       $zero,              0               \n\t"   \
+  "addiu        %[level],       $zero,              0               \n\t"   \
+  "beqz         %[temp4],       2f                                  \n\t"   \
+  "lhu          %[temp1],       "#J"(%[ppiq])                       \n\t"   \
+  "lw           %[temp2],       "#K"(%[ppbias])                     \n\t"   \
+  "lhu          %[temp3],       "#J"(%[ppq])                        \n\t"   \
+  "mul          %[level],       %[coeff],           %[temp1]        \n\t"   \
+  "addu         %[level],       %[level],           %[temp2]        \n\t"   \
+  "sra          %[level],       %[level],           17              \n\t"   \
+  "slt          %[temp4],       %[max_level],       %[level]        \n\t"   \
+  "movn         %[level],       %[max_level],       %[temp4]        \n\t"   \
+  "xor          %[level],       %[level],           %[sign]         \n\t"   \
+  "subu         %[level],       %[level],           %[sign]         \n\t"   \
+  "mul          %[temp5],       %[level],           %[temp3]        \n\t"   \
+"2:                                                                 \n\t"   \
+  "sh           %[temp5],       "#J"(%[ppin])                       \n\t"   \
+  "sh           %[level],       "#N"(%[pout])                       \n\t"
+
+static int QuantizeBlockMIPS32(int16_t in[16], int16_t out[16],
+                               int n, const VP8Matrix* const mtx) {
+  int last;
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  int sign, coeff, level, i;
+  int max_level = MAX_LEVEL;
+
+  int16_t* ppin             = &in[0];
+  int16_t* pout             = &out[0];
+  const uint16_t* ppsharpen = &mtx->sharpen_[0];
+  const uint32_t* ppzthresh = &mtx->zthresh_[0];
+  const uint16_t* ppq       = &mtx->q_[0];
+  const uint16_t* ppiq      = &mtx->iq_[0];
+  const uint32_t* ppbias    = &mtx->bias_[0];
+
+  __asm__ volatile(
+    "bnez         %[n],           1f                               \n\t"
+    QUANTIZE_ONE( 0,  0,  0)
+  "1:                                                              \n\t"
+    QUANTIZE_ONE( 2,  4,  2)
+    QUANTIZE_ONE( 8, 16,  4)
+    QUANTIZE_ONE(16, 32,  6)
+    QUANTIZE_ONE(10, 20,  8)
+    QUANTIZE_ONE( 4,  8, 10)
+    QUANTIZE_ONE( 6, 12, 12)
+    QUANTIZE_ONE(12, 24, 14)
+    QUANTIZE_ONE(18, 36, 16)
+    QUANTIZE_ONE(24, 48, 18)
+    QUANTIZE_ONE(26, 52, 20)
+    QUANTIZE_ONE(20, 40, 22)
+    QUANTIZE_ONE(14, 28, 24)
+    QUANTIZE_ONE(22, 44, 26)
+    QUANTIZE_ONE(28, 56, 28)
+    QUANTIZE_ONE(30, 60, 30)
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [sign]"=&r"(sign), [coeff]"=&r"(coeff),
+      [level]"=&r"(level)
+    : [n]"r"(n), [pout]"r"(pout), [ppin]"r"(ppin),
+      [ppiq]"r"(ppiq), [max_level]"r"(max_level),
+      [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
+      [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
+    : "memory", "hi", "lo"
+  );
+
+  // moved out from macro to increase possibility for earlier breaking
+  last = -1;
+  for (i = 15; i >= n; i--) {
+    if (out[i]) {
+      last = i;
+      break;
+    }
+  }
+  return (last >= 0);
+}
+
+#undef QUANTIZE_ONE
+
 #endif  // WEBP_USE_MIPS32
 
 //------------------------------------------------------------------------------
@@ -160,5 +255,6 @@ extern void VP8EncDspInitMIPS32(void);
 void VP8EncDspInitMIPS32(void) {
 #if defined(WEBP_USE_MIPS32)
   VP8ITransform = ITransformMIPS32;
+  VP8EncQuantizeBlock = QuantizeBlockMIPS32;
 #endif  // WEBP_USE_MIPS32
 }