MIPS: dspr2: added optimization for function QuantizeBlock

Change-Id: Id217116890b7408d23464216608ce67ae545688a
2025-12-23 21:46:26 +01:00 · 2015-01-16 11:18:27 +01:00
parent 4fbe9cf202
commit 5487529368
1 changed files with 149 additions and 0 deletions
--- a/src/dsp/enc_mips_dsp_r2.c
+++ b/src/dsp/enc_mips_dsp_r2.c
@@ -1174,6 +1174,153 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
 #undef ABS_X8
 #undef ADD_SUB_HALVES_X4
 //------------------------------------------------------------------------------
 // Quantization
 //
 // macro for one pass through for loop in QuantizeBlock reading 2 values at time
 // QUANTDIV macro inlined
 // J - offset in bytes (kZigzag[n] * 2)
 // K - offset in bytes (kZigzag[n] * 4)
 // N - offset in bytes (n * 2)
 // N1 - offset in bytes ((n + 1) * 2)
 #define QUANTIZE_ONE(J, K, N, N1)                                         \
  "ulw         %[temp1],     "#J"(%[ppin])                   \n\t"        \
  "ulw         %[temp2],     "#J"(%[ppsharpen])              \n\t"        \
  "lhu         %[temp3],     "#K"(%[ppzthresh])              \n\t"        \
  "lhu         %[temp6],     "#K"+4(%[ppzthresh])            \n\t"        \
  "absq_s.ph   %[temp4],     %[temp1]                        \n\t"        \
  "ins         %[temp3],     %[temp6],         16,       16  \n\t"        \
  "addu.ph     %[coeff],     %[temp4],         %[temp2]      \n\t"        \
  "shra.ph     %[sign],      %[temp1],         15            \n\t"        \
  "li          %[level],     0x10001                         \n\t"        \
  "cmp.lt.ph   %[temp3],     %[coeff]                        \n\t"        \
  "lhu         %[temp1],     "#J"(%[ppiq])                   \n\t"        \
  "pick.ph     %[temp5],     %[level],         $0            \n\t"        \
  "lw          %[temp2],     "#K"(%[ppbias])                 \n\t"        \
  "beqz        %[temp5],     0f                              \n\t"        \
  "lhu         %[temp3],     "#J"(%[ppq])                    \n\t"        \
  "beq         %[temp5],     %[level],         1f            \n\t"        \
  "andi        %[temp5],     %[temp5],         0x1           \n\t"        \
  "andi        %[temp4],     %[coeff],         0xffff        \n\t"        \
  "beqz        %[temp5],     2f                              \n\t"        \
  "mul         %[level],     %[temp4],         %[temp1]      \n\t"        \
  "sh          $0,           "#J"+2(%[ppin])                 \n\t"        \
  "sh          $0,           "#N1"(%[pout])                  \n\t"        \
  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
  "sra         %[level],     %[level],         17            \n\t"        \
  "slt         %[temp4],     %[max_level],     %[level]      \n\t"        \
  "movn        %[level],     %[max_level],     %[temp4]      \n\t"        \
  "andi        %[temp6],     %[sign],          0xffff        \n\t"        \
  "xor         %[level],     %[level],         %[temp6]      \n\t"        \
  "subu        %[level],     %[level],         %[temp6]      \n\t"        \
  "mul         %[temp5],     %[level],         %[temp3]      \n\t"        \
  "or          %[ret],       %[ret],           %[level]      \n\t"        \
  "sh          %[level],     "#N"(%[pout])                   \n\t"        \
  "sh          %[temp5],     "#J"(%[ppin])                   \n\t"        \
  "j           3f                                            \n\t"        \
 "2:                                                          \n\t"        \
  "lhu         %[temp1],     "#J"+2(%[ppiq])                 \n\t"        \
  "srl         %[temp5],     %[coeff],         16            \n\t"        \
  "mul         %[level],     %[temp5],         %[temp1]      \n\t"        \
  "lw          %[temp2],     "#K"+4(%[ppbias])               \n\t"        \
  "lhu         %[temp3],     "#J"+2(%[ppq])                  \n\t"        \
  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
  "sra         %[level],     %[level],         17            \n\t"        \
  "srl         %[temp6],     %[sign],          16            \n\t"        \
  "slt         %[temp4],     %[max_level],     %[level]      \n\t"        \
  "movn        %[level],     %[max_level],     %[temp4]      \n\t"        \
  "xor         %[level],     %[level],         %[temp6]      \n\t"        \
  "subu        %[level],     %[level],         %[temp6]      \n\t"        \
  "mul         %[temp5],     %[level],         %[temp3]      \n\t"        \
  "sh          $0,           "#J"(%[ppin])                   \n\t"        \
  "sh          $0,           "#N"(%[pout])                   \n\t"        \
  "or          %[ret],       %[ret],           %[level]      \n\t"        \
  "sh          %[temp5],     "#J"+2(%[ppin])                 \n\t"        \
  "sh          %[level],     "#N1"(%[pout])                  \n\t"        \
  "j           3f                                            \n\t"        \
 "1:                                                          \n\t"        \
  "lhu         %[temp1],     "#J"(%[ppiq])                   \n\t"        \
  "lw          %[temp2],     "#K"(%[ppbias])                 \n\t"        \
  "ulw         %[temp3],     "#J"(%[ppq])                    \n\t"        \
  "andi        %[temp5],     %[coeff],         0xffff        \n\t"        \
  "srl         %[temp0],     %[coeff],         16            \n\t"        \
  "lhu         %[temp6],     "#J"+2(%[ppiq])                 \n\t"        \
  "lw          %[coeff],     "#K"+4(%[ppbias])               \n\t"        \
  "mul         %[level],     %[temp5],         %[temp1]      \n\t"        \
  "mul         %[temp4],     %[temp0],         %[temp6]      \n\t"        \
  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
  "addu        %[temp4],     %[temp4],         %[coeff]      \n\t"        \
  "precrq.ph.w %[level],     %[temp4],         %[level]      \n\t"        \
  "shra.ph     %[level],     %[level],         1             \n\t"        \
  "cmp.lt.ph   %[max_level1],%[level]                        \n\t"        \
  "pick.ph     %[level],     %[max_level],     %[level]      \n\t"        \
  "xor         %[level],     %[level],         %[sign]       \n\t"        \
  "subu.ph     %[level],     %[level],         %[sign]       \n\t"        \
  "mul.ph      %[temp3],     %[level],         %[temp3]      \n\t"        \
  "or          %[ret],       %[ret],           %[level]      \n\t"        \
  "sh          %[level],     "#N"(%[pout])                   \n\t"        \
  "srl         %[level],     %[level],         16            \n\t"        \
  "sh          %[level],     "#N1"(%[pout])                  \n\t"        \
  "usw         %[temp3],     "#J"(%[ppin])                   \n\t"        \
  "j           3f                                            \n\t"        \
 "0:                                                          \n\t"        \
  "sh          $0,           "#N"(%[pout])                   \n\t"        \
  "sh          $0,           "#N1"(%[pout])                  \n\t"        \
  "usw         $0,           "#J"(%[ppin])                   \n\t"        \
 "3:                                                          \n\t"
 static int QuantizeBlock(int16_t in[16], int16_t out[16],
                         const VP8Matrix* const mtx) {
  int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
  int sign, coeff, level;
  int max_level = MAX_LEVEL;
  int max_level1 = max_level << 16 | max_level;
  int ret = 0;
  int16_t* ppin             = &in[0];
  int16_t* pout             = &out[0];
  const uint16_t* ppsharpen = &mtx->sharpen_[0];
  const uint32_t* ppzthresh = &mtx->zthresh_[0];
  const uint16_t* ppq       = &mtx->q_[0];
  const uint16_t* ppiq      = &mtx->iq_[0];
  const uint32_t* ppbias    = &mtx->bias_[0];
  __asm__ volatile (
    QUANTIZE_ONE( 0,  0,  0,  2)
    QUANTIZE_ONE( 4,  8, 10, 12)
    QUANTIZE_ONE( 8, 16,  4,  8)
    QUANTIZE_ONE(12, 24, 14, 24)
    QUANTIZE_ONE(16, 32,  6, 16)
    QUANTIZE_ONE(20, 40, 22, 26)
    QUANTIZE_ONE(24, 48, 18, 20)
    QUANTIZE_ONE(28, 56, 28, 30)
    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
      [sign]"=&r"(sign), [coeff]"=&r"(coeff),
      [level]"=&r"(level), [temp6]"=&r"(temp6), [ret]"+&r"(ret)
    : [ppin]"r"(ppin), [pout]"r"(pout), [max_level1]"r"(max_level1),
      [ppiq]"r"(ppiq), [max_level]"r"(max_level),
      [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
      [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
    : "memory", "hi", "lo"
  );
  return (ret != 0);
 }
 static int Quantize2Blocks(int16_t in[32], int16_t out[32],
                           const VP8Matrix* const mtx) {
  int nz;
  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
  return nz;
 }
 #undef QUANTIZE_ONE
 #endif  // WEBP_USE_MIPS_DSP_R2
 //------------------------------------------------------------------------------
@@ -1196,5 +1343,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
  VP8SSE16x8 = SSE16x8;
  VP8SSE4x4 = SSE4x4;
 #endif
  VP8EncQuantizeBlock = QuantizeBlock;
  VP8EncQuantize2Blocks = Quantize2Blocks;
 #endif  // WEBP_USE_MIPS_DSP_R2
 }