diff --git a/src/dsp/enc.c b/src/dsp/enc.c index d830888f..6399ed88 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -56,9 +56,9 @@ void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1], histo->last_non_zero = last_non_zero; } -static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, - int start_block, int end_block, - VP8Histogram* const histo) { +static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred, + int start_block, int end_block, + VP8Histogram* const histo) { int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; for (j = start_block; j < end_block; ++j) { @@ -140,15 +140,15 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, } } -static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, - int do_two) { +static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst, + int do_two) { ITransformOne(ref, in, dst); if (do_two) { ITransformOne(ref + 4, in + 16, dst + 4); } } -static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { +static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) { int i; int tmp[16]; for (i = 0; i < 4; ++i, src += BPS, ref += BPS) { @@ -177,12 +177,13 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { } } -static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) { +static void FTransform2_C(const uint8_t* src, const uint8_t* ref, + int16_t* out) { VP8FTransform(src, ref, out); VP8FTransform(src + 4, ref + 4, out + 16); } -static void FTransformWHT(const int16_t* in, int16_t* out) { +static void FTransformWHT_C(const int16_t* in, int16_t* out) { // input is 12b signed int32_t tmp[16]; int i; @@ -303,8 +304,8 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left, //------------------------------------------------------------------------------ // Chroma 8x8 prediction (paragraph 12.2) -static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left, + const uint8_t* top) { // U block DCMode(C8DC8 + dst, left, top, 8, 8, 4); VerticalPred(C8VE8 + dst, top, 8); @@ -323,8 +324,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, //------------------------------------------------------------------------------ // luma 16x16 prediction (paragraph 12.3) -static void Intra16Preds(uint8_t* dst, - const uint8_t* left, const uint8_t* top) { +static void Intra16Preds_C(uint8_t* dst, + const uint8_t* left, const uint8_t* top) { DCMode(I16DC16 + dst, left, top, 16, 16, 5); VerticalPred(I16VE16 + dst, top, 16); HorizontalPred(I16HE16 + dst, left, 16); @@ -507,7 +508,7 @@ static void TM4(uint8_t* dst, const uint8_t* top) { // Left samples are top[-5 .. -2], top_left is top[-1], top are // located at top[0..3], and top right is top[4..7] -static void Intra4Preds(uint8_t* dst, const uint8_t* top) { +static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) { DC4(I4DC4 + dst, top); TM4(I4TM4 + dst, top); VE4(I4VE4 + dst, top); @@ -538,20 +539,20 @@ static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b, return count; } -static int SSE16x16(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_C(const uint8_t* a, const uint8_t* b) { return GetSSE(a, b, 16, 16); } -static int SSE16x8(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_C(const uint8_t* a, const uint8_t* b) { return GetSSE(a, b, 16, 8); } -static int SSE8x8(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_C(const uint8_t* a, const uint8_t* b) { return GetSSE(a, b, 8, 8); } -static int SSE4x4(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_C(const uint8_t* a, const uint8_t* b) { return GetSSE(a, b, 4, 4); } -static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) { +static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) { int k, x, y; for (k = 0; k < 4; ++k) { uint32_t avg = 0; @@ -608,20 +609,20 @@ static int TTransform(const uint8_t* in, const uint16_t* w) { return sum; } -static int Disto4x4(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { const int sum1 = TTransform(a, w); const int sum2 = TTransform(b, w); return abs(sum2 - sum1) >> 5; } -static int Disto16x16(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (x = 0; x < 16; x += 4) { - D += Disto4x4(a + x + y, b + x + y, w); + D += Disto4x4_C(a + x + y, b + x + y, w); } } return D; @@ -636,8 +637,8 @@ static const uint8_t kZigzag[16] = { }; // Simple quantization -static int QuantizeBlock(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { +static int QuantizeBlock_C(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { int last = -1; int n; for (n = 0; n < 16; ++n) { @@ -662,8 +663,8 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16], return (last >= 0); } -static int Quantize2Blocks(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { +static int Quantize2Blocks_C(int16_t in[32], int16_t out[32], + const VP8Matrix* const mtx) { int nz; nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; @@ -682,11 +683,11 @@ static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) { } } -static void Copy4x4(const uint8_t* src, uint8_t* dst) { +static void Copy4x4_C(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4, 4); } -static void Copy16x8(const uint8_t* src, uint8_t* dst) { +static void Copy16x8_C(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 16, 8); } @@ -734,26 +735,26 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) { InitTables(); // default C implementations - VP8CollectHistogram = CollectHistogram; - VP8ITransform = ITransform; - VP8FTransform = FTransform; - VP8FTransform2 = FTransform2; - VP8FTransformWHT = FTransformWHT; - VP8EncPredLuma4 = Intra4Preds; - VP8EncPredLuma16 = Intra16Preds; - VP8EncPredChroma8 = IntraChromaPreds; - VP8SSE16x16 = SSE16x16; - VP8SSE8x8 = SSE8x8; - VP8SSE16x8 = SSE16x8; - VP8SSE4x4 = SSE4x4; - VP8TDisto4x4 = Disto4x4; - VP8TDisto16x16 = Disto16x16; - VP8Mean16x4 = Mean16x4; - VP8EncQuantizeBlock = QuantizeBlock; - VP8EncQuantize2Blocks = Quantize2Blocks; - VP8EncQuantizeBlockWHT = QuantizeBlock; - VP8Copy4x4 = Copy4x4; - VP8Copy16x8 = Copy16x8; + VP8CollectHistogram = CollectHistogram_C; + VP8ITransform = ITransform_C; + VP8FTransform = FTransform_C; + VP8FTransform2 = FTransform2_C; + VP8FTransformWHT = FTransformWHT_C; + VP8EncPredLuma4 = Intra4Preds_C; + VP8EncPredLuma16 = Intra16Preds_C; + VP8EncPredChroma8 = IntraChromaPreds_C; + VP8SSE16x16 = SSE16x16_C; + VP8SSE8x8 = SSE8x8_C; + VP8SSE16x8 = SSE16x8_C; + VP8SSE4x4 = SSE4x4_C; + VP8TDisto4x4 = Disto4x4_C; + VP8TDisto16x16 = Disto16x16_C; + VP8Mean16x4 = Mean16x4_C; + VP8EncQuantizeBlock = QuantizeBlock_C; + VP8EncQuantize2Blocks = Quantize2Blocks_C; + VP8EncQuantizeBlockWHT = QuantizeBlock_C; + VP8Copy4x4 = Copy4x4_C; + VP8Copy16x8 = Copy16x8_C; // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { diff --git a/src/dsp/enc_mips32.c b/src/dsp/enc_mips32.c index 752b14da..ad7d79bb 100644 --- a/src/dsp/enc_mips32.c +++ b/src/dsp/enc_mips32.c @@ -113,8 +113,9 @@ static const int kC2 = 35468; "sb %[" #TEMP12 "], 3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" // Does one or two inverse transforms. -static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, - uint8_t* dst) { +static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref, + const int16_t* in, + uint8_t* dst) { int temp0, temp1, temp2, temp3, temp4, temp5, temp6; int temp7, temp8, temp9, temp10, temp11, temp12, temp13; int temp14, temp15, temp16, temp17, temp18, temp19, temp20; @@ -144,11 +145,11 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, ); } -static void ITransform(const uint8_t* ref, const int16_t* in, - uint8_t* dst, int do_two) { - ITransformOne(ref, in, dst); +static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in, + uint8_t* dst, int do_two) { + ITransformOne_MIPS32(ref, in, dst); if (do_two) { - ITransformOne(ref + 4, in + 16, dst + 4); + ITransformOne_MIPS32(ref + 4, in + 16, dst + 4); } } @@ -187,8 +188,8 @@ static void ITransform(const uint8_t* ref, const int16_t* in, "sh %[temp5], " #J "(%[ppin]) \n\t" \ "sh %[level], " #N "(%[pout]) \n\t" -static int QuantizeBlock(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { +static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { int temp0, temp1, temp2, temp3, temp4, temp5; int sign, coeff, level, i; int max_level = MAX_LEVEL; @@ -238,11 +239,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16], return 0; } -static int Quantize2Blocks(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { +static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32], + const VP8Matrix* const mtx) { int nz; - nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; - nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; + nz = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0; + nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1; return nz; } @@ -361,8 +362,8 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32], "msub %[temp6], %[temp0] \n\t" \ "msub %[temp7], %[temp1] \n\t" -static int Disto4x4(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { int tmp[32]; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; @@ -396,13 +397,13 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b, #undef VERTICAL_PASS #undef HORIZONTAL_PASS -static int Disto16x16(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (x = 0; x < 16; x += 4) { - D += Disto4x4(a + x + y, b + x + y, w); + D += Disto4x4_MIPS32(a + x + y, b + x + y, w); } } return D; @@ -478,7 +479,8 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b, "sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \ "sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t" -static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { +static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref, + int16_t* out) { int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16; int temp17, temp18, temp19, temp20; @@ -539,7 +541,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { GET_SSE_INNER(C, C + 1, C + 2, C + 3) \ GET_SSE_INNER(D, D + 1, D + 2, D + 3) -static int SSE16x16(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) { int count; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; @@ -573,7 +575,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE16x8(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) { int count; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; @@ -599,7 +601,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE8x8(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) { int count; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; @@ -621,7 +623,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE4x4(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) { int count; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; @@ -651,17 +653,20 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { extern void VP8EncDspInitMIPS32(void); WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) { - VP8ITransform = ITransform; - VP8FTransform = FTransform; - VP8EncQuantizeBlock = QuantizeBlock; - VP8EncQuantize2Blocks = Quantize2Blocks; - VP8TDisto4x4 = Disto4x4; - VP8TDisto16x16 = Disto16x16; + VP8ITransform = ITransform_MIPS32; + VP8FTransform = FTransform_MIPS32; + + VP8EncQuantizeBlock = QuantizeBlock_MIPS32; + VP8EncQuantize2Blocks = Quantize2Blocks_MIPS32; + + VP8TDisto4x4 = Disto4x4_MIPS32; + VP8TDisto16x16 = Disto16x16_MIPS32; + #if !defined(WORK_AROUND_GCC) - VP8SSE16x16 = SSE16x16; - VP8SSE8x8 = SSE8x8; - VP8SSE16x8 = SSE16x8; - VP8SSE4x4 = SSE4x4; + VP8SSE16x16 = SSE16x16_MIPS32; + VP8SSE8x8 = SSE8x8_MIPS32; + VP8SSE16x8 = SSE16x8_MIPS32; + VP8SSE4x4 = SSE4x4_MIPS32; #endif } diff --git a/src/dsp/enc_mips_dsp_r2.c b/src/dsp/enc_mips_dsp_r2.c index 6c8c1c6a..3ee2ecee 100644 --- a/src/dsp/enc_mips_dsp_r2.c +++ b/src/dsp/enc_mips_dsp_r2.c @@ -141,7 +141,8 @@ static const int kC2 = 35468; "sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \ "sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t" -static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { +static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref, + int16_t* out) { const int c2217 = 2217; const int c5352 = 5352; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; @@ -238,16 +239,16 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, ); } -static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, - int do_two) { +static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in, + uint8_t* dst, int do_two) { ITransformOne(ref, in, dst); if (do_two) { ITransformOne(ref + 4, in + 16, dst + 4); } } -static int Disto4x4(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17; @@ -313,13 +314,14 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b, return abs(temp3 - temp17) >> 5; } -static int Disto16x16(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_MIPSdspR2(const uint8_t* const a, + const uint8_t* const b, + const uint16_t* const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (x = 0; x < 16; x += 4) { - D += Disto4x4(a + x + y, b + x + y, w); + D += Disto4x4_MIPSdspR2(a + x + y, b + x + y, w); } } return D; @@ -1011,8 +1013,8 @@ static void HU4(uint8_t* dst, const uint8_t* top) { //------------------------------------------------------------------------------ // Chroma 8x8 prediction (paragraph 12.2) -static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left, + const uint8_t* top) { // U block DCMode8(C8DC8 + dst, left, top); VerticalPred8(C8VE8 + dst, top); @@ -1031,8 +1033,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, //------------------------------------------------------------------------------ // luma 16x16 prediction (paragraph 12.3) -static void Intra16Preds(uint8_t* dst, - const uint8_t* left, const uint8_t* top) { +static void Intra16Preds_MIPSdspR2(uint8_t* dst, + const uint8_t* left, const uint8_t* top) { DCMode16(I16DC16 + dst, left, top); VerticalPred16(I16VE16 + dst, top); HorizontalPred16(I16HE16 + dst, left); @@ -1041,7 +1043,7 @@ static void Intra16Preds(uint8_t* dst, // Left samples are top[-5 .. -2], top_left is top[-1], top are // located at top[0..3], and top right is top[4..7] -static void Intra4Preds(uint8_t* dst, const uint8_t* top) { +static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) { DC4(I4DC4 + dst, top); TM4(I4TM4 + dst, top); VE4(I4VE4 + dst, top); @@ -1077,7 +1079,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) { GET_SSE_INNER(C) \ GET_SSE_INNER(D) -static int SSE16x16(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) { int count; int temp0, temp1, temp2, temp3; __asm__ volatile ( @@ -1107,7 +1109,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE16x8(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) { int count; int temp0, temp1, temp2, temp3; __asm__ volatile ( @@ -1129,7 +1131,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE8x8(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) { int count; int temp0, temp1, temp2, temp3; __asm__ volatile ( @@ -1147,7 +1149,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE4x4(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) { int count; int temp0, temp1, temp2, temp3; __asm__ volatile ( @@ -1270,8 +1272,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { "usw $0, " #J "(%[ppin]) \n\t" \ "3: \n\t" -static int QuantizeBlock(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { +static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { int temp0, temp1, temp2, temp3, temp4, temp5,temp6; int sign, coeff, level; int max_level = MAX_LEVEL; @@ -1311,11 +1313,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16], return (ret != 0); } -static int Quantize2Blocks(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { +static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32], + const VP8Matrix* const mtx) { int nz; - nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; - nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; + nz = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0; + nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1; return nz; } @@ -1358,7 +1360,7 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32], "usw %[" #TEMP4 "], " #C "(%[out]) \n\t" \ "usw %[" #TEMP6 "], " #D "(%[out]) \n\t" -static void FTransformWHT(const int16_t* in, int16_t* out) { +static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) { int temp0, temp1, temp2, temp3, temp4; int temp5, temp6, temp7, temp8, temp9; @@ -1450,9 +1452,9 @@ static void FTransformWHT(const int16_t* in, int16_t* out) { "addiu %[temp8], %[temp8], 1 \n\t" \ "sw %[temp8], 0(%[temp3]) \n\t" -static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, - int start_block, int end_block, - VP8Histogram* const histo) { +static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred, + int start_block, int end_block, + VP8Histogram* const histo) { int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH; @@ -1484,23 +1486,28 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, extern void VP8EncDspInitMIPSdspR2(void); WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) { - VP8FTransform = FTransform; - VP8ITransform = ITransform; - VP8TDisto4x4 = Disto4x4; - VP8TDisto16x16 = Disto16x16; - VP8EncPredLuma16 = Intra16Preds; - VP8EncPredChroma8 = IntraChromaPreds; - VP8EncPredLuma4 = Intra4Preds; + VP8FTransform = FTransform_MIPSdspR2; + VP8FTransformWHT = FTransformWHT_MIPSdspR2; + VP8ITransform = ITransform_MIPSdspR2; + + VP8TDisto4x4 = Disto4x4_MIPSdspR2; + VP8TDisto16x16 = Disto16x16_MIPSdspR2; + + VP8EncPredLuma16 = Intra16Preds_MIPSdspR2; + VP8EncPredChroma8 = IntraChromaPreds_MIPSdspR2; + VP8EncPredLuma4 = Intra4Preds_MIPSdspR2; + #if !defined(WORK_AROUND_GCC) - VP8SSE16x16 = SSE16x16; - VP8SSE8x8 = SSE8x8; - VP8SSE16x8 = SSE16x8; - VP8SSE4x4 = SSE4x4; + VP8SSE16x16 = SSE16x16_MIPSdspR2; + VP8SSE8x8 = SSE8x8_MIPSdspR2; + VP8SSE16x8 = SSE16x8_MIPSdspR2; + VP8SSE4x4 = SSE4x4_MIPSdspR2; #endif - VP8EncQuantizeBlock = QuantizeBlock; - VP8EncQuantize2Blocks = Quantize2Blocks; - VP8FTransformWHT = FTransformWHT; - VP8CollectHistogram = CollectHistogram; + + VP8EncQuantizeBlock = QuantizeBlock_MIPSdspR2; + VP8EncQuantize2Blocks = Quantize2Blocks_MIPSdspR2; + + VP8CollectHistogram = CollectHistogram_MIPSdspR2; } #else // !WEBP_USE_MIPS_DSP_R2 diff --git a/src/dsp/enc_msa.c b/src/dsp/enc_msa.c index d1c8bdf2..eb805042 100644 --- a/src/dsp/enc_msa.c +++ b/src/dsp/enc_msa.c @@ -69,15 +69,16 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS); } -static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, - int do_two) { +static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst, + int do_two) { ITransformOne(ref, in, dst); if (do_two) { ITransformOne(ref + 4, in + 16, dst + 4); } } -static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { +static void FTransform_MSA(const uint8_t* src, const uint8_t* ref, + int16_t* out) { uint64_t out0, out1, out2, out3; uint32_t in0, in1, in2, in3; v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; @@ -130,7 +131,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { SD4(out0, out1, out2, out3, out, 8); } -static void FTransformWHT(const int16_t* in, int16_t* out) { +static void FTransformWHT_MSA(const int16_t* in, int16_t* out) { v8i16 in0 = { 0 }; v8i16 in1 = { 0 }; v8i16 tmp0, tmp1, tmp2, tmp3; @@ -167,7 +168,7 @@ static void FTransformWHT(const int16_t* in, int16_t* out) { ST_SH2(out0, out1, out, 8); } -static int TTransform(const uint8_t* in, const uint16_t* w) { +static int TTransform_MSA(const uint8_t* in, const uint16_t* w) { int sum; uint32_t in0_m, in1_m, in2_m, in3_m; v16i8 src0 = { 0 }; @@ -199,20 +200,20 @@ static int TTransform(const uint8_t* in, const uint16_t* w) { return sum; } -static int Disto4x4(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { - const int sum1 = TTransform(a, w); - const int sum2 = TTransform(b, w); +static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { + const int sum1 = TTransform_MSA(a, w); + const int sum2 = TTransform_MSA(b, w); return abs(sum2 - sum1) >> 5; } -static int Disto16x16(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (x = 0; x < 16; x += 4) { - D += Disto4x4(a + x + y, b + x + y, w); + D += Disto4x4_MSA(a + x + y, b + x + y, w); } } return D; @@ -221,9 +222,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b, //------------------------------------------------------------------------------ // Histogram -static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, - int start_block, int end_block, - VP8Histogram* const histo) { +static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred, + int start_block, int end_block, + VP8Histogram* const histo) { int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; for (j = start_block; j < end_block; ++j) { @@ -430,7 +431,7 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) { #undef AVG3 #undef AVG2 -static void Intra4Preds(uint8_t* dst, const uint8_t* top) { +static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) { DC4(I4DC4 + dst, top); TM4(I4TM4 + dst, top); VE4(I4VE4 + dst, top); @@ -547,8 +548,8 @@ static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left, STORE16x16(out, dst); } -static void Intra16Preds(uint8_t* dst, - const uint8_t* left, const uint8_t* top) { +static void Intra16Preds_MSA(uint8_t* dst, + const uint8_t* left, const uint8_t* top) { DCMode16x16(I16DC16 + dst, left, top); VerticalPred16x16(I16VE16 + dst, top); HorizontalPred16x16(I16HE16 + dst, left); @@ -669,8 +670,8 @@ static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left, STORE8x8(out, dst); } -static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left, + const uint8_t* top) { // U block DCMode8x8(C8DC8 + dst, left, top); VerticalPred8x8(C8VE8 + dst, top); @@ -711,7 +712,7 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \ } while (0) -static int SSE16x16(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) { uint32_t sum; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; @@ -738,7 +739,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) { return sum; } -static int SSE16x8(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) { uint32_t sum; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; @@ -757,7 +758,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) { return sum; } -static int SSE8x8(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) { uint32_t sum; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; @@ -777,7 +778,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) { return sum; } -static int SSE4x4(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) { uint32_t sum = 0; uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1; @@ -799,8 +800,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { //------------------------------------------------------------------------------ // Quantization -static int QuantizeBlock(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { +static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { int sum; v8i16 in0, in1, sh0, sh1, out0, out1; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1; @@ -852,8 +853,8 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16], return (sum > 0); } -static int Quantize2Blocks(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { +static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32], + const VP8Matrix* const mtx) { int nz; nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; @@ -866,26 +867,26 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32], extern void VP8EncDspInitMSA(void); WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) { - VP8ITransform = ITransform; - VP8FTransform = FTransform; - VP8FTransformWHT = FTransformWHT; + VP8ITransform = ITransform_MSA; + VP8FTransform = FTransform_MSA; + VP8FTransformWHT = FTransformWHT_MSA; - VP8TDisto4x4 = Disto4x4; - VP8TDisto16x16 = Disto16x16; - VP8CollectHistogram = CollectHistogram; + VP8TDisto4x4 = Disto4x4_MSA; + VP8TDisto16x16 = Disto16x16_MSA; + VP8CollectHistogram = CollectHistogram_MSA; - VP8EncPredLuma4 = Intra4Preds; - VP8EncPredLuma16 = Intra16Preds; - VP8EncPredChroma8 = IntraChromaPreds; + VP8EncPredLuma4 = Intra4Preds_MSA; + VP8EncPredLuma16 = Intra16Preds_MSA; + VP8EncPredChroma8 = IntraChromaPreds_MSA; - VP8SSE16x16 = SSE16x16; - VP8SSE16x8 = SSE16x8; - VP8SSE8x8 = SSE8x8; - VP8SSE4x4 = SSE4x4; + VP8SSE16x16 = SSE16x16_MSA; + VP8SSE16x8 = SSE16x8_MSA; + VP8SSE8x8 = SSE8x8_MSA; + VP8SSE4x4 = SSE4x4_MSA; - VP8EncQuantizeBlock = QuantizeBlock; - VP8EncQuantize2Blocks = Quantize2Blocks; - VP8EncQuantizeBlockWHT = QuantizeBlock; + VP8EncQuantizeBlock = QuantizeBlock_MSA; + VP8EncQuantize2Blocks = Quantize2Blocks_MSA; + VP8EncQuantizeBlockWHT = QuantizeBlock_MSA; } #else // !WEBP_USE_MSA diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c index 6a078d63..cfc31f47 100644 --- a/src/dsp/enc_neon.c +++ b/src/dsp/enc_neon.c @@ -116,8 +116,8 @@ static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) { Transpose8x2(E0, E1, rows); } -static void ITransformOne(const uint8_t* ref, - const int16_t* in, uint8_t* dst) { +static void ITransformOne_NEON(const uint8_t* ref, + const int16_t* in, uint8_t* dst) { int16x8x2_t rows; INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8)); TransformPass(&rows); @@ -127,8 +127,8 @@ static void ITransformOne(const uint8_t* ref, #else -static void ITransformOne(const uint8_t* ref, - const int16_t* in, uint8_t* dst) { +static void ITransformOne_NEON(const uint8_t* ref, + const int16_t* in, uint8_t* dst) { const int kBPS = BPS; const int16_t kC1C2[] = { kC1, kC2, 0, 0 }; @@ -243,11 +243,11 @@ static void ITransformOne(const uint8_t* ref, #endif // WEBP_USE_INTRINSICS -static void ITransform(const uint8_t* ref, - const int16_t* in, uint8_t* dst, int do_two) { - ITransformOne(ref, in, dst); +static void ITransform_NEON(const uint8_t* ref, + const int16_t* in, uint8_t* dst, int do_two) { + ITransformOne_NEON(ref, in, dst); if (do_two) { - ITransformOne(ref + 4, in + 16, dst + 4); + ITransformOne_NEON(ref + 4, in + 16, dst + 4); } } @@ -288,8 +288,8 @@ static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a, return vreinterpretq_s16_u16(vsubl_u8(a, b)); } -static void FTransform(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform_NEON(const uint8_t* src, const uint8_t* ref, + int16_t* out) { int16x8_t d0d1, d3d2; // working 4x4 int16 variables { const uint8x16_t S0 = Load4x4(src); @@ -358,8 +358,8 @@ static const int32_t kCoeff32[] = { 51000, 51000, 51000, 51000 }; -static void FTransform(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform_NEON(const uint8_t* src, const uint8_t* ref, + int16_t* out) { const int kBPS = BPS; const uint8_t* src_ptr = src; const uint8_t* ref_ptr = ref; @@ -478,7 +478,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, src += stride; \ } while (0) -static void FTransformWHT(const int16_t* src, int16_t* out) { +static void FTransformWHT_NEON(const int16_t* src, int16_t* out) { const int stride = 16; const int16x4_t zero = vdup_n_s16(0); int32x4x4_t tmp0; @@ -652,8 +652,8 @@ static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in, // Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. // w[] contains a row-major 4 by 4 symmetric matrix. -static int Disto4x4(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { uint32x2_t d_in_ab_0123 = vdup_n_u32(0); uint32x2_t d_in_ab_4567 = vdup_n_u32(0); uint32x2_t d_in_ab_89ab = vdup_n_u32(0); @@ -694,13 +694,13 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b, } #undef LOAD_LANE_32b -static int Disto16x16(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (x = 0; x < 16; x += 4) { - D += Disto4x4(a + x + y, b + x + y, w); + D += Disto4x4_NEON(a + x + y, b + x + y, w); } } return D; @@ -708,15 +708,15 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b, //------------------------------------------------------------------------------ -static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, - int start_block, int end_block, - VP8Histogram* const histo) { +static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred, + int start_block, int end_block, + VP8Histogram* const histo) { const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH); int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; for (j = start_block; j < end_block; ++j) { int16_t out[16]; - FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); + FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out); { int k; const int16x8_t a0 = vld1q_s16(out + 0); @@ -813,8 +813,8 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) { // Compilation with gcc-4.6.x is problematic for now. #if !defined(WORK_AROUND_GCC) -static int16x8_t Quantize(int16_t* const in, - const VP8Matrix* const mtx, int offset) { +static int16x8_t Quantize_NEON(int16_t* const in, + const VP8Matrix* const mtx, int offset) { const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]); const uint16x8_t q = vld1q_u16(&mtx->q_[offset]); const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]); @@ -847,10 +847,10 @@ static const uint8_t kShuffles[4][8] = { { 14, 15, 22, 23, 28, 29, 30, 31 } }; -static int QuantizeBlock(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { - const int16x8_t out0 = Quantize(in, mtx, 0); - const int16x8_t out1 = Quantize(in, mtx, 8); +static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { + const int16x8_t out0 = Quantize_NEON(in, mtx, 0); + const int16x8_t out1 = Quantize_NEON(in, mtx, 8); uint8x8x4_t shuffles; // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use // non-standard versions there. @@ -889,11 +889,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16], return 0; } -static int Quantize2Blocks(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { +static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32], + const VP8Matrix* const mtx) { int nz; - nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; - nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; + nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0; + nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1; return nz; } @@ -905,14 +905,14 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32], extern void VP8EncDspInitNEON(void); WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) { - VP8ITransform = ITransform; - VP8FTransform = FTransform; + VP8ITransform = ITransform_NEON; + VP8FTransform = FTransform_NEON; - VP8FTransformWHT = FTransformWHT; + VP8FTransformWHT = FTransformWHT_NEON; - VP8TDisto4x4 = Disto4x4; - VP8TDisto16x16 = Disto16x16; - VP8CollectHistogram = CollectHistogram; + VP8TDisto4x4 = Disto4x4_NEON; + VP8TDisto16x16 = Disto16x16_NEON; + VP8CollectHistogram = CollectHistogram_NEON; VP8SSE16x16 = SSE16x16_NEON; VP8SSE16x8 = SSE16x8_NEON; @@ -920,8 +920,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) { VP8SSE4x4 = SSE4x4_NEON; #if !defined(WORK_AROUND_GCC) - VP8EncQuantizeBlock = QuantizeBlock; - VP8EncQuantize2Blocks = Quantize2Blocks; + VP8EncQuantizeBlock = QuantizeBlock_NEON; + VP8EncQuantize2Blocks = Quantize2Blocks_NEON; #endif } diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c index e8cd366c..1d58db18 100644 --- a/src/dsp/enc_sse2.c +++ b/src/dsp/enc_sse2.c @@ -26,8 +26,8 @@ // Transforms (Paragraph 14.4) // Does one or two inverse transforms. -static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, - int do_two) { +static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, + int do_two) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 @@ -193,10 +193,10 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, } } -static void FTransformPass1(const __m128i* const in01, - const __m128i* const in23, - __m128i* const out01, - __m128i* const out32) { +static void FTransformPass1_SSE2(const __m128i* const in01, + const __m128i* const in23, + __m128i* const out01, + __m128i* const out32) { const __m128i k937 = _mm_set1_epi32(937); const __m128i k1812 = _mm_set1_epi32(1812); @@ -239,8 +239,9 @@ static void FTransformPass1(const __m128i* const in01, *out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2.. } -static void FTransformPass2(const __m128i* const v01, const __m128i* const v32, - int16_t* out) { +static void FTransformPass2_SSE2(const __m128i* const v01, + const __m128i* const v32, + int16_t* out) { const __m128i zero = _mm_setzero_si128(); const __m128i seven = _mm_set1_epi16(7); const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, @@ -291,7 +292,8 @@ static void FTransformPass2(const __m128i* const v01, const __m128i* const v32, _mm_storeu_si128((__m128i*)&out[8], d2_f3); } -static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { +static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref, + int16_t* out) { const __m128i zero = _mm_setzero_si128(); // Load src. const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); @@ -328,13 +330,14 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { __m128i v01, v32; // First pass - FTransformPass1(&row01, &row23, &v01, &v32); + FTransformPass1_SSE2(&row01, &row23, &v01, &v32); // Second pass - FTransformPass2(&v01, &v32, out); + FTransformPass2_SSE2(&v01, &v32, out); } -static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) { +static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref, + int16_t* out) { const __m128i zero = _mm_setzero_si128(); // Load src and convert to 16b. @@ -374,15 +377,15 @@ static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) { __m128i v01h, v32h; // First pass - FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l); - FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h); + FTransformPass1_SSE2(&shuf01l, &shuf23l, &v01l, &v32l); + FTransformPass1_SSE2(&shuf01h, &shuf23h, &v01h, &v32h); // Second pass - FTransformPass2(&v01l, &v32l, out + 0); - FTransformPass2(&v01h, &v32h, out + 16); + FTransformPass2_SSE2(&v01l, &v32l, out + 0); + FTransformPass2_SSE2(&v01h, &v32h, out + 16); } -static void FTransformWHTRow(const int16_t* const in, __m128i* const out) { +static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) { const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1); const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]); const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]); @@ -398,14 +401,14 @@ static void FTransformWHTRow(const int16_t* const in, __m128i* const out) { *out = _mm_madd_epi16(D, kMult); } -static void FTransformWHT(const int16_t* in, int16_t* out) { +static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) { // Input is 12b signed. __m128i row0, row1, row2, row3; // Rows are 14b signed. - FTransformWHTRow(in + 0 * 64, &row0); - FTransformWHTRow(in + 1 * 64, &row1); - FTransformWHTRow(in + 2 * 64, &row2); - FTransformWHTRow(in + 3 * 64, &row3); + FTransformWHTRow_SSE2(in + 0 * 64, &row0); + FTransformWHTRow_SSE2(in + 1 * 64, &row1); + FTransformWHTRow_SSE2(in + 2 * 64, &row2); + FTransformWHTRow_SSE2(in + 3 * 64, &row3); { // The a* are 15b signed. @@ -431,9 +434,9 @@ static void FTransformWHT(const int16_t* in, int16_t* out) { // Compute susceptibility based on DCT-coeff histograms: // the higher, the "easier" the macroblock is to compress. -static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, - int start_block, int end_block, - VP8Histogram* const histo) { +static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred, + int start_block, int end_block, + VP8Histogram* const histo) { const __m128i zero = _mm_setzero_si128(); const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); int j; @@ -442,7 +445,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, int16_t out[16]; int k; - FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); + FTransform_SSE2(ref + VP8DspScan[j], pred + VP8DspScan[j], out); // Convert coefficients to bin (within out[]). { @@ -888,7 +891,7 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) { // Left samples are top[-5 .. -2], top_left is top[-1], top are // located at top[0..3], and top right is top[4..7] -static void Intra4Preds(uint8_t* dst, const uint8_t* top) { +static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) { DC4(I4DC4 + dst, top); TM4(I4TM4 + dst, top); VE4(I4VE4 + dst, top); @@ -904,8 +907,8 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) { //------------------------------------------------------------------------------ // Chroma 8x8 prediction (paragraph 12.2) -static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left, + const uint8_t* top) { // U block DC8uvMode(C8DC8 + dst, left, top); VerticalPred(C8VE8 + dst, top, 8); @@ -924,8 +927,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, //------------------------------------------------------------------------------ // luma 16x16 prediction (paragraph 12.3) -static void Intra16Preds(uint8_t* dst, - const uint8_t* left, const uint8_t* top) { +static void Intra16Preds_SSE2(uint8_t* dst, + const uint8_t* left, const uint8_t* top) { DC16Mode(I16DC16 + dst, left, top); VerticalPred(I16VE16 + dst, top, 16); HorizontalPred(I16HE16 + dst, left, 16); @@ -973,18 +976,18 @@ static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b, return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); } -static int SSE16x16(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) { return SSE_16xN(a, b, 8); } -static int SSE16x8(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) { return SSE_16xN(a, b, 4); } #define LOAD_8x16b(ptr) \ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero) -static int SSE8x8(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) { const __m128i zero = _mm_setzero_si128(); int num_pairs = 4; __m128i sum = zero; @@ -1011,7 +1014,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) { } #undef LOAD_8x16b -static int SSE4x4(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) { const __m128i zero = _mm_setzero_si128(); // Load values. Note that we read 8 pixels instead of 4, @@ -1048,7 +1051,7 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { //------------------------------------------------------------------------------ -static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) { +static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) { const __m128i mask = _mm_set1_epi16(0x00ff); const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]); const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]); @@ -1086,8 +1089,8 @@ static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) { // Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. // w[] contains a row-major 4 by 4 symmetric matrix. -static int TTransform(const uint8_t* inA, const uint8_t* inB, - const uint16_t* const w) { +static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB, + const uint16_t* const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; const __m128i zero = _mm_setzero_si128(); @@ -1187,19 +1190,19 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB, return sum[0] + sum[1] + sum[2] + sum[3]; } -static int Disto4x4(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { - const int diff_sum = TTransform(a, b, w); +static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { + const int diff_sum = TTransform_SSE2(a, b, w); return abs(diff_sum) >> 5; } -static int Disto16x16(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (x = 0; x < 16; x += 4) { - D += Disto4x4(a + x + y, b + x + y, w); + D += Disto4x4_SSE2(a + x + y, b + x + y, w); } } return D; @@ -1346,24 +1349,24 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32], extern void VP8EncDspInitSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) { - VP8CollectHistogram = CollectHistogram; - VP8EncPredLuma16 = Intra16Preds; - VP8EncPredChroma8 = IntraChromaPreds; - VP8EncPredLuma4 = Intra4Preds; + VP8CollectHistogram = CollectHistogram_SSE2; + VP8EncPredLuma16 = Intra16Preds_SSE2; + VP8EncPredChroma8 = IntraChromaPreds_SSE2; + VP8EncPredLuma4 = Intra4Preds_SSE2; VP8EncQuantizeBlock = QuantizeBlock; VP8EncQuantize2Blocks = Quantize2Blocks; VP8EncQuantizeBlockWHT = QuantizeBlockWHT; - VP8ITransform = ITransform; - VP8FTransform = FTransform; - VP8FTransform2 = FTransform2; - VP8FTransformWHT = FTransformWHT; - VP8SSE16x16 = SSE16x16; - VP8SSE16x8 = SSE16x8; - VP8SSE8x8 = SSE8x8; - VP8SSE4x4 = SSE4x4; - VP8TDisto4x4 = Disto4x4; - VP8TDisto16x16 = Disto16x16; - VP8Mean16x4 = Mean16x4; + VP8ITransform = ITransform_SSE2; + VP8FTransform = FTransform_SSE2; + VP8FTransform2 = FTransform2_SSE2; + VP8FTransformWHT = FTransformWHT_SSE2; + VP8SSE16x16 = SSE16x16_SSE2; + VP8SSE16x8 = SSE16x8_SSE2; + VP8SSE8x8 = SSE8x8_SSE2; + VP8SSE4x4 = SSE4x4_SSE2; + VP8TDisto4x4 = Disto4x4_SSE2; + VP8TDisto16x16 = Disto16x16_SSE2; + VP8Mean16x4 = Mean16x4_SSE2; } #else // !WEBP_USE_SSE2 diff --git a/src/dsp/enc_sse41.c b/src/dsp/enc_sse41.c index e32086d9..a3f4b6be 100644 --- a/src/dsp/enc_sse41.c +++ b/src/dsp/enc_sse41.c @@ -23,9 +23,9 @@ //------------------------------------------------------------------------------ // Compute susceptibility based on DCT-coeff histograms. -static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, - int start_block, int end_block, - VP8Histogram* const histo) { +static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred, + int start_block, int end_block, + VP8Histogram* const histo) { const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; @@ -70,8 +70,8 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, // Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. // w[] contains a row-major 4 by 4 symmetric matrix. -static int TTransform(const uint8_t* inA, const uint8_t* inB, - const uint16_t* const w) { +static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB, + const uint16_t* const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; @@ -168,19 +168,19 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB, return sum[0] + sum[1] + sum[2] + sum[3]; } -static int Disto4x4(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { - const int diff_sum = TTransform(a, b, w); +static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { + const int diff_sum = TTransform_SSE41(a, b, w); return abs(diff_sum) >> 5; } -static int Disto16x16(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (x = 0; x < 16; x += 4) { - D += Disto4x4(a + x + y, b + x + y, w); + D += Disto4x4_SSE41(a + x + y, b + x + y, w); } } return D; @@ -197,9 +197,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b, 2 * (D) + 1, 2 * (D) + 0, 2 * (C) + 1, 2 * (C) + 0, \ 2 * (B) + 1, 2 * (B) + 0, 2 * (A) + 1, 2 * (A) + 0) -static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16], - const uint16_t* const sharpen, - const VP8Matrix* const mtx) { +static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16], + const uint16_t* const sharpen, + const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL); const __m128i zero = _mm_setzero_si128(); __m128i out0, out8; @@ -300,22 +300,22 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16], #undef PSHUFB_CST -static int QuantizeBlock(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { - return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx); +static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { + return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx); } -static int QuantizeBlockWHT(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { - return DoQuantizeBlock(in, out, NULL, mtx); +static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { + return DoQuantizeBlock_SSE41(in, out, NULL, mtx); } -static int Quantize2Blocks(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { +static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32], + const VP8Matrix* const mtx) { int nz; const uint16_t* const sharpen = &mtx->sharpen_[0]; - nz = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0; - nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1; + nz = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0; + nz |= DoQuantizeBlock_SSE41(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1; return nz; } @@ -324,12 +324,12 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32], extern void VP8EncDspInitSSE41(void); WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE41(void) { - VP8CollectHistogram = CollectHistogram; - VP8EncQuantizeBlock = QuantizeBlock; - VP8EncQuantize2Blocks = Quantize2Blocks; - VP8EncQuantizeBlockWHT = QuantizeBlockWHT; - VP8TDisto4x4 = Disto4x4; - VP8TDisto16x16 = Disto16x16; + VP8CollectHistogram = CollectHistogram_SSE41; + VP8EncQuantizeBlock = QuantizeBlock_SSE41; + VP8EncQuantize2Blocks = Quantize2Blocks_SSE41; + VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE41; + VP8TDisto4x4 = Disto4x4_SSE41; + VP8TDisto16x16 = Disto16x16_SSE41; } #else // !WEBP_USE_SSE41