diff --git a/src/dsp/enc.c b/src/dsp/enc.c index 00ee80c9..552807ad 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -144,9 +144,9 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { const int a1 = (d1 + d2); const int a2 = (d1 - d2); const int a3 = (d0 - d3); - tmp[0 + i * 4] = (a0 + a1) << 3; // 14b [-8160,8160] + tmp[0 + i * 4] = (a0 + a1) * 8; // 14b [-8160,8160] tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9; // [-7536,7542] - tmp[2 + i * 4] = (a0 - a1) << 3; + tmp[2 + i * 4] = (a0 - a1) * 8; tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 937) >> 9; } for (i = 0; i < 4; ++i) { @@ -189,31 +189,32 @@ static void ITransformWHT(const int16_t* in, int16_t* out) { } static void FTransformWHT(const int16_t* in, int16_t* out) { - int tmp[16]; + // input is 12b signed + int16_t tmp[16]; int i; for (i = 0; i < 4; ++i, in += 64) { - const int a0 = (in[0 * 16] + in[2 * 16]) << 2; - const int a1 = (in[1 * 16] + in[3 * 16]) << 2; - const int a2 = (in[1 * 16] - in[3 * 16]) << 2; - const int a3 = (in[0 * 16] - in[2 * 16]) << 2; - tmp[0 + i * 4] = (a0 + a1) + (a0 != 0); + const int a0 = (in[0 * 16] + in[2 * 16]); // 13b + const int a1 = (in[1 * 16] + in[3 * 16]); + const int a2 = (in[1 * 16] - in[3 * 16]); + const int a3 = (in[0 * 16] - in[2 * 16]); + tmp[0 + i * 4] = a0 + a1; // 14b tmp[1 + i * 4] = a3 + a2; tmp[2 + i * 4] = a3 - a2; tmp[3 + i * 4] = a0 - a1; } for (i = 0; i < 4; ++i) { - const int a0 = (tmp[0 + i] + tmp[8 + i]); + const int a0 = (tmp[0 + i] + tmp[8 + i]); // 15b const int a1 = (tmp[4 + i] + tmp[12+ i]); const int a2 = (tmp[4 + i] - tmp[12+ i]); const int a3 = (tmp[0 + i] - tmp[8 + i]); - const int b0 = a0 + a1; + const int b0 = a0 + a1; // 16b const int b1 = a3 + a2; const int b2 = a3 - a2; const int b3 = a0 - a1; - out[ 0 + i] = (b0 + (b0 > 0) + 3) >> 3; - out[ 4 + i] = (b1 + (b1 > 0) + 3) >> 3; - out[ 8 + i] = (b2 + (b2 > 0) + 3) >> 3; - out[12 + i] = (b3 + (b3 > 0) + 3) >> 3; + out[ 0 + i] = b0 >> 1; // 15b + out[ 4 + i] = b1 >> 1; + out[ 8 + i] = b2 >> 1; + out[12 + i] = b3 >> 1; } } diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c index 6239a766..eb256e68 100644 --- a/src/dsp/enc_neon.c +++ b/src/dsp/enc_neon.c @@ -365,19 +365,12 @@ static void FTransformWHT(const int16_t* in, int16_t* out) { "vld1.16 d2[3], [%[in]], %[kStep] \n" "vld1.16 d3[3], [%[in]], %[kStep] \n" - "vaddl.s16 q2, d0, d2 \n" - "vshl.s32 q2, q2, #2 \n" // a0=(in[0*16]+in[2*16])<<2 - "vaddl.s16 q3, d1, d3 \n" - "vshl.s32 q3, q3, #2 \n" // a1=(in[1*16]+in[3*16])<<2 - "vsubl.s16 q4, d1, d3 \n" - "vshl.s32 q4, q4, #2 \n" // a2=(in[1*16]-in[3*16])<<2 - "vsubl.s16 q5, d0, d2 \n" - "vshl.s32 q5, q5, #2 \n" // a3=(in[0*16]-in[2*16])<<2 + "vaddl.s16 q2, d0, d2 \n" // a0=(in[0*16]+in[2*16]) + "vaddl.s16 q3, d1, d3 \n" // a1=(in[1*16]+in[3*16]) + "vsubl.s16 q4, d1, d3 \n" // a2=(in[1*16]-in[3*16]) + "vsubl.s16 q5, d0, d2 \n" // a3=(in[0*16]-in[2*16]) - "vceq.s32 q10, q2, #0 \n" - "vmvn.s32 q10, q10 \n" // (a0 != 0) - "vqadd.s32 q6, q2, q3 \n" // (a0 + a1) - "vqsub.s32 q6, q6, q10 \n" // (a0 + a1) + (a0 != 0) + "vqadd.s32 q6, q2, q3 \n" // a0 + a1 "vqadd.s32 q7, q5, q4 \n" // a3 + a2 "vqsub.s32 q8, q5, q4 \n" // a3 - a2 "vqsub.s32 q9, q2, q3 \n" // a0 - a1 @@ -400,27 +393,10 @@ static void FTransformWHT(const int16_t* in, int16_t* out) { "vqsub.s32 q6, q3, q2 \n" // b2 = a3 - a2 "vqsub.s32 q7, q0, q1 \n" // b3 = a0 - a1 - "vmov.s32 q0, #3 \n" // q0 = 3 - - "vcgt.s32 q1, q4, #0 \n" // (b0>0) - "vqsub.s32 q2, q4, q1 \n" // (b0+(b0>0)) - "vqadd.s32 q3, q2, q0 \n" // (b0+(b0>0)+3) - "vshrn.s32 d18, q3, #3 \n" // (b0+(b0>0)+3) >> 3 - - "vcgt.s32 q1, q5, #0 \n" // (b1>0) - "vqsub.s32 q2, q5, q1 \n" // (b1+(b1>0)) - "vqadd.s32 q3, q2, q0 \n" // (b1+(b1>0)+3) - "vshrn.s32 d19, q3, #3 \n" // (b1+(b1>0)+3) >> 3 - - "vcgt.s32 q1, q6, #0 \n" // (b2>0) - "vqsub.s32 q2, q6, q1 \n" // (b2+(b2>0)) - "vqadd.s32 q3, q2, q0 \n" // (b2+(b2>0)+3) - "vshrn.s32 d20, q3, #3 \n" // (b2+(b2>0)+3) >> 3 - - "vcgt.s32 q1, q7, #0 \n" // (b3>0) - "vqsub.s32 q2, q7, q1 \n" // (b3+(b3>0)) - "vqadd.s32 q3, q2, q0 \n" // (b3+(b3>0)+3) - "vshrn.s32 d21, q3, #3 \n" // (b3+(b3>0)+3) >> 3 + "vshrn.s32 d18, q4, #1 \n" // b0 >> 1 + "vshrn.s32 d19, q5, #1 \n" // b1 >> 1 + "vshrn.s32 d20, q6, #1 \n" // b2 >> 1 + "vshrn.s32 d21, q7, #1 \n" // b3 >> 1 "vst1.16 {q9, q10}, [%[out]] \n" diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c index bb0b2a29..032e9907 100644 --- a/src/dsp/enc_sse2.c +++ b/src/dsp/enc_sse2.c @@ -455,6 +455,39 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, } } +static void FTransformWHTSSE2(const int16_t* in, int16_t* out) { + int16_t tmp[16]; + int i; + for (i = 0; i < 4; ++i, in += 64) { + const int a0 = (in[0 * 16] + in[2 * 16]); + const int a1 = (in[1 * 16] + in[3 * 16]); + const int a2 = (in[1 * 16] - in[3 * 16]); + const int a3 = (in[0 * 16] - in[2 * 16]); + tmp[0 + i * 4] = a0 + a1; + tmp[1 + i * 4] = a3 + a2; + tmp[2 + i * 4] = a3 - a2; + tmp[3 + i * 4] = a0 - a1; + } + { + const __m128i src0 = _mm_loadl_epi64((__m128i*)&tmp[0]); + const __m128i src1 = _mm_loadl_epi64((__m128i*)&tmp[4]); + const __m128i src2 = _mm_loadl_epi64((__m128i*)&tmp[8]); + const __m128i src3 = _mm_loadl_epi64((__m128i*)&tmp[12]); + const __m128i a0 = _mm_add_epi16(src0, src2); + const __m128i a1 = _mm_add_epi16(src1, src3); + const __m128i a2 = _mm_sub_epi16(src1, src3); + const __m128i a3 = _mm_sub_epi16(src0, src2); + const __m128i b0 = _mm_srai_epi16(_mm_adds_epi16(a0, a1), 1); + const __m128i b1 = _mm_srai_epi16(_mm_adds_epi16(a3, a2), 1); + const __m128i b2 = _mm_srai_epi16(_mm_subs_epi16(a3, a2), 1); + const __m128i b3 = _mm_srai_epi16(_mm_subs_epi16(a0, a1), 1); + _mm_storel_epi64((__m128i*)&out[ 0], b0); + _mm_storel_epi64((__m128i*)&out[ 4], b1); + _mm_storel_epi64((__m128i*)&out[ 8], b2); + _mm_storel_epi64((__m128i*)&out[12], b3); + } +} + //------------------------------------------------------------------------------ // Metric @@ -921,6 +954,7 @@ void VP8EncDspInitSSE2(void) { VP8EncQuantizeBlock = QuantizeBlockSSE2; VP8ITransform = ITransformSSE2; VP8FTransform = FTransformSSE2; + VP8FTransformWHT = FTransformWHTSSE2; VP8SSE16x16 = SSE16x16SSE2; VP8SSE16x8 = SSE16x8SSE2; VP8SSE8x8 = SSE8x8SSE2;