diff --git a/src/dsp/enc.c b/src/dsp/enc.c index ddf7a0e5..ae2c830a 100644 --- a/src/dsp/enc.c +++ b/src/dsp/enc.c @@ -134,25 +134,25 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { int i; int tmp[16]; for (i = 0; i < 4; ++i, src += BPS, ref += BPS) { - const int d0 = src[0] - ref[0]; + const int d0 = src[0] - ref[0]; // 9bit dynamic range ([-255,255]) const int d1 = src[1] - ref[1]; const int d2 = src[2] - ref[2]; const int d3 = src[3] - ref[3]; - const int a0 = (d0 + d3) << 3; - const int a1 = (d1 + d2) << 3; - const int a2 = (d1 - d2) << 3; - const int a3 = (d0 - d3) << 3; - tmp[0 + i * 4] = (a0 + a1); - tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 14500) >> 12; - tmp[2 + i * 4] = (a0 - a1); - tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 7500) >> 12; + const int a0 = (d0 + d3); // 10b [-510,510] + const int a1 = (d1 + d2); + const int a2 = (d1 - d2); + const int a3 = (d0 - d3); + tmp[0 + i * 4] = (a0 + a1) << 3; // 14b [-8160,8160] + tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9; // [-7536,7542] + tmp[2 + i * 4] = (a0 - a1) << 3; + tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 937) >> 9; } for (i = 0; i < 4; ++i) { - const int a0 = (tmp[0 + i] + tmp[12 + i]); + const int a0 = (tmp[0 + i] + tmp[12 + i]); // 15b const int a1 = (tmp[4 + i] + tmp[ 8 + i]); const int a2 = (tmp[4 + i] - tmp[ 8 + i]); const int a3 = (tmp[0 + i] - tmp[12 + i]); - out[0 + i] = (a0 + a1 + 7) >> 4; + out[0 + i] = (a0 + a1 + 7) >> 4; // 12b out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0); out[8 + i] = (a0 - a1 + 7) >> 4; out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16); diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c index 65271106..29fd612d 100644 --- a/src/dsp/enc_neon.c +++ b/src/dsp/enc_neon.c @@ -222,15 +222,15 @@ static const int16_t kCoeff16[] = { 5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217 }; static const int32_t kCoeff32[] = { - 14500, 14500, 14500, 14500, - 7500, 7500, 7500, 7500, + 1812, 1812, 1812, 1812, + 937, 937, 937, 937, 12000, 12000, 12000, 12000, 51000, 51000, 51000, 51000 }; static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { - const int kBPS = BPS; + const int kBPS = BPS; const uint8_t* src_ptr = src; const uint8_t* ref_ptr = ref; int16_t* coeff16 = kCoeff16; @@ -253,45 +253,45 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, "vtrn.32 q4, q5 \n" "vtrn.32 q6, q7 \n" - // {q0, q1} = q4 - q6 + // d[0-3] = src - ref "vsubl.u8 q0, d8, d12 \n" "vsubl.u8 q1, d9, d13 \n" // load coeff16 into q8(d16=5352, d17=2217) "vld1.16 {q8}, [%[coeff16]] \n" - // load coeff32 high half into q9 = 14500, q10 = 7500 + // load coeff32 high half into q9 = 1812, q10 = 937 "vld1.32 {q9, q10}, [%[coeff32]]! \n" // load coeff32 low half into q11=12000, q12=51000 "vld1.32 {q11,q12}, [%[coeff32]] \n" // part 1 - // transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3] + // Transpose. Register dN is the same as dN in C "vtrn.32 d0, d2 \n" "vtrn.32 d1, d3 \n" "vtrn.16 d0, d1 \n" "vtrn.16 d2, d3 \n" - "vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[3] - "vadd.s16 d5, d1, d2 \n" // b1 = ip[1] + ip[2] - "vsub.s16 d6, d1, d2 \n" // c1 = ip[1] - ip[2] - "vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[3] + "vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3 + "vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2 + "vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2 + "vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3 - "vshl.s16 q2, q2, #3 \n" // (a1, b1) << 3 - "vshl.s16 q3, q3, #3 \n" // (c1, d1) << 3 + "vadd.s16 d0, d4, d5 \n" // a0 + a1 + "vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3 + "vsub.s16 d2, d4, d5 \n" // a0 - a1 + "vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3 - "vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 - "vsub.s16 d2, d4, d5 \n" // op[2] = a1 - b1 - "vmlal.s16 q9, d7, d16 \n" // d1*5352 + 14500 - "vmlal.s16 q10, d7, d17 \n" // d1*2217 + 7500 - "vmlal.s16 q9, d6, d17 \n" // c1*2217 + d1*5352 + 14500 - "vmlsl.s16 q10, d6, d16 \n" // d1*2217 - c1*5352 + 7500 + "vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812 + "vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937 + "vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812 + "vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352 - // op[1] = (c1*2217 + d1*5352 + 14500) >> 12 - // op[3] = (d1*2217 - c1*5352 + 7500) >> 12 - "vshrn.s32 d1, q9, #12 \n" - "vshrn.s32 d3, q10, #12 \n" + // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9 + // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9 + "vshrn.s32 d1, q9, #9 \n" + "vshrn.s32 d3, q10, #9 \n" // part 2 // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] @@ -398,7 +398,7 @@ static void FTransformWHT(const int16_t* in, int16_t* out) { "vqsub.s32 q6, q3, q2 \n" // b2 = a3 - a2 "vqsub.s32 q7, q0, q1 \n" // b3 = a0 - a1 - "vmov.32 q0, #3 \n" // q0 = 3 + "vmov.s32 q0, #3 \n" // q0 = 3 "vcgt.s32 q1, q4, #0 \n" // (b0>0) "vqsub.s32 q2, q4, q1 \n" // (b0+(b0>0)) diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c index c296e5b4..45da22d4 100644 --- a/src/dsp/enc_sse2.c +++ b/src/dsp/enc_sse2.c @@ -295,8 +295,8 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, int16_t* out) { const __m128i zero = _mm_setzero_si128(); const __m128i seven = _mm_set1_epi16(7); - const __m128i k7500 = _mm_set1_epi32(7500); - const __m128i k14500 = _mm_set1_epi32(14500); + const __m128i k937 = _mm_set1_epi32(937); + const __m128i k1812 = _mm_set1_epi32(1812); const __m128i k51000 = _mm_set1_epi32(51000); const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, @@ -352,32 +352,32 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, // First pass and subsequent transpose. { // Same operations are done on the (0,3) and (1,2) pairs. - // b0 = (a0 + a3) << 3 - // b1 = (a1 + a2) << 3 - // b3 = (a0 - a3) << 3 - // b2 = (a1 - a2) << 3 + // b0 = (a0 + a3) + // b1 = (a1 + a2) + // b3 = (a0 - a3) + // b2 = (a1 - a2) const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); - const __m128i b01 = _mm_slli_epi16(a01, 3); - const __m128i b32 = _mm_slli_epi16(a32, 3); - const __m128i b11 = _mm_unpackhi_epi64(b01, b01); - const __m128i b22 = _mm_unpackhi_epi64(b32, b32); + const __m128i b11 = _mm_unpackhi_epi64(a01, a01); + const __m128i b22 = _mm_unpackhi_epi64(a32, a32); - // e0 = b0 + b1 - // e2 = b0 - b1 - const __m128i e0 = _mm_add_epi16(b01, b11); - const __m128i e2 = _mm_sub_epi16(b01, b11); - const __m128i e02 = _mm_unpacklo_epi64(e0, e2); + // e0 = (b0 + b1) + // e2 = (b0 - b1) + const __m128i e0 = _mm_add_epi16(a01, b11); + const __m128i e2 = _mm_sub_epi16(a01, b11); + // e02 = [e0 | e2] << 3 + const __m128i e0_e2 = _mm_unpacklo_epi64(e0, e2); + const __m128i e02 = _mm_slli_epi16(e0_e2, 3); - // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12 - // e3 = (b3 * 2217 - b2 * 5352 + 7500) >> 12 - const __m128i b23 = _mm_unpacklo_epi16(b22, b32); + // e1 = (b3 * 5352 + b2 * 2217 + 1812) >> 9 + // e3 = (b3 * 2217 - b2 * 5352 + 937) >> 9 + const __m128i b23 = _mm_unpacklo_epi16(b22, a32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); - const __m128i d1 = _mm_add_epi32(c1, k14500); - const __m128i d3 = _mm_add_epi32(c3, k7500); - const __m128i e1 = _mm_srai_epi32(d1, 12); - const __m128i e3 = _mm_srai_epi32(d3, 12); + const __m128i d1 = _mm_add_epi32(c1, k1812); + const __m128i d3 = _mm_add_epi32(c3, k937); + const __m128i e1 = _mm_srai_epi32(d1, 9); + const __m128i e3 = _mm_srai_epi32(d3, 9); const __m128i e13 = _mm_packs_epi32(e1, e3); // Transpose. @@ -406,13 +406,12 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i a11 = _mm_unpackhi_epi64(a01, a01); const __m128i a22 = _mm_unpackhi_epi64(a32, a32); + const __m128i a01_plus_7 = _mm_add_epi16(a01, seven); // d0 = (a0 + a1 + 7) >> 4; // d2 = (a0 - a1 + 7) >> 4; - const __m128i b0 = _mm_add_epi16(a01, a11); - const __m128i b2 = _mm_sub_epi16(a01, a11); - const __m128i c0 = _mm_add_epi16(b0, seven); - const __m128i c2 = _mm_add_epi16(b2, seven); + const __m128i c0 = _mm_add_epi16(a01_plus_7, a11); + const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11); const __m128i d0 = _mm_srai_epi16(c0, 4); const __m128i d2 = _mm_srai_epi16(c2, 4); @@ -430,6 +429,7 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, // f1 = f1 + (a3 != 0); // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the // desired (0, 1), we add one earlier through k12000_plus_one. + // -> f1 = f1 + 1 - (a3 == 0) const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); _mm_storel_epi64((__m128i*)&out[ 0], d0);