mirror of
https://github.com/webmproject/libwebp.git
synced 2024-12-27 06:08:21 +01:00
perform two idct transforms at a time when possible
patch by Christian Duvivier (cduvivier at google dot com)
This commit is contained in:
parent
01af7b69cd
commit
96ed9ce0fb
@ -62,7 +62,7 @@ static const int kC1 = 20091 + (1 << 16);
|
|||||||
static const int kC2 = 35468;
|
static const int kC2 = 35468;
|
||||||
#define MUL(a, b) (((a) * (b)) >> 16)
|
#define MUL(a, b) (((a) * (b)) >> 16)
|
||||||
|
|
||||||
static void Transform(const int16_t* in, uint8_t* dst) {
|
static void TransformOne(const int16_t* in, uint8_t* dst) {
|
||||||
int C[4 * 4], *tmp;
|
int C[4 * 4], *tmp;
|
||||||
int i;
|
int i;
|
||||||
tmp = C;
|
tmp = C;
|
||||||
@ -102,11 +102,16 @@ static void Transform(const int16_t* in, uint8_t* dst) {
|
|||||||
}
|
}
|
||||||
#undef MUL
|
#undef MUL
|
||||||
|
|
||||||
|
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
|
||||||
|
TransformOne(in, dst);
|
||||||
|
if (do_two) {
|
||||||
|
TransformOne(in + 16, dst + 4);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void TransformUV(const int16_t* in, uint8_t* dst) {
|
static void TransformUV(const int16_t* in, uint8_t* dst) {
|
||||||
VP8Transform(in + 0 * 16, dst);
|
VP8Transform(in + 0 * 16, dst, 1);
|
||||||
VP8Transform(in + 1 * 16, dst + 4);
|
VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
|
||||||
VP8Transform(in + 2 * 16, dst + 4 * BPS);
|
|
||||||
VP8Transform(in + 3 * 16, dst + 4 * BPS + 4);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void TransformDC(const int16_t *in, uint8_t* dst) {
|
static void TransformDC(const int16_t *in, uint8_t* dst) {
|
||||||
@ -129,7 +134,7 @@ static void TransformDCUV(const int16_t* in, uint8_t* dst) {
|
|||||||
#undef STORE
|
#undef STORE
|
||||||
|
|
||||||
// default C implementations:
|
// default C implementations:
|
||||||
VP8Idct VP8Transform = Transform;
|
VP8Idct2 VP8Transform = TransformTwo;
|
||||||
VP8Idct VP8TransformUV = TransformUV;
|
VP8Idct VP8TransformUV = TransformUV;
|
||||||
VP8Idct VP8TransformDC = TransformDC;
|
VP8Idct VP8TransformDC = TransformDC;
|
||||||
VP8Idct VP8TransformDCUV = TransformDCUV;
|
VP8Idct VP8TransformDCUV = TransformDCUV;
|
||||||
|
@ -22,7 +22,7 @@ extern "C" {
|
|||||||
//-----------------------------------------------------------------------------
|
//-----------------------------------------------------------------------------
|
||||||
// Transforms (Paragraph 14.4)
|
// Transforms (Paragraph 14.4)
|
||||||
|
|
||||||
static void TransformSSE2(const int16_t* in, uint8_t* dst) {
|
static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
|
||||||
// This implementation makes use of 16-bit fixed point versions of two
|
// This implementation makes use of 16-bit fixed point versions of two
|
||||||
// multiply constants:
|
// multiply constants:
|
||||||
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
|
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
|
||||||
@ -43,8 +43,9 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst) {
|
|||||||
const __m128i k2 = _mm_set1_epi16(-30068);
|
const __m128i k2 = _mm_set1_epi16(-30068);
|
||||||
__m128i T0, T1, T2, T3;
|
__m128i T0, T1, T2, T3;
|
||||||
|
|
||||||
// Load the transform coefficients. The second half of the vectors will just
|
// Load and concatenate the transform coefficients (we'll do two transforms
|
||||||
// contain random value we'll never use nor store.
|
// in parallel). In the case of only one transform, the second half of the
|
||||||
|
// vectors will just contain random value we'll never use nor store.
|
||||||
__m128i in0, in1, in2, in3;
|
__m128i in0, in1, in2, in3;
|
||||||
{
|
{
|
||||||
in0 = _mm_loadl_epi64((__m128i*)&in[0]);
|
in0 = _mm_loadl_epi64((__m128i*)&in[0]);
|
||||||
@ -55,6 +56,20 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst) {
|
|||||||
// a01 a11 a21 a31 x x x x
|
// a01 a11 a21 a31 x x x x
|
||||||
// a02 a12 a22 a32 x x x x
|
// a02 a12 a22 a32 x x x x
|
||||||
// a03 a13 a23 a33 x x x x
|
// a03 a13 a23 a33 x x x x
|
||||||
|
if (do_two) {
|
||||||
|
const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
|
||||||
|
const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
|
||||||
|
const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
|
||||||
|
const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
|
||||||
|
in0 = _mm_unpacklo_epi64(in0, inB0);
|
||||||
|
in1 = _mm_unpacklo_epi64(in1, inB1);
|
||||||
|
in2 = _mm_unpacklo_epi64(in2, inB2);
|
||||||
|
in3 = _mm_unpacklo_epi64(in3, inB3);
|
||||||
|
// a00 a10 a20 a30 b00 b10 b20 b30
|
||||||
|
// a01 a11 a21 a31 b01 b11 b21 b31
|
||||||
|
// a02 a12 a22 a32 b02 b12 b22 b32
|
||||||
|
// a03 a13 a23 a33 b03 b13 b23 b33
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Vertical pass and subsequent transpose.
|
// Vertical pass and subsequent transpose.
|
||||||
@ -179,10 +194,20 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst) {
|
|||||||
{
|
{
|
||||||
const __m128i zero = _mm_set1_epi16(0);
|
const __m128i zero = _mm_set1_epi16(0);
|
||||||
// Load the reference(s).
|
// Load the reference(s).
|
||||||
__m128i dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]);
|
__m128i dst0, dst1, dst2, dst3;
|
||||||
__m128i dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]);
|
if (do_two) {
|
||||||
__m128i dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]);
|
// Load eight bytes/pixels per line.
|
||||||
__m128i dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]);
|
dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]);
|
||||||
|
dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]);
|
||||||
|
dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]);
|
||||||
|
dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]);
|
||||||
|
} else {
|
||||||
|
// Load four bytes/pixels per line.
|
||||||
|
dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]);
|
||||||
|
dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]);
|
||||||
|
dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]);
|
||||||
|
dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]);
|
||||||
|
}
|
||||||
// Convert to 16b.
|
// Convert to 16b.
|
||||||
dst0 = _mm_unpacklo_epi8(dst0, zero);
|
dst0 = _mm_unpacklo_epi8(dst0, zero);
|
||||||
dst1 = _mm_unpacklo_epi8(dst1, zero);
|
dst1 = _mm_unpacklo_epi8(dst1, zero);
|
||||||
@ -198,11 +223,20 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst) {
|
|||||||
dst1 = _mm_packus_epi16(dst1, dst1);
|
dst1 = _mm_packus_epi16(dst1, dst1);
|
||||||
dst2 = _mm_packus_epi16(dst2, dst2);
|
dst2 = _mm_packus_epi16(dst2, dst2);
|
||||||
dst3 = _mm_packus_epi16(dst3, dst3);
|
dst3 = _mm_packus_epi16(dst3, dst3);
|
||||||
// Store the results, four bytes/pixels per line.
|
// Store the results.
|
||||||
*((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0);
|
if (do_two) {
|
||||||
*((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1);
|
// Store eight bytes/pixels per line.
|
||||||
*((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2);
|
_mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0);
|
||||||
*((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3);
|
_mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1);
|
||||||
|
_mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2);
|
||||||
|
_mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3);
|
||||||
|
} else {
|
||||||
|
// Store four bytes/pixels per line.
|
||||||
|
*((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0);
|
||||||
|
*((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1);
|
||||||
|
*((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2);
|
||||||
|
*((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -379,7 +379,7 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
|
|||||||
uint8_t* const dst = y_dst + kScan[n];
|
uint8_t* const dst = y_dst + kScan[n];
|
||||||
VP8PredLuma4[dec->imodes_[n]](dst);
|
VP8PredLuma4[dec->imodes_[n]](dst);
|
||||||
if (dec->non_zero_ac_ & (1 << n)) {
|
if (dec->non_zero_ac_ & (1 << n)) {
|
||||||
VP8Transform(coeffs + n * 16, dst);
|
VP8Transform(coeffs + n * 16, dst, 0);
|
||||||
} else if (dec->non_zero_ & (1 << n)) { // only DC is present
|
} else if (dec->non_zero_ & (1 << n)) { // only DC is present
|
||||||
VP8TransformDC(coeffs + n * 16, dst);
|
VP8TransformDC(coeffs + n * 16, dst);
|
||||||
}
|
}
|
||||||
@ -391,7 +391,7 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
|
|||||||
for (n = 0; n < 16; n++) {
|
for (n = 0; n < 16; n++) {
|
||||||
uint8_t* const dst = y_dst + kScan[n];
|
uint8_t* const dst = y_dst + kScan[n];
|
||||||
if (dec->non_zero_ac_ & (1 << n)) {
|
if (dec->non_zero_ac_ & (1 << n)) {
|
||||||
VP8Transform(coeffs + n * 16, dst);
|
VP8Transform(coeffs + n * 16, dst, 0);
|
||||||
} else if (dec->non_zero_ & (1 << n)) { // only DC is present
|
} else if (dec->non_zero_ & (1 << n)) { // only DC is present
|
||||||
VP8TransformDC(coeffs + n * 16, dst);
|
VP8TransformDC(coeffs + n * 16, dst);
|
||||||
}
|
}
|
||||||
|
@ -297,7 +297,9 @@ int VP8DecodeLayer(VP8Decoder* const dec);
|
|||||||
|
|
||||||
// in dsp.c
|
// in dsp.c
|
||||||
typedef void (*VP8Idct)(const int16_t* coeffs, uint8_t* dst);
|
typedef void (*VP8Idct)(const int16_t* coeffs, uint8_t* dst);
|
||||||
extern VP8Idct VP8Transform;
|
// when doing two transforms, coeffs is actually int16_t[2][16].
|
||||||
|
typedef void (*VP8Idct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
|
||||||
|
extern VP8Idct2 VP8Transform;
|
||||||
extern VP8Idct VP8TransformUV;
|
extern VP8Idct VP8TransformUV;
|
||||||
extern VP8Idct VP8TransformDC;
|
extern VP8Idct VP8TransformDC;
|
||||||
extern VP8Idct VP8TransformDCUV;
|
extern VP8Idct VP8TransformDCUV;
|
||||||
|
Loading…
Reference in New Issue
Block a user