rescaler: ~20% faster SSE2 implementation for lossless ImportRowExpand

lossy (1-channel) speed-up is more on the 5% side.

Change-Id: Id19d97b9e9a34804b59604a5b48f94a37fdafd62
This commit is contained in:
Pascal Massimino 2015-10-14 07:32:12 +02:00
parent 99e3f8128a
commit 67c547fdcd

View File

@ -25,6 +25,89 @@
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
// input: 8 bytes ABCDEFGH -> output: A0E0B0F0C0G0D0H0
static void LoadTwoPixels(const uint8_t* const src, __m128i* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH
const __m128i B = _mm_unpacklo_epi8(A, zero); // A0B0C0D0E0F0G0H0
const __m128i C = _mm_srli_si128(B, 8); // E0F0G0H0
*out = _mm_unpacklo_epi16(B, C);
}
// input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0
static void LoadHeightPixels(const uint8_t* const src, __m128i* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH
*out = _mm_unpacklo_epi8(A, zero);
}
static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
const uint8_t* src) {
rescaler_t* frow = wrk->frow;
const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels;
const int x_add = wrk->x_add;
int accum = x_add;
__m128i cur_pixels;
assert(!WebPRescalerInputDone(wrk));
assert(wrk->x_expand);
if (wrk->num_channels == 4) {
if (wrk->src_width < 2) {
WebPRescalerImportRowExpandC(wrk, src);
return;
}
LoadTwoPixels(src, &cur_pixels);
src += 4;
while (1) {
const __m128i mult = _mm_set1_epi32(((x_add - accum) << 16) | accum);
const __m128i out = _mm_madd_epi16(cur_pixels, mult);
_mm_storeu_si128((__m128i*)frow, out);
frow += 4;
if (frow >= frow_end) break;
accum -= wrk->x_sub;
if (accum < 0) {
LoadTwoPixels(src, &cur_pixels);
src += 4;
accum += x_add;
}
}
} else {
int left;
const uint8_t* const src_limit = src + wrk->src_width - 8;
if (wrk->src_width < 8) {
WebPRescalerImportRowExpandC(wrk, src);
return;
}
LoadHeightPixels(src, &cur_pixels);
src += 7;
left = 7;
while (1) {
const __m128i mult = _mm_cvtsi32_si128(((x_add - accum) << 16) | accum);
const __m128i out = _mm_madd_epi16(cur_pixels, mult);
*(uint32_t*)frow = _mm_cvtsi128_si32(out);
frow += 1;
if (frow >= frow_end) break;
accum -= wrk->x_sub;
if (accum < 0) {
if (--left) {
cur_pixels = _mm_srli_si128(cur_pixels, 2);
} else if (src <= src_limit) {
LoadHeightPixels(src, &cur_pixels);
src += 7;
left = 7;
} else { // tail
cur_pixels = _mm_srli_si128(cur_pixels, 2);
cur_pixels = _mm_insert_epi16(cur_pixels, src[1], 1);
src += 1;
left = 1;
}
accum += x_add;
}
}
}
assert(accum == 0);
}
static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
const uint8_t* src) {
const int x_sub = wrk->x_sub;
@ -277,6 +360,7 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
extern void WebPRescalerDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitSSE2(void) {
WebPRescalerImportRowExpand = RescalerImportRowExpandSSE2;
WebPRescalerImportRowShrink = RescalerImportRowShrinkSSE2;
WebPRescalerExportRowExpand = RescalerExportRowExpandSSE2;
WebPRescalerExportRowShrink = RescalerExportRowShrinkSSE2;