mirror of
https://github.com/webmproject/libwebp.git
synced 2024-11-20 04:18:26 +01:00
rescaler: ~20% faster SSE2 implementation for lossless ImportRowExpand
lossy (1-channel) speed-up is more on the 5% side. Change-Id: Id19d97b9e9a34804b59604a5b48f94a37fdafd62
This commit is contained in:
parent
99e3f8128a
commit
67c547fdcd
@ -25,6 +25,89 @@
|
||||
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
|
||||
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
|
||||
|
||||
// input: 8 bytes ABCDEFGH -> output: A0E0B0F0C0G0D0H0
|
||||
static void LoadTwoPixels(const uint8_t* const src, __m128i* out) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH
|
||||
const __m128i B = _mm_unpacklo_epi8(A, zero); // A0B0C0D0E0F0G0H0
|
||||
const __m128i C = _mm_srli_si128(B, 8); // E0F0G0H0
|
||||
*out = _mm_unpacklo_epi16(B, C);
|
||||
}
|
||||
|
||||
// input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0
|
||||
static void LoadHeightPixels(const uint8_t* const src, __m128i* out) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH
|
||||
*out = _mm_unpacklo_epi8(A, zero);
|
||||
}
|
||||
|
||||
static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
|
||||
const uint8_t* src) {
|
||||
rescaler_t* frow = wrk->frow;
|
||||
const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels;
|
||||
const int x_add = wrk->x_add;
|
||||
int accum = x_add;
|
||||
__m128i cur_pixels;
|
||||
|
||||
assert(!WebPRescalerInputDone(wrk));
|
||||
assert(wrk->x_expand);
|
||||
if (wrk->num_channels == 4) {
|
||||
if (wrk->src_width < 2) {
|
||||
WebPRescalerImportRowExpandC(wrk, src);
|
||||
return;
|
||||
}
|
||||
LoadTwoPixels(src, &cur_pixels);
|
||||
src += 4;
|
||||
while (1) {
|
||||
const __m128i mult = _mm_set1_epi32(((x_add - accum) << 16) | accum);
|
||||
const __m128i out = _mm_madd_epi16(cur_pixels, mult);
|
||||
_mm_storeu_si128((__m128i*)frow, out);
|
||||
frow += 4;
|
||||
if (frow >= frow_end) break;
|
||||
accum -= wrk->x_sub;
|
||||
if (accum < 0) {
|
||||
LoadTwoPixels(src, &cur_pixels);
|
||||
src += 4;
|
||||
accum += x_add;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
int left;
|
||||
const uint8_t* const src_limit = src + wrk->src_width - 8;
|
||||
if (wrk->src_width < 8) {
|
||||
WebPRescalerImportRowExpandC(wrk, src);
|
||||
return;
|
||||
}
|
||||
LoadHeightPixels(src, &cur_pixels);
|
||||
src += 7;
|
||||
left = 7;
|
||||
while (1) {
|
||||
const __m128i mult = _mm_cvtsi32_si128(((x_add - accum) << 16) | accum);
|
||||
const __m128i out = _mm_madd_epi16(cur_pixels, mult);
|
||||
*(uint32_t*)frow = _mm_cvtsi128_si32(out);
|
||||
frow += 1;
|
||||
if (frow >= frow_end) break;
|
||||
accum -= wrk->x_sub;
|
||||
if (accum < 0) {
|
||||
if (--left) {
|
||||
cur_pixels = _mm_srli_si128(cur_pixels, 2);
|
||||
} else if (src <= src_limit) {
|
||||
LoadHeightPixels(src, &cur_pixels);
|
||||
src += 7;
|
||||
left = 7;
|
||||
} else { // tail
|
||||
cur_pixels = _mm_srli_si128(cur_pixels, 2);
|
||||
cur_pixels = _mm_insert_epi16(cur_pixels, src[1], 1);
|
||||
src += 1;
|
||||
left = 1;
|
||||
}
|
||||
accum += x_add;
|
||||
}
|
||||
}
|
||||
}
|
||||
assert(accum == 0);
|
||||
}
|
||||
|
||||
static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
|
||||
const uint8_t* src) {
|
||||
const int x_sub = wrk->x_sub;
|
||||
@ -277,6 +360,7 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
|
||||
extern void WebPRescalerDspInitSSE2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitSSE2(void) {
|
||||
WebPRescalerImportRowExpand = RescalerImportRowExpandSSE2;
|
||||
WebPRescalerImportRowShrink = RescalerImportRowShrinkSSE2;
|
||||
WebPRescalerExportRowExpand = RescalerExportRowExpandSSE2;
|
||||
WebPRescalerExportRowShrink = RescalerExportRowShrinkSSE2;
|
||||
|
Loading…
Reference in New Issue
Block a user