mirror of
https://github.com/webmproject/libwebp.git
synced 2025-02-13 23:42:53 +01:00
Merge "rescaler: ~20% faster SSE2 implementation for lossless ImportRowExpand"
This commit is contained in:
commit
a26408389a
@ -25,6 +25,89 @@
|
|||||||
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
|
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
|
||||||
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
|
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
|
||||||
|
|
||||||
|
// input: 8 bytes ABCDEFGH -> output: A0E0B0F0C0G0D0H0
|
||||||
|
static void LoadTwoPixels(const uint8_t* const src, __m128i* out) {
|
||||||
|
const __m128i zero = _mm_setzero_si128();
|
||||||
|
const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH
|
||||||
|
const __m128i B = _mm_unpacklo_epi8(A, zero); // A0B0C0D0E0F0G0H0
|
||||||
|
const __m128i C = _mm_srli_si128(B, 8); // E0F0G0H0
|
||||||
|
*out = _mm_unpacklo_epi16(B, C);
|
||||||
|
}
|
||||||
|
|
||||||
|
// input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0
|
||||||
|
static void LoadHeightPixels(const uint8_t* const src, __m128i* out) {
|
||||||
|
const __m128i zero = _mm_setzero_si128();
|
||||||
|
const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH
|
||||||
|
*out = _mm_unpacklo_epi8(A, zero);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
|
||||||
|
const uint8_t* src) {
|
||||||
|
rescaler_t* frow = wrk->frow;
|
||||||
|
const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels;
|
||||||
|
const int x_add = wrk->x_add;
|
||||||
|
int accum = x_add;
|
||||||
|
__m128i cur_pixels;
|
||||||
|
|
||||||
|
assert(!WebPRescalerInputDone(wrk));
|
||||||
|
assert(wrk->x_expand);
|
||||||
|
if (wrk->num_channels == 4) {
|
||||||
|
if (wrk->src_width < 2) {
|
||||||
|
WebPRescalerImportRowExpandC(wrk, src);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
LoadTwoPixels(src, &cur_pixels);
|
||||||
|
src += 4;
|
||||||
|
while (1) {
|
||||||
|
const __m128i mult = _mm_set1_epi32(((x_add - accum) << 16) | accum);
|
||||||
|
const __m128i out = _mm_madd_epi16(cur_pixels, mult);
|
||||||
|
_mm_storeu_si128((__m128i*)frow, out);
|
||||||
|
frow += 4;
|
||||||
|
if (frow >= frow_end) break;
|
||||||
|
accum -= wrk->x_sub;
|
||||||
|
if (accum < 0) {
|
||||||
|
LoadTwoPixels(src, &cur_pixels);
|
||||||
|
src += 4;
|
||||||
|
accum += x_add;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int left;
|
||||||
|
const uint8_t* const src_limit = src + wrk->src_width - 8;
|
||||||
|
if (wrk->src_width < 8) {
|
||||||
|
WebPRescalerImportRowExpandC(wrk, src);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
LoadHeightPixels(src, &cur_pixels);
|
||||||
|
src += 7;
|
||||||
|
left = 7;
|
||||||
|
while (1) {
|
||||||
|
const __m128i mult = _mm_cvtsi32_si128(((x_add - accum) << 16) | accum);
|
||||||
|
const __m128i out = _mm_madd_epi16(cur_pixels, mult);
|
||||||
|
*(uint32_t*)frow = _mm_cvtsi128_si32(out);
|
||||||
|
frow += 1;
|
||||||
|
if (frow >= frow_end) break;
|
||||||
|
accum -= wrk->x_sub;
|
||||||
|
if (accum < 0) {
|
||||||
|
if (--left) {
|
||||||
|
cur_pixels = _mm_srli_si128(cur_pixels, 2);
|
||||||
|
} else if (src <= src_limit) {
|
||||||
|
LoadHeightPixels(src, &cur_pixels);
|
||||||
|
src += 7;
|
||||||
|
left = 7;
|
||||||
|
} else { // tail
|
||||||
|
cur_pixels = _mm_srli_si128(cur_pixels, 2);
|
||||||
|
cur_pixels = _mm_insert_epi16(cur_pixels, src[1], 1);
|
||||||
|
src += 1;
|
||||||
|
left = 1;
|
||||||
|
}
|
||||||
|
accum += x_add;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert(accum == 0);
|
||||||
|
}
|
||||||
|
|
||||||
static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
|
static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
|
||||||
const uint8_t* src) {
|
const uint8_t* src) {
|
||||||
const int x_sub = wrk->x_sub;
|
const int x_sub = wrk->x_sub;
|
||||||
@ -277,6 +360,7 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
|
|||||||
extern void WebPRescalerDspInitSSE2(void);
|
extern void WebPRescalerDspInitSSE2(void);
|
||||||
|
|
||||||
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitSSE2(void) {
|
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitSSE2(void) {
|
||||||
|
WebPRescalerImportRowExpand = RescalerImportRowExpandSSE2;
|
||||||
WebPRescalerImportRowShrink = RescalerImportRowShrinkSSE2;
|
WebPRescalerImportRowShrink = RescalerImportRowShrinkSSE2;
|
||||||
WebPRescalerExportRowExpand = RescalerExportRowExpandSSE2;
|
WebPRescalerExportRowExpand = RescalerExportRowExpandSSE2;
|
||||||
WebPRescalerExportRowShrink = RescalerExportRowShrinkSSE2;
|
WebPRescalerExportRowShrink = RescalerExportRowShrinkSSE2;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user