mirror of
https://github.com/webmproject/libwebp.git
synced 2025-04-19 23:36:45 +02:00
Speed DispatchAlpha_SSE2 up
On some dataset, this was taking 2.5%. 2% when switching to _mm_maskmoveu_si128. 1.7% when using _mm_loadu_si128 Confirmed by IACA: going from throughput of 4.26 to 3.5 and then to 6.26 for twice the input. Change-Id: I409f901aaad9d39bf55a1aac28cc25f126876b01
This commit is contained in:
parent
ee8e8c620f
commit
44f91b0ddd
@ -16,6 +16,8 @@
|
|||||||
#if defined(WEBP_USE_SSE2)
|
#if defined(WEBP_USE_SSE2)
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
|
|
||||||
|
#include "src/dsp/cpu.h"
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha,
|
static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha,
|
||||||
@ -26,38 +28,44 @@ static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha,
|
|||||||
uint32_t alpha_and = 0xff;
|
uint32_t alpha_and = 0xff;
|
||||||
int i, j;
|
int i, j;
|
||||||
const __m128i zero = _mm_setzero_si128();
|
const __m128i zero = _mm_setzero_si128();
|
||||||
const __m128i rgb_mask = _mm_set1_epi32((int)0xffffff00); // to preserve RGB
|
const __m128i alpha_mask = _mm_set1_epi32((int)0xff); // to preserve A
|
||||||
const __m128i all_0xff = _mm_set_epi32(0, 0, ~0, ~0);
|
const __m128i all_0xff = _mm_set1_epi8(0xff);
|
||||||
__m128i all_alphas = all_0xff;
|
__m128i all_alphas16 = all_0xff;
|
||||||
|
__m128i all_alphas8 = all_0xff;
|
||||||
|
|
||||||
// We must be able to access 3 extra bytes after the last written byte
|
// We must be able to access 3 extra bytes after the last written byte
|
||||||
// 'dst[4 * width - 4]', because we don't know if alpha is the first or the
|
// 'dst[4 * width - 4]', because we don't know if alpha is the first or the
|
||||||
// last byte of the quadruplet.
|
// last byte of the quadruplet.
|
||||||
const int limit = (width - 1) & ~7;
|
|
||||||
|
|
||||||
for (j = 0; j < height; ++j) {
|
for (j = 0; j < height; ++j) {
|
||||||
__m128i* out = (__m128i*)dst;
|
char* ptr = (char*)dst;
|
||||||
for (i = 0; i < limit; i += 8) {
|
for (i = 0; i + 16 <= width - 1; i += 16) {
|
||||||
|
// load 16 alpha bytes
|
||||||
|
const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]);
|
||||||
|
const __m128i a1_lo = _mm_unpacklo_epi8(a0, zero);
|
||||||
|
const __m128i a1_hi = _mm_unpackhi_epi8(a0, zero);
|
||||||
|
const __m128i a2_lo_lo = _mm_unpacklo_epi16(a1_lo, zero);
|
||||||
|
const __m128i a2_lo_hi = _mm_unpackhi_epi16(a1_lo, zero);
|
||||||
|
const __m128i a2_hi_lo = _mm_unpacklo_epi16(a1_hi, zero);
|
||||||
|
const __m128i a2_hi_hi = _mm_unpackhi_epi16(a1_hi, zero);
|
||||||
|
_mm_maskmoveu_si128(a2_lo_lo, alpha_mask, ptr + 0);
|
||||||
|
_mm_maskmoveu_si128(a2_lo_hi, alpha_mask, ptr + 16);
|
||||||
|
_mm_maskmoveu_si128(a2_hi_lo, alpha_mask, ptr + 32);
|
||||||
|
_mm_maskmoveu_si128(a2_hi_hi, alpha_mask, ptr + 48);
|
||||||
|
// accumulate 16 alpha 'and' in parallel
|
||||||
|
all_alphas16 = _mm_and_si128(all_alphas16, a0);
|
||||||
|
ptr += 64;
|
||||||
|
}
|
||||||
|
if (i + 8 <= width - 1) {
|
||||||
// load 8 alpha bytes
|
// load 8 alpha bytes
|
||||||
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]);
|
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]);
|
||||||
const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
|
const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
|
||||||
const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
|
const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
|
||||||
const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
|
const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
|
||||||
// load 8 dst pixels (32 bytes)
|
_mm_maskmoveu_si128(a2_lo, alpha_mask, ptr);
|
||||||
const __m128i b0_lo = _mm_loadu_si128(out + 0);
|
_mm_maskmoveu_si128(a2_hi, alpha_mask, ptr + 16);
|
||||||
const __m128i b0_hi = _mm_loadu_si128(out + 1);
|
// accumulate 8 alpha 'and' in parallel
|
||||||
// mask dst alpha values
|
all_alphas8 = _mm_and_si128(all_alphas8, a0);
|
||||||
const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask);
|
i += 8;
|
||||||
const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask);
|
|
||||||
// combine
|
|
||||||
const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo);
|
|
||||||
const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi);
|
|
||||||
// store
|
|
||||||
_mm_storeu_si128(out + 0, b2_lo);
|
|
||||||
_mm_storeu_si128(out + 1, b2_hi);
|
|
||||||
// accumulate eight alpha 'and' in parallel
|
|
||||||
all_alphas = _mm_and_si128(all_alphas, a0);
|
|
||||||
out += 2;
|
|
||||||
}
|
}
|
||||||
for (; i < width; ++i) {
|
for (; i < width; ++i) {
|
||||||
const uint32_t alpha_value = alpha[i];
|
const uint32_t alpha_value = alpha[i];
|
||||||
@ -68,8 +76,9 @@ static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha,
|
|||||||
dst += dst_stride;
|
dst += dst_stride;
|
||||||
}
|
}
|
||||||
// Combine the eight alpha 'and' into a 8-bit mask.
|
// Combine the eight alpha 'and' into a 8-bit mask.
|
||||||
alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
|
alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas8, all_0xff)) & 0xff;
|
||||||
return (alpha_and != 0xff);
|
return (alpha_and != 0xff ||
|
||||||
|
_mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas16, all_0xff)) != 0xffff);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void DispatchAlphaToGreen_SSE2(const uint8_t* WEBP_RESTRICT alpha,
|
static void DispatchAlphaToGreen_SSE2(const uint8_t* WEBP_RESTRICT alpha,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user