Compare commits

...

2 Commits

Author SHA1 Message Date
Vincent Rabaud
7c70ff7a3b Clean dsp/lossless includes
Change-Id: I47a405a9c402095b440404fe57ac08b5293ea71b
2025-03-25 12:38:00 +01:00
Vincent Rabaud
9dd5ae819b Use the full register in PredictorSub13_SSE2
No more than 15 registers are used at a time

Change-Id: I40f77d9df8500e5e0d52ff6b206d765e8be62ae1
2025-03-25 11:07:15 +01:00
6 changed files with 71 additions and 32 deletions

View File

@ -13,15 +13,21 @@
// Jyrki Alakuijala (jyrki@google.com) // Jyrki Alakuijala (jyrki@google.com)
// Urvang Joshi (urvang@google.com) // Urvang Joshi (urvang@google.com)
#include "src/dsp/dsp.h" #include "src/dsp/lossless.h"
#include <assert.h> #include <assert.h>
#include <math.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h>
#include "src/dec/vp8li_dec.h" #include "src/dec/vp8li_dec.h"
#include "src/utils/endian_inl_utils.h" #include "src/dsp/cpu.h"
#include "src/dsp/lossless.h" #include "src/dsp/dsp.h"
#include "src/dsp/lossless_common.h" #include "src/dsp/lossless_common.h"
#include "src/utils/endian_inl_utils.h"
#include "src/utils/utils.h"
#include "src/webp/decode.h"
#include "src/webp/format_constants.h"
#include "src/webp/types.h"
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Image transforms. // Image transforms.

View File

@ -13,16 +13,19 @@
// Jyrki Alakuijala (jyrki@google.com) // Jyrki Alakuijala (jyrki@google.com)
// Urvang Joshi (urvang@google.com) // Urvang Joshi (urvang@google.com)
#include "src/dsp/dsp.h"
#include <assert.h> #include <assert.h>
#include <math.h> #include <math.h>
#include <stdlib.h> #include <stdlib.h>
#include "src/dec/vp8li_dec.h" #include <string.h>
#include "src/utils/endian_inl_utils.h"
#include "src/dsp/cpu.h"
#include "src/dsp/dsp.h"
#include "src/dsp/lossless.h" #include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h" #include "src/dsp/lossless_common.h"
#include "src/dsp/yuv.h" #include "src/enc/histogram_enc.h"
#include "src/utils/utils.h"
#include "src/webp/format_constants.h"
#include "src/webp/types.h"
// lookup table for small values of log2(int) * (1 << LOG_2_PRECISION_BITS). // lookup table for small values of log2(int) * (1 << LOG_2_PRECISION_BITS).
// Obtained in Python with: // Obtained in Python with:

View File

@ -14,11 +14,15 @@
#include "src/dsp/dsp.h" #include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2) #if defined(WEBP_USE_SSE2)
#include <assert.h> #include <assert.h>
#include <emmintrin.h> #include <emmintrin.h>
#include "src/dsp/cpu.h"
#include "src/dsp/lossless.h" #include "src/dsp/lossless.h"
#include "src/dsp/common_sse2.h"
#include "src/dsp/lossless_common.h" #include "src/dsp/lossless_common.h"
#include "src/utils/utils.h"
#include "src/webp/types.h"
// For sign-extended multiplying constants, pre-shifted by 5: // For sign-extended multiplying constants, pre-shifted by 5:
#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5) #define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
@ -645,25 +649,43 @@ static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* WEBP_RESTRICT out) { int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i; int i;
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
for (i = 0; i + 2 <= num_pixels; i += 2) { for (i = 0; i + 4 <= num_pixels; i += 4) {
// we can only process two pixels at a time const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
const __m128i L = _mm_loadl_epi64((const __m128i*)&in[i - 1]); const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
const __m128i src = _mm_loadl_epi64((const __m128i*)&in[i]); const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
const __m128i T = _mm_loadl_epi64((const __m128i*)&upper[i]); const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
const __m128i TL = _mm_loadl_epi64((const __m128i*)&upper[i - 1]); __m128i A4_lo, A4_hi;
const __m128i L_lo = _mm_unpacklo_epi8(L, zero); // lo.
const __m128i T_lo = _mm_unpacklo_epi8(T, zero); {
const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero); const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
const __m128i sum = _mm_add_epi16(T_lo, L_lo); const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
const __m128i avg = _mm_srli_epi16(sum, 1); const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
const __m128i A1 = _mm_sub_epi16(avg, TL_lo); const __m128i sum_lo = _mm_add_epi16(T_lo, L_lo);
const __m128i bit_fix = _mm_cmpgt_epi16(TL_lo, avg); const __m128i avg_lo = _mm_srli_epi16(sum_lo, 1);
const __m128i A2 = _mm_sub_epi16(A1, bit_fix); const __m128i A1_lo = _mm_sub_epi16(avg_lo, TL_lo);
const __m128i A3 = _mm_srai_epi16(A2, 1); const __m128i bit_fix_lo = _mm_cmpgt_epi16(TL_lo, avg_lo);
const __m128i A4 = _mm_add_epi16(avg, A3); const __m128i A2_lo = _mm_sub_epi16(A1_lo, bit_fix_lo);
const __m128i pred = _mm_packus_epi16(A4, A4); const __m128i A3_lo = _mm_srai_epi16(A2_lo, 1);
const __m128i res = _mm_sub_epi8(src, pred); A4_lo = _mm_add_epi16(avg_lo, A3_lo);
_mm_storel_epi64((__m128i*)&out[i], res); }
// hi.
{
const __m128i L_hi = _mm_unpackhi_epi8(L, zero);
const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
const __m128i sum_hi = _mm_add_epi16(T_hi, L_hi);
const __m128i avg_hi = _mm_srli_epi16(sum_hi, 1);
const __m128i A1_hi = _mm_sub_epi16(avg_hi, TL_hi);
const __m128i bit_fix_hi = _mm_cmpgt_epi16(TL_hi, avg_hi);
const __m128i A2_hi = _mm_sub_epi16(A1_hi, bit_fix_hi);
const __m128i A3_hi = _mm_srai_epi16(A2_hi, 1);
A4_hi = _mm_add_epi16(avg_hi, A3_hi);
}
{
const __m128i pred = _mm_packus_epi16(A4_lo, A4_hi);
const __m128i res = _mm_sub_epi8(src, pred);
_mm_storeu_si128((__m128i*)&out[i], res);
}
} }
if (i != num_pixels) { if (i != num_pixels) {
VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i); VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i);

View File

@ -14,9 +14,13 @@
#include "src/dsp/dsp.h" #include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE41) #if defined(WEBP_USE_SSE41)
#include <assert.h> #include <assert.h>
#include <smmintrin.h> #include <smmintrin.h>
#include "src/dsp/cpu.h"
#include "src/dsp/lossless.h" #include "src/dsp/lossless.h"
#include "src/webp/types.h"
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Cost operations. // Cost operations.

View File

@ -15,10 +15,13 @@
#if defined(WEBP_USE_SSE2) #if defined(WEBP_USE_SSE2)
#include <emmintrin.h>
#include "src/dsp/common_sse2.h" #include "src/dsp/common_sse2.h"
#include "src/dsp/cpu.h"
#include "src/dsp/lossless.h" #include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h" #include "src/dsp/lossless_common.h"
#include <emmintrin.h> #include "src/webp/types.h"
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Predictor Transform // Predictor Transform

View File

@ -13,9 +13,10 @@
#if defined(WEBP_USE_SSE41) #if defined(WEBP_USE_SSE41)
#include "src/dsp/common_sse41.h" #include <smmintrin.h>
#include "src/dsp/cpu.h"
#include "src/dsp/lossless.h" #include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Color-space conversion functions // Color-space conversion functions