summaryrefslogtreecommitdiffstats
path: root/contrib/libs/libwebp/src/dsp/lossless_enc_sse2.c
diff options
context:
space:
mode:
authorrobot-piglet <[email protected]>2025-07-24 10:07:25 +0300
committerrobot-piglet <[email protected]>2025-07-24 10:16:27 +0300
commit1c295121fa6a70a55c0ed79beb993761eac1fadc (patch)
tree0a3af4cf839ddc14d3d2829c3b224c1da409d80c /contrib/libs/libwebp/src/dsp/lossless_enc_sse2.c
parent026ffc40392187f03308f5ae7445365ad4a1ef7f (diff)
Intermediate changes
commit_hash:9e9c04347de10235f77fcdaf62119e9b89e8bc59
Diffstat (limited to 'contrib/libs/libwebp/src/dsp/lossless_enc_sse2.c')
-rw-r--r--contrib/libs/libwebp/src/dsp/lossless_enc_sse2.c81
1 files changed, 57 insertions, 24 deletions
diff --git a/contrib/libs/libwebp/src/dsp/lossless_enc_sse2.c b/contrib/libs/libwebp/src/dsp/lossless_enc_sse2.c
index b2fa6480f23..88276f4f6bc 100644
--- a/contrib/libs/libwebp/src/dsp/lossless_enc_sse2.c
+++ b/contrib/libs/libwebp/src/dsp/lossless_enc_sse2.c
@@ -14,11 +14,17 @@
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
-#include <assert.h>
#include <emmintrin.h>
+
+#include <assert.h>
+#include <string.h>
+
+#include "src/dsp/cpu.h"
#include "src/dsp/lossless.h"
-#include "src/dsp/common_sse2.h"
#include "src/dsp/lossless_common.h"
+#include "src/utils/utils.h"
+#include "src/webp/format_constants.h"
+#include "src/webp/types.h"
// For sign-extended multiplying constants, pre-shifted by 5:
#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
@@ -52,9 +58,9 @@ static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
static void TransformColor_SSE2(const VP8LMultipliers* WEBP_RESTRICT const m,
uint32_t* WEBP_RESTRICT argb_data,
int num_pixels) {
- const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_),
- CST_5b(m->green_to_blue_));
- const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0);
+ const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red),
+ CST_5b(m->green_to_blue));
+ const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue), 0);
const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); // alpha-green masks
const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks
int i;
@@ -645,25 +651,43 @@ static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* WEBP_RESTRICT out) {
int i;
const __m128i zero = _mm_setzero_si128();
- for (i = 0; i + 2 <= num_pixels; i += 2) {
- // we can only process two pixels at a time
- const __m128i L = _mm_loadl_epi64((const __m128i*)&in[i - 1]);
- const __m128i src = _mm_loadl_epi64((const __m128i*)&in[i]);
- const __m128i T = _mm_loadl_epi64((const __m128i*)&upper[i]);
- const __m128i TL = _mm_loadl_epi64((const __m128i*)&upper[i - 1]);
- const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
- const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
- const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
- const __m128i sum = _mm_add_epi16(T_lo, L_lo);
- const __m128i avg = _mm_srli_epi16(sum, 1);
- const __m128i A1 = _mm_sub_epi16(avg, TL_lo);
- const __m128i bit_fix = _mm_cmpgt_epi16(TL_lo, avg);
- const __m128i A2 = _mm_sub_epi16(A1, bit_fix);
- const __m128i A3 = _mm_srai_epi16(A2, 1);
- const __m128i A4 = _mm_add_epi16(avg, A3);
- const __m128i pred = _mm_packus_epi16(A4, A4);
- const __m128i res = _mm_sub_epi8(src, pred);
- _mm_storel_epi64((__m128i*)&out[i], res);
+ for (i = 0; i + 4 <= num_pixels; i += 4) {
+ const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+ const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+ const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+ const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+ __m128i A4_lo, A4_hi;
+ // lo.
+ {
+ const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
+ const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
+ const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
+ const __m128i sum_lo = _mm_add_epi16(T_lo, L_lo);
+ const __m128i avg_lo = _mm_srli_epi16(sum_lo, 1);
+ const __m128i A1_lo = _mm_sub_epi16(avg_lo, TL_lo);
+ const __m128i bit_fix_lo = _mm_cmpgt_epi16(TL_lo, avg_lo);
+ const __m128i A2_lo = _mm_sub_epi16(A1_lo, bit_fix_lo);
+ const __m128i A3_lo = _mm_srai_epi16(A2_lo, 1);
+ A4_lo = _mm_add_epi16(avg_lo, A3_lo);
+ }
+ // hi.
+ {
+ const __m128i L_hi = _mm_unpackhi_epi8(L, zero);
+ const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
+ const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
+ const __m128i sum_hi = _mm_add_epi16(T_hi, L_hi);
+ const __m128i avg_hi = _mm_srli_epi16(sum_hi, 1);
+ const __m128i A1_hi = _mm_sub_epi16(avg_hi, TL_hi);
+ const __m128i bit_fix_hi = _mm_cmpgt_epi16(TL_hi, avg_hi);
+ const __m128i A2_hi = _mm_sub_epi16(A1_hi, bit_fix_hi);
+ const __m128i A3_hi = _mm_srai_epi16(A2_hi, 1);
+ A4_hi = _mm_add_epi16(avg_hi, A3_hi);
+ }
+ {
+ const __m128i pred = _mm_packus_epi16(A4_lo, A4_hi);
+ const __m128i res = _mm_sub_epi8(src, pred);
+ _mm_storeu_si128((__m128i*)&out[i], res);
+ }
}
if (i != num_pixels) {
VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i);
@@ -704,6 +728,15 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
VP8LPredictorsSub[13] = PredictorSub13_SSE2;
VP8LPredictorsSub[14] = PredictorSub0_SSE2; // <- padding security sentinels
VP8LPredictorsSub[15] = PredictorSub0_SSE2;
+
+ // SSE exports for AVX and above.
+ VP8LSubtractGreenFromBlueAndRed_SSE = SubtractGreenFromBlueAndRed_SSE2;
+ VP8LTransformColor_SSE = TransformColor_SSE2;
+ VP8LCollectColorBlueTransforms_SSE = CollectColorBlueTransforms_SSE2;
+ VP8LCollectColorRedTransforms_SSE = CollectColorRedTransforms_SSE2;
+ VP8LBundleColorMap_SSE = BundleColorMap_SSE2;
+
+ memcpy(VP8LPredictorsSub_SSE, VP8LPredictorsSub, sizeof(VP8LPredictorsSub));
}
#else // !WEBP_USE_SSE2