diff options
author | robot-piglet <[email protected]> | 2025-07-24 10:07:25 +0300 |
---|---|---|
committer | robot-piglet <[email protected]> | 2025-07-24 10:16:27 +0300 |
commit | 1c295121fa6a70a55c0ed79beb993761eac1fadc (patch) | |
tree | 0a3af4cf839ddc14d3d2829c3b224c1da409d80c /contrib/libs/libwebp/src/dsp/lossless_enc_sse2.c | |
parent | 026ffc40392187f03308f5ae7445365ad4a1ef7f (diff) |
Intermediate changes
commit_hash:9e9c04347de10235f77fcdaf62119e9b89e8bc59
Diffstat (limited to 'contrib/libs/libwebp/src/dsp/lossless_enc_sse2.c')
-rw-r--r-- | contrib/libs/libwebp/src/dsp/lossless_enc_sse2.c | 81 |
1 files changed, 57 insertions, 24 deletions
diff --git a/contrib/libs/libwebp/src/dsp/lossless_enc_sse2.c b/contrib/libs/libwebp/src/dsp/lossless_enc_sse2.c index b2fa6480f23..88276f4f6bc 100644 --- a/contrib/libs/libwebp/src/dsp/lossless_enc_sse2.c +++ b/contrib/libs/libwebp/src/dsp/lossless_enc_sse2.c @@ -14,11 +14,17 @@ #include "src/dsp/dsp.h" #if defined(WEBP_USE_SSE2) -#include <assert.h> #include <emmintrin.h> + +#include <assert.h> +#include <string.h> + +#include "src/dsp/cpu.h" #include "src/dsp/lossless.h" -#include "src/dsp/common_sse2.h" #include "src/dsp/lossless_common.h" +#include "src/utils/utils.h" +#include "src/webp/format_constants.h" +#include "src/webp/types.h" // For sign-extended multiplying constants, pre-shifted by 5: #define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5) @@ -52,9 +58,9 @@ static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data, static void TransformColor_SSE2(const VP8LMultipliers* WEBP_RESTRICT const m, uint32_t* WEBP_RESTRICT argb_data, int num_pixels) { - const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_), - CST_5b(m->green_to_blue_)); - const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0); + const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red), + CST_5b(m->green_to_blue)); + const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue), 0); const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); // alpha-green masks const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks int i; @@ -645,25 +651,43 @@ static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* WEBP_RESTRICT out) { int i; const __m128i zero = _mm_setzero_si128(); - for (i = 0; i + 2 <= num_pixels; i += 2) { - // we can only process two pixels at a time - const __m128i L = _mm_loadl_epi64((const __m128i*)&in[i - 1]); - const __m128i src = _mm_loadl_epi64((const __m128i*)&in[i]); - const __m128i T = _mm_loadl_epi64((const __m128i*)&upper[i]); - const __m128i TL = _mm_loadl_epi64((const __m128i*)&upper[i - 1]); - const __m128i L_lo = _mm_unpacklo_epi8(L, zero); - const __m128i T_lo = _mm_unpacklo_epi8(T, zero); - const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero); - const __m128i sum = _mm_add_epi16(T_lo, L_lo); - const __m128i avg = _mm_srli_epi16(sum, 1); - const __m128i A1 = _mm_sub_epi16(avg, TL_lo); - const __m128i bit_fix = _mm_cmpgt_epi16(TL_lo, avg); - const __m128i A2 = _mm_sub_epi16(A1, bit_fix); - const __m128i A3 = _mm_srai_epi16(A2, 1); - const __m128i A4 = _mm_add_epi16(avg, A3); - const __m128i pred = _mm_packus_epi16(A4, A4); - const __m128i res = _mm_sub_epi8(src, pred); - _mm_storel_epi64((__m128i*)&out[i], res); + for (i = 0; i + 4 <= num_pixels; i += 4) { + const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); + const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); + const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); + const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); + __m128i A4_lo, A4_hi; + // lo. + { + const __m128i L_lo = _mm_unpacklo_epi8(L, zero); + const __m128i T_lo = _mm_unpacklo_epi8(T, zero); + const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero); + const __m128i sum_lo = _mm_add_epi16(T_lo, L_lo); + const __m128i avg_lo = _mm_srli_epi16(sum_lo, 1); + const __m128i A1_lo = _mm_sub_epi16(avg_lo, TL_lo); + const __m128i bit_fix_lo = _mm_cmpgt_epi16(TL_lo, avg_lo); + const __m128i A2_lo = _mm_sub_epi16(A1_lo, bit_fix_lo); + const __m128i A3_lo = _mm_srai_epi16(A2_lo, 1); + A4_lo = _mm_add_epi16(avg_lo, A3_lo); + } + // hi. + { + const __m128i L_hi = _mm_unpackhi_epi8(L, zero); + const __m128i T_hi = _mm_unpackhi_epi8(T, zero); + const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero); + const __m128i sum_hi = _mm_add_epi16(T_hi, L_hi); + const __m128i avg_hi = _mm_srli_epi16(sum_hi, 1); + const __m128i A1_hi = _mm_sub_epi16(avg_hi, TL_hi); + const __m128i bit_fix_hi = _mm_cmpgt_epi16(TL_hi, avg_hi); + const __m128i A2_hi = _mm_sub_epi16(A1_hi, bit_fix_hi); + const __m128i A3_hi = _mm_srai_epi16(A2_hi, 1); + A4_hi = _mm_add_epi16(avg_hi, A3_hi); + } + { + const __m128i pred = _mm_packus_epi16(A4_lo, A4_hi); + const __m128i res = _mm_sub_epi8(src, pred); + _mm_storeu_si128((__m128i*)&out[i], res); + } } if (i != num_pixels) { VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i); @@ -704,6 +728,15 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) { VP8LPredictorsSub[13] = PredictorSub13_SSE2; VP8LPredictorsSub[14] = PredictorSub0_SSE2; // <- padding security sentinels VP8LPredictorsSub[15] = PredictorSub0_SSE2; + + // SSE exports for AVX and above. + VP8LSubtractGreenFromBlueAndRed_SSE = SubtractGreenFromBlueAndRed_SSE2; + VP8LTransformColor_SSE = TransformColor_SSE2; + VP8LCollectColorBlueTransforms_SSE = CollectColorBlueTransforms_SSE2; + VP8LCollectColorRedTransforms_SSE = CollectColorRedTransforms_SSE2; + VP8LBundleColorMap_SSE = BundleColorMap_SSE2; + + memcpy(VP8LPredictorsSub_SSE, VP8LPredictorsSub, sizeof(VP8LPredictorsSub)); } #else // !WEBP_USE_SSE2 |