diff options
author | robot-piglet <[email protected]> | 2025-07-24 10:07:25 +0300 |
---|---|---|
committer | robot-piglet <[email protected]> | 2025-07-24 10:16:27 +0300 |
commit | 1c295121fa6a70a55c0ed79beb993761eac1fadc (patch) | |
tree | 0a3af4cf839ddc14d3d2829c3b224c1da409d80c /contrib/libs/libwebp/src/dsp/alpha_processing_sse2.c | |
parent | 026ffc40392187f03308f5ae7445365ad4a1ef7f (diff) |
Intermediate changes
commit_hash:9e9c04347de10235f77fcdaf62119e9b89e8bc59
Diffstat (limited to 'contrib/libs/libwebp/src/dsp/alpha_processing_sse2.c')
-rw-r--r-- | contrib/libs/libwebp/src/dsp/alpha_processing_sse2.c | 58 |
1 files changed, 34 insertions, 24 deletions
diff --git a/contrib/libs/libwebp/src/dsp/alpha_processing_sse2.c b/contrib/libs/libwebp/src/dsp/alpha_processing_sse2.c index aa0cc2848ae..1a6bfcb917b 100644 --- a/contrib/libs/libwebp/src/dsp/alpha_processing_sse2.c +++ b/contrib/libs/libwebp/src/dsp/alpha_processing_sse2.c @@ -16,6 +16,9 @@ #if defined(WEBP_USE_SSE2) #include <emmintrin.h> +#include "src/webp/types.h" +#include "src/dsp/cpu.h" + //------------------------------------------------------------------------------ static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha, @@ -26,38 +29,44 @@ static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha, uint32_t alpha_and = 0xff; int i, j; const __m128i zero = _mm_setzero_si128(); - const __m128i rgb_mask = _mm_set1_epi32((int)0xffffff00); // to preserve RGB - const __m128i all_0xff = _mm_set_epi32(0, 0, ~0, ~0); - __m128i all_alphas = all_0xff; + const __m128i alpha_mask = _mm_set1_epi32((int)0xff); // to preserve A + const __m128i all_0xff = _mm_set1_epi8((char)0xff); + __m128i all_alphas16 = all_0xff; + __m128i all_alphas8 = all_0xff; // We must be able to access 3 extra bytes after the last written byte // 'dst[4 * width - 4]', because we don't know if alpha is the first or the // last byte of the quadruplet. - const int limit = (width - 1) & ~7; - for (j = 0; j < height; ++j) { - __m128i* out = (__m128i*)dst; - for (i = 0; i < limit; i += 8) { + char* ptr = (char*)dst; + for (i = 0; i + 16 <= width - 1; i += 16) { + // load 16 alpha bytes + const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]); + const __m128i a1_lo = _mm_unpacklo_epi8(a0, zero); + const __m128i a1_hi = _mm_unpackhi_epi8(a0, zero); + const __m128i a2_lo_lo = _mm_unpacklo_epi16(a1_lo, zero); + const __m128i a2_lo_hi = _mm_unpackhi_epi16(a1_lo, zero); + const __m128i a2_hi_lo = _mm_unpacklo_epi16(a1_hi, zero); + const __m128i a2_hi_hi = _mm_unpackhi_epi16(a1_hi, zero); + _mm_maskmoveu_si128(a2_lo_lo, alpha_mask, ptr + 0); + _mm_maskmoveu_si128(a2_lo_hi, alpha_mask, ptr + 16); + _mm_maskmoveu_si128(a2_hi_lo, alpha_mask, ptr + 32); + _mm_maskmoveu_si128(a2_hi_hi, alpha_mask, ptr + 48); + // accumulate 16 alpha 'and' in parallel + all_alphas16 = _mm_and_si128(all_alphas16, a0); + ptr += 64; + } + if (i + 8 <= width - 1) { // load 8 alpha bytes const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]); const __m128i a1 = _mm_unpacklo_epi8(a0, zero); const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); - // load 8 dst pixels (32 bytes) - const __m128i b0_lo = _mm_loadu_si128(out + 0); - const __m128i b0_hi = _mm_loadu_si128(out + 1); - // mask dst alpha values - const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask); - const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask); - // combine - const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo); - const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi); - // store - _mm_storeu_si128(out + 0, b2_lo); - _mm_storeu_si128(out + 1, b2_hi); - // accumulate eight alpha 'and' in parallel - all_alphas = _mm_and_si128(all_alphas, a0); - out += 2; + _mm_maskmoveu_si128(a2_lo, alpha_mask, ptr); + _mm_maskmoveu_si128(a2_hi, alpha_mask, ptr + 16); + // accumulate 8 alpha 'and' in parallel + all_alphas8 = _mm_and_si128(all_alphas8, a0); + i += 8; } for (; i < width; ++i) { const uint32_t alpha_value = alpha[i]; @@ -68,8 +77,9 @@ static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha, dst += dst_stride; } // Combine the eight alpha 'and' into a 8-bit mask. - alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); - return (alpha_and != 0xff); + alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas8, all_0xff)) & 0xff; + return (alpha_and != 0xff || + _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas16, all_0xff)) != 0xffff); } static void DispatchAlphaToGreen_SSE2(const uint8_t* WEBP_RESTRICT alpha, |