diff options
author | robot-piglet <[email protected]> | 2025-06-15 15:44:41 +0300 |
---|---|---|
committer | robot-piglet <[email protected]> | 2025-06-15 15:55:30 +0300 |
commit | ea626d7b15346c0da649291483f80f1ae6e1d7e7 (patch) | |
tree | 24ae3c2aa7f259f3ba95af8450b5bce9a4bdb10d /contrib/libs/libjpeg-turbo/simd/arm/jcphuff-neon.c | |
parent | 726087f32fb38c191ff0c3ef8c6646aa940d987e (diff) |
Intermediate changes
commit_hash:79edafb911368bba0a4d2f7f151a6c8a37c349f3
Diffstat (limited to 'contrib/libs/libjpeg-turbo/simd/arm/jcphuff-neon.c')
-rw-r--r-- | contrib/libs/libjpeg-turbo/simd/arm/jcphuff-neon.c | 197 |
1 files changed, 99 insertions, 98 deletions
diff --git a/contrib/libs/libjpeg-turbo/simd/arm/jcphuff-neon.c b/contrib/libs/libjpeg-turbo/simd/arm/jcphuff-neon.c index b91c5db478a..435f96ee968 100644 --- a/contrib/libs/libjpeg-turbo/simd/arm/jcphuff-neon.c +++ b/contrib/libs/libjpeg-turbo/simd/arm/jcphuff-neon.c @@ -2,6 +2,8 @@ * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon) * * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved. + * Copyright (C) 2022, Matthieu Darbois. All Rights Reserved. + * Copyright (C) 2022, 2024, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -21,12 +23,11 @@ */ #define JPEG_INTERNALS -#include "jconfigint.h" -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" +#include "../../src/jinclude.h" +#include "../../src/jpeglib.h" +#include "../../src/jsimd.h" +#include "../../src/jdct.h" +#include "../../src/jsimddct.h" #include "../jsimd.h" #include "neon-compat.h" @@ -41,10 +42,10 @@ void jsimd_encode_mcu_AC_first_prepare_neon (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, - JCOEF *values, size_t *zerobits) + UJCOEF *values, size_t *zerobits) { - JCOEF *values_ptr = values; - JCOEF *diff_values_ptr = values + DCTSIZE2; + UJCOEF *values_ptr = values; + UJCOEF *diff_values_ptr = values + DCTSIZE2; /* Rows of coefficients to zero (since they haven't been processed) */ int i, rows_to_zero = 8; @@ -68,23 +69,23 @@ void jsimd_encode_mcu_AC_first_prepare_neon coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7); /* Isolate sign of coefficients. */ - int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15); - int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15); + uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)); + uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)); /* Compute absolute value of coefficients and apply point transform Al. */ - int16x8_t abs_coefs1 = vabsq_s16(coefs1); - int16x8_t abs_coefs2 = vabsq_s16(coefs2); - coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); - coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); + uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1)); + uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2)); + abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al)); + abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al)); /* Compute diff values. */ - int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1); - int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2); + uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1); + uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2); /* Store transformed coefficients and diff values. */ - vst1q_s16(values_ptr, coefs1); - vst1q_s16(values_ptr + DCTSIZE, coefs2); - vst1q_s16(diff_values_ptr, diff1); - vst1q_s16(diff_values_ptr + DCTSIZE, diff2); + vst1q_u16(values_ptr, abs_coefs1); + vst1q_u16(values_ptr + DCTSIZE, abs_coefs2); + vst1q_u16(diff_values_ptr, diff1); + vst1q_u16(diff_values_ptr + DCTSIZE, diff2); values_ptr += 16; diff_values_ptr += 16; jpeg_natural_order_start += 16; @@ -130,23 +131,23 @@ void jsimd_encode_mcu_AC_first_prepare_neon } /* Isolate sign of coefficients. */ - int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15); - int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15); + uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)); + uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)); /* Compute absolute value of coefficients and apply point transform Al. */ - int16x8_t abs_coefs1 = vabsq_s16(coefs1); - int16x8_t abs_coefs2 = vabsq_s16(coefs2); - coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); - coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); + uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1)); + uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2)); + abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al)); + abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al)); /* Compute diff values. */ - int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1); - int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2); + uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1); + uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2); /* Store transformed coefficients and diff values. */ - vst1q_s16(values_ptr, coefs1); - vst1q_s16(values_ptr + DCTSIZE, coefs2); - vst1q_s16(diff_values_ptr, diff1); - vst1q_s16(diff_values_ptr + DCTSIZE, diff2); + vst1q_u16(values_ptr, abs_coefs1); + vst1q_u16(values_ptr + DCTSIZE, abs_coefs2); + vst1q_u16(diff_values_ptr, diff1); + vst1q_u16(diff_values_ptr + DCTSIZE, diff2); values_ptr += 16; diff_values_ptr += 16; rows_to_zero -= 2; @@ -184,17 +185,17 @@ void jsimd_encode_mcu_AC_first_prepare_neon } /* Isolate sign of coefficients. */ - int16x8_t sign_coefs = vshrq_n_s16(coefs, 15); + uint16x8_t sign_coefs = vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)); /* Compute absolute value of coefficients and apply point transform Al. */ - int16x8_t abs_coefs = vabsq_s16(coefs); - coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al)); + uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs)); + abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al)); /* Compute diff values. */ - int16x8_t diff = veorq_s16(coefs, sign_coefs); + uint16x8_t diff = veorq_u16(abs_coefs, sign_coefs); /* Store transformed coefficients and diff values. */ - vst1q_s16(values_ptr, coefs); - vst1q_s16(diff_values_ptr, diff); + vst1q_u16(values_ptr, abs_coefs); + vst1q_u16(diff_values_ptr, diff); values_ptr += 8; diff_values_ptr += 8; rows_to_zero--; @@ -202,8 +203,8 @@ void jsimd_encode_mcu_AC_first_prepare_neon /* Zero remaining memory in the values and diff_values blocks. */ for (i = 0; i < rows_to_zero; i++) { - vst1q_s16(values_ptr, vdupq_n_s16(0)); - vst1q_s16(diff_values_ptr, vdupq_n_s16(0)); + vst1q_u16(values_ptr, vdupq_n_u16(0)); + vst1q_u16(diff_values_ptr, vdupq_n_u16(0)); values_ptr += 8; diff_values_ptr += 8; } @@ -211,23 +212,23 @@ void jsimd_encode_mcu_AC_first_prepare_neon /* Construct zerobits bitmap. A set bit means that the corresponding * coefficient != 0. */ - int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE); - int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE); - int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE); - int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE); - int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE); - int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE); - int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE); - int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE); - - uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0))); - uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0))); - uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0))); - uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0))); - uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0))); - uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0))); - uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0))); - uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0))); + uint16x8_t row0 = vld1q_u16(values + 0 * DCTSIZE); + uint16x8_t row1 = vld1q_u16(values + 1 * DCTSIZE); + uint16x8_t row2 = vld1q_u16(values + 2 * DCTSIZE); + uint16x8_t row3 = vld1q_u16(values + 3 * DCTSIZE); + uint16x8_t row4 = vld1q_u16(values + 4 * DCTSIZE); + uint16x8_t row5 = vld1q_u16(values + 5 * DCTSIZE); + uint16x8_t row6 = vld1q_u16(values + 6 * DCTSIZE); + uint16x8_t row7 = vld1q_u16(values + 7 * DCTSIZE); + + uint8x8_t row0_eq0 = vmovn_u16(vceqq_u16(row0, vdupq_n_u16(0))); + uint8x8_t row1_eq0 = vmovn_u16(vceqq_u16(row1, vdupq_n_u16(0))); + uint8x8_t row2_eq0 = vmovn_u16(vceqq_u16(row2, vdupq_n_u16(0))); + uint8x8_t row3_eq0 = vmovn_u16(vceqq_u16(row3, vdupq_n_u16(0))); + uint8x8_t row4_eq0 = vmovn_u16(vceqq_u16(row4, vdupq_n_u16(0))); + uint8x8_t row5_eq0 = vmovn_u16(vceqq_u16(row5, vdupq_n_u16(0))); + uint8x8_t row6_eq0 = vmovn_u16(vceqq_u16(row6, vdupq_n_u16(0))); + uint8x8_t row7_eq0 = vmovn_u16(vceqq_u16(row7, vdupq_n_u16(0))); /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */ const uint8x8_t bitmap_mask = @@ -274,7 +275,7 @@ void jsimd_encode_mcu_AC_first_prepare_neon int jsimd_encode_mcu_AC_refine_prepare_neon (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, - JCOEF *absvalues, size_t *bits) + UJCOEF *absvalues, size_t *bits) { /* Temporary storage buffers for data used to compute the signbits bitmap and * the end-of-block (EOB) position @@ -282,7 +283,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon uint8_t coef_sign_bits[64]; uint8_t coef_eq1_bits[64]; - JCOEF *absvalues_ptr = absvalues; + UJCOEF *absvalues_ptr = absvalues; uint8_t *coef_sign_bits_ptr = coef_sign_bits; uint8_t *eq1_bits_ptr = coef_eq1_bits; @@ -316,18 +317,18 @@ int jsimd_encode_mcu_AC_refine_prepare_neon vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2); /* Compute absolute value of coefficients and apply point transform Al. */ - int16x8_t abs_coefs1 = vabsq_s16(coefs1); - int16x8_t abs_coefs2 = vabsq_s16(coefs2); - coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); - coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); - vst1q_s16(absvalues_ptr, coefs1); - vst1q_s16(absvalues_ptr + DCTSIZE, coefs2); + uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1)); + uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2)); + abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al)); + abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al)); + vst1q_u16(absvalues_ptr, abs_coefs1); + vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2); /* Test whether transformed coefficient values == 1 (used to find EOB * position.) */ - uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1))); - uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1))); + uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1))); + uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1))); vst1_u8(eq1_bits_ptr, coefs_eq11); vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12); @@ -385,18 +386,18 @@ int jsimd_encode_mcu_AC_refine_prepare_neon vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2); /* Compute absolute value of coefficients and apply point transform Al. */ - int16x8_t abs_coefs1 = vabsq_s16(coefs1); - int16x8_t abs_coefs2 = vabsq_s16(coefs2); - coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); - coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); - vst1q_s16(absvalues_ptr, coefs1); - vst1q_s16(absvalues_ptr + DCTSIZE, coefs2); + uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1)); + uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2)); + abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al)); + abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al)); + vst1q_u16(absvalues_ptr, abs_coefs1); + vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2); /* Test whether transformed coefficient values == 1 (used to find EOB * position.) */ - uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1))); - uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1))); + uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1))); + uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1))); vst1_u8(eq1_bits_ptr, coefs_eq11); vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12); @@ -444,14 +445,14 @@ int jsimd_encode_mcu_AC_refine_prepare_neon vst1_u8(coef_sign_bits_ptr, sign_coefs); /* Compute absolute value of coefficients and apply point transform Al. */ - int16x8_t abs_coefs = vabsq_s16(coefs); - coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al)); - vst1q_s16(absvalues_ptr, coefs); + uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs)); + abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al)); + vst1q_u16(absvalues_ptr, abs_coefs); /* Test whether transformed coefficient values == 1 (used to find EOB * position.) */ - uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1))); + uint8x8_t coefs_eq1 = vmovn_u16(vceqq_u16(abs_coefs, vdupq_n_u16(1))); vst1_u8(eq1_bits_ptr, coefs_eq1); absvalues_ptr += 8; @@ -462,7 +463,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon /* Zero remaining memory in blocks. */ for (i = 0; i < rows_to_zero; i++) { - vst1q_s16(absvalues_ptr, vdupq_n_s16(0)); + vst1q_u16(absvalues_ptr, vdupq_n_u16(0)); vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0)); vst1_u8(eq1_bits_ptr, vdup_n_u8(0)); absvalues_ptr += 8; @@ -471,23 +472,23 @@ int jsimd_encode_mcu_AC_refine_prepare_neon } /* Construct zerobits bitmap. */ - int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE); - int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE); - int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE); - int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE); - int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE); - int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE); - int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE); - int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE); - - uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0))); - uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0))); - uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0))); - uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0))); - uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0))); - uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0))); - uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0))); - uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0))); + uint16x8_t abs_row0 = vld1q_u16(absvalues + 0 * DCTSIZE); + uint16x8_t abs_row1 = vld1q_u16(absvalues + 1 * DCTSIZE); + uint16x8_t abs_row2 = vld1q_u16(absvalues + 2 * DCTSIZE); + uint16x8_t abs_row3 = vld1q_u16(absvalues + 3 * DCTSIZE); + uint16x8_t abs_row4 = vld1q_u16(absvalues + 4 * DCTSIZE); + uint16x8_t abs_row5 = vld1q_u16(absvalues + 5 * DCTSIZE); + uint16x8_t abs_row6 = vld1q_u16(absvalues + 6 * DCTSIZE); + uint16x8_t abs_row7 = vld1q_u16(absvalues + 7 * DCTSIZE); + + uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_u16(abs_row0, vdupq_n_u16(0))); + uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_u16(abs_row1, vdupq_n_u16(0))); + uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_u16(abs_row2, vdupq_n_u16(0))); + uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_u16(abs_row3, vdupq_n_u16(0))); + uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_u16(abs_row4, vdupq_n_u16(0))); + uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_u16(abs_row5, vdupq_n_u16(0))); + uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_u16(abs_row6, vdupq_n_u16(0))); + uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_u16(abs_row7, vdupq_n_u16(0))); /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */ const uint8x8_t bitmap_mask = |