diff options
author | robot-piglet <[email protected]> | 2025-06-15 15:44:41 +0300 |
---|---|---|
committer | robot-piglet <[email protected]> | 2025-06-15 15:55:30 +0300 |
commit | ea626d7b15346c0da649291483f80f1ae6e1d7e7 (patch) | |
tree | 24ae3c2aa7f259f3ba95af8450b5bce9a4bdb10d /contrib/libs/libjpeg-turbo/simd | |
parent | 726087f32fb38c191ff0c3ef8c6646aa940d987e (diff) |
Intermediate changes
commit_hash:79edafb911368bba0a4d2f7f151a6c8a37c349f3
Diffstat (limited to 'contrib/libs/libjpeg-turbo/simd')
114 files changed, 1417 insertions, 1602 deletions
diff --git a/contrib/libs/libjpeg-turbo/simd/arm/aarch32/jchuff-neon.c b/contrib/libs/libjpeg-turbo/simd/arm/aarch32/jchuff-neon.c index 19d94f720da..153da1f1c11 100644 --- a/contrib/libs/libjpeg-turbo/simd/arm/aarch32/jchuff-neon.c +++ b/contrib/libs/libjpeg-turbo/simd/arm/aarch32/jchuff-neon.c @@ -2,6 +2,7 @@ * jchuff-neon.c - Huffman entropy encoding (32-bit Arm Neon) * * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * Copyright (C) 2024, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -24,11 +25,11 @@ */ #define JPEG_INTERNALS -#include "../../../jinclude.h" -#include "../../../jpeglib.h" -#include "../../../jsimd.h" -#include "../../../jdct.h" -#include "../../../jsimddct.h" +#include "../../../src/jinclude.h" +#include "../../../src/jpeglib.h" +#include "../../../src/jsimd.h" +#include "../../../src/jdct.h" +#include "../../../src/jsimddct.h" #include "../../jsimd.h" #include "../jchuff.h" #include "neon-compat.h" diff --git a/contrib/libs/libjpeg-turbo/simd/arm/aarch32/jsimd.c b/contrib/libs/libjpeg-turbo/simd/arm/aarch32/jsimd.c index 920f7656ebf..7c8ea306bd1 100644 --- a/contrib/libs/libjpeg-turbo/simd/arm/aarch32/jsimd.c +++ b/contrib/libs/libjpeg-turbo/simd/arm/aarch32/jsimd.c @@ -3,8 +3,8 @@ * * Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies). - * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander. - * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, 2024, D. R. Commander. + * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois. * Copyright (C) 2019, Google LLC. * Copyright (C) 2020, Arm Limited. * @@ -18,17 +18,17 @@ */ #define JPEG_INTERNALS -#include "../../../jinclude.h" -#include "../../../jpeglib.h" -#include "../../../jsimd.h" -#include "../../../jdct.h" -#include "../../../jsimddct.h" +#include "../../../src/jinclude.h" +#include "../../../src/jpeglib.h" +#include "../../../src/jsimd.h" +#include "../../../src/jdct.h" +#include "../../../src/jsimddct.h" #include "../../jsimd.h" #include <ctype.h> -static unsigned int simd_support = ~0; -static unsigned int simd_huffman = 1; +static THREAD_LOCAL unsigned int simd_support = ~0; +static THREAD_LOCAL unsigned int simd_huffman = 1; #if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)) @@ -96,8 +96,6 @@ parse_proc_cpuinfo(int bufsize) /* * Check what SIMD accelerations are supported. - * - * FIXME: This code is racy under a multi-threaded environment. */ LOCAL(void) init_simd(void) @@ -945,7 +943,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void) GLOBAL(void) jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *values, size_t *zerobits) + int Al, UJCOEF *values, size_t *zerobits) { jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start, Sl, Al, values, zerobits); @@ -970,7 +968,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void) GLOBAL(int) jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *absvalues, size_t *bits) + int Al, UJCOEF *absvalues, size_t *bits) { return jsimd_encode_mcu_AC_refine_prepare_neon(block, jpeg_natural_order_start, Sl, diff --git a/contrib/libs/libjpeg-turbo/simd/arm/aarch64/jchuff-neon.c b/contrib/libs/libjpeg-turbo/simd/arm/aarch64/jchuff-neon.c index 607a116070c..11bf6dab130 100644 --- a/contrib/libs/libjpeg-turbo/simd/arm/aarch64/jchuff-neon.c +++ b/contrib/libs/libjpeg-turbo/simd/arm/aarch64/jchuff-neon.c @@ -2,7 +2,7 @@ * jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon) * * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved. - * Copyright (C) 2020, 2022, D. R. Commander. All Rights Reserved. + * Copyright (C) 2020, 2022, 2024, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -25,11 +25,11 @@ */ #define JPEG_INTERNALS -#include "../../../jinclude.h" -#include "../../../jpeglib.h" -#include "../../../jsimd.h" -#include "../../../jdct.h" -#include "../../../jsimddct.h" +#include "../../../src/jinclude.h" +#include "../../../src/jpeglib.h" +#include "../../../src/jsimd.h" +#include "../../../src/jdct.h" +#include "../../../src/jsimddct.h" #include "../../jsimd.h" #include "../align.h" #include "../jchuff.h" diff --git a/contrib/libs/libjpeg-turbo/simd/arm/aarch64/jsimd.c b/contrib/libs/libjpeg-turbo/simd/arm/aarch64/jsimd.c index 41c06d31801..8a6f30a1a89 100644 --- a/contrib/libs/libjpeg-turbo/simd/arm/aarch64/jsimd.c +++ b/contrib/libs/libjpeg-turbo/simd/arm/aarch64/jsimd.c @@ -3,8 +3,9 @@ * * Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies). - * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, 2022, D. R. Commander. - * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, 2022, 2024, + * D. R. Commander. + * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois. * Copyright (C) 2020, Arm Limited. * * Based on the x86 SIMD extension for IJG JPEG library, @@ -17,13 +18,12 @@ */ #define JPEG_INTERNALS -#include "../../../jinclude.h" -#include "../../../jpeglib.h" -#include "../../../jsimd.h" -#include "../../../jdct.h" -#include "../../../jsimddct.h" +#include "../../../src/jinclude.h" +#include "../../../src/jpeglib.h" +#include "../../../src/jsimd.h" +#include "../../../src/jdct.h" +#include "../../../src/jsimddct.h" #include "../../jsimd.h" -#include "jconfigint.h" #include <ctype.h> @@ -31,10 +31,10 @@ #define JSIMD_FASTST3 2 #define JSIMD_FASTTBL 4 -static unsigned int simd_support = ~0; -static unsigned int simd_huffman = 1; -static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 | - JSIMD_FASTTBL; +static THREAD_LOCAL unsigned int simd_support = ~0; +static THREAD_LOCAL unsigned int simd_huffman = 1; +static THREAD_LOCAL unsigned int simd_features = JSIMD_FASTLD3 | + JSIMD_FASTST3 | JSIMD_FASTTBL; #if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) @@ -109,8 +109,6 @@ parse_proc_cpuinfo(int bufsize) /* * Check what SIMD accelerations are supported. - * - * FIXME: This code is racy under a multi-threaded environment. */ /* @@ -1021,7 +1019,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void) GLOBAL(void) jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *values, size_t *zerobits) + int Al, UJCOEF *values, size_t *zerobits) { jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start, Sl, Al, values, zerobits); @@ -1048,7 +1046,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void) GLOBAL(int) jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *absvalues, size_t *bits) + int Al, UJCOEF *absvalues, size_t *bits) { return jsimd_encode_mcu_AC_refine_prepare_neon(block, jpeg_natural_order_start, diff --git a/contrib/libs/libjpeg-turbo/simd/arm/jccolor-neon.c b/contrib/libs/libjpeg-turbo/simd/arm/jccolor-neon.c index 9fcc62dd25c..d14a7bf5018 100644 --- a/contrib/libs/libjpeg-turbo/simd/arm/jccolor-neon.c +++ b/contrib/libs/libjpeg-turbo/simd/arm/jccolor-neon.c @@ -2,7 +2,7 @@ * jccolor-neon.c - colorspace conversion (Arm Neon) * * Copyright (C) 2020, Arm Limited. All Rights Reserved. - * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * Copyright (C) 2020, 2024, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -22,11 +22,11 @@ */ #define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" +#include "../../src/jinclude.h" +#include "../../src/jpeglib.h" +#include "../../src/jsimd.h" +#include "../../src/jdct.h" +#include "../../src/jsimddct.h" #include "../jsimd.h" #include "align.h" #include "neon-compat.h" diff --git a/contrib/libs/libjpeg-turbo/simd/arm/jcgray-neon.c b/contrib/libs/libjpeg-turbo/simd/arm/jcgray-neon.c index 71c7b2de218..fbcf8214057 100644 --- a/contrib/libs/libjpeg-turbo/simd/arm/jcgray-neon.c +++ b/contrib/libs/libjpeg-turbo/simd/arm/jcgray-neon.c @@ -2,6 +2,7 @@ * jcgray-neon.c - grayscale colorspace conversion (Arm Neon) * * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * Copyright (C) 2024, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -21,13 +22,14 @@ */ #define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" +#include "../../src/jinclude.h" +#include "../../src/jpeglib.h" +#include "../../src/jsimd.h" +#include "../../src/jdct.h" +#include "../../src/jsimddct.h" #include "../jsimd.h" #include "align.h" +#include "neon-compat.h" #include <arm_neon.h> diff --git a/contrib/libs/libjpeg-turbo/simd/arm/jcphuff-neon.c b/contrib/libs/libjpeg-turbo/simd/arm/jcphuff-neon.c index b91c5db478a..435f96ee968 100644 --- a/contrib/libs/libjpeg-turbo/simd/arm/jcphuff-neon.c +++ b/contrib/libs/libjpeg-turbo/simd/arm/jcphuff-neon.c @@ -2,6 +2,8 @@ * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon) * * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved. + * Copyright (C) 2022, Matthieu Darbois. All Rights Reserved. + * Copyright (C) 2022, 2024, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -21,12 +23,11 @@ */ #define JPEG_INTERNALS -#include "jconfigint.h" -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" +#include "../../src/jinclude.h" +#include "../../src/jpeglib.h" +#include "../../src/jsimd.h" +#include "../../src/jdct.h" +#include "../../src/jsimddct.h" #include "../jsimd.h" #include "neon-compat.h" @@ -41,10 +42,10 @@ void jsimd_encode_mcu_AC_first_prepare_neon (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, - JCOEF *values, size_t *zerobits) + UJCOEF *values, size_t *zerobits) { - JCOEF *values_ptr = values; - JCOEF *diff_values_ptr = values + DCTSIZE2; + UJCOEF *values_ptr = values; + UJCOEF *diff_values_ptr = values + DCTSIZE2; /* Rows of coefficients to zero (since they haven't been processed) */ int i, rows_to_zero = 8; @@ -68,23 +69,23 @@ void jsimd_encode_mcu_AC_first_prepare_neon coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7); /* Isolate sign of coefficients. */ - int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15); - int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15); + uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)); + uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)); /* Compute absolute value of coefficients and apply point transform Al. */ - int16x8_t abs_coefs1 = vabsq_s16(coefs1); - int16x8_t abs_coefs2 = vabsq_s16(coefs2); - coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); - coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); + uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1)); + uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2)); + abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al)); + abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al)); /* Compute diff values. */ - int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1); - int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2); + uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1); + uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2); /* Store transformed coefficients and diff values. */ - vst1q_s16(values_ptr, coefs1); - vst1q_s16(values_ptr + DCTSIZE, coefs2); - vst1q_s16(diff_values_ptr, diff1); - vst1q_s16(diff_values_ptr + DCTSIZE, diff2); + vst1q_u16(values_ptr, abs_coefs1); + vst1q_u16(values_ptr + DCTSIZE, abs_coefs2); + vst1q_u16(diff_values_ptr, diff1); + vst1q_u16(diff_values_ptr + DCTSIZE, diff2); values_ptr += 16; diff_values_ptr += 16; jpeg_natural_order_start += 16; @@ -130,23 +131,23 @@ void jsimd_encode_mcu_AC_first_prepare_neon } /* Isolate sign of coefficients. */ - int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15); - int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15); + uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)); + uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)); /* Compute absolute value of coefficients and apply point transform Al. */ - int16x8_t abs_coefs1 = vabsq_s16(coefs1); - int16x8_t abs_coefs2 = vabsq_s16(coefs2); - coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); - coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); + uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1)); + uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2)); + abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al)); + abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al)); /* Compute diff values. */ - int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1); - int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2); + uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1); + uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2); /* Store transformed coefficients and diff values. */ - vst1q_s16(values_ptr, coefs1); - vst1q_s16(values_ptr + DCTSIZE, coefs2); - vst1q_s16(diff_values_ptr, diff1); - vst1q_s16(diff_values_ptr + DCTSIZE, diff2); + vst1q_u16(values_ptr, abs_coefs1); + vst1q_u16(values_ptr + DCTSIZE, abs_coefs2); + vst1q_u16(diff_values_ptr, diff1); + vst1q_u16(diff_values_ptr + DCTSIZE, diff2); values_ptr += 16; diff_values_ptr += 16; rows_to_zero -= 2; @@ -184,17 +185,17 @@ void jsimd_encode_mcu_AC_first_prepare_neon } /* Isolate sign of coefficients. */ - int16x8_t sign_coefs = vshrq_n_s16(coefs, 15); + uint16x8_t sign_coefs = vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)); /* Compute absolute value of coefficients and apply point transform Al. */ - int16x8_t abs_coefs = vabsq_s16(coefs); - coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al)); + uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs)); + abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al)); /* Compute diff values. */ - int16x8_t diff = veorq_s16(coefs, sign_coefs); + uint16x8_t diff = veorq_u16(abs_coefs, sign_coefs); /* Store transformed coefficients and diff values. */ - vst1q_s16(values_ptr, coefs); - vst1q_s16(diff_values_ptr, diff); + vst1q_u16(values_ptr, abs_coefs); + vst1q_u16(diff_values_ptr, diff); values_ptr += 8; diff_values_ptr += 8; rows_to_zero--; @@ -202,8 +203,8 @@ void jsimd_encode_mcu_AC_first_prepare_neon /* Zero remaining memory in the values and diff_values blocks. */ for (i = 0; i < rows_to_zero; i++) { - vst1q_s16(values_ptr, vdupq_n_s16(0)); - vst1q_s16(diff_values_ptr, vdupq_n_s16(0)); + vst1q_u16(values_ptr, vdupq_n_u16(0)); + vst1q_u16(diff_values_ptr, vdupq_n_u16(0)); values_ptr += 8; diff_values_ptr += 8; } @@ -211,23 +212,23 @@ void jsimd_encode_mcu_AC_first_prepare_neon /* Construct zerobits bitmap. A set bit means that the corresponding * coefficient != 0. */ - int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE); - int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE); - int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE); - int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE); - int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE); - int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE); - int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE); - int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE); - - uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0))); - uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0))); - uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0))); - uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0))); - uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0))); - uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0))); - uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0))); - uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0))); + uint16x8_t row0 = vld1q_u16(values + 0 * DCTSIZE); + uint16x8_t row1 = vld1q_u16(values + 1 * DCTSIZE); + uint16x8_t row2 = vld1q_u16(values + 2 * DCTSIZE); + uint16x8_t row3 = vld1q_u16(values + 3 * DCTSIZE); + uint16x8_t row4 = vld1q_u16(values + 4 * DCTSIZE); + uint16x8_t row5 = vld1q_u16(values + 5 * DCTSIZE); + uint16x8_t row6 = vld1q_u16(values + 6 * DCTSIZE); + uint16x8_t row7 = vld1q_u16(values + 7 * DCTSIZE); + + uint8x8_t row0_eq0 = vmovn_u16(vceqq_u16(row0, vdupq_n_u16(0))); + uint8x8_t row1_eq0 = vmovn_u16(vceqq_u16(row1, vdupq_n_u16(0))); + uint8x8_t row2_eq0 = vmovn_u16(vceqq_u16(row2, vdupq_n_u16(0))); + uint8x8_t row3_eq0 = vmovn_u16(vceqq_u16(row3, vdupq_n_u16(0))); + uint8x8_t row4_eq0 = vmovn_u16(vceqq_u16(row4, vdupq_n_u16(0))); + uint8x8_t row5_eq0 = vmovn_u16(vceqq_u16(row5, vdupq_n_u16(0))); + uint8x8_t row6_eq0 = vmovn_u16(vceqq_u16(row6, vdupq_n_u16(0))); + uint8x8_t row7_eq0 = vmovn_u16(vceqq_u16(row7, vdupq_n_u16(0))); /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */ const uint8x8_t bitmap_mask = @@ -274,7 +275,7 @@ void jsimd_encode_mcu_AC_first_prepare_neon int jsimd_encode_mcu_AC_refine_prepare_neon (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, - JCOEF *absvalues, size_t *bits) + UJCOEF *absvalues, size_t *bits) { /* Temporary storage buffers for data used to compute the signbits bitmap and * the end-of-block (EOB) position @@ -282,7 +283,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon uint8_t coef_sign_bits[64]; uint8_t coef_eq1_bits[64]; - JCOEF *absvalues_ptr = absvalues; + UJCOEF *absvalues_ptr = absvalues; uint8_t *coef_sign_bits_ptr = coef_sign_bits; uint8_t *eq1_bits_ptr = coef_eq1_bits; @@ -316,18 +317,18 @@ int jsimd_encode_mcu_AC_refine_prepare_neon vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2); /* Compute absolute value of coefficients and apply point transform Al. */ - int16x8_t abs_coefs1 = vabsq_s16(coefs1); - int16x8_t abs_coefs2 = vabsq_s16(coefs2); - coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); - coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); - vst1q_s16(absvalues_ptr, coefs1); - vst1q_s16(absvalues_ptr + DCTSIZE, coefs2); + uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1)); + uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2)); + abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al)); + abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al)); + vst1q_u16(absvalues_ptr, abs_coefs1); + vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2); /* Test whether transformed coefficient values == 1 (used to find EOB * position.) */ - uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1))); - uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1))); + uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1))); + uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1))); vst1_u8(eq1_bits_ptr, coefs_eq11); vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12); @@ -385,18 +386,18 @@ int jsimd_encode_mcu_AC_refine_prepare_neon vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2); /* Compute absolute value of coefficients and apply point transform Al. */ - int16x8_t abs_coefs1 = vabsq_s16(coefs1); - int16x8_t abs_coefs2 = vabsq_s16(coefs2); - coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); - coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); - vst1q_s16(absvalues_ptr, coefs1); - vst1q_s16(absvalues_ptr + DCTSIZE, coefs2); + uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1)); + uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2)); + abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al)); + abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al)); + vst1q_u16(absvalues_ptr, abs_coefs1); + vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2); /* Test whether transformed coefficient values == 1 (used to find EOB * position.) */ - uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1))); - uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1))); + uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1))); + uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1))); vst1_u8(eq1_bits_ptr, coefs_eq11); vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12); @@ -444,14 +445,14 @@ int jsimd_encode_mcu_AC_refine_prepare_neon vst1_u8(coef_sign_bits_ptr, sign_coefs); /* Compute absolute value of coefficients and apply point transform Al. */ - int16x8_t abs_coefs = vabsq_s16(coefs); - coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al)); - vst1q_s16(absvalues_ptr, coefs); + uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs)); + abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al)); + vst1q_u16(absvalues_ptr, abs_coefs); /* Test whether transformed coefficient values == 1 (used to find EOB * position.) */ - uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1))); + uint8x8_t coefs_eq1 = vmovn_u16(vceqq_u16(abs_coefs, vdupq_n_u16(1))); vst1_u8(eq1_bits_ptr, coefs_eq1); absvalues_ptr += 8; @@ -462,7 +463,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon /* Zero remaining memory in blocks. */ for (i = 0; i < rows_to_zero; i++) { - vst1q_s16(absvalues_ptr, vdupq_n_s16(0)); + vst1q_u16(absvalues_ptr, vdupq_n_u16(0)); vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0)); vst1_u8(eq1_bits_ptr, vdup_n_u8(0)); absvalues_ptr += 8; @@ -471,23 +472,23 @@ int jsimd_encode_mcu_AC_refine_prepare_neon } /* Construct zerobits bitmap. */ - int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE); - int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE); - int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE); - int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE); - int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE); - int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE); - int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE); - int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE); - - uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0))); - uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0))); - uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0))); - uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0))); - uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0))); - uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0))); - uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0))); - uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0))); + uint16x8_t abs_row0 = vld1q_u16(absvalues + 0 * DCTSIZE); + uint16x8_t abs_row1 = vld1q_u16(absvalues + 1 * DCTSIZE); + uint16x8_t abs_row2 = vld1q_u16(absvalues + 2 * DCTSIZE); + uint16x8_t abs_row3 = vld1q_u16(absvalues + 3 * DCTSIZE); + uint16x8_t abs_row4 = vld1q_u16(absvalues + 4 * DCTSIZE); + uint16x8_t abs_row5 = vld1q_u16(absvalues + 5 * DCTSIZE); + uint16x8_t abs_row6 = vld1q_u16(absvalues + 6 * DCTSIZE); + uint16x8_t abs_row7 = vld1q_u16(absvalues + 7 * DCTSIZE); + + uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_u16(abs_row0, vdupq_n_u16(0))); + uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_u16(abs_row1, vdupq_n_u16(0))); + uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_u16(abs_row2, vdupq_n_u16(0))); + uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_u16(abs_row3, vdupq_n_u16(0))); + uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_u16(abs_row4, vdupq_n_u16(0))); + uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_u16(abs_row5, vdupq_n_u16(0))); + uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_u16(abs_row6, vdupq_n_u16(0))); + uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_u16(abs_row7, vdupq_n_u16(0))); /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */ const uint8x8_t bitmap_mask = diff --git a/contrib/libs/libjpeg-turbo/simd/arm/jcsample-neon.c b/contrib/libs/libjpeg-turbo/simd/arm/jcsample-neon.c index 8a3e237838e..fd8a93e520b 100644 --- a/contrib/libs/libjpeg-turbo/simd/arm/jcsample-neon.c +++ b/contrib/libs/libjpeg-turbo/simd/arm/jcsample-neon.c @@ -2,6 +2,7 @@ * jcsample-neon.c - downsampling (Arm Neon) * * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * Copyright (C) 2024, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -21,13 +22,14 @@ */ #define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" +#include "../../src/jinclude.h" +#include "../../src/jpeglib.h" +#include "../../src/jsimd.h" +#include "../../src/jdct.h" +#include "../../src/jsimddct.h" #include "../jsimd.h" #include "align.h" +#include "neon-compat.h" #include <arm_neon.h> diff --git a/contrib/libs/libjpeg-turbo/simd/arm/jdcolor-neon.c b/contrib/libs/libjpeg-turbo/simd/arm/jdcolor-neon.c index ea4668f1d30..97bb02a1ed7 100644 --- a/contrib/libs/libjpeg-turbo/simd/arm/jdcolor-neon.c +++ b/contrib/libs/libjpeg-turbo/simd/arm/jdcolor-neon.c @@ -2,6 +2,7 @@ * jdcolor-neon.c - colorspace conversion (Arm Neon) * * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * Copyright (C) 2024, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -21,14 +22,14 @@ */ #define JPEG_INTERNALS -#include "jconfigint.h" -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" +#include "../../src/jinclude.h" +#include "../../src/jpeglib.h" +#include "../../src/jsimd.h" +#include "../../src/jdct.h" +#include "../../src/jsimddct.h" #include "../jsimd.h" #include "align.h" +#include "neon-compat.h" #include <arm_neon.h> diff --git a/contrib/libs/libjpeg-turbo/simd/arm/jdmerge-neon.c b/contrib/libs/libjpeg-turbo/simd/arm/jdmerge-neon.c index e4f91fdc0ef..95e6d32830c 100644 --- a/contrib/libs/libjpeg-turbo/simd/arm/jdmerge-neon.c +++ b/contrib/libs/libjpeg-turbo/simd/arm/jdmerge-neon.c @@ -2,6 +2,7 @@ * jdmerge-neon.c - merged upsampling/color conversion (Arm Neon) * * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * Copyright (C) 2024, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -21,14 +22,14 @@ */ #define JPEG_INTERNALS -#include "jconfigint.h" -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" +#include "../../src/jinclude.h" +#include "../../src/jpeglib.h" +#include "../../src/jsimd.h" +#include "../../src/jdct.h" +#include "../../src/jsimddct.h" #include "../jsimd.h" #include "align.h" +#include "neon-compat.h" #include <arm_neon.h> diff --git a/contrib/libs/libjpeg-turbo/simd/arm/jdsample-neon.c b/contrib/libs/libjpeg-turbo/simd/arm/jdsample-neon.c index 90ec6782c47..a130b1a9581 100644 --- a/contrib/libs/libjpeg-turbo/simd/arm/jdsample-neon.c +++ b/contrib/libs/libjpeg-turbo/simd/arm/jdsample-neon.c @@ -2,7 +2,7 @@ * jdsample-neon.c - upsampling (Arm Neon) * * Copyright (C) 2020, Arm Limited. All Rights Reserved. - * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * Copyright (C) 2020, 2024, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -22,12 +22,13 @@ */ #define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" +#include "../../src/jinclude.h" +#include "../../src/jpeglib.h" +#include "../../src/jsimd.h" +#include "../../src/jdct.h" +#include "../../src/jsimddct.h" #include "../jsimd.h" +#include "neon-compat.h" #include <arm_neon.h> diff --git a/contrib/libs/libjpeg-turbo/simd/arm/jfdctfst-neon.c b/contrib/libs/libjpeg-turbo/simd/arm/jfdctfst-neon.c index bb371be3999..d6109f11d34 100644 --- a/contrib/libs/libjpeg-turbo/simd/arm/jfdctfst-neon.c +++ b/contrib/libs/libjpeg-turbo/simd/arm/jfdctfst-neon.c @@ -2,6 +2,7 @@ * jfdctfst-neon.c - fast integer FDCT (Arm Neon) * * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * Copyright (C) 2024, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -21,13 +22,14 @@ */ #define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" +#include "../../src/jinclude.h" +#include "../../src/jpeglib.h" +#include "../../src/jsimd.h" +#include "../../src/jdct.h" +#include "../../src/jsimddct.h" #include "../jsimd.h" #include "align.h" +#include "neon-compat.h" #include <arm_neon.h> diff --git a/contrib/libs/libjpeg-turbo/simd/arm/jfdctint-neon.c b/contrib/libs/libjpeg-turbo/simd/arm/jfdctint-neon.c index ccfc07b15d9..bb290ea45d2 100644 --- a/contrib/libs/libjpeg-turbo/simd/arm/jfdctint-neon.c +++ b/contrib/libs/libjpeg-turbo/simd/arm/jfdctint-neon.c @@ -2,7 +2,7 @@ * jfdctint-neon.c - accurate integer FDCT (Arm Neon) * * Copyright (C) 2020, Arm Limited. All Rights Reserved. - * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * Copyright (C) 2020, 2024, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -22,11 +22,11 @@ */ #define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" +#include "../../src/jinclude.h" +#include "../../src/jpeglib.h" +#include "../../src/jsimd.h" +#include "../../src/jdct.h" +#include "../../src/jsimddct.h" #include "../jsimd.h" #include "align.h" #include "neon-compat.h" diff --git a/contrib/libs/libjpeg-turbo/simd/arm/jidctfst-neon.c b/contrib/libs/libjpeg-turbo/simd/arm/jidctfst-neon.c index a91be5362eb..e789125344a 100644 --- a/contrib/libs/libjpeg-turbo/simd/arm/jidctfst-neon.c +++ b/contrib/libs/libjpeg-turbo/simd/arm/jidctfst-neon.c @@ -2,6 +2,7 @@ * jidctfst-neon.c - fast integer IDCT (Arm Neon) * * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * Copyright (C) 2024, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -21,13 +22,14 @@ */ #define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" +#include "../../src/jinclude.h" +#include "../../src/jpeglib.h" +#include "../../src/jsimd.h" +#include "../../src/jdct.h" +#include "../../src/jsimddct.h" #include "../jsimd.h" #include "align.h" +#include "neon-compat.h" #include <arm_neon.h> diff --git a/contrib/libs/libjpeg-turbo/simd/arm/jidctint-neon.c b/contrib/libs/libjpeg-turbo/simd/arm/jidctint-neon.c index 043b652e6c5..709e0eaf4e9 100644 --- a/contrib/libs/libjpeg-turbo/simd/arm/jidctint-neon.c +++ b/contrib/libs/libjpeg-turbo/simd/arm/jidctint-neon.c @@ -2,7 +2,7 @@ * jidctint-neon.c - accurate integer IDCT (Arm Neon) * * Copyright (C) 2020, Arm Limited. All Rights Reserved. - * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * Copyright (C) 2020, 2024, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -22,12 +22,11 @@ */ #define JPEG_INTERNALS -#include "jconfigint.h" -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" +#include "../../src/jinclude.h" +#include "../../src/jpeglib.h" +#include "../../src/jsimd.h" +#include "../../src/jdct.h" +#include "../../src/jsimddct.h" #include "../jsimd.h" #include "align.h" #include "neon-compat.h" diff --git a/contrib/libs/libjpeg-turbo/simd/arm/jidctred-neon.c b/contrib/libs/libjpeg-turbo/simd/arm/jidctred-neon.c index be9627e61d4..25b1addc6a9 100644 --- a/contrib/libs/libjpeg-turbo/simd/arm/jidctred-neon.c +++ b/contrib/libs/libjpeg-turbo/simd/arm/jidctred-neon.c @@ -2,7 +2,7 @@ * jidctred-neon.c - reduced-size IDCT (Arm Neon) * * Copyright (C) 2020, Arm Limited. All Rights Reserved. - * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * Copyright (C) 2020, 2024, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -22,11 +22,11 @@ */ #define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" +#include "../../src/jinclude.h" +#include "../../src/jpeglib.h" +#include "../../src/jsimd.h" +#include "../../src/jdct.h" +#include "../../src/jsimddct.h" #include "../jsimd.h" #include "align.h" #include "neon-compat.h" diff --git a/contrib/libs/libjpeg-turbo/simd/arm/jquanti-neon.c b/contrib/libs/libjpeg-turbo/simd/arm/jquanti-neon.c index d5d95d89f67..e44fb3d4131 100644 --- a/contrib/libs/libjpeg-turbo/simd/arm/jquanti-neon.c +++ b/contrib/libs/libjpeg-turbo/simd/arm/jquanti-neon.c @@ -2,6 +2,7 @@ * jquanti-neon.c - sample data conversion and quantization (Arm Neon) * * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved. + * Copyright (C) 2024, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -21,12 +22,13 @@ */ #define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" +#include "../../src/jinclude.h" +#include "../../src/jpeglib.h" +#include "../../src/jsimd.h" +#include "../../src/jdct.h" +#include "../../src/jsimddct.h" #include "../jsimd.h" +#include "neon-compat.h" #include <arm_neon.h> diff --git a/contrib/libs/libjpeg-turbo/simd/arm/neon-compat.h b/contrib/libs/libjpeg-turbo/simd/arm/neon-compat.h index 069c62d8290..992aa5a4f97 100644 --- a/contrib/libs/libjpeg-turbo/simd/arm/neon-compat.h +++ b/contrib/libs/libjpeg-turbo/simd/arm/neon-compat.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * Copyright (C) 2020, 2024, D. R. Commander. All Rights Reserved. * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved. * * This software is provided 'as-is', without any express or implied @@ -35,3 +35,11 @@ #else #error "Unknown compiler" #endif + +#if defined(__clang__) +#pragma clang diagnostic ignored "-Wdeclaration-after-statement" +#pragma clang diagnostic ignored "-Wc99-extensions" +#elif defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wdeclaration-after-statement" +#pragma GCC diagnostic ignored "-Wpedantic" +#endif diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jccolext-avx2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jccolext-avx2.asm index c46d684436d..28ac9528079 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jccolext-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jccolext-avx2.asm @@ -2,17 +2,13 @@ ; jccolext.asm - colorspace conversion (AVX2) ; ; Copyright (C) 2015, Intel Corporation. -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -49,15 +45,15 @@ EXTN(jsimd_rgb_ycc_convert_avx2): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic eax ; make a room for GOT address + PUSHPIC eax ; make a room for GOT address push ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address + GET_GOT ebx ; get GOT address + MOVPIC POINTER [gotptr], ebx ; save GOT address mov ecx, JDIMENSION [img_width(eax)] test ecx, ecx @@ -80,9 +76,9 @@ EXTN(jsimd_rgb_ycc_convert_avx2): mov eax, INT [num_rows(eax)] test eax, eax jle near .return - alignx 16, 7 + ALIGNX 16, 7 .rowloop: - pushpic eax + PUSHPIC eax push edx push ebx push edi @@ -93,11 +89,11 @@ EXTN(jsimd_rgb_ycc_convert_avx2): mov edi, JSAMPROW [edi] ; outptr0 mov ebx, JSAMPROW [ebx] ; outptr1 mov edx, JSAMPROW [edx] ; outptr2 - movpic eax, POINTER [gotptr] ; load GOT address (eax) + MOVPIC eax, POINTER [gotptr] ; load GOT address (eax) cmp ecx, byte SIZEOF_YMMWORD jae near .columnloop - alignx 16, 7 + ALIGNX 16, 7 %if RGB_PIXELSIZE == 3 ; --------------- @@ -154,7 +150,7 @@ EXTN(jsimd_rgb_ycc_convert_avx2): vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] jmp short .rgb_ycc_cnv - alignx 16, 7 + ALIGNX 16, 7 .columnloop: vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] @@ -278,7 +274,7 @@ EXTN(jsimd_rgb_ycc_convert_avx2): vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] jmp short .rgb_ycc_cnv - alignx 16, 7 + ALIGNX 16, 7 .columnloop: vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] @@ -552,7 +548,7 @@ EXTN(jsimd_rgb_ycc_convert_avx2): pop edi pop ebx pop edx - poppic eax + POPPIC eax add esi, byte SIZEOF_JSAMPROW ; input_buf add edi, byte SIZEOF_JSAMPROW diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jccolext-mmx.asm b/contrib/libs/libjpeg-turbo/simd/i386/jccolext-mmx.asm index 6357a42b2cf..44b62512e91 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jccolext-mmx.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jccolext-mmx.asm @@ -2,17 +2,13 @@ ; jccolext.asm - colorspace conversion (MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -49,15 +45,15 @@ EXTN(jsimd_rgb_ycc_convert_mmx): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic eax ; make a room for GOT address + PUSHPIC eax ; make a room for GOT address push ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address + GET_GOT ebx ; get GOT address + MOVPIC POINTER [gotptr], ebx ; save GOT address mov ecx, JDIMENSION [img_width(eax)] ; num_cols test ecx, ecx @@ -80,9 +76,9 @@ EXTN(jsimd_rgb_ycc_convert_mmx): mov eax, INT [num_rows(eax)] test eax, eax jle near .return - alignx 16, 7 + ALIGNX 16, 7 .rowloop: - pushpic eax + PUSHPIC eax push edx push ebx push edi @@ -93,11 +89,11 @@ EXTN(jsimd_rgb_ycc_convert_mmx): mov edi, JSAMPROW [edi] ; outptr0 mov ebx, JSAMPROW [ebx] ; outptr1 mov edx, JSAMPROW [edx] ; outptr2 - movpic eax, POINTER [gotptr] ; load GOT address (eax) + MOVPIC eax, POINTER [gotptr] ; load GOT address (eax) cmp ecx, byte SIZEOF_MMWORD jae short .columnloop - alignx 16, 7 + ALIGNX 16, 7 %if RGB_PIXELSIZE == 3 ; --------------- @@ -143,7 +139,7 @@ EXTN(jsimd_rgb_ycc_convert_mmx): movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] jmp short .rgb_ycc_cnv - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] @@ -211,7 +207,7 @@ EXTN(jsimd_rgb_ycc_convert_mmx): movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] jmp short .rgb_ycc_cnv - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] @@ -449,7 +445,7 @@ EXTN(jsimd_rgb_ycc_convert_mmx): pop edi pop ebx pop edx - poppic eax + POPPIC eax add esi, byte SIZEOF_JSAMPROW ; input_buf add edi, byte SIZEOF_JSAMPROW diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jccolext-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jccolext-sse2.asm index c6c80852ac5..1d8d5f5a205 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jccolext-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jccolext-sse2.asm @@ -1,17 +1,13 @@ ; ; jccolext.asm - colorspace conversion (SSE2) ; -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -48,15 +44,15 @@ EXTN(jsimd_rgb_ycc_convert_sse2): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic eax ; make a room for GOT address + PUSHPIC eax ; make a room for GOT address push ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address + GET_GOT ebx ; get GOT address + MOVPIC POINTER [gotptr], ebx ; save GOT address mov ecx, JDIMENSION [img_width(eax)] test ecx, ecx @@ -79,9 +75,9 @@ EXTN(jsimd_rgb_ycc_convert_sse2): mov eax, INT [num_rows(eax)] test eax, eax jle near .return - alignx 16, 7 + ALIGNX 16, 7 .rowloop: - pushpic eax + PUSHPIC eax push edx push ebx push edi @@ -92,11 +88,11 @@ EXTN(jsimd_rgb_ycc_convert_sse2): mov edi, JSAMPROW [edi] ; outptr0 mov ebx, JSAMPROW [ebx] ; outptr1 mov edx, JSAMPROW [edx] ; outptr2 - movpic eax, POINTER [gotptr] ; load GOT address (eax) + MOVPIC eax, POINTER [gotptr] ; load GOT address (eax) cmp ecx, byte SIZEOF_XMMWORD jae near .columnloop - alignx 16, 7 + ALIGNX 16, 7 %if RGB_PIXELSIZE == 3 ; --------------- @@ -147,7 +143,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2): movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] jmp short .rgb_ycc_cnv - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] @@ -232,7 +228,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2): movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] jmp short .rgb_ycc_cnv - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] @@ -478,7 +474,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2): pop edi pop ebx pop edx - poppic eax + POPPIC eax add esi, byte SIZEOF_JSAMPROW ; input_buf add edi, byte SIZEOF_JSAMPROW diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jccolor-avx2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jccolor-avx2.asm index 14944e952f1..9ad5ea95f80 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jccolor-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jccolor-avx2.asm @@ -1,18 +1,14 @@ ; ; jccolor.asm - colorspace conversion (AVX2) ; -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; Copyright (C) 2015, Intel Corporation. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -33,7 +29,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_rgb_ycc_convert_avx2) EXTN(jconst_rgb_ycc_convert_avx2): @@ -46,7 +42,7 @@ PD_ONEHALFM1_CJ times 8 dd (1 << (SCALEBITS - 1)) - 1 + \ (CENTERJSAMPLE << SCALEBITS) PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1)) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jccolor-mmx.asm b/contrib/libs/libjpeg-turbo/simd/i386/jccolor-mmx.asm index 8cb399bdc43..0dbec54817e 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jccolor-mmx.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jccolor-mmx.asm @@ -2,17 +2,13 @@ ; jccolor.asm - colorspace conversion (MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -33,7 +29,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_rgb_ycc_convert_mmx) EXTN(jconst_rgb_ycc_convert_mmx): @@ -46,7 +42,7 @@ PD_ONEHALFM1_CJ times 2 dd (1 << (SCALEBITS - 1)) - 1 + \ (CENTERJSAMPLE << SCALEBITS) PD_ONEHALF times 2 dd (1 << (SCALEBITS - 1)) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jccolor-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jccolor-sse2.asm index 686d222ff70..678306a10c3 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jccolor-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jccolor-sse2.asm @@ -1,17 +1,13 @@ ; ; jccolor.asm - colorspace conversion (SSE2) ; -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -32,7 +28,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_rgb_ycc_convert_sse2) EXTN(jconst_rgb_ycc_convert_sse2): @@ -45,7 +41,7 @@ PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS - 1)) - 1 + \ (CENTERJSAMPLE << SCALEBITS) PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1)) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jcgray-avx2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jcgray-avx2.asm index 560ee0c71e2..ded39567df2 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jcgray-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jcgray-avx2.asm @@ -1,18 +1,14 @@ ; ; jcgray.asm - grayscale colorspace conversion (AVX2) ; -; Copyright (C) 2011, 2016, D. R. Commander. +; Copyright (C) 2011, 2016, 2024, D. R. Commander. ; Copyright (C) 2015, Intel Corporation. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -29,7 +25,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_rgb_gray_convert_avx2) EXTN(jconst_rgb_gray_convert_avx2): @@ -38,7 +34,7 @@ PW_F0299_F0337 times 8 dw F_0_299, F_0_337 PW_F0114_F0250 times 8 dw F_0_114, F_0_250 PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1)) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jcgray-mmx.asm b/contrib/libs/libjpeg-turbo/simd/i386/jcgray-mmx.asm index 79fdf082a84..d6f031869a0 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jcgray-mmx.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jcgray-mmx.asm @@ -2,17 +2,13 @@ ; jcgray.asm - grayscale colorspace conversion (MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2011, 2016, D. R. Commander. +; Copyright (C) 2011, 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -29,7 +25,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_rgb_gray_convert_mmx) EXTN(jconst_rgb_gray_convert_mmx): @@ -38,7 +34,7 @@ PW_F0299_F0337 times 2 dw F_0_299, F_0_337 PW_F0114_F0250 times 2 dw F_0_114, F_0_250 PD_ONEHALF times 2 dd (1 << (SCALEBITS - 1)) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jcgray-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jcgray-sse2.asm index cb4b28e8f49..ecc7fa08abb 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jcgray-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jcgray-sse2.asm @@ -1,17 +1,13 @@ ; ; jcgray.asm - grayscale colorspace conversion (SSE2) ; -; Copyright (C) 2011, 2016, D. R. Commander. +; Copyright (C) 2011, 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -28,7 +24,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_rgb_gray_convert_sse2) EXTN(jconst_rgb_gray_convert_sse2): @@ -37,7 +33,7 @@ PW_F0299_F0337 times 4 dw F_0_299, F_0_337 PW_F0114_F0250 times 4 dw F_0_114, F_0_250 PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1)) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jcgryext-avx2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jcgryext-avx2.asm index 3fa7973d72b..70df8f80ba4 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jcgryext-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jcgryext-avx2.asm @@ -1,18 +1,14 @@ ; ; jcgryext.asm - grayscale colorspace conversion (AVX2) ; -; Copyright (C) 2011, 2016, D. R. Commander. +; Copyright (C) 2011, 2016, 2024, D. R. Commander. ; Copyright (C) 2015, Intel Corporation. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -49,15 +45,15 @@ EXTN(jsimd_rgb_gray_convert_avx2): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic eax ; make a room for GOT address + PUSHPIC eax ; make a room for GOT address push ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address + GET_GOT ebx ; get GOT address + MOVPIC POINTER [gotptr], ebx ; save GOT address mov ecx, JDIMENSION [img_width(eax)] test ecx, ecx @@ -76,20 +72,20 @@ EXTN(jsimd_rgb_gray_convert_avx2): mov eax, INT [num_rows(eax)] test eax, eax jle near .return - alignx 16, 7 + ALIGNX 16, 7 .rowloop: - pushpic eax + PUSHPIC eax push edi push esi push ecx ; col mov esi, JSAMPROW [esi] ; inptr mov edi, JSAMPROW [edi] ; outptr0 - movpic eax, POINTER [gotptr] ; load GOT address (eax) + MOVPIC eax, POINTER [gotptr] ; load GOT address (eax) cmp ecx, byte SIZEOF_YMMWORD jae near .columnloop - alignx 16, 7 + ALIGNX 16, 7 %if RGB_PIXELSIZE == 3 ; --------------- @@ -146,7 +142,7 @@ EXTN(jsimd_rgb_gray_convert_avx2): vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] jmp short .rgb_gray_cnv - alignx 16, 7 + ALIGNX 16, 7 .columnloop: vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] @@ -270,7 +266,7 @@ EXTN(jsimd_rgb_gray_convert_avx2): vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] jmp short .rgb_gray_cnv - alignx 16, 7 + ALIGNX 16, 7 .columnloop: vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] @@ -433,7 +429,7 @@ EXTN(jsimd_rgb_gray_convert_avx2): pop ecx ; col pop esi pop edi - poppic eax + POPPIC eax add esi, byte SIZEOF_JSAMPROW ; input_buf add edi, byte SIZEOF_JSAMPROW diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jcgryext-mmx.asm b/contrib/libs/libjpeg-turbo/simd/i386/jcgryext-mmx.asm index 8af42e5a332..dd90c3dfb08 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jcgryext-mmx.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jcgryext-mmx.asm @@ -2,17 +2,13 @@ ; jcgryext.asm - grayscale colorspace conversion (MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2011, 2016, D. R. Commander. +; Copyright (C) 2011, 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -49,15 +45,15 @@ EXTN(jsimd_rgb_gray_convert_mmx): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic eax ; make a room for GOT address + PUSHPIC eax ; make a room for GOT address push ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address + GET_GOT ebx ; get GOT address + MOVPIC POINTER [gotptr], ebx ; save GOT address mov ecx, JDIMENSION [img_width(eax)] ; num_cols test ecx, ecx @@ -76,20 +72,20 @@ EXTN(jsimd_rgb_gray_convert_mmx): mov eax, INT [num_rows(eax)] test eax, eax jle near .return - alignx 16, 7 + ALIGNX 16, 7 .rowloop: - pushpic eax + PUSHPIC eax push edi push esi push ecx ; col mov esi, JSAMPROW [esi] ; inptr mov edi, JSAMPROW [edi] ; outptr0 - movpic eax, POINTER [gotptr] ; load GOT address (eax) + MOVPIC eax, POINTER [gotptr] ; load GOT address (eax) cmp ecx, byte SIZEOF_MMWORD jae short .columnloop - alignx 16, 7 + ALIGNX 16, 7 %if RGB_PIXELSIZE == 3 ; --------------- @@ -135,7 +131,7 @@ EXTN(jsimd_rgb_gray_convert_mmx): movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] jmp short .rgb_gray_cnv - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] @@ -203,7 +199,7 @@ EXTN(jsimd_rgb_gray_convert_mmx): movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] jmp short .rgb_gray_cnv - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] @@ -330,7 +326,7 @@ EXTN(jsimd_rgb_gray_convert_mmx): pop ecx ; col pop esi pop edi - poppic eax + POPPIC eax add esi, byte SIZEOF_JSAMPROW ; input_buf add edi, byte SIZEOF_JSAMPROW diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jcgryext-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jcgryext-sse2.asm index c9d6ff1e351..227295f3072 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jcgryext-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jcgryext-sse2.asm @@ -1,17 +1,13 @@ ; ; jcgryext.asm - grayscale colorspace conversion (SSE2) ; -; Copyright (C) 2011, 2016, D. R. Commander. +; Copyright (C) 2011, 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -48,15 +44,15 @@ EXTN(jsimd_rgb_gray_convert_sse2): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic eax ; make a room for GOT address + PUSHPIC eax ; make a room for GOT address push ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address + GET_GOT ebx ; get GOT address + MOVPIC POINTER [gotptr], ebx ; save GOT address mov ecx, JDIMENSION [img_width(eax)] test ecx, ecx @@ -75,20 +71,20 @@ EXTN(jsimd_rgb_gray_convert_sse2): mov eax, INT [num_rows(eax)] test eax, eax jle near .return - alignx 16, 7 + ALIGNX 16, 7 .rowloop: - pushpic eax + PUSHPIC eax push edi push esi push ecx ; col mov esi, JSAMPROW [esi] ; inptr mov edi, JSAMPROW [edi] ; outptr0 - movpic eax, POINTER [gotptr] ; load GOT address (eax) + MOVPIC eax, POINTER [gotptr] ; load GOT address (eax) cmp ecx, byte SIZEOF_XMMWORD jae near .columnloop - alignx 16, 7 + ALIGNX 16, 7 %if RGB_PIXELSIZE == 3 ; --------------- @@ -139,7 +135,7 @@ EXTN(jsimd_rgb_gray_convert_sse2): movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] jmp short .rgb_gray_cnv - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] @@ -224,7 +220,7 @@ EXTN(jsimd_rgb_gray_convert_sse2): movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] jmp short .rgb_gray_cnv - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] @@ -359,7 +355,7 @@ EXTN(jsimd_rgb_gray_convert_sse2): pop ecx ; col pop esi pop edi - poppic eax + POPPIC eax add esi, byte SIZEOF_JSAMPROW ; input_buf add edi, byte SIZEOF_JSAMPROW diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jchuff-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jchuff-sse2.asm index 278cf5e83af..ed194dd383d 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jchuff-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jchuff-sse2.asm @@ -1,7 +1,7 @@ ; ; jchuff-sse2.asm - Huffman entropy encoding (SSE2) ; -; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander. +; Copyright (C) 2009-2011, 2014-2017, 2019, 2024, D. R. Commander. ; Copyright (C) 2015, Matthieu Darbois. ; Copyright (C) 2018, Matthias Räncker. ; @@ -9,11 +9,7 @@ ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains an SSE2 implementation for Huffman coding of one block. ; The following code is based on jchuff.c; see jchuff.c for more details. @@ -42,7 +38,7 @@ endstruc EXTN(jconst_huff_encode_one_block): - alignz 32 + ALIGNZ 32 jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007 dq 0x000f, 0x001f, 0x003f, 0x007f @@ -65,7 +61,8 @@ times 1 << 2 db 3 times 1 << 1 db 2 times 1 << 0 db 1 times 1 db 0 -jpeg_nbits_table: +GLOBAL_DATA(jpeg_nbits_table) +EXTN(jpeg_nbits_table): times 1 db 0 times 1 << 0 db 1 times 1 << 1 db 2 @@ -83,14 +80,14 @@ times 1 << 12 db 13 times 1 << 13 db 14 times 1 << 14 db 15 - alignz 32 + ALIGNZ 32 %ifdef PIC %define NBITS(x) nbits_base + x %else -%define NBITS(x) jpeg_nbits_table + x +%define NBITS(x) EXTN(jpeg_nbits_table) + x %endif -%define MASK_BITS(x) NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table) +%define MASK_BITS(x) NBITS((x) * 8) + (jpeg_mask_bits - EXTN(jpeg_nbits_table)) ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -235,7 +232,7 @@ times 1 << 14 db 15 ; If PIC is defined, load the address of a symbol defined in this file into a ; register. Equivalent to -; get_GOT %1 +; GET_GOT %1 ; lea %1, [GOTOFF(%1, %2)] ; without using the GOT. ; @@ -469,7 +466,7 @@ EXTN(jsimd_huff_encode_one_block_sse2): pcmpeqw mm_all_0xff, mm_all_0xff ;Z: all_0xff[i] = 0xFF; %endmacro - GET_SYM nbits_base, jpeg_nbits_table, GET_SYM_BEFORE, GET_SYM_AFTER + GET_SYM nbits_base, EXTN(jpeg_nbits_table), GET_SYM_BEFORE, GET_SYM_AFTER psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 -- shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59 diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jcphuff-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jcphuff-sse2.asm index c26b48a47d8..19a183fcd83 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jcphuff-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jcphuff-sse2.asm @@ -7,11 +7,7 @@ ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains an SSE2 implementation of data preparation for progressive ; Huffman encoding. See jcphuff.c for more details. diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jcsample-avx2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jcsample-avx2.asm index 0a20802dd89..5019829c9ae 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jcsample-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jcsample-avx2.asm @@ -3,17 +3,13 @@ ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB ; Copyright (C) 2015, Intel Corporation. -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -70,7 +66,7 @@ EXTN(jsimd_h2v1_downsample_avx2): cld mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - alignx 16, 7 + ALIGNX 16, 7 .expandloop: push eax push ecx @@ -106,7 +102,7 @@ EXTN(jsimd_h2v1_downsample_avx2): mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov edi, JSAMPARRAY [output_data(ebp)] ; output_data - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push ecx push edi @@ -117,7 +113,7 @@ EXTN(jsimd_h2v1_downsample_avx2): cmp ecx, byte SIZEOF_YMMWORD jae short .columnloop - alignx 16, 7 + ALIGNX 16, 7 .columnloop_r24: ; ecx can possibly be 8, 16, 24 @@ -141,7 +137,7 @@ EXTN(jsimd_h2v1_downsample_avx2): vpxor ymm1, ymm1, ymm1 mov ecx, SIZEOF_YMMWORD jmp short .downsample - alignx 16, 7 + ALIGNX 16, 7 .columnloop: vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] @@ -243,7 +239,7 @@ EXTN(jsimd_h2v2_downsample_avx2): cld mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - alignx 16, 7 + ALIGNX 16, 7 .expandloop: push eax push ecx @@ -279,7 +275,7 @@ EXTN(jsimd_h2v2_downsample_avx2): mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov edi, JSAMPARRAY [output_data(ebp)] ; output_data - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push ecx push edi @@ -291,7 +287,7 @@ EXTN(jsimd_h2v2_downsample_avx2): cmp ecx, byte SIZEOF_YMMWORD jae short .columnloop - alignx 16, 7 + ALIGNX 16, 7 .columnloop_r24: cmp ecx, 24 @@ -320,7 +316,7 @@ EXTN(jsimd_h2v2_downsample_avx2): vpxor ymm3, ymm3, ymm3 mov ecx, SIZEOF_YMMWORD jmp short .downsample - alignx 16, 7 + ALIGNX 16, 7 .columnloop: vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD] diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jcsample-mmx.asm b/contrib/libs/libjpeg-turbo/simd/i386/jcsample-mmx.asm index 2c223eebe81..94dd88870a3 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jcsample-mmx.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jcsample-mmx.asm @@ -2,17 +2,13 @@ ; jcsample.asm - downsampling (MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -69,7 +65,7 @@ EXTN(jsimd_h2v1_downsample_mmx): cld mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - alignx 16, 7 + ALIGNX 16, 7 .expandloop: push eax push ecx @@ -104,7 +100,7 @@ EXTN(jsimd_h2v1_downsample_mmx): mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov edi, JSAMPARRAY [output_data(ebp)] ; output_data - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push ecx push edi @@ -112,7 +108,7 @@ EXTN(jsimd_h2v1_downsample_mmx): mov esi, JSAMPROW [esi] ; inptr mov edi, JSAMPROW [edi] ; outptr - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] @@ -212,7 +208,7 @@ EXTN(jsimd_h2v2_downsample_mmx): cld mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - alignx 16, 7 + ALIGNX 16, 7 .expandloop: push eax push ecx @@ -247,7 +243,7 @@ EXTN(jsimd_h2v2_downsample_mmx): mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov edi, JSAMPARRAY [output_data(ebp)] ; output_data - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push ecx push edi @@ -256,7 +252,7 @@ EXTN(jsimd_h2v2_downsample_mmx): mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 mov edi, JSAMPROW [edi] ; outptr - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movq mm0, MMWORD [edx+0*SIZEOF_MMWORD] diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jcsample-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jcsample-sse2.asm index 4fea60d2e21..eb8808bea84 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jcsample-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jcsample-sse2.asm @@ -2,17 +2,13 @@ ; jcsample.asm - downsampling (SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -69,7 +65,7 @@ EXTN(jsimd_h2v1_downsample_sse2): cld mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - alignx 16, 7 + ALIGNX 16, 7 .expandloop: push eax push ecx @@ -104,7 +100,7 @@ EXTN(jsimd_h2v1_downsample_sse2): mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov edi, JSAMPARRAY [output_data(ebp)] ; output_data - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push ecx push edi @@ -115,14 +111,14 @@ EXTN(jsimd_h2v1_downsample_sse2): cmp ecx, byte SIZEOF_XMMWORD jae short .columnloop - alignx 16, 7 + ALIGNX 16, 7 .columnloop_r8: movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] pxor xmm1, xmm1 mov ecx, SIZEOF_XMMWORD jmp short .downsample - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] @@ -225,7 +221,7 @@ EXTN(jsimd_h2v2_downsample_sse2): cld mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - alignx 16, 7 + ALIGNX 16, 7 .expandloop: push eax push ecx @@ -260,7 +256,7 @@ EXTN(jsimd_h2v2_downsample_sse2): mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov edi, JSAMPARRAY [output_data(ebp)] ; output_data - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push ecx push edi @@ -272,7 +268,7 @@ EXTN(jsimd_h2v2_downsample_sse2): cmp ecx, byte SIZEOF_XMMWORD jae short .columnloop - alignx 16, 7 + ALIGNX 16, 7 .columnloop_r8: movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] @@ -281,7 +277,7 @@ EXTN(jsimd_h2v2_downsample_sse2): pxor xmm3, xmm3 mov ecx, SIZEOF_XMMWORD jmp short .downsample - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jdcolext-avx2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jdcolext-avx2.asm index 015be0416c5..fd79b79568e 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jdcolext-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jdcolext-avx2.asm @@ -2,18 +2,14 @@ ; jdcolext.asm - colorspace conversion (AVX2) ; ; Copyright 2009, 2012 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2012, 2016, D. R. Commander. +; Copyright (C) 2012, 2016, 2024, D. R. Commander. ; Copyright (C) 2015, Intel Corporation. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -50,15 +46,15 @@ EXTN(jsimd_ycc_rgb_convert_avx2): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic eax ; make a room for GOT address + PUSHPIC eax ; make a room for GOT address push ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address + GET_GOT ebx ; get GOT address + MOVPIC POINTER [gotptr], ebx ; save GOT address mov ecx, JDIMENSION [out_width(eax)] ; num_cols test ecx, ecx @@ -81,7 +77,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2): mov eax, INT [num_rows(eax)] test eax, eax jle near .return - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push eax push edi @@ -94,8 +90,8 @@ EXTN(jsimd_ycc_rgb_convert_avx2): mov ebx, JSAMPROW [ebx] ; inptr1 mov edx, JSAMPROW [edx] ; inptr2 mov edi, JSAMPROW [edi] ; outptr - movpic eax, POINTER [gotptr] ; load GOT address (eax) - alignx 16, 7 + MOVPIC eax, POINTER [gotptr] ; load GOT address (eax) + ALIGNX 16, 7 .columnloop: vmovdqu ymm5, YMMWORD [ebx] ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV) @@ -295,7 +291,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2): add ebx, byte SIZEOF_YMMWORD ; inptr1 add edx, byte SIZEOF_YMMWORD ; inptr2 jmp near .columnloop - alignx 16, 7 + ALIGNX 16, 7 .column_st64: lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE @@ -436,7 +432,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2): add ebx, byte SIZEOF_YMMWORD ; inptr1 add edx, byte SIZEOF_YMMWORD ; inptr2 jmp near .columnloop - alignx 16, 7 + ALIGNX 16, 7 .column_st64: cmp ecx, byte SIZEOF_YMMWORD/2 @@ -479,7 +475,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2): %endif ; RGB_PIXELSIZE ; --------------- - alignx 16, 7 + ALIGNX 16, 7 .nextrow: pop ecx diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jdcolext-mmx.asm b/contrib/libs/libjpeg-turbo/simd/i386/jdcolext-mmx.asm index 5813cfcb66f..636bd6d3fdc 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jdcolext-mmx.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jdcolext-mmx.asm @@ -2,17 +2,13 @@ ; jdcolext.asm - colorspace conversion (MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -49,15 +45,15 @@ EXTN(jsimd_ycc_rgb_convert_mmx): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic eax ; make a room for GOT address + PUSHPIC eax ; make a room for GOT address push ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address + GET_GOT ebx ; get GOT address + MOVPIC POINTER [gotptr], ebx ; save GOT address mov ecx, JDIMENSION [out_width(eax)] ; num_cols test ecx, ecx @@ -80,7 +76,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx): mov eax, INT [num_rows(eax)] test eax, eax jle near .return - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push eax push edi @@ -93,8 +89,8 @@ EXTN(jsimd_ycc_rgb_convert_mmx): mov ebx, JSAMPROW [ebx] ; inptr1 mov edx, JSAMPROW [edx] ; inptr2 mov edi, JSAMPROW [edi] ; outptr - movpic eax, POINTER [gotptr] ; load GOT address (eax) - alignx 16, 7 + MOVPIC eax, POINTER [gotptr] ; load GOT address (eax) + ALIGNX 16, 7 .columnloop: movq mm5, MMWORD [ebx] ; mm5=Cb(01234567) @@ -255,7 +251,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx): add edx, byte SIZEOF_MMWORD ; inptr2 add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr jmp near .columnloop - alignx 16, 7 + ALIGNX 16, 7 .column_st16: lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE @@ -344,7 +340,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx): add edx, byte SIZEOF_MMWORD ; inptr2 add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr jmp near .columnloop - alignx 16, 7 + ALIGNX 16, 7 .column_st16: cmp ecx, byte SIZEOF_MMWORD/2 @@ -369,7 +365,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx): %endif ; RGB_PIXELSIZE ; --------------- - alignx 16, 7 + ALIGNX 16, 7 .nextrow: pop ecx diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jdcolext-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jdcolext-sse2.asm index d5572b32946..0150f2cb69c 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jdcolext-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jdcolext-sse2.asm @@ -2,17 +2,13 @@ ; jdcolext.asm - colorspace conversion (SSE2) ; ; Copyright 2009, 2012 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2012, 2016, D. R. Commander. +; Copyright (C) 2012, 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -49,15 +45,15 @@ EXTN(jsimd_ycc_rgb_convert_sse2): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic eax ; make a room for GOT address + PUSHPIC eax ; make a room for GOT address push ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address + GET_GOT ebx ; get GOT address + MOVPIC POINTER [gotptr], ebx ; save GOT address mov ecx, JDIMENSION [out_width(eax)] ; num_cols test ecx, ecx @@ -80,7 +76,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2): mov eax, INT [num_rows(eax)] test eax, eax jle near .return - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push eax push edi @@ -93,8 +89,8 @@ EXTN(jsimd_ycc_rgb_convert_sse2): mov ebx, JSAMPROW [ebx] ; inptr1 mov edx, JSAMPROW [edx] ; inptr2 mov edi, JSAMPROW [edi] ; outptr - movpic eax, POINTER [gotptr] ; load GOT address (eax) - alignx 16, 7 + MOVPIC eax, POINTER [gotptr] ; load GOT address (eax) + ALIGNX 16, 7 .columnloop: movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF) @@ -275,7 +271,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2): add ebx, byte SIZEOF_XMMWORD ; inptr1 add edx, byte SIZEOF_XMMWORD ; inptr2 jmp near .columnloop - alignx 16, 7 + ALIGNX 16, 7 .column_st32: lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE @@ -387,7 +383,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2): add ebx, byte SIZEOF_XMMWORD ; inptr1 add edx, byte SIZEOF_XMMWORD ; inptr2 jmp near .columnloop - alignx 16, 7 + ALIGNX 16, 7 .column_st32: cmp ecx, byte SIZEOF_XMMWORD/2 @@ -423,7 +419,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2): %endif ; RGB_PIXELSIZE ; --------------- - alignx 16, 7 + ALIGNX 16, 7 .nextrow: pop ecx diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jdcolor-avx2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jdcolor-avx2.asm index e05b60d0017..d3a30d63a71 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jdcolor-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jdcolor-avx2.asm @@ -3,17 +3,13 @@ ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB ; Copyright (C) 2015, Intel Corporation. -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -32,7 +28,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_ycc_rgb_convert_avx2) EXTN(jconst_ycc_rgb_convert_avx2): @@ -43,7 +39,7 @@ PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285 PW_ONE times 16 dw 1 PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jdcolor-mmx.asm b/contrib/libs/libjpeg-turbo/simd/i386/jdcolor-mmx.asm index fb7e7bcce4b..6e67e4b72ea 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jdcolor-mmx.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jdcolor-mmx.asm @@ -2,17 +2,13 @@ ; jdcolor.asm - colorspace conversion (MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -31,7 +27,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_ycc_rgb_convert_mmx) EXTN(jconst_ycc_rgb_convert_mmx): @@ -42,7 +38,7 @@ PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285 PW_ONE times 4 dw 1 PD_ONEHALF times 2 dd 1 << (SCALEBITS - 1) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jdcolor-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jdcolor-sse2.asm index b736255317e..79c9c6821a4 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jdcolor-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jdcolor-sse2.asm @@ -2,17 +2,13 @@ ; jdcolor.asm - colorspace conversion (SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -31,7 +27,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_ycc_rgb_convert_sse2) EXTN(jconst_ycc_rgb_convert_sse2): @@ -42,7 +38,7 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 PW_ONE times 8 dw 1 PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jdmerge-avx2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jdmerge-avx2.asm index 711e6792d0f..90493fd023b 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jdmerge-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jdmerge-avx2.asm @@ -2,18 +2,14 @@ ; jdmerge.asm - merged upsampling/color conversion (AVX2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; Copyright (C) 2015, Intel Corporation. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -32,7 +28,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_merged_upsample_avx2) EXTN(jconst_merged_upsample_avx2): @@ -43,7 +39,7 @@ PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285 PW_ONE times 16 dw 1 PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jdmerge-mmx.asm b/contrib/libs/libjpeg-turbo/simd/i386/jdmerge-mmx.asm index 6e8311d4081..0dc204aa8b4 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jdmerge-mmx.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jdmerge-mmx.asm @@ -2,17 +2,13 @@ ; jdmerge.asm - merged upsampling/color conversion (MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -31,7 +27,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_merged_upsample_mmx) EXTN(jconst_merged_upsample_mmx): @@ -42,7 +38,7 @@ PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285 PW_ONE times 4 dw 1 PD_ONEHALF times 2 dd 1 << (SCALEBITS - 1) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jdmerge-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jdmerge-sse2.asm index e32f90aa177..06f07627421 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jdmerge-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jdmerge-sse2.asm @@ -2,17 +2,13 @@ ; jdmerge.asm - merged upsampling/color conversion (SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -31,7 +27,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_merged_upsample_sse2) EXTN(jconst_merged_upsample_sse2): @@ -42,7 +38,7 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 PW_ONE times 8 dw 1 PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jdmrgext-avx2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jdmrgext-avx2.asm index e35f7282bc4..a7aa930e346 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jdmrgext-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jdmrgext-avx2.asm @@ -2,18 +2,14 @@ ; jdmrgext.asm - merged upsampling/color conversion (AVX2) ; ; Copyright 2009, 2012 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2012, 2016, D. R. Commander. +; Copyright (C) 2012, 2016, 2024, D. R. Commander. ; Copyright (C) 2015, Intel Corporation. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -50,15 +46,15 @@ EXTN(jsimd_h2v1_merged_upsample_avx2): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic eax ; make a room for GOT address + PUSHPIC eax ; make a room for GOT address push ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address + GET_GOT ebx ; get GOT address + MOVPIC POINTER [gotptr], ebx ; save GOT address mov ecx, JDIMENSION [output_width(eax)] ; col test ecx, ecx @@ -79,9 +75,9 @@ EXTN(jsimd_h2v1_merged_upsample_avx2): pop ecx ; col - alignx 16, 7 + ALIGNX 16, 7 .columnloop: - movpic eax, POINTER [gotptr] ; load GOT address (eax) + MOVPIC eax, POINTER [gotptr] ; load GOT address (eax) vmovdqu ymm6, YMMWORD [ebx] ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV) vmovdqu ymm7, YMMWORD [edx] ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV) @@ -168,13 +164,13 @@ EXTN(jsimd_h2v1_merged_upsample_avx2): mov al, 2 ; Yctr jmp short .Yloop_1st - alignx 16, 7 + ALIGNX 16, 7 .Yloop_2nd: vmovdqa ymm0, YMMWORD [wk(1)] ; ymm0=(R-Y)H vmovdqa ymm2, YMMWORD [wk(2)] ; ymm2=(G-Y)H vmovdqa ymm4, YMMWORD [wk(0)] ; ymm4=(B-Y)H - alignx 16, 7 + ALIGNX 16, 7 .Yloop_1st: vmovdqu ymm7, YMMWORD [esi] ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV) @@ -301,7 +297,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2): add ebx, byte SIZEOF_YMMWORD ; inptr1 add edx, byte SIZEOF_YMMWORD ; inptr2 jmp near .columnloop - alignx 16, 7 + ALIGNX 16, 7 .column_st64: lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE @@ -445,7 +441,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2): add ebx, byte SIZEOF_YMMWORD ; inptr1 add edx, byte SIZEOF_YMMWORD ; inptr2 jmp near .columnloop - alignx 16, 7 + ALIGNX 16, 7 .column_st64: cmp ecx, byte SIZEOF_YMMWORD/2 diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jdmrgext-mmx.asm b/contrib/libs/libjpeg-turbo/simd/i386/jdmrgext-mmx.asm index eb3e36b4759..562758146c6 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jdmrgext-mmx.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jdmrgext-mmx.asm @@ -2,17 +2,13 @@ ; jdmrgext.asm - merged upsampling/color conversion (MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -47,15 +43,15 @@ EXTN(jsimd_h2v1_merged_upsample_mmx): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic eax ; make a room for GOT address + PUSHPIC eax ; make a room for GOT address push ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address + GET_GOT ebx ; get GOT address + MOVPIC POINTER [gotptr], ebx ; save GOT address mov ecx, JDIMENSION [output_width(eax)] ; col test ecx, ecx @@ -76,9 +72,9 @@ EXTN(jsimd_h2v1_merged_upsample_mmx): pop ecx ; col - alignx 16, 7 + ALIGNX 16, 7 .columnloop: - movpic eax, POINTER [gotptr] ; load GOT address (eax) + MOVPIC eax, POINTER [gotptr] ; load GOT address (eax) movq mm6, MMWORD [ebx] ; mm6=Cb(01234567) movq mm7, MMWORD [edx] ; mm7=Cr(01234567) @@ -171,13 +167,13 @@ EXTN(jsimd_h2v1_merged_upsample_mmx): mov al, 2 ; Yctr jmp short .Yloop_1st - alignx 16, 7 + ALIGNX 16, 7 .Yloop_2nd: movq mm0, MMWORD [wk(1)] ; mm0=(R-Y)H movq mm2, MMWORD [wk(2)] ; mm2=(G-Y)H movq mm4, MMWORD [wk(0)] ; mm4=(B-Y)H - alignx 16, 7 + ALIGNX 16, 7 .Yloop_1st: movq mm7, MMWORD [esi] ; mm7=Y(01234567) @@ -258,7 +254,7 @@ EXTN(jsimd_h2v1_merged_upsample_mmx): add ebx, byte SIZEOF_MMWORD ; inptr1 add edx, byte SIZEOF_MMWORD ; inptr2 jmp near .columnloop - alignx 16, 7 + ALIGNX 16, 7 .column_st16: lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE @@ -350,7 +346,7 @@ EXTN(jsimd_h2v1_merged_upsample_mmx): add ebx, byte SIZEOF_MMWORD ; inptr1 add edx, byte SIZEOF_MMWORD ; inptr2 jmp near .columnloop - alignx 16, 7 + ALIGNX 16, 7 .column_st16: cmp ecx, byte SIZEOF_MMWORD/2 diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jdmrgext-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jdmrgext-sse2.asm index c113dc4d27e..13e7d980fa6 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jdmrgext-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jdmrgext-sse2.asm @@ -2,17 +2,13 @@ ; jdmrgext.asm - merged upsampling/color conversion (SSE2) ; ; Copyright 2009, 2012 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2012, 2016, D. R. Commander. +; Copyright (C) 2012, 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -49,15 +45,15 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic eax ; make a room for GOT address + PUSHPIC eax ; make a room for GOT address push ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address + GET_GOT ebx ; get GOT address + MOVPIC POINTER [gotptr], ebx ; save GOT address mov ecx, JDIMENSION [output_width(eax)] ; col test ecx, ecx @@ -78,9 +74,9 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): pop ecx ; col - alignx 16, 7 + ALIGNX 16, 7 .columnloop: - movpic eax, POINTER [gotptr] ; load GOT address (eax) + MOVPIC eax, POINTER [gotptr] ; load GOT address (eax) movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF) movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF) @@ -173,13 +169,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): mov al, 2 ; Yctr jmp short .Yloop_1st - alignx 16, 7 + ALIGNX 16, 7 .Yloop_2nd: movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H - alignx 16, 7 + ALIGNX 16, 7 .Yloop_1st: movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF) @@ -280,7 +276,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): add ebx, byte SIZEOF_XMMWORD ; inptr1 add edx, byte SIZEOF_XMMWORD ; inptr2 jmp near .columnloop - alignx 16, 7 + ALIGNX 16, 7 .column_st32: lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE @@ -395,7 +391,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): add ebx, byte SIZEOF_XMMWORD ; inptr1 add edx, byte SIZEOF_XMMWORD ; inptr2 jmp near .columnloop - alignx 16, 7 + ALIGNX 16, 7 .column_st32: cmp ecx, byte SIZEOF_XMMWORD/2 diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jdsample-avx2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jdsample-avx2.asm index a800c35e083..eba53ef7574 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jdsample-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jdsample-avx2.asm @@ -3,24 +3,20 @@ ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB ; Copyright (C) 2015, Intel Corporation. -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_fancy_upsample_avx2) EXTN(jconst_fancy_upsample_avx2): @@ -31,7 +27,7 @@ PW_THREE times 16 dw 3 PW_SEVEN times 16 dw 7 PW_EIGHT times 16 dw 8 - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -62,13 +58,13 @@ PW_EIGHT times 16 dw 8 EXTN(jsimd_h2v1_fancy_upsample_avx2): push ebp mov ebp, esp - pushpic ebx + PUSHPIC ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr test eax, eax @@ -81,7 +77,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2): mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov edi, POINTER [output_data_ptr(ebp)] mov edi, JSAMPARRAY [edi] ; output_data - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push eax ; colctr push edi @@ -104,7 +100,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2): and eax, byte -SIZEOF_YMMWORD cmp eax, byte SIZEOF_YMMWORD ja short .columnloop - alignx 16, 7 + ALIGNX 16, 7 .columnloop_last: vpcmpeqb xmm6, xmm6, xmm6 @@ -112,7 +108,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2): vperm2i128 ymm6, ymm6, ymm6, 1 ; (---- ---- ... ---- ---- ff) MSB is ff vpand ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD] jmp short .upsample - alignx 16, 7 + ALIGNX 16, 7 .columnloop: vmovdqu ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD] @@ -196,7 +192,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2): pop esi ; pop edx ; need not be preserved ; pop ecx ; need not be preserved - poppic ebx + POPPIC ebx pop ebp ret @@ -234,15 +230,15 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic eax ; make a room for GOT address + PUSHPIC eax ; make a room for GOT address push ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address + GET_GOT ebx ; get GOT address + MOVPIC POINTER [gotptr], ebx ; save GOT address mov edx, eax ; edx = original ebp mov eax, JDIMENSION [downsamp_width(edx)] ; colctr @@ -256,7 +252,7 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2): mov esi, JSAMPARRAY [input_data(edx)] ; input_data mov edi, POINTER [output_data_ptr(edx)] mov edi, JSAMPARRAY [edi] ; output_data - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push eax ; colctr push ecx @@ -286,8 +282,8 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2): vmovdqu ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0] vmovdqu ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0] - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address + PUSHPIC ebx + MOVPIC ebx, POINTER [gotptr] ; load GOT address vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's) @@ -328,19 +324,19 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2): vmovdqa YMMWORD [wk(0)], ymm1 vmovdqa YMMWORD [wk(1)], ymm2 - poppic ebx + POPPIC ebx add eax, byte SIZEOF_YMMWORD-1 and eax, byte -SIZEOF_YMMWORD cmp eax, byte SIZEOF_YMMWORD ja short .columnloop - alignx 16, 7 + ALIGNX 16, 7 .columnloop_last: ; -- process the last column block - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address + PUSHPIC ebx + MOVPIC ebx, POINTER [gotptr] ; load GOT address vpcmpeqb xmm1, xmm1, xmm1 vpslldq xmm1, xmm1, (SIZEOF_XMMWORD-2) @@ -353,7 +349,7 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2): vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) jmp near .upsample - alignx 16, 7 + ALIGNX 16, 7 .columnloop: ; -- process the next column block @@ -362,8 +358,8 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2): vmovdqu ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1] vmovdqu ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1] - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address + PUSHPIC ebx + MOVPIC ebx, POINTER [gotptr] ; load GOT address vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's) @@ -516,7 +512,7 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2): vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1 vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0 - poppic ebx + POPPIC ebx sub eax, byte SIZEOF_YMMWORD add ecx, byte 1*SIZEOF_YMMWORD ; inptr1(above) @@ -590,7 +586,7 @@ EXTN(jsimd_h2v1_upsample_avx2): mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov edi, POINTER [output_data_ptr(ebp)] mov edi, JSAMPARRAY [edi] ; output_data - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push edi push esi @@ -598,7 +594,7 @@ EXTN(jsimd_h2v1_upsample_avx2): mov esi, JSAMPROW [esi] ; inptr mov edi, JSAMPROW [edi] ; outptr mov eax, edx ; colctr - alignx 16, 7 + ALIGNX 16, 7 .columnloop: cmp eax, byte SIZEOF_YMMWORD @@ -629,7 +625,7 @@ EXTN(jsimd_h2v1_upsample_avx2): add esi, byte SIZEOF_YMMWORD ; inptr add edi, byte 2*SIZEOF_YMMWORD ; outptr jmp short .columnloop - alignx 16, 7 + ALIGNX 16, 7 .nextrow: pop esi @@ -689,7 +685,7 @@ EXTN(jsimd_h2v2_upsample_avx2): mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov edi, POINTER [output_data_ptr(ebp)] mov edi, JSAMPARRAY [edi] ; output_data - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push edi push esi @@ -698,7 +694,7 @@ EXTN(jsimd_h2v2_upsample_avx2): mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 mov eax, edx ; colctr - alignx 16, 7 + ALIGNX 16, 7 .columnloop: cmp eax, byte SIZEOF_YMMWORD @@ -734,7 +730,7 @@ EXTN(jsimd_h2v2_upsample_avx2): add ebx, 2*SIZEOF_YMMWORD ; outptr0 add edi, 2*SIZEOF_YMMWORD ; outptr1 jmp short .columnloop - alignx 16, 7 + ALIGNX 16, 7 .nextrow: pop esi diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jdsample-mmx.asm b/contrib/libs/libjpeg-turbo/simd/i386/jdsample-mmx.asm index 12c49f0eab5..01d09e62d10 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jdsample-mmx.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jdsample-mmx.asm @@ -2,24 +2,20 @@ ; jdsample.asm - upsampling (MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_fancy_upsample_mmx) EXTN(jconst_fancy_upsample_mmx): @@ -30,7 +26,7 @@ PW_THREE times 4 dw 3 PW_SEVEN times 4 dw 7 PW_EIGHT times 4 dw 8 - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -61,13 +57,13 @@ PW_EIGHT times 4 dw 8 EXTN(jsimd_h2v1_fancy_upsample_mmx): push ebp mov ebp, esp - pushpic ebx + PUSHPIC ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr test eax, eax @@ -80,7 +76,7 @@ EXTN(jsimd_h2v1_fancy_upsample_mmx): mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov edi, POINTER [output_data_ptr(ebp)] mov edi, JSAMPARRAY [edi] ; output_data - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push eax ; colctr push edi @@ -103,14 +99,14 @@ EXTN(jsimd_h2v1_fancy_upsample_mmx): and eax, byte -SIZEOF_MMWORD cmp eax, byte SIZEOF_MMWORD ja short .columnloop - alignx 16, 7 + ALIGNX 16, 7 .columnloop_last: pcmpeqb mm6, mm6 psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT pand mm6, MMWORD [esi+0*SIZEOF_MMWORD] jmp short .upsample - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movq mm6, MMWORD [esi+1*SIZEOF_MMWORD] @@ -187,7 +183,7 @@ EXTN(jsimd_h2v1_fancy_upsample_mmx): pop esi ; pop edx ; need not be preserved ; pop ecx ; need not be preserved - poppic ebx + POPPIC ebx pop ebp ret @@ -224,15 +220,15 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic eax ; make a room for GOT address + PUSHPIC eax ; make a room for GOT address push ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address + GET_GOT ebx ; get GOT address + MOVPIC POINTER [gotptr], ebx ; save GOT address mov edx, eax ; edx = original ebp mov eax, JDIMENSION [downsamp_width(edx)] ; colctr @@ -246,7 +242,7 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx): mov esi, JSAMPARRAY [input_data(edx)] ; input_data mov edi, POINTER [output_data_ptr(edx)] mov edi, JSAMPARRAY [edi] ; output_data - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push eax ; colctr push ecx @@ -276,8 +272,8 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx): movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0] movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0] - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address + PUSHPIC ebx + MOVPIC ebx, POINTER [gotptr] ; load GOT address pxor mm3, mm3 ; mm3=(all 0's) movq mm4, mm0 @@ -312,19 +308,19 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx): movq MMWORD [wk(0)], mm1 movq MMWORD [wk(1)], mm2 - poppic ebx + POPPIC ebx add eax, byte SIZEOF_MMWORD-1 and eax, byte -SIZEOF_MMWORD cmp eax, byte SIZEOF_MMWORD ja short .columnloop - alignx 16, 7 + ALIGNX 16, 7 .columnloop_last: ; -- process the last column block - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address + PUSHPIC ebx + MOVPIC ebx, POINTER [gotptr] ; load GOT address pcmpeqb mm1, mm1 psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT @@ -337,7 +333,7 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx): movq MMWORD [wk(3)], mm2 jmp short .upsample - alignx 16, 7 + ALIGNX 16, 7 .columnloop: ; -- process the next column block @@ -346,8 +342,8 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx): movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1] movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1] - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address + PUSHPIC ebx + MOVPIC ebx, POINTER [gotptr] ; load GOT address pxor mm3, mm3 ; mm3=(all 0's) movq mm4, mm0 @@ -486,7 +482,7 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx): movq MMWORD [edi+0*SIZEOF_MMWORD], mm1 movq MMWORD [edi+1*SIZEOF_MMWORD], mm0 - poppic ebx + POPPIC ebx sub eax, byte SIZEOF_MMWORD add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above) @@ -561,7 +557,7 @@ EXTN(jsimd_h2v1_upsample_mmx): mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov edi, POINTER [output_data_ptr(ebp)] mov edi, JSAMPARRAY [edi] ; output_data - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push edi push esi @@ -569,7 +565,7 @@ EXTN(jsimd_h2v1_upsample_mmx): mov esi, JSAMPROW [esi] ; inptr mov edi, JSAMPROW [edi] ; outptr mov eax, edx ; colctr - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] @@ -599,7 +595,7 @@ EXTN(jsimd_h2v1_upsample_mmx): add esi, byte 2*SIZEOF_MMWORD ; inptr add edi, byte 4*SIZEOF_MMWORD ; outptr jmp short .columnloop - alignx 16, 7 + ALIGNX 16, 7 .nextrow: pop esi @@ -660,7 +656,7 @@ EXTN(jsimd_h2v2_upsample_mmx): mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov edi, POINTER [output_data_ptr(ebp)] mov edi, JSAMPARRAY [edi] ; output_data - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push edi push esi @@ -669,7 +665,7 @@ EXTN(jsimd_h2v2_upsample_mmx): mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 mov eax, edx ; colctr - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] @@ -704,7 +700,7 @@ EXTN(jsimd_h2v2_upsample_mmx): add ebx, byte 4*SIZEOF_MMWORD ; outptr0 add edi, byte 4*SIZEOF_MMWORD ; outptr1 jmp short .columnloop - alignx 16, 7 + ALIGNX 16, 7 .nextrow: pop esi diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jdsample-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jdsample-sse2.asm index 4e28d2f4b80..b10d9227987 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jdsample-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jdsample-sse2.asm @@ -2,24 +2,20 @@ ; jdsample.asm - upsampling (SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_fancy_upsample_sse2) EXTN(jconst_fancy_upsample_sse2): @@ -30,7 +26,7 @@ PW_THREE times 8 dw 3 PW_SEVEN times 8 dw 7 PW_EIGHT times 8 dw 8 - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -61,13 +57,13 @@ PW_EIGHT times 8 dw 8 EXTN(jsimd_h2v1_fancy_upsample_sse2): push ebp mov ebp, esp - pushpic ebx + PUSHPIC ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr test eax, eax @@ -80,7 +76,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2): mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov edi, POINTER [output_data_ptr(ebp)] mov edi, JSAMPARRAY [edi] ; output_data - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push eax ; colctr push edi @@ -103,14 +99,14 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2): and eax, byte -SIZEOF_XMMWORD cmp eax, byte SIZEOF_XMMWORD ja short .columnloop - alignx 16, 7 + ALIGNX 16, 7 .columnloop_last: pcmpeqb xmm6, xmm6 pslldq xmm6, (SIZEOF_XMMWORD-1) pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD] jmp short .upsample - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD] @@ -185,7 +181,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2): pop esi ; pop edx ; need not be preserved ; pop ecx ; need not be preserved - poppic ebx + POPPIC ebx pop ebp ret @@ -223,15 +219,15 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic eax ; make a room for GOT address + PUSHPIC eax ; make a room for GOT address push ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address + GET_GOT ebx ; get GOT address + MOVPIC POINTER [gotptr], ebx ; save GOT address mov edx, eax ; edx = original ebp mov eax, JDIMENSION [downsamp_width(edx)] ; colctr @@ -245,7 +241,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2): mov esi, JSAMPARRAY [input_data(edx)] ; input_data mov edi, POINTER [output_data_ptr(edx)] mov edi, JSAMPARRAY [edi] ; output_data - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push eax ; colctr push ecx @@ -275,8 +271,8 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2): movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address + PUSHPIC ebx + MOVPIC ebx, POINTER [gotptr] ; load GOT address pxor xmm3, xmm3 ; xmm3=(all 0's) movdqa xmm4, xmm0 @@ -311,19 +307,19 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2): movdqa XMMWORD [wk(0)], xmm1 movdqa XMMWORD [wk(1)], xmm2 - poppic ebx + POPPIC ebx add eax, byte SIZEOF_XMMWORD-1 and eax, byte -SIZEOF_XMMWORD cmp eax, byte SIZEOF_XMMWORD ja short .columnloop - alignx 16, 7 + ALIGNX 16, 7 .columnloop_last: ; -- process the last column block - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address + PUSHPIC ebx + MOVPIC ebx, POINTER [gotptr] ; load GOT address pcmpeqb xmm1, xmm1 pslldq xmm1, (SIZEOF_XMMWORD-2) @@ -336,7 +332,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2): movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) jmp near .upsample - alignx 16, 7 + ALIGNX 16, 7 .columnloop: ; -- process the next column block @@ -345,8 +341,8 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2): movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address + PUSHPIC ebx + MOVPIC ebx, POINTER [gotptr] ; load GOT address pxor xmm3, xmm3 ; xmm3=(all 0's) movdqa xmm4, xmm0 @@ -485,7 +481,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2): movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0 - poppic ebx + POPPIC ebx sub eax, byte SIZEOF_XMMWORD add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above) @@ -558,7 +554,7 @@ EXTN(jsimd_h2v1_upsample_sse2): mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov edi, POINTER [output_data_ptr(ebp)] mov edi, JSAMPARRAY [edi] ; output_data - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push edi push esi @@ -566,7 +562,7 @@ EXTN(jsimd_h2v1_upsample_sse2): mov esi, JSAMPROW [esi] ; inptr mov edi, JSAMPROW [edi] ; outptr mov eax, edx ; colctr - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] @@ -596,7 +592,7 @@ EXTN(jsimd_h2v1_upsample_sse2): add esi, byte 2*SIZEOF_XMMWORD ; inptr add edi, byte 4*SIZEOF_XMMWORD ; outptr jmp short .columnloop - alignx 16, 7 + ALIGNX 16, 7 .nextrow: pop esi @@ -655,7 +651,7 @@ EXTN(jsimd_h2v2_upsample_sse2): mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov edi, POINTER [output_data_ptr(ebp)] mov edi, JSAMPARRAY [edi] ; output_data - alignx 16, 7 + ALIGNX 16, 7 .rowloop: push edi push esi @@ -664,7 +660,7 @@ EXTN(jsimd_h2v2_upsample_sse2): mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 mov eax, edx ; colctr - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] @@ -699,7 +695,7 @@ EXTN(jsimd_h2v2_upsample_sse2): add ebx, byte 4*SIZEOF_XMMWORD ; outptr0 add edi, byte 4*SIZEOF_XMMWORD ; outptr1 jmp short .columnloop - alignx 16, 7 + ALIGNX 16, 7 .nextrow: pop esi diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jfdctflt-3dn.asm b/contrib/libs/libjpeg-turbo/simd/i386/jfdctflt-3dn.asm index 322ab163252..0cedc6caf40 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jfdctflt-3dn.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jfdctflt-3dn.asm @@ -2,17 +2,13 @@ ; jfdctflt.asm - floating-point FDCT (3DNow!) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a floating-point implementation of the forward DCT ; (Discrete Cosine Transform). The following code is based directly on @@ -24,7 +20,7 @@ ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_fdct_float_3dnow) EXTN(jconst_fdct_float_3dnow): @@ -34,7 +30,7 @@ PD_0_707 times 2 dd 0.707106781186547524400844 PD_0_541 times 2 dd 0.541196100146196984399723 PD_1_306 times 2 dd 1.306562964876376527856643 - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -63,19 +59,19 @@ EXTN(jsimd_fdct_float_3dnow): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic ebx + PUSHPIC ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved ; push esi ; unused ; push edi ; unused - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address ; ---- Pass 1: process rows. mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) mov ecx, DCTSIZE/2 - alignx 16, 7 + ALIGNX 16, 7 .rowloop: movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] @@ -190,7 +186,7 @@ EXTN(jsimd_fdct_float_3dnow): mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) mov ecx, DCTSIZE/2 - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] @@ -307,7 +303,7 @@ EXTN(jsimd_fdct_float_3dnow): ; pop esi ; unused ; pop edx ; need not be preserved ; pop ecx ; need not be preserved - poppic ebx + POPPIC ebx mov esp, ebp ; esp <- aligned ebp pop esp ; esp <- original ebp pop ebp diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jfdctflt-sse.asm b/contrib/libs/libjpeg-turbo/simd/i386/jfdctflt-sse.asm index 86952c6499c..2cb95335869 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jfdctflt-sse.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jfdctflt-sse.asm @@ -2,17 +2,13 @@ ; jfdctflt.asm - floating-point FDCT (SSE) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a floating-point implementation of the forward DCT ; (Discrete Cosine Transform). The following code is based directly on @@ -34,7 +30,7 @@ ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_fdct_float_sse) EXTN(jconst_fdct_float_sse): @@ -44,7 +40,7 @@ PD_0_707 times 4 dd 0.707106781186547524400844 PD_0_541 times 4 dd 0.541196100146196984399723 PD_1_306 times 4 dd 1.306562964876376527856643 - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -74,19 +70,19 @@ EXTN(jsimd_fdct_float_sse): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic ebx + PUSHPIC ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved ; push esi ; unused ; push edi ; unused - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address ; ---- Pass 1: process rows. mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) mov ecx, DCTSIZE/4 - alignx 16, 7 + ALIGNX 16, 7 .rowloop: movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] @@ -222,7 +218,7 @@ EXTN(jsimd_fdct_float_sse): mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) mov ecx, DCTSIZE/4 - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] @@ -358,7 +354,7 @@ EXTN(jsimd_fdct_float_sse): ; pop esi ; unused ; pop edx ; need not be preserved ; pop ecx ; need not be preserved - poppic ebx + POPPIC ebx mov esp, ebp ; esp <- aligned ebp pop esp ; esp <- original ebp pop ebp diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jfdctfst-mmx.asm b/contrib/libs/libjpeg-turbo/simd/i386/jfdctfst-mmx.asm index 80645a50d7e..fe16e83ee24 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jfdctfst-mmx.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jfdctfst-mmx.asm @@ -2,17 +2,13 @@ ; jfdctfst.asm - fast integer FDCT (MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a fast, not so accurate integer implementation of ; the forward DCT (Discrete Cosine Transform). The following code is @@ -49,7 +45,7 @@ F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965) %define PRE_MULTIPLY_SCALE_BITS 2 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_fdct_ifast_mmx) EXTN(jconst_fdct_ifast_mmx): @@ -59,7 +55,7 @@ PW_F0382 times 4 dw F_0_382 << CONST_SHIFT PW_F0541 times 4 dw F_0_541 << CONST_SHIFT PW_F1306 times 4 dw F_1_306 << CONST_SHIFT - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -88,19 +84,19 @@ EXTN(jsimd_fdct_ifast_mmx): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic ebx + PUSHPIC ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved ; push esi ; unused ; push edi ; unused - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address ; ---- Pass 1: process rows. mov edx, POINTER [data(eax)] ; (DCTELEM *) mov ecx, DCTSIZE/4 - alignx 16, 7 + ALIGNX 16, 7 .rowloop: movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] @@ -241,7 +237,7 @@ EXTN(jsimd_fdct_ifast_mmx): mov edx, POINTER [data(eax)] ; (DCTELEM *) mov ecx, DCTSIZE/4 - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] @@ -384,7 +380,7 @@ EXTN(jsimd_fdct_ifast_mmx): ; pop esi ; unused ; pop edx ; need not be preserved ; pop ecx ; need not be preserved - poppic ebx + POPPIC ebx mov esp, ebp ; esp <- aligned ebp pop esp ; esp <- original ebp pop ebp diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jfdctfst-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jfdctfst-sse2.asm index 446fa7a68f7..890482e0067 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jfdctfst-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jfdctfst-sse2.asm @@ -2,17 +2,13 @@ ; jfdctfst.asm - fast integer FDCT (SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a fast, not so accurate integer implementation of ; the forward DCT (Discrete Cosine Transform). The following code is @@ -49,7 +45,7 @@ F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965) %define PRE_MULTIPLY_SCALE_BITS 2 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_fdct_ifast_sse2) EXTN(jconst_fdct_ifast_sse2): @@ -59,7 +55,7 @@ PW_F0382 times 8 dw F_0_382 << CONST_SHIFT PW_F0541 times 8 dw F_0_541 << CONST_SHIFT PW_F1306 times 8 dw F_1_306 << CONST_SHIFT - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -89,13 +85,13 @@ EXTN(jsimd_fdct_ifast_sse2): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic ebx + PUSHPIC ebx ; push ecx ; unused ; push edx ; need not be preserved ; push esi ; unused ; push edi ; unused - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address ; ---- Pass 1: process rows. @@ -392,7 +388,7 @@ EXTN(jsimd_fdct_ifast_sse2): ; pop esi ; unused ; pop edx ; need not be preserved ; pop ecx ; unused - poppic ebx + POPPIC ebx mov esp, ebp ; esp <- aligned ebp pop esp ; esp <- original ebp pop ebp diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jfdctint-avx2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jfdctint-avx2.asm index 23cf733135b..05ea8654850 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jfdctint-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jfdctint-avx2.asm @@ -2,17 +2,13 @@ ; jfdctint.asm - accurate integer FDCT (AVX2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander. +; Copyright (C) 2009, 2016, 2018, 2020, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a slower but more accurate integer implementation of the ; forward DCT (Discrete Cosine Transform). The following code is based @@ -65,7 +61,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) ; %1-%4: Input/output registers ; %5-%8: Temp registers -%macro dotranspose 8 +%macro DOTRANSPOSE 8 ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) @@ -108,7 +104,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) ; %5-%8: Temp registers ; %9: Pass (1 or 2) -%macro dodct 9 +%macro DODCT 9 vpsubw %5, %1, %4 ; %5=data1_0-data6_7=tmp6_7 vpaddw %6, %1, %4 ; %6=data1_0+data6_7=tmp1_0 vpaddw %7, %2, %3 ; %7=data3_2+data4_5=tmp3_2 @@ -223,7 +219,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_fdct_islow_avx2) EXTN(jconst_fdct_islow_avx2): @@ -242,7 +238,7 @@ PW_DESCALE_P2X times 16 dw 1 << (PASS1_BITS - 1) PW_1_NEG1 times 8 dw 1 times 8 dw -1 - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -262,13 +258,13 @@ PW_1_NEG1 times 8 dw 1 EXTN(jsimd_fdct_islow_avx2): push ebp mov ebp, esp - pushpic ebx + PUSHPIC ebx ; push ecx ; unused ; push edx ; need not be preserved ; push esi ; unused ; push edi ; unused - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address ; ---- Pass 1: process rows. @@ -292,9 +288,9 @@ EXTN(jsimd_fdct_islow_avx2): ; ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) ; ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) - dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 + DOTRANSPOSE ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 - dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1 + DODCT ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1 ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5 ; ---- Pass 2: process columns. @@ -302,9 +298,9 @@ EXTN(jsimd_fdct_islow_avx2): vperm2i128 ymm4, ymm1, ymm3, 0x20 ; ymm4=data3_7 vperm2i128 ymm1, ymm1, ymm3, 0x31 ; ymm1=data1_5 - dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 + DOTRANSPOSE ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 - dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2 + DODCT ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2 ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5 vperm2i128 ymm3, ymm0, ymm1, 0x30 ; ymm3=data0_1 @@ -322,7 +318,7 @@ EXTN(jsimd_fdct_islow_avx2): ; pop esi ; unused ; pop edx ; need not be preserved ; pop ecx ; unused - poppic ebx + POPPIC ebx pop ebp ret diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jfdctint-mmx.asm b/contrib/libs/libjpeg-turbo/simd/i386/jfdctint-mmx.asm index 34a43b9e5ef..7d4c61cd7d3 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jfdctint-mmx.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jfdctint-mmx.asm @@ -2,17 +2,13 @@ ; jfdctint.asm - accurate integer FDCT (MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, 2020, D. R. Commander. +; Copyright (C) 2016, 2020, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a slower but more accurate integer implementation of the ; forward DCT (Discrete Cosine Transform). The following code is based @@ -63,7 +59,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_fdct_islow_mmx) EXTN(jconst_fdct_islow_mmx): @@ -80,7 +76,7 @@ PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1 - 1) PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1) PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS - 1) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -109,19 +105,19 @@ EXTN(jsimd_fdct_islow_mmx): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic ebx + PUSHPIC ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved ; push esi ; unused ; push edi ; unused - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address ; ---- Pass 1: process rows. mov edx, POINTER [data(eax)] ; (DCTELEM *) mov ecx, DCTSIZE/4 - alignx 16, 7 + ALIGNX 16, 7 .rowloop: movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] @@ -363,7 +359,7 @@ EXTN(jsimd_fdct_islow_mmx): mov edx, POINTER [data(eax)] ; (DCTELEM *) mov ecx, DCTSIZE/4 - alignx 16, 7 + ALIGNX 16, 7 .columnloop: movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] @@ -609,7 +605,7 @@ EXTN(jsimd_fdct_islow_mmx): ; pop esi ; unused ; pop edx ; need not be preserved ; pop ecx ; need not be preserved - poppic ebx + POPPIC ebx mov esp, ebp ; esp <- aligned ebp pop esp ; esp <- original ebp pop ebp diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jfdctint-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jfdctint-sse2.asm index 6f8e18cb9d0..7ed5c9501ac 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jfdctint-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jfdctint-sse2.asm @@ -2,17 +2,13 @@ ; jfdctint.asm - accurate integer FDCT (SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, 2020, D. R. Commander. +; Copyright (C) 2016, 2020, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a slower but more accurate integer implementation of the ; forward DCT (Discrete Cosine Transform). The following code is based @@ -63,7 +59,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_fdct_islow_sse2) EXTN(jconst_fdct_islow_sse2): @@ -80,7 +76,7 @@ PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1) PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1) PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -110,13 +106,13 @@ EXTN(jsimd_fdct_islow_sse2): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic ebx + PUSHPIC ebx ; push ecx ; unused ; push edx ; need not be preserved ; push esi ; unused ; push edi ; unused - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address ; ---- Pass 1: process rows. @@ -622,7 +618,7 @@ EXTN(jsimd_fdct_islow_sse2): ; pop esi ; unused ; pop edx ; need not be preserved ; pop ecx ; unused - poppic ebx + POPPIC ebx mov esp, ebp ; esp <- aligned ebp pop esp ; esp <- original ebp pop ebp diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jidctflt-3dn.asm b/contrib/libs/libjpeg-turbo/simd/i386/jidctflt-3dn.asm index 87951910d8e..8612eee3a5f 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jidctflt-3dn.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jidctflt-3dn.asm @@ -2,17 +2,13 @@ ; jidctflt.asm - floating-point IDCT (3DNow! & MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a floating-point implementation of the inverse DCT ; (Discrete Cosine Transform). The following code is based directly on @@ -24,7 +20,7 @@ ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_idct_float_3dnow) EXTN(jconst_idct_float_3dnow): @@ -36,7 +32,7 @@ PD_2_613 times 2 dd 2.613125929752753055713286 PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3) PB_CENTERJSAMP times 8 db CENTERJSAMPLE - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -78,7 +74,7 @@ EXTN(jsimd_idct_float_3dnow): push esi push edi - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address ; ---- Pass 1: process columns from input, store into work array. @@ -87,21 +83,21 @@ EXTN(jsimd_idct_float_3dnow): mov esi, JCOEFPTR [coef_block(eax)] ; inptr lea edi, [workspace] ; FAST_FLOAT *wsptr mov ecx, DCTSIZE/2 ; ctr - alignx 16, 7 + ALIGNX 16, 7 .columnloop: %ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] jnz short .columnDCT - pushpic ebx ; save GOT address + PUSHPIC ebx ; save GOT address mov ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] mov eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] or ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] or eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] or ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] or eax, ebx - poppic ebx ; restore GOT address + POPPIC ebx ; restore GOT address jnz short .columnDCT ; -- AC terms all zero @@ -127,7 +123,7 @@ EXTN(jsimd_idct_float_3dnow): movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1 movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 jmp near .nextcolumn - alignx 16, 7 + ALIGNX 16, 7 %endif .columnDCT: @@ -293,7 +289,7 @@ EXTN(jsimd_idct_float_3dnow): mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) mov eax, JDIMENSION [output_col(eax)] mov ecx, DCTSIZE/2 ; ctr - alignx 16, 7 + ALIGNX 16, 7 .rowloop: ; -- Even part @@ -420,14 +416,14 @@ EXTN(jsimd_idct_float_3dnow): punpckldq mm6, mm4 ; mm6=(00 01 02 03 04 05 06 07) punpckhdq mm7, mm4 ; mm7=(10 11 12 13 14 15 16 17) - pushpic ebx ; save GOT address + PUSHPIC ebx ; save GOT address mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 - poppic ebx ; restore GOT address + POPPIC ebx ; restore GOT address add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr add edi, byte 2*SIZEOF_JSAMPROW diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jidctflt-sse.asm b/contrib/libs/libjpeg-turbo/simd/i386/jidctflt-sse.asm index b27ecfdf46a..caf636b5106 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jidctflt-sse.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jidctflt-sse.asm @@ -2,17 +2,13 @@ ; jidctflt.asm - floating-point IDCT (SSE & MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a floating-point implementation of the inverse DCT ; (Discrete Cosine Transform). The following code is based directly on @@ -23,18 +19,18 @@ ; -------------------------------------------------------------------------- -%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) +%macro UNPCKLPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) shufps %1, %2, 0x44 %endmacro -%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) +%macro UNPCKHPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) shufps %1, %2, 0xEE %endmacro ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_idct_float_sse) EXTN(jconst_idct_float_sse): @@ -46,7 +42,7 @@ PD_M2_613 times 4 dd -2.613125929752753055713286 PD_0_125 times 4 dd 0.125 ; 1/8 PB_CENTERJSAMP times 8 db CENTERJSAMPLE - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -88,7 +84,7 @@ EXTN(jsimd_idct_float_sse): push esi push edi - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address ; ---- Pass 1: process columns from input, store into work array. @@ -97,7 +93,7 @@ EXTN(jsimd_idct_float_sse): mov esi, JCOEFPTR [coef_block(eax)] ; inptr lea edi, [workspace] ; FAST_FLOAT *wsptr mov ecx, DCTSIZE/4 ; ctr - alignx 16, 7 + ALIGNX 16, 7 .columnloop: %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] @@ -149,7 +145,7 @@ EXTN(jsimd_idct_float_sse): movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 jmp near .nextcolumn - alignx 16, 7 + ALIGNX 16, 7 %endif .columnDCT: @@ -325,11 +321,11 @@ EXTN(jsimd_idct_float_sse): unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) movaps xmm3, xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) - unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) + UNPCKLPS2 xmm6, xmm7 ; xmm6=(00 10 20 30) + UNPCKHPS2 xmm3, xmm7 ; xmm3=(01 11 21 31) movaps xmm0, xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) - unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) + UNPCKLPS2 xmm1, xmm2 ; xmm1=(02 12 22 32) + UNPCKHPS2 xmm0, xmm2 ; xmm0=(03 13 23 33) movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) @@ -340,11 +336,11 @@ EXTN(jsimd_idct_float_sse): movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 movaps xmm6, xmm5 ; transpose coefficients(phase 2) - unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) - unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) + UNPCKLPS2 xmm5, xmm7 ; xmm5=(40 50 60 70) + UNPCKHPS2 xmm6, xmm7 ; xmm6=(41 51 61 71) movaps xmm3, xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) - unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) + UNPCKLPS2 xmm4, xmm2 ; xmm4=(42 52 62 72) + UNPCKHPS2 xmm3, xmm2 ; xmm3=(43 53 63 73) movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 @@ -372,7 +368,7 @@ EXTN(jsimd_idct_float_sse): mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) mov eax, JDIMENSION [output_col(eax)] mov ecx, DCTSIZE/4 ; ctr - alignx 16, 7 + ALIGNX 16, 7 .rowloop: ; -- Even part @@ -536,7 +532,7 @@ EXTN(jsimd_idct_float_sse): punpckldq mm5, mm6 ; mm5=(20 21 22 23 24 25 26 27) punpckhdq mm4, mm6 ; mm4=(30 31 32 33 34 35 36 37) - pushpic ebx ; save GOT address + PUSHPIC ebx ; save GOT address mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] @@ -547,7 +543,7 @@ EXTN(jsimd_idct_float_sse): movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 - poppic ebx ; restore GOT address + POPPIC ebx ; restore GOT address add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr add edi, byte 4*SIZEOF_JSAMPROW diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jidctflt-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jidctflt-sse2.asm index c646eaef76e..42703a8efd7 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jidctflt-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jidctflt-sse2.asm @@ -2,17 +2,13 @@ ; jidctflt.asm - floating-point IDCT (SSE & SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a floating-point implementation of the inverse DCT ; (Discrete Cosine Transform). The following code is based directly on @@ -23,18 +19,18 @@ ; -------------------------------------------------------------------------- -%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) +%macro UNPCKLPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) shufps %1, %2, 0x44 %endmacro -%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) +%macro UNPCKHPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) shufps %1, %2, 0xEE %endmacro ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_idct_float_sse2) EXTN(jconst_idct_float_sse2): @@ -46,7 +42,7 @@ PD_M2_613 times 4 dd -2.613125929752753055713286 PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -88,7 +84,7 @@ EXTN(jsimd_idct_float_sse2): push esi push edi - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address ; ---- Pass 1: process columns from input, store into work array. @@ -97,7 +93,7 @@ EXTN(jsimd_idct_float_sse2): mov esi, JCOEFPTR [coef_block(eax)] ; inptr lea edi, [workspace] ; FAST_FLOAT *wsptr mov ecx, DCTSIZE/4 ; ctr - alignx 16, 7 + ALIGNX 16, 7 .columnloop: %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] @@ -150,7 +146,7 @@ EXTN(jsimd_idct_float_sse2): movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 jmp near .nextcolumn - alignx 16, 7 + ALIGNX 16, 7 %endif .columnDCT: @@ -287,11 +283,11 @@ EXTN(jsimd_idct_float_sse2): unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) movaps xmm3, xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) - unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) + UNPCKLPS2 xmm6, xmm7 ; xmm6=(00 10 20 30) + UNPCKHPS2 xmm3, xmm7 ; xmm3=(01 11 21 31) movaps xmm0, xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) - unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) + UNPCKLPS2 xmm1, xmm2 ; xmm1=(02 12 22 32) + UNPCKHPS2 xmm0, xmm2 ; xmm0=(03 13 23 33) movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) @@ -302,11 +298,11 @@ EXTN(jsimd_idct_float_sse2): movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 movaps xmm6, xmm5 ; transpose coefficients(phase 2) - unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) - unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) + UNPCKLPS2 xmm5, xmm7 ; xmm5=(40 50 60 70) + UNPCKHPS2 xmm6, xmm7 ; xmm6=(41 51 61 71) movaps xmm3, xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) - unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) + UNPCKLPS2 xmm4, xmm2 ; xmm4=(42 52 62 72) + UNPCKHPS2 xmm3, xmm2 ; xmm3=(43 53 63 73) movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 @@ -334,7 +330,7 @@ EXTN(jsimd_idct_float_sse2): mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) mov eax, JDIMENSION [output_col(eax)] mov ecx, DCTSIZE/4 ; ctr - alignx 16, 7 + ALIGNX 16, 7 .rowloop: ; -- Even part @@ -464,7 +460,7 @@ EXTN(jsimd_idct_float_sse2): pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) - pushpic ebx ; save GOT address + PUSHPIC ebx ; save GOT address mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] @@ -475,7 +471,7 @@ EXTN(jsimd_idct_float_sse2): movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 - poppic ebx ; restore GOT address + POPPIC ebx ; restore GOT address add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr add edi, byte 4*SIZEOF_JSAMPROW diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jidctfst-mmx.asm b/contrib/libs/libjpeg-turbo/simd/i386/jidctfst-mmx.asm index 24622d43693..77d4613d23b 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jidctfst-mmx.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jidctfst-mmx.asm @@ -2,17 +2,13 @@ ; jidctfst.asm - fast integer IDCT (MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a fast, not so accurate integer implementation of ; the inverse DCT (Discrete Cosine Transform). The following code is @@ -56,7 +52,7 @@ F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) %define PRE_MULTIPLY_SCALE_BITS 2 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_idct_ifast_mmx) EXTN(jconst_idct_ifast_mmx): @@ -67,7 +63,7 @@ PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT PW_F1082 times 4 dw F_1_082 << CONST_SHIFT PB_CENTERJSAMP times 8 db CENTERJSAMPLE - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -109,7 +105,7 @@ EXTN(jsimd_idct_ifast_mmx): push esi push edi - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address ; ---- Pass 1: process columns from input, store into work array. @@ -118,7 +114,7 @@ EXTN(jsimd_idct_ifast_mmx): mov esi, JCOEFPTR [coef_block(eax)] ; inptr lea edi, [workspace] ; JCOEF *wsptr mov ecx, DCTSIZE/4 ; ctr - alignx 16, 7 + ALIGNX 16, 7 .columnloop: %ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] @@ -163,7 +159,7 @@ EXTN(jsimd_idct_ifast_mmx): movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 jmp near .nextcolumn - alignx 16, 7 + ALIGNX 16, 7 %endif .columnDCT: @@ -326,7 +322,7 @@ EXTN(jsimd_idct_ifast_mmx): mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) mov eax, JDIMENSION [output_col(eax)] mov ecx, DCTSIZE/4 ; ctr - alignx 16, 7 + ALIGNX 16, 7 .rowloop: ; -- Even part @@ -464,7 +460,7 @@ EXTN(jsimd_idct_ifast_mmx): punpckldq mm5, mm4 ; mm5=(20 21 22 23 24 25 26 27) punpckhdq mm1, mm4 ; mm1=(30 31 32 33 34 35 36 37) - pushpic ebx ; save GOT address + PUSHPIC ebx ; save GOT address mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] @@ -475,7 +471,7 @@ EXTN(jsimd_idct_ifast_mmx): movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 - poppic ebx ; restore GOT address + POPPIC ebx ; restore GOT address add esi, byte 4*SIZEOF_JCOEF ; wsptr add edi, byte 4*SIZEOF_JSAMPROW diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jidctfst-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jidctfst-sse2.asm index 19704ffa48f..c2fe34ba8c6 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jidctfst-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jidctfst-sse2.asm @@ -2,17 +2,13 @@ ; jidctfst.asm - fast integer IDCT (SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a fast, not so accurate integer implementation of ; the inverse DCT (Discrete Cosine Transform). The following code is @@ -56,7 +52,7 @@ F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) %define PRE_MULTIPLY_SCALE_BITS 2 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_idct_ifast_sse2) EXTN(jconst_idct_ifast_sse2): @@ -67,7 +63,7 @@ PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT PW_F1082 times 8 dw F_1_082 << CONST_SHIFT PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -101,13 +97,13 @@ EXTN(jsimd_idct_ifast_sse2): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic ebx + PUSHPIC ebx ; push ecx ; unused ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address ; ---- Pass 1: process columns from input. @@ -155,7 +151,7 @@ EXTN(jsimd_idct_ifast_sse2): movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 jmp near .column_end - alignx 16, 7 + ALIGNX 16, 7 %endif .columnDCT: @@ -490,7 +486,7 @@ EXTN(jsimd_idct_ifast_sse2): pop esi ; pop edx ; need not be preserved ; pop ecx ; unused - poppic ebx + POPPIC ebx mov esp, ebp ; esp <- aligned ebp pop esp ; esp <- original ebp pop ebp diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jidctint-avx2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jidctint-avx2.asm index 199c7df3b69..cb119d3f06d 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jidctint-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jidctint-avx2.asm @@ -2,17 +2,13 @@ ; jidctint.asm - accurate integer IDCT (AVX2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander. +; Copyright (C) 2009, 2016, 2018, 2020, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a slower but more accurate integer implementation of the ; inverse DCT (Discrete Cosine Transform). The following code is based @@ -65,7 +61,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) ; %1-%4: Input/output registers ; %5-%8: Temp registers -%macro dotranspose 8 +%macro DOTRANSPOSE 8 ; %5=(00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71) ; %6=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72) ; %7=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75) @@ -118,7 +114,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) ; %5-%12: Temp registers ; %9: Pass (1 or 2) -%macro dodct 13 +%macro DODCT 13 ; -- Even part ; (Original) @@ -250,7 +246,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_idct_islow_avx2) EXTN(jconst_idct_islow_avx2): @@ -269,7 +265,7 @@ PB_CENTERJSAMP times 32 db CENTERJSAMPLE PW_1_NEG1 times 8 dw 1 times 8 dw -1 - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -303,13 +299,13 @@ EXTN(jsimd_idct_islow_avx2): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic ebx + PUSHPIC ebx ; push ecx ; unused ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address ; ---- Pass 1: process columns. @@ -353,7 +349,7 @@ EXTN(jsimd_idct_islow_avx2): vpshufd ymm3, ymm4, 0xFF ; ymm3=col3_7=(03 03 03 03 03 03 03 03 07 07 07 07 07 07 07 07) jmp near .column_end - alignx 16, 7 + ALIGNX 16, 7 %endif .columnDCT: @@ -371,10 +367,10 @@ EXTN(jsimd_idct_islow_avx2): vperm2i128 ymm2, ymm5, ymm7, 0x20 ; ymm2=in2_6 vperm2i128 ymm3, ymm7, ymm6, 0x31 ; ymm3=in7_5 - dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 1 + DODCT ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 1 ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6 - dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 + DOTRANSPOSE ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7 .column_end: @@ -395,10 +391,10 @@ EXTN(jsimd_idct_islow_avx2): vperm2i128 ymm4, ymm3, ymm1, 0x31 ; ymm3=in7_5 vperm2i128 ymm1, ymm3, ymm1, 0x20 ; ymm1=in3_1 - dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 2 + DODCT ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 2 ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6 - dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 + DOTRANSPOSE ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7 vpacksswb ymm0, ymm0, ymm1 ; ymm0=data01_45 @@ -442,7 +438,7 @@ EXTN(jsimd_idct_islow_avx2): pop esi ; pop edx ; need not be preserved ; pop ecx ; unused - poppic ebx + POPPIC ebx mov esp, ebp ; esp <- aligned ebp pop esp ; esp <- original ebp pop ebp diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jidctint-mmx.asm b/contrib/libs/libjpeg-turbo/simd/i386/jidctint-mmx.asm index f15c8d34bcb..c2c17f441b8 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jidctint-mmx.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jidctint-mmx.asm @@ -2,17 +2,13 @@ ; jidctint.asm - accurate integer IDCT (MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, 2020, D. R. Commander. +; Copyright (C) 2016, 2020, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a slower but more accurate integer implementation of the ; inverse DCT (Discrete Cosine Transform). The following code is based @@ -63,7 +59,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_idct_islow_mmx) EXTN(jconst_idct_islow_mmx): @@ -80,7 +76,7 @@ PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1 - 1) PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1) PB_CENTERJSAMP times 8 db CENTERJSAMPLE - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -122,7 +118,7 @@ EXTN(jsimd_idct_islow_mmx): push esi push edi - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address ; ---- Pass 1: process columns from input, store into work array. @@ -131,7 +127,7 @@ EXTN(jsimd_idct_islow_mmx): mov esi, JCOEFPTR [coef_block(eax)] ; inptr lea edi, [workspace] ; JCOEF *wsptr mov ecx, DCTSIZE/4 ; ctr - alignx 16, 7 + ALIGNX 16, 7 .columnloop: %ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] @@ -178,7 +174,7 @@ EXTN(jsimd_idct_islow_mmx): movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 jmp near .nextcolumn - alignx 16, 7 + ALIGNX 16, 7 %endif .columnDCT: @@ -513,7 +509,7 @@ EXTN(jsimd_idct_islow_mmx): mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) mov eax, JDIMENSION [output_col(eax)] mov ecx, DCTSIZE/4 ; ctr - alignx 16, 7 + ALIGNX 16, 7 .rowloop: ; -- Even part @@ -816,7 +812,7 @@ EXTN(jsimd_idct_islow_mmx): punpckldq mm7, mm5 ; mm7=(20 21 22 23 24 25 26 27) punpckhdq mm4, mm5 ; mm4=(30 31 32 33 34 35 36 37) - pushpic ebx ; save GOT address + PUSHPIC ebx ; save GOT address mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] @@ -827,7 +823,7 @@ EXTN(jsimd_idct_islow_mmx): movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7 movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 - poppic ebx ; restore GOT address + POPPIC ebx ; restore GOT address add esi, byte 4*SIZEOF_JCOEF ; wsptr add edi, byte 4*SIZEOF_JSAMPROW diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jidctint-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jidctint-sse2.asm index 43e320189b4..70516cadcef 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jidctint-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jidctint-sse2.asm @@ -2,17 +2,13 @@ ; jidctint.asm - accurate integer IDCT (SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, 2020, D. R. Commander. +; Copyright (C) 2016, 2020, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a slower but more accurate integer implementation of the ; inverse DCT (Discrete Cosine Transform). The following code is based @@ -63,7 +59,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_idct_islow_sse2) EXTN(jconst_idct_islow_sse2): @@ -80,7 +76,7 @@ PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1) PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1) PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -114,13 +110,13 @@ EXTN(jsimd_idct_islow_sse2): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic ebx + PUSHPIC ebx ; push ecx ; unused ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address ; ---- Pass 1: process columns from input. @@ -172,7 +168,7 @@ EXTN(jsimd_idct_islow_sse2): movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 jmp near .column_end - alignx 16, 7 + ALIGNX 16, 7 %endif .columnDCT: @@ -847,7 +843,7 @@ EXTN(jsimd_idct_islow_sse2): pop esi ; pop edx ; need not be preserved ; pop ecx ; unused - poppic ebx + POPPIC ebx mov esp, ebp ; esp <- aligned ebp pop esp ; esp <- original ebp pop ebp diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jidctred-mmx.asm b/contrib/libs/libjpeg-turbo/simd/i386/jidctred-mmx.asm index e2307e1cb6c..96cda657133 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jidctred-mmx.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jidctred-mmx.asm @@ -2,17 +2,13 @@ ; jidctred.asm - reduced-size IDCT (MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains inverse-DCT routines that produce reduced-size ; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. @@ -69,7 +65,7 @@ F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_idct_red_mmx) EXTN(jconst_idct_red_mmx): @@ -87,7 +83,7 @@ PD_DESCALE_P1_2 times 2 dd 1 << (DESCALE_P1_2 - 1) PD_DESCALE_P2_2 times 2 dd 1 << (DESCALE_P2_2 - 1) PB_CENTERJSAMP times 8 db CENTERJSAMPLE - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -124,13 +120,13 @@ EXTN(jsimd_idct_4x4_mmx): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [workspace] - pushpic ebx + PUSHPIC ebx ; push ecx ; need not be preserved ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address ; ---- Pass 1: process columns from input, store into work array. @@ -139,7 +135,7 @@ EXTN(jsimd_idct_4x4_mmx): mov esi, JCOEFPTR [coef_block(eax)] ; inptr lea edi, [workspace] ; JCOEF *wsptr mov ecx, DCTSIZE/4 ; ctr - alignx 16, 7 + ALIGNX 16, 7 .columnloop: %ifndef NO_ZERO_COLUMN_TEST_4X4_MMX mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] @@ -181,7 +177,7 @@ EXTN(jsimd_idct_4x4_mmx): movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 jmp near .nextcolumn - alignx 16, 7 + ALIGNX 16, 7 %endif .columnDCT: @@ -479,7 +475,7 @@ EXTN(jsimd_idct_4x4_mmx): pop esi ; pop edx ; need not be preserved ; pop ecx ; need not be preserved - poppic ebx + POPPIC ebx mov esp, ebp ; esp <- aligned ebp pop esp ; esp <- original ebp pop ebp @@ -512,7 +508,7 @@ EXTN(jsimd_idct_2x2_mmx): push esi push edi - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address ; ---- Pass 1: process columns from input. diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jidctred-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jidctred-sse2.asm index 6e56494e975..1fe967db199 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jidctred-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jidctred-sse2.asm @@ -2,17 +2,13 @@ ; jidctred.asm - reduced-size IDCT (SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains inverse-DCT routines that produce reduced-size ; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. @@ -69,7 +65,7 @@ F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_idct_red_sse2) EXTN(jconst_idct_red_sse2): @@ -87,7 +83,7 @@ PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2 - 1) PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2 - 1) PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -122,13 +118,13 @@ EXTN(jsimd_idct_4x4_sse2): mov [esp], eax mov ebp, esp ; ebp = aligned ebp lea esp, [wk(0)] - pushpic ebx + PUSHPIC ebx ; push ecx ; unused ; push edx ; need not be preserved push esi push edi - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address ; ---- Pass 1: process columns from input. @@ -171,7 +167,7 @@ EXTN(jsimd_idct_4x4_sse2): pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) jmp near .column_end - alignx 16, 7 + ALIGNX 16, 7 %endif .columnDCT: @@ -400,7 +396,7 @@ EXTN(jsimd_idct_4x4_sse2): pop esi ; pop edx ; need not be preserved ; pop ecx ; unused - poppic ebx + POPPIC ebx mov esp, ebp ; esp <- aligned ebp pop esp ; esp <- original ebp pop ebp @@ -433,7 +429,7 @@ EXTN(jsimd_idct_2x2_sse2): push esi push edi - get_GOT ebx ; get GOT address + GET_GOT ebx ; get GOT address ; ---- Pass 1: process columns from input. diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jquant-3dn.asm b/contrib/libs/libjpeg-turbo/simd/i386/jquant-3dn.asm index 5cb60caa947..58e0011f70e 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jquant-3dn.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jquant-3dn.asm @@ -2,17 +2,13 @@ ; jquant.asm - sample data conversion and quantization (3DNow! & MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" %include "jdct.inc" @@ -52,7 +48,7 @@ EXTN(jsimd_convsamp_float_3dnow): mov eax, JDIMENSION [start_col] mov edi, POINTER [workspace] ; (DCTELEM *) mov ecx, DCTSIZE/2 - alignx 16, 7 + ALIGNX 16, 7 .convloop: mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) @@ -154,7 +150,7 @@ EXTN(jsimd_quantize_float_3dnow): mov edx, POINTER [divisors] mov edi, JCOEFPTR [coef_block] mov eax, DCTSIZE2/16 - alignx 16, 7 + ALIGNX 16, 7 .quantloop: movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jquant-mmx.asm b/contrib/libs/libjpeg-turbo/simd/i386/jquant-mmx.asm index 61305c625de..4eda95ce12f 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jquant-mmx.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jquant-mmx.asm @@ -2,17 +2,13 @@ ; jquant.asm - sample data conversion and quantization (MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" %include "jdct.inc" @@ -52,7 +48,7 @@ EXTN(jsimd_convsamp_mmx): mov eax, JDIMENSION [start_col] mov edi, POINTER [workspace] ; (DCTELEM *) mov ecx, DCTSIZE/4 - alignx 16, 7 + ALIGNX 16, 7 .convloop: mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) @@ -120,8 +116,8 @@ EXTN(jsimd_convsamp_mmx): ; Quantize/descale the coefficients, and store into coef_block ; ; This implementation is based on an algorithm described in -; "How to optimize for the Pentium family of microprocessors" -; (http://www.agner.org/assem/). +; "Optimizing subroutines in assembly language: +; An optimization guide for x86 platforms" (https://agner.org/optimize). ; ; GLOBAL(void) ; jsimd_quantize_mmx(JCOEFPTR coef_block, DCTELEM *divisors, @@ -157,10 +153,10 @@ EXTN(jsimd_quantize_mmx): mov edx, POINTER [divisors] mov edi, JCOEFPTR [coef_block] mov ah, 2 - alignx 16, 7 + ALIGNX 16, 7 .quantloop1: mov al, DCTSIZE2/8/2 - alignx 16, 7 + ALIGNX 16, 7 .quantloop2: movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)] movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)] diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jquant-sse.asm b/contrib/libs/libjpeg-turbo/simd/i386/jquant-sse.asm index 218adc976f3..6cb5f79c215 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jquant-sse.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jquant-sse.asm @@ -2,17 +2,13 @@ ; jquant.asm - sample data conversion and quantization (SSE & MMX) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" %include "jdct.inc" @@ -52,7 +48,7 @@ EXTN(jsimd_convsamp_float_sse): mov eax, JDIMENSION [start_col] mov edi, POINTER [workspace] ; (DCTELEM *) mov ecx, DCTSIZE/2 - alignx 16, 7 + ALIGNX 16, 7 .convloop: mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) @@ -150,7 +146,7 @@ EXTN(jsimd_quantize_float_sse): mov edx, POINTER [divisors] mov edi, JCOEFPTR [coef_block] mov eax, DCTSIZE2/16 - alignx 16, 7 + ALIGNX 16, 7 .quantloop: movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jquantf-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jquantf-sse2.asm index a881ab50f92..5668f8cb396 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jquantf-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jquantf-sse2.asm @@ -2,17 +2,13 @@ ; jquantf.asm - sample data conversion and quantization (SSE & SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" %include "jdct.inc" @@ -52,7 +48,7 @@ EXTN(jsimd_convsamp_float_sse2): mov eax, JDIMENSION [start_col] mov edi, POINTER [workspace] ; (DCTELEM *) mov ecx, DCTSIZE/2 - alignx 16, 7 + ALIGNX 16, 7 .convloop: mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) @@ -127,7 +123,7 @@ EXTN(jsimd_quantize_float_sse2): mov edx, POINTER [divisors] mov edi, JCOEFPTR [coef_block] mov eax, DCTSIZE2/16 - alignx 16, 7 + ALIGNX 16, 7 .quantloop: movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jquanti-avx2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jquanti-avx2.asm index 5ed6bec246c..60ae098e9c4 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jquanti-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jquanti-avx2.asm @@ -2,18 +2,14 @@ ; jquanti.asm - sample data conversion and quantization (AVX2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, 2018, D. R. Commander. +; Copyright (C) 2016, 2018, 2024, D. R. Commander. ; Copyright (C) 2016, Matthieu Darbois. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" %include "jdct.inc" @@ -107,8 +103,8 @@ EXTN(jsimd_convsamp_avx2): ; Quantize/descale the coefficients, and store into coef_block ; ; This implementation is based on an algorithm described in -; "How to optimize for the Pentium family of microprocessors" -; (http://www.agner.org/assem/). +; "Optimizing subroutines in assembly language: +; An optimization guide for x86 platforms" (https://agner.org/optimize). ; ; GLOBAL(void) ; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors, diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jquanti-sse2.asm b/contrib/libs/libjpeg-turbo/simd/i386/jquanti-sse2.asm index 0a509408aa1..c1edde996e9 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jquanti-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jquanti-sse2.asm @@ -2,17 +2,13 @@ ; jquanti.asm - sample data conversion and quantization (SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" %include "jdct.inc" @@ -52,7 +48,7 @@ EXTN(jsimd_convsamp_sse2): mov eax, JDIMENSION [start_col] mov edi, POINTER [workspace] ; (DCTELEM *) mov ecx, DCTSIZE/4 - alignx 16, 7 + ALIGNX 16, 7 .convloop: mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) @@ -98,8 +94,8 @@ EXTN(jsimd_convsamp_sse2): ; Quantize/descale the coefficients, and store into coef_block ; ; This implementation is based on an algorithm described in -; "How to optimize for the Pentium family of microprocessors" -; (http://www.agner.org/assem/). +; "Optimizing subroutines in assembly language: +; An optimization guide for x86 platforms" (https://agner.org/optimize). ; ; GLOBAL(void) ; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors, @@ -133,7 +129,7 @@ EXTN(jsimd_quantize_sse2): mov edx, POINTER [divisors] mov edi, JCOEFPTR [coef_block] mov eax, DCTSIZE2/32 - alignx 16, 7 + ALIGNX 16, 7 .quantloop: movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jsimd.c b/contrib/libs/libjpeg-turbo/simd/i386/jsimd.c index 80bc821ff4e..d4786b155b7 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jsimd.c +++ b/contrib/libs/libjpeg-turbo/simd/i386/jsimd.c @@ -2,8 +2,8 @@ * jsimd_i386.c * * Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB - * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander. - * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022-2024, D. R. Commander. + * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois. * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -15,13 +15,12 @@ */ #define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" +#include "../../src/jinclude.h" +#include "../../src/jpeglib.h" +#include "../../src/jsimd.h" +#include "../../src/jdct.h" +#include "../../src/jsimddct.h" #include "../jsimd.h" -#include "jconfigint.h" /* * In the PIC cases, we have no guarantee that constants will keep @@ -32,13 +31,11 @@ #define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */ #define IS_ALIGNED_AVX(ptr) (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */ -static unsigned int simd_support = (unsigned int)(~0); -static unsigned int simd_huffman = 1; +static THREAD_LOCAL unsigned int simd_support = (unsigned int)(~0); +static THREAD_LOCAL unsigned int simd_huffman = 1; /* * Check what SIMD accelerations are supported. - * - * FIXME: This code is racy under a multi-threaded environment. */ LOCAL(void) init_simd(void) @@ -161,6 +158,9 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->in_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_extrgb_ycc_convert_avx2; @@ -220,6 +220,9 @@ jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->in_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_extrgb_gray_convert_avx2; @@ -279,6 +282,9 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->out_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_ycc_extrgb_convert_avx2; @@ -382,6 +388,9 @@ GLOBAL(void) jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor, compptr->v_samp_factor, @@ -402,6 +411,9 @@ GLOBAL(void) jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor, compptr->v_samp_factor, @@ -464,6 +476,9 @@ GLOBAL(void) jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width, input_data, output_data_ptr); @@ -479,6 +494,9 @@ GLOBAL(void) jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width, input_data, output_data_ptr); @@ -540,6 +558,9 @@ GLOBAL(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor, compptr->downsampled_width, input_data, @@ -558,6 +579,9 @@ GLOBAL(void) jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor, compptr->downsampled_width, input_data, @@ -626,6 +650,9 @@ jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->out_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2; @@ -684,6 +711,9 @@ jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->out_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2; @@ -788,6 +818,9 @@ GLOBAL(void) jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_convsamp_avx2(sample_data, start_col, workspace); else if (simd_support & JSIMD_SSE2) @@ -800,6 +833,9 @@ GLOBAL(void) jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_SSE2) jsimd_convsamp_float_sse2(sample_data, start_col, workspace); else if (simd_support & JSIMD_SSE) @@ -870,6 +906,9 @@ jsimd_can_fdct_float(void) GLOBAL(void) jsimd_fdct_islow(DCTELEM *data) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_fdct_islow_avx2(data); else if (simd_support & JSIMD_SSE2) @@ -881,6 +920,9 @@ jsimd_fdct_islow(DCTELEM *data) GLOBAL(void) jsimd_fdct_ifast(DCTELEM *data) { + if (simd_support == ~0U) + init_simd(); + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2)) jsimd_fdct_ifast_sse2(data); else @@ -890,6 +932,9 @@ jsimd_fdct_ifast(DCTELEM *data) GLOBAL(void) jsimd_fdct_float(FAST_FLOAT *data) { + if (simd_support == ~0U) + init_simd(); + if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse)) jsimd_fdct_float_sse(data); else if (simd_support & JSIMD_3DNOW) @@ -945,6 +990,9 @@ jsimd_can_quantize_float(void) GLOBAL(void) jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_quantize_avx2(coef_block, divisors, workspace); else if (simd_support & JSIMD_SSE2) @@ -957,6 +1005,9 @@ GLOBAL(void) jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_SSE2) jsimd_quantize_float_sse2(coef_block, divisors, workspace); else if (simd_support & JSIMD_SSE) @@ -1020,6 +1071,9 @@ jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { + if (simd_support == ~0U) + init_simd(); + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col); @@ -1032,6 +1086,9 @@ jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { + if (simd_support == ~0U) + init_simd(); + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col); @@ -1126,6 +1183,9 @@ jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf, output_col); @@ -1142,6 +1202,9 @@ jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { + if (simd_support == ~0U) + init_simd(); + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2)) jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, output_col); @@ -1155,6 +1218,9 @@ jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { + if (simd_support == ~0U) + init_simd(); + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2)) jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf, output_col); @@ -1212,7 +1278,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void) GLOBAL(void) jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *values, size_t *zerobits) + int Al, UJCOEF *values, size_t *zerobits) { jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start, Sl, Al, values, zerobits); @@ -1238,7 +1304,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void) GLOBAL(int) jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *absvalues, size_t *bits) + int Al, UJCOEF *absvalues, size_t *bits) { return jsimd_encode_mcu_AC_refine_prepare_sse2(block, jpeg_natural_order_start, diff --git a/contrib/libs/libjpeg-turbo/simd/i386/jsimdcpu.asm b/contrib/libs/libjpeg-turbo/simd/i386/jsimdcpu.asm index ddcafa9e213..df80f17f5fa 100644 --- a/contrib/libs/libjpeg-turbo/simd/i386/jsimdcpu.asm +++ b/contrib/libs/libjpeg-turbo/simd/i386/jsimdcpu.asm @@ -8,11 +8,7 @@ ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" diff --git a/contrib/libs/libjpeg-turbo/simd/jsimd.h b/contrib/libs/libjpeg-turbo/simd/jsimd.h index 64747c6360c..a28754adb9d 100644 --- a/contrib/libs/libjpeg-turbo/simd/jsimd.h +++ b/contrib/libs/libjpeg-turbo/simd/jsimd.h @@ -2,10 +2,10 @@ * simd/jsimd.h * * Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB - * Copyright (C) 2011, 2014-2016, 2018, 2020, D. R. Commander. + * Copyright (C) 2011, 2014-2016, 2018, 2020, 2022, D. R. Commander. * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. * Copyright (C) 2014, Linaro Limited. - * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois. * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. * Copyright (C) 2020, Arm Limited. * @@ -1243,16 +1243,16 @@ EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl /* Progressive Huffman encoding */ EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2 (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, - JCOEF *values, size_t *zerobits); + UJCOEF *values, size_t *zerobits); EXTERN(void) jsimd_encode_mcu_AC_first_prepare_neon (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, - JCOEF *values, size_t *zerobits); + UJCOEF *values, size_t *zerobits); EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2 (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, - JCOEF *absvalues, size_t *bits); + UJCOEF *absvalues, size_t *bits); EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_neon (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, - JCOEF *absvalues, size_t *bits); + UJCOEF *absvalues, size_t *bits); diff --git a/contrib/libs/libjpeg-turbo/simd/nasm/jsimdcfg.inc.h b/contrib/libs/libjpeg-turbo/simd/nasm/jsimdcfg.inc.h index bf2a45ad50c..ed3f9c2a693 100644 --- a/contrib/libs/libjpeg-turbo/simd/nasm/jsimdcfg.inc.h +++ b/contrib/libs/libjpeg-turbo/simd/nasm/jsimdcfg.inc.h @@ -12,9 +12,9 @@ #define JPEG_INTERNALS -#include "../jpeglib.h" +#include "../src/jpeglib.h" #include "../jconfig.h" -#include "../jmorecfg.h" +#include "../src/jmorecfg.h" #include "jsimd.h" ; diff --git a/contrib/libs/libjpeg-turbo/simd/nasm/jsimdext.inc b/contrib/libs/libjpeg-turbo/simd/nasm/jsimdext.inc index d8a50ed8e23..674dfb6464c 100644 --- a/contrib/libs/libjpeg-turbo/simd/nasm/jsimdext.inc +++ b/contrib/libs/libjpeg-turbo/simd/nasm/jsimdext.inc @@ -2,9 +2,10 @@ ; jsimdext.inc - common declarations ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander. +; Copyright (C) 2010, 2016, 2018-2019, 2024, D. R. Commander. ; Copyright (C) 2018, Matthieu Darbois. ; Copyright (C) 2018, Matthias Räncker. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on the x86 SIMD extension for IJG JPEG library - version 1.02 ; @@ -75,6 +76,14 @@ ; mark stack as non-executable section .note.GNU-stack noalloc noexec nowrite progbits +%ifdef __CET__ +%ifdef __x86_64__ +section .note.gnu.property note alloc noexec align=8 + dd 0x00000004, 0x00000010, 0x00000005, 0x00554e47 + dd 0xc0000002, 0x00000004, 0x00000003, 0x00000000 +%endif +%endif + ; -- segment definition -- ; %ifdef _x86_64_ @@ -271,7 +280,7 @@ const_base: %define GOTOFF(got, sym) (got) + (sym) - const_base -%imacro get_GOT 1 +%imacro GET_GOT 1 ; NOTE: this macro destroys ecx resister. call %%geteip add ecx, byte (%%ref - $) @@ -303,7 +312,7 @@ const_base: %define GOTOFF(got, sym) (got) + (sym) wrt ..gotoff -%imacro get_GOT 1 +%imacro GET_GOT 1 extern GOT_SYMBOL call %%geteip add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc @@ -316,13 +325,13 @@ const_base: %endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- -%imacro pushpic 1.nolist +%imacro PUSHPIC 1.nolist push %1 %endmacro -%imacro poppic 1.nolist +%imacro POPPIC 1.nolist pop %1 %endmacro -%imacro movpic 2.nolist +%imacro MOVPIC 2.nolist mov %1, %2 %endmacro @@ -330,13 +339,13 @@ const_base: %define GOTOFF(got, sym) (sym) -%imacro get_GOT 1.nolist +%imacro GET_GOT 1.nolist %endmacro -%imacro pushpic 1.nolist +%imacro PUSHPIC 1.nolist %endmacro -%imacro poppic 1.nolist +%imacro POPPIC 1.nolist %endmacro -%imacro movpic 2.nolist +%imacro MOVPIC 2.nolist %endmacro %endif ; PIC ----------------------------------------- @@ -348,7 +357,7 @@ const_base: %define MSKLE(x, y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16) %define FILLB(b, n) (($$-(b)) & ((n)-1)) -%imacro alignx 1-2.nolist 0xFFFF +%imacro ALIGNX 1-2.nolist 0xFFFF %%bs: \ times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \ db 0x90 ; nop @@ -370,7 +379,7 @@ const_base: ; Align the next data on {2,4,8,16,..}-byte boundary. ; -%imacro alignz 1.nolist +%imacro ALIGNZ 1.nolist align %1, db 0 ; filling zeros %endmacro @@ -378,7 +387,7 @@ const_base: %ifdef WIN64 -%imacro collect_args 1 +%imacro COLLECT_ARGS 1 sub rsp, SIZEOF_XMMWORD movaps XMMWORD [rsp], xmm6 sub rsp, SIZEOF_XMMWORD @@ -397,17 +406,17 @@ const_base: %endif %if %1 > 4 push r14 - mov r14, [rax+48] + mov r14, [rbp+48] %endif %if %1 > 5 push r15 - mov r15, [rax+56] + mov r15, [rbp+56] %endif push rsi push rdi %endmacro -%imacro uncollect_args 1 +%imacro UNCOLLECT_ARGS 1 pop rdi pop rsi %if %1 > 5 @@ -428,7 +437,7 @@ const_base: add rsp, SIZEOF_XMMWORD %endmacro -%imacro push_xmm 1 +%imacro PUSH_XMM 1 sub rsp, %1 * SIZEOF_XMMWORD movaps XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8 %if %1 > 1 @@ -442,7 +451,7 @@ const_base: %endif %endmacro -%imacro pop_xmm 1 +%imacro POP_XMM 1 movaps xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD] %if %1 > 1 movaps xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD] @@ -458,7 +467,7 @@ const_base: %else -%imacro collect_args 1 +%imacro COLLECT_ARGS 1 push r10 mov r10, rdi %if %1 > 1 @@ -483,7 +492,7 @@ const_base: %endif %endmacro -%imacro uncollect_args 1 +%imacro UNCOLLECT_ARGS 1 %if %1 > 5 pop r15 %endif @@ -502,16 +511,29 @@ const_base: pop r10 %endmacro -%imacro push_xmm 1 +%imacro PUSH_XMM 1 %endmacro -%imacro pop_xmm 1 +%imacro POP_XMM 1 %endmacro %endif %endif +%ifdef __CET__ + +%imacro ENDBR64 0 + dd 0xfa1e0ff3 +%endmacro + +%else + +%imacro ENDBR64 0 +%endmacro + +%endif + ; -------------------------------------------------------------------------- ; Defines picked up from the C headers ; diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jccolext-avx2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jccolext-avx2.asm index ffb527db00e..aeeda0a682f 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jccolext-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jccolext-avx2.asm @@ -1,19 +1,16 @@ ; ; jccolext.asm - colorspace conversion (64-bit AVX2) ; -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2018, Matthias Räncker. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -33,21 +30,22 @@ ; r13d = JDIMENSION output_row ; r14d = int num_rows -%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] +%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] %define WK_NUM 8 align 32 GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2) EXTN(jsimd_rgb_ycc_convert_avx2): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 + mov rbp, rsp + push r15 and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args 5 + ; Allocate stack space for wk array. r15 is used to access it. + mov r15, rsp + sub rsp, (SIZEOF_YMMWORD * WK_NUM) + COLLECT_ARGS 5 push rbx mov ecx, r10d @@ -548,9 +546,9 @@ EXTN(jsimd_rgb_ycc_convert_avx2): .return: pop rbx vzeroupper - uncollect_args 5 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + UNCOLLECT_ARGS 5 + lea rsp, [rbp-8] + pop r15 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jccolext-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jccolext-sse2.asm index af70ed6010f..f3a1244903b 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jccolext-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jccolext-sse2.asm @@ -1,18 +1,15 @@ ; ; jccolext.asm - colorspace conversion (64-bit SSE2) ; -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; Copyright (C) 2018, Matthias Räncker. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -32,21 +29,22 @@ ; r13d = JDIMENSION output_row ; r14d = int num_rows -%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 8 align 32 GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2) EXTN(jsimd_rgb_ycc_convert_sse2): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 + mov rbp, rsp + push r15 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args 5 + ; Allocate stack space for wk array. r15 is used to access it. + mov r15, rsp + sub rsp, (SIZEOF_XMMWORD * WK_NUM) + COLLECT_ARGS 5 push rbx mov ecx, r10d @@ -473,9 +471,9 @@ EXTN(jsimd_rgb_ycc_convert_sse2): .return: pop rbx - uncollect_args 5 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + UNCOLLECT_ARGS 5 + lea rsp, [rbp-8] + pop r15 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jccolor-avx2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jccolor-avx2.asm index 16b78298dc4..e2628917336 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jccolor-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jccolor-avx2.asm @@ -1,18 +1,14 @@ ; ; jccolor.asm - colorspace conversion (64-bit AVX2) ; -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; Copyright (C) 2015, Intel Corporation. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -33,7 +29,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_rgb_ycc_convert_avx2) EXTN(jconst_rgb_ycc_convert_avx2): @@ -46,7 +42,7 @@ PD_ONEHALFM1_CJ times 8 dd (1 << (SCALEBITS - 1)) - 1 + \ (CENTERJSAMPLE << SCALEBITS) PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1)) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jccolor-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jccolor-sse2.asm index e2955c21340..cc9edb4cebc 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jccolor-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jccolor-sse2.asm @@ -1,17 +1,13 @@ ; ; jccolor.asm - colorspace conversion (64-bit SSE2) ; -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -32,7 +28,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_rgb_ycc_convert_sse2) EXTN(jconst_rgb_ycc_convert_sse2): @@ -45,7 +41,7 @@ PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS - 1)) - 1 + \ (CENTERJSAMPLE << SCALEBITS) PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1)) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jcgray-avx2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jcgray-avx2.asm index 591255bb112..267ec5142a4 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jcgray-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jcgray-avx2.asm @@ -1,18 +1,14 @@ ; ; jcgray.asm - grayscale colorspace conversion (64-bit AVX2) ; -; Copyright (C) 2011, 2016, D. R. Commander. +; Copyright (C) 2011, 2016, 2024, D. R. Commander. ; Copyright (C) 2015, Intel Corporation. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -29,7 +25,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_rgb_gray_convert_avx2) EXTN(jconst_rgb_gray_convert_avx2): @@ -38,7 +34,7 @@ PW_F0299_F0337 times 8 dw F_0_299, F_0_337 PW_F0114_F0250 times 8 dw F_0_114, F_0_250 PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1)) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jcgray-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jcgray-sse2.asm index e389904f2f8..4b94d7b8a28 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jcgray-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jcgray-sse2.asm @@ -1,17 +1,13 @@ ; ; jcgray.asm - grayscale colorspace conversion (64-bit SSE2) ; -; Copyright (C) 2011, 2016, D. R. Commander. +; Copyright (C) 2011, 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -28,7 +24,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_rgb_gray_convert_sse2) EXTN(jconst_rgb_gray_convert_sse2): @@ -37,7 +33,7 @@ PW_F0299_F0337 times 4 dw F_0_299, F_0_337 PW_F0114_F0250 times 4 dw F_0_114, F_0_250 PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1)) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jcgryext-avx2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jcgryext-avx2.asm index ddcc2c0a2fe..77e85f768f9 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jcgryext-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jcgryext-avx2.asm @@ -1,19 +1,16 @@ ; ; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2) ; -; Copyright (C) 2011, 2016, D. R. Commander. +; Copyright (C) 2011, 2016, 2024, D. R. Commander. ; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2018, Matthias Räncker. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -33,21 +30,22 @@ ; r13d = JDIMENSION output_row ; r14d = int num_rows -%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] +%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] %define WK_NUM 2 align 32 GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2) EXTN(jsimd_rgb_gray_convert_avx2): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 + mov rbp, rsp + push r15 and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args 5 + ; Allocate stack space for wk array. r15 is used to access it. + mov r15, rsp + sub rsp, byte (SIZEOF_YMMWORD * WK_NUM) + COLLECT_ARGS 5 push rbx mov ecx, r10d @@ -427,9 +425,9 @@ EXTN(jsimd_rgb_gray_convert_avx2): .return: pop rbx vzeroupper - uncollect_args 5 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + UNCOLLECT_ARGS 5 + lea rsp, [rbp-8] + pop r15 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jcgryext-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jcgryext-sse2.asm index f1d399a63b8..3e8087c39bc 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jcgryext-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jcgryext-sse2.asm @@ -1,18 +1,15 @@ ; ; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2) ; -; Copyright (C) 2011, 2016, D. R. Commander. +; Copyright (C) 2011, 2016, 2024, D. R. Commander. ; Copyright (C) 2018, Matthias Räncker. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -32,21 +29,22 @@ ; r13d = JDIMENSION output_row ; r14d = int num_rows -%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 2 align 32 GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2) EXTN(jsimd_rgb_gray_convert_sse2): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 + mov rbp, rsp + push r15 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args 5 + ; Allocate stack space for wk array. r15 is used to access it. + mov r15, rsp + sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) + COLLECT_ARGS 5 push rbx mov ecx, r10d @@ -352,9 +350,9 @@ EXTN(jsimd_rgb_gray_convert_sse2): .return: pop rbx - uncollect_args 5 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + UNCOLLECT_ARGS 5 + lea rsp, [rbp-8] + pop r15 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jchuff-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jchuff-sse2.asm index 9ea6df946ef..b18b7f5d651 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jchuff-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jchuff-sse2.asm @@ -1,19 +1,16 @@ ; ; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2) ; -; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander. +; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, 2023-2024, D. R. Commander. ; Copyright (C) 2015, Matthieu Darbois. ; Copyright (C) 2018, Matthias Räncker. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains an SSE2 implementation for Huffman coding of one block. ; The following code is based on jchuff.c; see jchuff.c for more details. @@ -38,7 +35,7 @@ endstruc ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_huff_encode_one_block) EXTN(jconst_huff_encode_one_block): @@ -48,7 +45,7 @@ jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007 dd 0x00ff, 0x01ff, 0x03ff, 0x07ff dd 0x0fff, 0x1fff, 0x3fff, 0x7fff - alignz 32 + ALIGNZ 32 times 1 << 14 db 15 times 1 << 13 db 14 @@ -66,7 +63,8 @@ times 1 << 2 db 3 times 1 << 1 db 2 times 1 << 0 db 1 times 1 db 0 -jpeg_nbits_table: +GLOBAL_DATA(jpeg_nbits_table) +EXTN(jpeg_nbits_table): times 1 db 0 times 1 << 0 db 1 times 1 << 1 db 2 @@ -85,10 +83,10 @@ times 1 << 13 db 14 times 1 << 14 db 15 times 1 << 15 db 16 - alignz 32 + ALIGNZ 32 %define NBITS(x) nbits_base + x -%define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table) +%define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - EXTN(jpeg_nbits_table)) ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -208,15 +206,15 @@ times 1 << 15 db 16 ; rax - buffer ; rbx - temp ; rcx - nbits -; rdx - block --> free_bits +; rdx - code ; rsi - nbits_base ; rdi - t -; rbp - code ; r8 - dctbl --> code_temp ; r9 - actbl ; r10 - state ; r11 - index ; r12 - put_buffer +; r15 - block --> free_bits %define buffer rax %ifdef WIN64 @@ -231,12 +229,11 @@ times 1 << 15 db 16 %define nbitsq rcx %define nbits ecx %define nbitsb cl -%define block rdx +%define codeq rdx +%define code edx %define nbits_base rsi %define t rdi %define td edi -%define codeq rbp -%define code ebp %define dctbl r8 %define actbl r9 %define state r10 @@ -244,6 +241,7 @@ times 1 << 15 db 16 %define indexd r11d %define put_buffer r12 %define put_bufferd r12d +%define block r15 ; Step 1: Re-arrange input data according to jpeg_natural_order ; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10 @@ -259,6 +257,9 @@ times 1 << 15 db 16 GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2) EXTN(jsimd_huff_encode_one_block_sse2): + ENDBR64 + push rbp + mov rbp, rsp %ifdef WIN64 @@ -266,15 +267,15 @@ EXTN(jsimd_huff_encode_one_block_sse2): ; rdx = JOCTET *buffer ; r8 = JCOEFPTR block ; r9 = int last_dc_val -; [rax+48] = c_derived_tbl *dctbl -; [rax+56] = c_derived_tbl *actbl +; [rbp+48] = c_derived_tbl *dctbl +; [rbp+56] = c_derived_tbl *actbl ;X: X = code stream mov buffer, rdx + push r15 mov block, r8 movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07 push rbx - push rbp movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07 push rsi push rdi @@ -284,12 +285,10 @@ EXTN(jsimd_huff_encode_one_block_sse2): movsx code, word [block] ;Z: code = block[0]; pxor xmm4, xmm4 ;A: w4[i] = 0; sub code, r9d ;Z: code -= last_dc_val; - mov dctbl, POINTER [rsp+6*8+4*8] - mov actbl, POINTER [rsp+6*8+5*8] + mov dctbl, POINTER [rbp+48] + mov actbl, POINTER [rbp+56] punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 - lea nbits_base, [rel jpeg_nbits_table] - add rsp, -DCTSIZE2 * SIZEOF_WORD - mov t, rsp + lea nbits_base, [rel EXTN(jpeg_nbits_table)] %else @@ -301,23 +300,27 @@ EXTN(jsimd_huff_encode_one_block_sse2): ; r9 = c_derived_tbl *actbl ;X: X = code stream + push r15 + mov block, rdx movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07 push rbx - push rbp movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07 push r12 mov state, rdi mov buffer, rsi movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15 movsx codeq, word [block] ;Z: code = block[0]; - lea nbits_base, [rel jpeg_nbits_table] + lea nbits_base, [rel EXTN(jpeg_nbits_table)] pxor xmm4, xmm4 ;A: w4[i] = 0; sub codeq, rcx ;Z: code -= last_dc_val; punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 - lea t, [rsp - DCTSIZE2 * SIZEOF_WORD] ; use red zone for t_ %endif + ; Allocate stack space for t array, and realign stack. + add rsp, -DCTSIZE2 * SIZEOF_WORD - 8 + mov t, rsp + pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11 pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11 punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15 @@ -443,9 +446,9 @@ EXTN(jsimd_huff_encode_one_block_sse2): pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29 ; (Row 4, offset 1) %undef block -%define free_bitsq rdx -%define free_bitsd edx -%define free_bitsb dl +%define free_bitsq r15 +%define free_bitsd r15d +%define free_bitsb r15b pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0); shl tempq, 48 ;Z: temp <<= 48; pxor xmm2, xmm2 ;E: w2[i] = 0; @@ -534,12 +537,8 @@ EXTN(jsimd_huff_encode_one_block_sse2): test index, index jnz .BLOOP ; } while (index != 0); .ELOOP: ; } /* index != 0 */ - sub td, esp ; t -= (WIN64: &t_[0], UNIX: &t_[64]); -%ifdef WIN64 + sub td, esp ; t -= &t_[0]; cmp td, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (t != 62) -%else - cmp td, -2 * SIZEOF_WORD ; if (t != -2) -%endif je .EFN ; { movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0] ; nbits = actbl->ehufsi[0]; @@ -556,18 +555,17 @@ EXTN(jsimd_huff_encode_one_block_sse2): ; state->cur.put_buffer.simd = put_buffer; mov byte [state + working_state.cur.free_bits], free_bitsb ; state->cur.free_bits = free_bits; -%ifdef WIN64 - sub rsp, -DCTSIZE2 * SIZEOF_WORD + sub rsp, -DCTSIZE2 * SIZEOF_WORD - 8 pop r12 +%ifdef WIN64 pop rdi pop rsi - pop rbp pop rbx %else - pop r12 - pop rbp pop rbx %endif + pop r15 + pop rbp ret ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jcphuff-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jcphuff-sse2.asm index 01b5c0235fa..c9ac59f2f1c 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jcphuff-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jcphuff-sse2.asm @@ -3,16 +3,14 @@ ; (64-bit SSE2) ; ; Copyright (C) 2016, 2018, Matthieu Darbois +; Copyright (C) 2023, Aliaksiej Kandracienka. +; Copyright (C) 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains an SSE2 implementation of data preparation for progressive ; Huffman encoding. See jcphuff.c for more details. @@ -281,16 +279,13 @@ GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2) EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 + mov rbp, rsp and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp - lea rsp, [rbp - 16] - collect_args 6 - - movdqa XMMWORD [rbp - 16], ZERO + sub rsp, SIZEOF_XMMWORD + movdqa XMMWORD [rsp], ZERO + COLLECT_ARGS 6 movd AL, r13d pxor ZERO, ZERO @@ -384,10 +379,9 @@ EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): REDUCE0 - movdqa ZERO, XMMWORD [rbp - 16] - uncollect_args 6 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + UNCOLLECT_ARGS 6 + movdqa ZERO, XMMWORD [rsp] + mov rsp, rbp pop rbp ret @@ -449,16 +443,13 @@ EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2) EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 + mov rbp, rsp and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp - lea rsp, [rbp - 16] - collect_args 6 - - movdqa XMMWORD [rbp - 16], ZERO + sub rsp, SIZEOF_XMMWORD + movdqa XMMWORD [rsp], ZERO + COLLECT_ARGS 6 xor SIGN, SIGN xor EOB, EOB @@ -606,10 +597,9 @@ EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2): REDUCE0 mov eax, EOB - movdqa ZERO, XMMWORD [rbp - 16] - uncollect_args 6 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + UNCOLLECT_ARGS 6 + movdqa ZERO, XMMWORD [rsp] + mov rsp, rbp pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jcsample-avx2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jcsample-avx2.asm index b32527aebea..53afc7d77fa 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jcsample-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jcsample-avx2.asm @@ -2,7 +2,7 @@ ; jcsample.asm - downsampling (64-bit AVX2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2018, Matthias Räncker. ; @@ -10,11 +10,7 @@ ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -44,10 +40,10 @@ GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2) EXTN(jsimd_h2v1_downsample_avx2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - collect_args 6 + COLLECT_ARGS 6 mov ecx, r13d shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) @@ -178,7 +174,7 @@ EXTN(jsimd_h2v1_downsample_avx2): .return: vzeroupper - uncollect_args 6 + UNCOLLECT_ARGS 6 pop rbp ret @@ -206,10 +202,10 @@ EXTN(jsimd_h2v1_downsample_avx2): GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2) EXTN(jsimd_h2v2_downsample_avx2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - collect_args 6 + COLLECT_ARGS 6 mov ecx, r13d shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) @@ -358,7 +354,7 @@ EXTN(jsimd_h2v2_downsample_avx2): .return: vzeroupper - uncollect_args 6 + UNCOLLECT_ARGS 6 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jcsample-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jcsample-sse2.asm index 2fcfe4567ab..d7ffa930e82 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jcsample-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jcsample-sse2.asm @@ -2,18 +2,14 @@ ; jcsample.asm - downsampling (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; Copyright (C) 2018, Matthias Räncker. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -43,10 +39,10 @@ GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2) EXTN(jsimd_h2v1_downsample_sse2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - collect_args 6 + COLLECT_ARGS 6 mov ecx, r13d shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) @@ -160,7 +156,7 @@ EXTN(jsimd_h2v1_downsample_sse2): jg near .rowloop .return: - uncollect_args 6 + UNCOLLECT_ARGS 6 pop rbp ret @@ -188,10 +184,10 @@ EXTN(jsimd_h2v1_downsample_sse2): GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2) EXTN(jsimd_h2v2_downsample_sse2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - collect_args 6 + COLLECT_ARGS 6 mov ecx, r13d shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) @@ -321,7 +317,7 @@ EXTN(jsimd_h2v2_downsample_sse2): jg near .rowloop .return: - uncollect_args 6 + UNCOLLECT_ARGS 6 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jdcolext-avx2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jdcolext-avx2.asm index 2370fda6424..7b8a084398d 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jdcolext-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jdcolext-avx2.asm @@ -2,19 +2,16 @@ ; jdcolext.asm - colorspace conversion (64-bit AVX2) ; ; Copyright 2009, 2012 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2012, 2016, D. R. Commander. +; Copyright (C) 2009, 2012, 2016, 2024, D. R. Commander. ; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2018, Matthias Räncker. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -34,21 +31,22 @@ ; r13 = JSAMPARRAY output_buf ; r14d = int num_rows -%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] +%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] %define WK_NUM 2 align 32 GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2) EXTN(jsimd_ycc_rgb_convert_avx2): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 + mov rbp, rsp + push r15 and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args 5 + ; Allocate stack space for wk array. r15 is used to access it. + mov r15, rsp + sub rsp, byte (WK_NUM * SIZEOF_YMMWORD) + COLLECT_ARGS 5 push rbx mov ecx, r10d ; num_cols @@ -485,9 +483,9 @@ EXTN(jsimd_ycc_rgb_convert_avx2): .return: pop rbx vzeroupper - uncollect_args 5 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + UNCOLLECT_ARGS 5 + lea rsp, [rbp-8] + pop r15 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jdcolext-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jdcolext-sse2.asm index e07c8d75188..261f74da5d2 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jdcolext-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jdcolext-sse2.asm @@ -2,18 +2,15 @@ ; jdcolext.asm - colorspace conversion (64-bit SSE2) ; ; Copyright 2009, 2012 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2012, 2016, D. R. Commander. +; Copyright (C) 2009, 2012, 2016, 2024, D. R. Commander. ; Copyright (C) 2018, Matthias Räncker. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -33,21 +30,22 @@ ; r13 = JSAMPARRAY output_buf ; r14d = int num_rows -%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 2 align 32 GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2) EXTN(jsimd_ycc_rgb_convert_sse2): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 + mov rbp, rsp + push r15 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args 5 + ; Allocate stack space for wk array. r15 is used to access it. + mov r15, rsp + sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) + COLLECT_ARGS 5 push rbx mov ecx, r10d ; num_cols @@ -428,9 +426,9 @@ EXTN(jsimd_ycc_rgb_convert_sse2): .return: pop rbx - uncollect_args 5 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + UNCOLLECT_ARGS 5 + lea rsp, [rbp-8] + pop r15 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jdcolor-avx2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jdcolor-avx2.asm index 43de9db04dc..bd5aa00b95c 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jdcolor-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jdcolor-avx2.asm @@ -2,18 +2,14 @@ ; jdcolor.asm - colorspace conversion (64-bit AVX2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; Copyright (C) 2015, Intel Corporation. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -32,7 +28,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_ycc_rgb_convert_avx2) EXTN(jconst_ycc_rgb_convert_avx2): @@ -43,7 +39,7 @@ PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285 PW_ONE times 16 dw 1 PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jdcolor-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jdcolor-sse2.asm index b3f1fec07eb..40343fe7895 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jdcolor-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jdcolor-sse2.asm @@ -2,17 +2,13 @@ ; jdcolor.asm - colorspace conversion (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -31,7 +27,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_ycc_rgb_convert_sse2) EXTN(jconst_ycc_rgb_convert_sse2): @@ -42,7 +38,7 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 PW_ONE times 8 dw 1 PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jdmerge-avx2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jdmerge-avx2.asm index 9515a17013d..6a5f1daba56 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jdmerge-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jdmerge-avx2.asm @@ -2,18 +2,14 @@ ; jdmerge.asm - merged upsampling/color conversion (64-bit AVX2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; Copyright (C) 2015, Intel Corporation. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -32,7 +28,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_merged_upsample_avx2) EXTN(jconst_merged_upsample_avx2): @@ -43,7 +39,7 @@ PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285 PW_ONE times 16 dw 1 PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jdmerge-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jdmerge-sse2.asm index aedccc20f6c..8c269b83d85 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jdmerge-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jdmerge-sse2.asm @@ -2,17 +2,13 @@ ; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -31,7 +27,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_merged_upsample_sse2) EXTN(jconst_merged_upsample_sse2): @@ -42,7 +38,7 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 PW_ONE times 8 dw 1 PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jdmrgext-avx2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jdmrgext-avx2.asm index 8b264b4f039..01826fb6abb 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jdmrgext-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jdmrgext-avx2.asm @@ -2,19 +2,16 @@ ; jdmrgext.asm - merged upsampling/color conversion (64-bit AVX2) ; ; Copyright 2009, 2012 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2012, 2016, D. R. Commander. +; Copyright (C) 2009, 2012, 2016, 2024, D. R. Commander. ; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2018, Matthias Räncker. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -34,21 +31,22 @@ ; r12d = JDIMENSION in_row_group_ctr ; r13 = JSAMPARRAY output_buf -%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] +%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] %define WK_NUM 3 align 32 GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2) EXTN(jsimd_h2v1_merged_upsample_avx2): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 + mov rbp, rsp + push r15 and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args 4 + ; Allocate stack space for wk array. r15 is used to access it. + mov r15, rsp + sub rsp, SIZEOF_YMMWORD * WK_NUM + COLLECT_ARGS 4 push rbx mov ecx, r10d ; col @@ -479,9 +477,9 @@ EXTN(jsimd_h2v1_merged_upsample_avx2): .return: pop rbx vzeroupper - uncollect_args 4 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + UNCOLLECT_ARGS 4 + lea rsp, [rbp-8] + pop r15 pop rbp ret @@ -505,10 +503,10 @@ EXTN(jsimd_h2v1_merged_upsample_avx2): GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2) EXTN(jsimd_h2v2_merged_upsample_avx2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - collect_args 4 + COLLECT_ARGS 4 push rbx mov eax, r10d @@ -587,7 +585,7 @@ EXTN(jsimd_h2v2_merged_upsample_avx2): add rsp, SIZEOF_JSAMPARRAY*4 pop rbx - uncollect_args 4 + UNCOLLECT_ARGS 4 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jdmrgext-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jdmrgext-sse2.asm index eb3ab9dbd94..abd22e21a73 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jdmrgext-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jdmrgext-sse2.asm @@ -2,18 +2,15 @@ ; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2) ; ; Copyright 2009, 2012 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2012, 2016, D. R. Commander. +; Copyright (C) 2009, 2012, 2016, 2024, D. R. Commander. ; Copyright (C) 2018, Matthias Räncker. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jcolsamp.inc" @@ -33,21 +30,22 @@ ; r12d = JDIMENSION in_row_group_ctr ; r13 = JSAMPARRAY output_buf -%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 3 align 32 GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2) EXTN(jsimd_h2v1_merged_upsample_sse2): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 + mov rbp, rsp + push r15 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args 4 + ; Allocate stack space for wk array. r15 is used to access it. + mov r15, rsp + sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) + COLLECT_ARGS 4 push rbx mov ecx, r10d ; col @@ -421,9 +419,9 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): .return: pop rbx - uncollect_args 4 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + UNCOLLECT_ARGS 4 + lea rsp, [rbp-8] + pop r15 pop rbp ret @@ -447,10 +445,10 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2) EXTN(jsimd_h2v2_merged_upsample_sse2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - collect_args 4 + COLLECT_ARGS 4 push rbx mov eax, r10d @@ -529,7 +527,7 @@ EXTN(jsimd_h2v2_merged_upsample_sse2): add rsp, SIZEOF_JSAMPARRAY*4 pop rbx - uncollect_args 4 + UNCOLLECT_ARGS 4 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jdsample-avx2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jdsample-avx2.asm index 1e4979f933e..6ae4cf812a7 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jdsample-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jdsample-avx2.asm @@ -2,26 +2,23 @@ ; jdsample.asm - upsampling (64-bit AVX2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2018, Matthias Räncker. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_fancy_upsample_avx2) EXTN(jconst_fancy_upsample_avx2): @@ -32,7 +29,7 @@ PW_THREE times 16 dw 3 PW_SEVEN times 16 dw 7 PW_EIGHT times 16 dw 8 - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -61,11 +58,11 @@ PW_EIGHT times 16 dw 8 GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2) EXTN(jsimd_h2v1_fancy_upsample_avx2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - push_xmm 3 - collect_args 4 + PUSH_XMM 3 + COLLECT_ARGS 4 mov eax, r11d ; colctr test rax, rax @@ -186,8 +183,8 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2): .return: vzeroupper - uncollect_args 4 - pop_xmm 3 + UNCOLLECT_ARGS 4 + POP_XMM 3 pop rbp ret @@ -208,22 +205,23 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2): ; r12 = JSAMPARRAY input_data ; r13 = JSAMPARRAY *output_data_ptr -%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] +%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] %define WK_NUM 4 align 32 GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2) EXTN(jsimd_h2v2_fancy_upsample_avx2): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - push_xmm 3 - collect_args 4 + mov rbp, rsp + push r15 + and rsp, byte (-SIZEOF_YMMWORD) ; align to 128 bits + ; Allocate stack space for wk array. r15 is used to access it. + mov r15, rsp + sub rsp, (SIZEOF_YMMWORD * WK_NUM) + PUSH_XMM 3 + COLLECT_ARGS 4 push rbx mov eax, r11d ; colctr @@ -498,10 +496,10 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2): .return: pop rbx vzeroupper - uncollect_args 4 - pop_xmm 3 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + UNCOLLECT_ARGS 4 + POP_XMM 3 + lea rsp, [rbp-8] + pop r15 pop rbp ret @@ -524,10 +522,10 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2): GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2) EXTN(jsimd_h2v1_upsample_avx2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - collect_args 4 + COLLECT_ARGS 4 mov edx, r11d add rdx, byte (SIZEOF_YMMWORD-1) @@ -590,7 +588,7 @@ EXTN(jsimd_h2v1_upsample_avx2): .return: vzeroupper - uncollect_args 4 + UNCOLLECT_ARGS 4 pop rbp ret @@ -613,10 +611,10 @@ EXTN(jsimd_h2v1_upsample_avx2): GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2) EXTN(jsimd_h2v2_upsample_avx2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - collect_args 4 + COLLECT_ARGS 4 push rbx mov edx, r11d @@ -687,7 +685,7 @@ EXTN(jsimd_h2v2_upsample_avx2): .return: pop rbx vzeroupper - uncollect_args 4 + UNCOLLECT_ARGS 4 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jdsample-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jdsample-sse2.asm index 38dbceec269..54c560fc28e 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jdsample-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jdsample-sse2.asm @@ -2,25 +2,22 @@ ; jdsample.asm - upsampling (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; Copyright (C) 2018, Matthias Räncker. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_fancy_upsample_sse2) EXTN(jconst_fancy_upsample_sse2): @@ -31,7 +28,7 @@ PW_THREE times 8 dw 3 PW_SEVEN times 8 dw 7 PW_EIGHT times 8 dw 8 - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -60,10 +57,10 @@ PW_EIGHT times 8 dw 8 GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2) EXTN(jsimd_h2v1_fancy_upsample_sse2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - collect_args 4 + COLLECT_ARGS 4 mov eax, r11d ; colctr test rax, rax @@ -174,7 +171,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2): jg near .rowloop .return: - uncollect_args 4 + UNCOLLECT_ARGS 4 pop rbp ret @@ -195,21 +192,22 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2): ; r12 = JSAMPARRAY input_data ; r13 = JSAMPARRAY *output_data_ptr -%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 4 align 32 GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2) EXTN(jsimd_h2v2_fancy_upsample_sse2): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 + mov rbp, rsp + push r15 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args 4 + ; Allocate stack space for wk array. r15 is used to access it. + mov r15, rsp + sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) + COLLECT_ARGS 4 push rbx mov eax, r11d ; colctr @@ -472,9 +470,9 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2): .return: pop rbx - uncollect_args 4 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + UNCOLLECT_ARGS 4 + lea rsp, [rbp-8] + pop r15 pop rbp ret @@ -497,10 +495,10 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2): GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2) EXTN(jsimd_h2v1_upsample_sse2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - collect_args 4 + COLLECT_ARGS 4 mov edx, r11d add rdx, byte (2*SIZEOF_XMMWORD)-1 @@ -561,7 +559,7 @@ EXTN(jsimd_h2v1_upsample_sse2): jg short .rowloop .return: - uncollect_args 4 + UNCOLLECT_ARGS 4 pop rbp ret @@ -584,10 +582,10 @@ EXTN(jsimd_h2v1_upsample_sse2): GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2) EXTN(jsimd_h2v2_upsample_sse2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - collect_args 4 + COLLECT_ARGS 4 push rbx mov edx, r11d @@ -656,7 +654,7 @@ EXTN(jsimd_h2v2_upsample_sse2): .return: pop rbx - uncollect_args 4 + UNCOLLECT_ARGS 4 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jfdctflt-sse.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jfdctflt-sse.asm index ef2796649bc..58a1f5570d3 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jfdctflt-sse.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jfdctflt-sse.asm @@ -2,17 +2,14 @@ ; jfdctflt.asm - floating-point FDCT (64-bit SSE) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a floating-point implementation of the forward DCT ; (Discrete Cosine Transform). The following code is based directly on @@ -34,7 +31,7 @@ ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_fdct_float_sse) EXTN(jconst_fdct_float_sse): @@ -44,7 +41,7 @@ PD_0_707 times 4 dd 0.707106781186547524400844 PD_0_541 times 4 dd 0.541196100146196984399723 PD_1_306 times 4 dd 1.306562964876376527856643 - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -58,21 +55,22 @@ PD_1_306 times 4 dd 1.306562964876376527856643 ; r10 = FAST_FLOAT *data -%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 2 align 32 GLOBAL_FUNCTION(jsimd_fdct_float_sse) EXTN(jsimd_fdct_float_sse): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 + mov rbp, rsp + push r15 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args 1 + ; Allocate stack space for wk array. r15 is used to access it. + mov r15, rsp + sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) + COLLECT_ARGS 1 ; ---- Pass 1: process rows. @@ -344,9 +342,9 @@ EXTN(jsimd_fdct_float_sse): dec rcx jnz near .columnloop - uncollect_args 1 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + UNCOLLECT_ARGS 1 + lea rsp, [rbp-8] + pop r15 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jfdctfst-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jfdctfst-sse2.asm index 2e1bfe6e8c2..3b92d4edaae 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jfdctfst-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jfdctfst-sse2.asm @@ -2,17 +2,14 @@ ; jfdctfst.asm - fast integer FDCT (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a fast, not so accurate integer implementation of ; the forward DCT (Discrete Cosine Transform). The following code is @@ -49,7 +46,7 @@ F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965) %define PRE_MULTIPLY_SCALE_BITS 2 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_fdct_ifast_sse2) EXTN(jconst_fdct_ifast_sse2): @@ -59,7 +56,7 @@ PW_F0382 times 8 dw F_0_382 << CONST_SHIFT PW_F0541 times 8 dw F_0_541 << CONST_SHIFT PW_F1306 times 8 dw F_1_306 << CONST_SHIFT - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -73,21 +70,22 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT ; r10 = DCTELEM *data -%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 2 align 32 GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2) EXTN(jsimd_fdct_ifast_sse2): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 + mov rbp, rsp + push r15 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args 1 + ; Allocate stack space for wk array. r15 is used to access it. + mov r15, rsp + sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) + COLLECT_ARGS 1 ; ---- Pass 1: process rows. @@ -378,9 +376,9 @@ EXTN(jsimd_fdct_ifast_sse2): movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6 movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2 - uncollect_args 1 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + UNCOLLECT_ARGS 1 + lea rsp, [rbp-8] + pop r15 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jfdctint-avx2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jfdctint-avx2.asm index e56258b48aa..0c4528612cd 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jfdctint-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jfdctint-avx2.asm @@ -2,17 +2,13 @@ ; jfdctint.asm - accurate integer FDCT (64-bit AVX2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander. +; Copyright (C) 2009, 2016, 2018, 2020, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a slower but more accurate integer implementation of the ; forward DCT (Discrete Cosine Transform). The following code is based @@ -65,7 +61,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) ; %1-%4: Input/output registers ; %5-%8: Temp registers -%macro dotranspose 8 +%macro DOTRANSPOSE 8 ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) @@ -108,7 +104,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) ; %5-%8: Temp registers ; %9: Pass (1 or 2) -%macro dodct 9 +%macro DODCT 9 vpsubw %5, %1, %4 ; %5=data1_0-data6_7=tmp6_7 vpaddw %6, %1, %4 ; %6=data1_0+data6_7=tmp1_0 vpaddw %7, %2, %3 ; %7=data3_2+data4_5=tmp3_2 @@ -223,7 +219,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_fdct_islow_avx2) EXTN(jconst_fdct_islow_avx2): @@ -242,7 +238,7 @@ PW_DESCALE_P2X times 16 dw 1 << (PASS1_BITS - 1) PW_1_NEG1 times 8 dw 1 times 8 dw -1 - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -260,10 +256,10 @@ PW_1_NEG1 times 8 dw 1 GLOBAL_FUNCTION(jsimd_fdct_islow_avx2) EXTN(jsimd_fdct_islow_avx2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - collect_args 1 + COLLECT_ARGS 1 ; ---- Pass 1: process rows. @@ -285,9 +281,9 @@ EXTN(jsimd_fdct_islow_avx2): ; ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) ; ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) - dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 + DOTRANSPOSE ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 - dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1 + DODCT ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1 ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5 ; ---- Pass 2: process columns. @@ -295,9 +291,9 @@ EXTN(jsimd_fdct_islow_avx2): vperm2i128 ymm4, ymm1, ymm3, 0x20 ; ymm4=data3_7 vperm2i128 ymm1, ymm1, ymm3, 0x31 ; ymm1=data1_5 - dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 + DOTRANSPOSE ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 - dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2 + DODCT ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2 ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5 vperm2i128 ymm3, ymm0, ymm1, 0x30 ; ymm3=data0_1 @@ -311,7 +307,7 @@ EXTN(jsimd_fdct_islow_avx2): vmovdqu YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm7 vzeroupper - uncollect_args 1 + UNCOLLECT_ARGS 1 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jfdctint-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jfdctint-sse2.asm index ec1f383ccb7..3a6be020cd0 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jfdctint-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jfdctint-sse2.asm @@ -2,17 +2,14 @@ ; jfdctint.asm - accurate integer FDCT (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, 2020, D. R. Commander. +; Copyright (C) 2009, 2016, 2020, 2024, D. R. Commander. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a slower but more accurate integer implementation of the ; forward DCT (Discrete Cosine Transform). The following code is based @@ -63,7 +60,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_fdct_islow_sse2) EXTN(jconst_fdct_islow_sse2): @@ -80,7 +77,7 @@ PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1) PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1) PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1) - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -94,21 +91,22 @@ PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1) ; r10 = DCTELEM *data -%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 6 align 32 GLOBAL_FUNCTION(jsimd_fdct_islow_sse2) EXTN(jsimd_fdct_islow_sse2): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 + mov rbp, rsp + push r15 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args 1 + ; Allocate stack space for wk array. r15 is used to access it. + mov r15, rsp + sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) + COLLECT_ARGS 1 ; ---- Pass 1: process rows. @@ -608,9 +606,9 @@ EXTN(jsimd_fdct_islow_sse2): movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1 movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3 - uncollect_args 1 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + UNCOLLECT_ARGS 1 + lea rsp, [rbp-8] + pop r15 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jidctflt-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jidctflt-sse2.asm index 60bf9618961..14437340229 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jidctflt-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jidctflt-sse2.asm @@ -2,18 +2,15 @@ ; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; Copyright (C) 2018, Matthias Räncker. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a floating-point implementation of the inverse DCT ; (Discrete Cosine Transform). The following code is based directly on @@ -24,18 +21,18 @@ ; -------------------------------------------------------------------------- -%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) +%macro UNPCKLPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) shufps %1, %2, 0x44 %endmacro -%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) +%macro UNPCKHPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) shufps %1, %2, 0xEE %endmacro ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_idct_float_sse2) EXTN(jconst_idct_float_sse2): @@ -47,7 +44,7 @@ PD_M2_613 times 4 dd -2.613125929752753055713286 PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -65,8 +62,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; r12 = JSAMPARRAY output_buf ; r13d = JDIMENSION output_col -%define original_rbp rbp + 0 -%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD +%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 2 %define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT @@ -76,14 +72,15 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE GLOBAL_FUNCTION(jsimd_idct_float_sse2) EXTN(jsimd_idct_float_sse2): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 + mov rbp, rsp + push r15 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp + ; Allocate stack space for wk array. r15 is used to access it. + mov r15, rsp lea rsp, [workspace] - collect_args 4 + COLLECT_ARGS 4 push rbx ; ---- Pass 1: process columns from input, store into work array. @@ -280,11 +277,11 @@ EXTN(jsimd_idct_float_sse2): unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) movaps xmm3, xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) - unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) + UNPCKLPS2 xmm6, xmm7 ; xmm6=(00 10 20 30) + UNPCKHPS2 xmm3, xmm7 ; xmm3=(01 11 21 31) movaps xmm0, xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) - unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) + UNPCKLPS2 xmm1, xmm2 ; xmm1=(02 12 22 32) + UNPCKHPS2 xmm0, xmm2 ; xmm0=(03 13 23 33) movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) @@ -295,11 +292,11 @@ EXTN(jsimd_idct_float_sse2): movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 movaps xmm6, xmm5 ; transpose coefficients(phase 2) - unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) - unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) + UNPCKLPS2 xmm5, xmm7 ; xmm5=(40 50 60 70) + UNPCKHPS2 xmm6, xmm7 ; xmm6=(41 51 61 71) movaps xmm3, xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) - unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) + UNPCKLPS2 xmm4, xmm2 ; xmm4=(42 52 62 72) + UNPCKHPS2 xmm3, xmm2 ; xmm3=(43 53 63 73) movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6 @@ -322,7 +319,6 @@ EXTN(jsimd_idct_float_sse2): ; ---- Pass 2: process rows from work array, store into output array. - mov rax, [original_rbp] lea rsi, [workspace] ; FAST_FLOAT *wsptr mov rdi, r12 ; (JSAMPROW *) mov eax, r13d @@ -471,9 +467,9 @@ EXTN(jsimd_idct_float_sse2): jnz near .rowloop pop rbx - uncollect_args 4 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + UNCOLLECT_ARGS 4 + lea rsp, [rbp-8] + pop r15 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jidctfst-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jidctfst-sse2.asm index cb97fdfbb24..cffabb8378e 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jidctfst-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jidctfst-sse2.asm @@ -2,18 +2,15 @@ ; jidctfst.asm - fast integer IDCT (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; Copyright (C) 2018, Matthias Räncker. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a fast, not so accurate integer implementation of ; the inverse DCT (Discrete Cosine Transform). The following code is @@ -57,7 +54,7 @@ F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) %define PRE_MULTIPLY_SCALE_BITS 2 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_idct_ifast_sse2) EXTN(jconst_idct_ifast_sse2): @@ -68,7 +65,7 @@ PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT PW_F1082 times 8 dw F_1_082 << CONST_SHIFT PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -86,8 +83,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; r12 = JSAMPARRAY output_buf ; r13d = JDIMENSION output_col -%define original_rbp rbp + 0 -%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD +%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 2 @@ -95,14 +91,15 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE GLOBAL_FUNCTION(jsimd_idct_ifast_sse2) EXTN(jsimd_idct_ifast_sse2): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 + mov rbp, rsp + push r15 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args 4 + ; Allocate stack space for wk array. r15 is used to access it. + mov r15, rsp + sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) + COLLECT_ARGS 4 ; ---- Pass 1: process columns from input. @@ -320,7 +317,6 @@ EXTN(jsimd_idct_ifast_sse2): ; ---- Pass 2: process rows from work array, store into output array. - mov rax, [original_rbp] mov rdi, r12 ; (JSAMPROW *) mov eax, r13d @@ -479,9 +475,9 @@ EXTN(jsimd_idct_ifast_sse2): movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 - uncollect_args 4 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + UNCOLLECT_ARGS 4 + lea rsp, [rbp-8] + pop r15 pop rbp ret ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jidctint-avx2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jidctint-avx2.asm index ca7e317f6e1..be3b46888e5 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jidctint-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jidctint-avx2.asm @@ -2,18 +2,14 @@ ; jidctint.asm - accurate integer IDCT (64-bit AVX2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander. +; Copyright (C) 2009, 2016, 2018, 2020, 2024, D. R. Commander. ; Copyright (C) 2018, Matthias Räncker. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a slower but more accurate integer implementation of the ; inverse DCT (Discrete Cosine Transform). The following code is based @@ -66,7 +62,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) ; %1-%4: Input/output registers ; %5-%8: Temp registers -%macro dotranspose 8 +%macro DOTRANSPOSE 8 ; %5=(00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71) ; %6=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72) ; %7=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75) @@ -119,7 +115,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) ; %5-%12: Temp registers ; %9: Pass (1 or 2) -%macro dodct 13 +%macro DODCT 13 ; -- Even part ; (Original) @@ -241,7 +237,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_idct_islow_avx2) EXTN(jconst_idct_islow_avx2): @@ -260,7 +256,7 @@ PB_CENTERJSAMP times 32 db CENTERJSAMPLE PW_1_NEG1 times 8 dw 1 times 8 dw -1 - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -282,11 +278,11 @@ PW_1_NEG1 times 8 dw 1 GLOBAL_FUNCTION(jsimd_idct_islow_avx2) EXTN(jsimd_idct_islow_avx2): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp mov rbp, rsp ; rbp = aligned rbp - push_xmm 4 - collect_args 4 + PUSH_XMM 4 + COLLECT_ARGS 4 ; ---- Pass 1: process columns. @@ -343,10 +339,10 @@ EXTN(jsimd_idct_islow_avx2): vperm2i128 ymm2, ymm5, ymm7, 0x20 ; ymm2=in2_6 vperm2i128 ymm3, ymm7, ymm6, 0x31 ; ymm3=in7_5 - dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 1 + DODCT ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 1 ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6 - dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 + DOTRANSPOSE ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7 .column_end: @@ -363,10 +359,10 @@ EXTN(jsimd_idct_islow_avx2): vperm2i128 ymm4, ymm3, ymm1, 0x31 ; ymm3=in7_5 vperm2i128 ymm1, ymm3, ymm1, 0x20 ; ymm1=in3_1 - dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 2 + DODCT ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 2 ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6 - dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 + DOTRANSPOSE ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7 vpacksswb ymm0, ymm0, ymm1 ; ymm0=data01_45 @@ -408,8 +404,8 @@ EXTN(jsimd_idct_islow_avx2): movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7 - uncollect_args 4 - pop_xmm 4 + UNCOLLECT_ARGS 4 + POP_XMM 4 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jidctint-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jidctint-sse2.asm index 7aa869bc0b5..b186871ff2a 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jidctint-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jidctint-sse2.asm @@ -2,18 +2,15 @@ ; jidctint.asm - accurate integer IDCT (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, 2020, D. R. Commander. +; Copyright (C) 2009, 2016, 2020, 2024, D. R. Commander. ; Copyright (C) 2018, Matthias Räncker. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains a slower but more accurate integer implementation of the ; inverse DCT (Discrete Cosine Transform). The following code is based @@ -64,7 +61,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_idct_islow_sse2) EXTN(jconst_idct_islow_sse2): @@ -81,7 +78,7 @@ PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1) PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1) PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -99,8 +96,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; r12 = JSAMPARRAY output_buf ; r13d = JDIMENSION output_col -%define original_rbp rbp + 0 -%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD +%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 12 @@ -108,14 +104,15 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE GLOBAL_FUNCTION(jsimd_idct_islow_sse2) EXTN(jsimd_idct_islow_sse2): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 + mov rbp, rsp + push r15 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args 4 + ; Allocate stack space for wk array. r15 is used to access it. + mov r15, rsp + sub rsp, (SIZEOF_XMMWORD * WK_NUM) + COLLECT_ARGS 4 ; ---- Pass 1: process columns from input. @@ -512,7 +509,6 @@ EXTN(jsimd_idct_islow_sse2): ; ---- Pass 2: process rows from work array, store into output array. - mov rax, [original_rbp] mov rdi, r12 ; (JSAMPROW *) mov eax, r13d @@ -836,9 +832,9 @@ EXTN(jsimd_idct_islow_sse2): movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5 - uncollect_args 4 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + UNCOLLECT_ARGS 4 + lea rsp, [rbp-8] + pop r15 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jidctred-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jidctred-sse2.asm index 4ece9d891cb..6fb7095612e 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jidctred-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jidctred-sse2.asm @@ -2,18 +2,15 @@ ; jidctred.asm - reduced-size IDCT (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; Copyright (C) 2018, Matthias Räncker. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. ; ; This file contains inverse-DCT routines that produce reduced-size ; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. @@ -70,7 +67,7 @@ F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785) ; -------------------------------------------------------------------------- SECTION SEG_CONST - alignz 32 + ALIGNZ 32 GLOBAL_DATA(jconst_idct_red_sse2) EXTN(jconst_idct_red_sse2): @@ -88,7 +85,7 @@ PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2 - 1) PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2 - 1) PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 32 + ALIGNZ 32 ; -------------------------------------------------------------------------- SECTION SEG_TEXT @@ -107,8 +104,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; r12 = JSAMPARRAY output_buf ; r13d = JDIMENSION output_col -%define original_rbp rbp + 0 -%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD +%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 2 @@ -116,14 +112,15 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE GLOBAL_FUNCTION(jsimd_idct_4x4_sse2) EXTN(jsimd_idct_4x4_sse2): + ENDBR64 push rbp - mov rax, rsp ; rax = original rbp - sub rsp, byte 4 + mov rbp, rsp + push r15 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp], rax - mov rbp, rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args 4 + ; Allocate stack space for wk array. r15 is used to access it. + mov r15, rsp + sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) + COLLECT_ARGS 4 ; ---- Pass 1: process columns from input. @@ -309,7 +306,6 @@ EXTN(jsimd_idct_4x4_sse2): ; ---- Pass 2: process rows, store into output array. - mov rax, [original_rbp] mov rdi, r12 ; (JSAMPROW *) mov eax, r13d @@ -389,9 +385,9 @@ EXTN(jsimd_idct_4x4_sse2): movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 - uncollect_args 4 - mov rsp, rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp + UNCOLLECT_ARGS 4 + lea rsp, [rbp-8] + pop r15 pop rbp ret @@ -414,10 +410,10 @@ EXTN(jsimd_idct_4x4_sse2): GLOBAL_FUNCTION(jsimd_idct_2x2_sse2) EXTN(jsimd_idct_2x2_sse2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - collect_args 4 + COLLECT_ARGS 4 push rbx ; ---- Pass 1: process columns from input. @@ -565,7 +561,7 @@ EXTN(jsimd_idct_2x2_sse2): mov word [rsi+rax*SIZEOF_JSAMPLE], cx pop rbx - uncollect_args 4 + UNCOLLECT_ARGS 4 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jquantf-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jquantf-sse2.asm index ab2e3954f63..64763338f2d 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jquantf-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jquantf-sse2.asm @@ -2,18 +2,14 @@ ; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; Copyright (C) 2018, Matthias Räncker. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" %include "jdct.inc" @@ -37,10 +33,10 @@ GLOBAL_FUNCTION(jsimd_convsamp_float_sse2) EXTN(jsimd_convsamp_float_sse2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - collect_args 3 + COLLECT_ARGS 3 push rbx pcmpeqw xmm7, xmm7 @@ -89,7 +85,7 @@ EXTN(jsimd_convsamp_float_sse2): jnz short .convloop pop rbx - uncollect_args 3 + UNCOLLECT_ARGS 3 pop rbp ret @@ -110,10 +106,10 @@ EXTN(jsimd_convsamp_float_sse2): GLOBAL_FUNCTION(jsimd_quantize_float_sse2) EXTN(jsimd_quantize_float_sse2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - collect_args 3 + COLLECT_ARGS 3 mov rsi, r12 mov rdx, r11 @@ -146,7 +142,7 @@ EXTN(jsimd_quantize_float_sse2): dec rax jnz short .quantloop - uncollect_args 3 + UNCOLLECT_ARGS 3 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jquanti-avx2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jquanti-avx2.asm index 70fe81139cc..7e126e88a88 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jquanti-avx2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jquanti-avx2.asm @@ -2,7 +2,7 @@ ; jquanti.asm - sample data conversion and quantization (64-bit AVX2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, 2018, D. R. Commander. +; Copyright (C) 2009, 2016, 2018, 2024, D. R. Commander. ; Copyright (C) 2016, Matthieu Darbois. ; Copyright (C) 2018, Matthias Räncker. ; @@ -10,11 +10,7 @@ ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" %include "jdct.inc" @@ -38,10 +34,10 @@ GLOBAL_FUNCTION(jsimd_convsamp_avx2) EXTN(jsimd_convsamp_avx2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - collect_args 3 + COLLECT_ARGS 3 mov eax, r11d @@ -84,7 +80,7 @@ EXTN(jsimd_convsamp_avx2): vmovdqu YMMWORD [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)], ymm3 vzeroupper - uncollect_args 3 + UNCOLLECT_ARGS 3 pop rbp ret @@ -93,8 +89,8 @@ EXTN(jsimd_convsamp_avx2): ; Quantize/descale the coefficients, and store into coef_block ; ; This implementation is based on an algorithm described in -; "How to optimize for the Pentium family of microprocessors" -; (http://www.agner.org/assem/). +; "Optimizing subroutines in assembly language: +; An optimization guide for x86 platforms" (https://agner.org/optimize). ; ; GLOBAL(void) ; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors, @@ -116,10 +112,10 @@ EXTN(jsimd_convsamp_avx2): GLOBAL_FUNCTION(jsimd_quantize_avx2) EXTN(jsimd_quantize_avx2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - collect_args 3 + COLLECT_ARGS 3 vmovdqu ymm4, [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)] vmovdqu ymm5, [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)] @@ -154,7 +150,7 @@ EXTN(jsimd_quantize_avx2): vmovdqu [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3 vzeroupper - uncollect_args 3 + UNCOLLECT_ARGS 3 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jquanti-sse2.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jquanti-sse2.asm index 3ee442027a5..284b9fea71e 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jquanti-sse2.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jquanti-sse2.asm @@ -2,18 +2,14 @@ ; jquanti.asm - sample data conversion and quantization (64-bit SSE2) ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB -; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2009, 2016, 2024, D. R. Commander. ; Copyright (C) 2018, Matthias Räncker. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" %include "jdct.inc" @@ -37,10 +33,10 @@ GLOBAL_FUNCTION(jsimd_convsamp_sse2) EXTN(jsimd_convsamp_sse2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - collect_args 3 + COLLECT_ARGS 3 push rbx pxor xmm6, xmm6 ; xmm6=(all 0's) @@ -84,7 +80,7 @@ EXTN(jsimd_convsamp_sse2): jnz short .convloop pop rbx - uncollect_args 3 + UNCOLLECT_ARGS 3 pop rbp ret @@ -93,8 +89,8 @@ EXTN(jsimd_convsamp_sse2): ; Quantize/descale the coefficients, and store into coef_block ; ; This implementation is based on an algorithm described in -; "How to optimize for the Pentium family of microprocessors" -; (http://www.agner.org/assem/). +; "Optimizing subroutines in assembly language: +; An optimization guide for x86 platforms" (https://agner.org/optimize). ; ; GLOBAL(void) ; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors, @@ -116,10 +112,10 @@ EXTN(jsimd_convsamp_sse2): GLOBAL_FUNCTION(jsimd_quantize_sse2) EXTN(jsimd_quantize_sse2): + ENDBR64 push rbp - mov rax, rsp mov rbp, rsp - collect_args 3 + COLLECT_ARGS 3 mov rsi, r12 mov rdx, r11 @@ -179,7 +175,7 @@ EXTN(jsimd_quantize_sse2): dec rax jnz near .quantloop - uncollect_args 3 + UNCOLLECT_ARGS 3 pop rbp ret diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jsimd.c b/contrib/libs/libjpeg-turbo/simd/x86_64/jsimd.c index d51962f3987..9f4e098fddc 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jsimd.c +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jsimd.c @@ -2,8 +2,8 @@ * jsimd_x86_64.c * * Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB - * Copyright (C) 2009-2011, 2014, 2016, 2018, 2022, D. R. Commander. - * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * Copyright (C) 2009-2011, 2014, 2016, 2018, 2022-2024, D. R. Commander. + * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois. * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -15,13 +15,12 @@ */ #define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" +#include "../../src/jinclude.h" +#include "../../src/jpeglib.h" +#include "../../src/jsimd.h" +#include "../../src/jdct.h" +#include "../../src/jsimddct.h" #include "../jsimd.h" -#include "jconfigint.h" /* * In the PIC cases, we have no guarantee that constants will keep @@ -32,13 +31,11 @@ #define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */ #define IS_ALIGNED_AVX(ptr) (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */ -static unsigned int simd_support = (unsigned int)(~0); -static unsigned int simd_huffman = 1; +static THREAD_LOCAL unsigned int simd_support = (unsigned int)(~0); +static THREAD_LOCAL unsigned int simd_huffman = 1; /* * Check what SIMD accelerations are supported. - * - * FIXME: This code is racy under a multi-threaded environment. */ LOCAL(void) init_simd(void) @@ -116,7 +113,9 @@ jsimd_can_ycc_rgb(void) { init_simd(); -#ifndef WITH_SANITIZER +#ifdef WITH_SANITIZER + return 0; +#endif /* The code is optimised for these values only */ if (BITS_IN_JSAMPLE != 8) return 0; @@ -131,7 +130,6 @@ jsimd_can_ycc_rgb(void) if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2)) return 1; -#endif return 0; } @@ -150,6 +148,9 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->in_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_extrgb_ycc_convert_avx2; @@ -199,6 +200,9 @@ jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->in_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_extrgb_gray_convert_avx2; @@ -248,6 +252,9 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->out_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_ycc_extrgb_convert_avx2; @@ -338,6 +345,9 @@ GLOBAL(void) jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor, compptr->v_samp_factor, @@ -354,6 +364,9 @@ GLOBAL(void) jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor, compptr->v_samp_factor, @@ -408,6 +421,9 @@ GLOBAL(void) jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width, input_data, output_data_ptr); @@ -420,6 +436,9 @@ GLOBAL(void) jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width, input_data, output_data_ptr); @@ -474,6 +493,9 @@ GLOBAL(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor, compptr->downsampled_width, input_data, @@ -488,6 +510,9 @@ GLOBAL(void) jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor, compptr->downsampled_width, input_data, @@ -547,6 +572,9 @@ jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->out_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2; @@ -595,6 +623,9 @@ jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->out_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2; @@ -684,6 +715,9 @@ GLOBAL(void) jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_convsamp_avx2(sample_data, start_col, workspace); else @@ -753,6 +787,9 @@ jsimd_can_fdct_float(void) GLOBAL(void) jsimd_fdct_islow(DCTELEM *data) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_fdct_islow_avx2(data); else @@ -814,6 +851,9 @@ jsimd_can_quantize_float(void) GLOBAL(void) jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_quantize_avx2(coef_block, divisors, workspace); else @@ -944,6 +984,9 @@ jsimd_can_idct_float(void) { init_simd(); +#ifdef WITH_SANITIZER + return 0; +#endif if (DCTSIZE != 8) return 0; if (sizeof(JCOEF) != 2) @@ -968,6 +1011,9 @@ jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf, output_col); @@ -999,7 +1045,9 @@ jsimd_can_huff_encode_one_block(void) { init_simd(); -#ifndef WITH_SANITIZER +#ifdef WITH_SANITIZER + return 0; +#endif if (DCTSIZE != 8) return 0; if (sizeof(JCOEF) != 2) @@ -1008,7 +1056,6 @@ jsimd_can_huff_encode_one_block(void) if ((simd_support & JSIMD_SSE2) && simd_huffman && IS_ALIGNED_SSE(jconst_huff_encode_one_block)) return 1; -#endif return 0; } @@ -1027,14 +1074,15 @@ jsimd_can_encode_mcu_AC_first_prepare(void) { init_simd(); -#ifndef WITH_SANITIZER +#ifdef WITH_SANITIZER + return 0; +#endif if (DCTSIZE != 8) return 0; if (sizeof(JCOEF) != 2) return 0; if (simd_support & JSIMD_SSE2) return 1; -#endif return 0; } @@ -1042,7 +1090,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void) GLOBAL(void) jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *values, size_t *zerobits) + int Al, UJCOEF *values, size_t *zerobits) { jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start, Sl, Al, values, zerobits); @@ -1053,14 +1101,12 @@ jsimd_can_encode_mcu_AC_refine_prepare(void) { init_simd(); -#ifndef WITH_SANITIZER if (DCTSIZE != 8) return 0; if (sizeof(JCOEF) != 2) return 0; if (simd_support & JSIMD_SSE2) return 1; -#endif return 0; } @@ -1068,7 +1114,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void) GLOBAL(int) jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *absvalues, size_t *bits) + int Al, UJCOEF *absvalues, size_t *bits) { return jsimd_encode_mcu_AC_refine_prepare_sse2(block, jpeg_natural_order_start, diff --git a/contrib/libs/libjpeg-turbo/simd/x86_64/jsimdcpu.asm b/contrib/libs/libjpeg-turbo/simd/x86_64/jsimdcpu.asm index 705f813d7da..b72f3b0b398 100644 --- a/contrib/libs/libjpeg-turbo/simd/x86_64/jsimdcpu.asm +++ b/contrib/libs/libjpeg-turbo/simd/x86_64/jsimdcpu.asm @@ -3,17 +3,14 @@ ; ; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB ; Copyright (C) 2016, D. R. Commander. +; Copyright (C) 2023, Aliaksiej Kandracienka. ; ; Based on ; x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; -; This file should be assembled with NASM (Netwide Assembler), -; can *not* be assembled with Microsoft's MASM or any compatible -; assembler (including Borland's Turbo Assembler). -; NASM is available from http://nasm.sourceforge.net/ or -; http://sourceforge.net/project/showfiles.php?group_id=6208 +; This file should be assembled with NASM (Netwide Assembler) or Yasm. %include "jsimdext.inc" @@ -31,6 +28,8 @@ GLOBAL_FUNCTION(jpeg_simd_cpu_support) EXTN(jpeg_simd_cpu_support): + push rbp + mov rbp, rsp push rbx push rdi @@ -79,6 +78,7 @@ EXTN(jpeg_simd_cpu_support): pop rdi pop rbx + pop rbp ret ; For some reason, the OS X linker does not honor the request to align the |