diff options
author | robot-piglet <robot-piglet@yandex-team.com> | 2024-02-06 15:03:31 +0300 |
---|---|---|
committer | Alexander Smirnov <alex@ydb.tech> | 2024-02-09 19:18:18 +0300 |
commit | 303fba2f20dfd94603064b607671b787de12624e (patch) | |
tree | 54c22fad0bcd67bf52f78822a3ee7714fd9dbf40 /contrib/python/numpy | |
parent | c7854274198c4168e713732ceb13e7075fce89b0 (diff) | |
download | ydb-303fba2f20dfd94603064b607671b787de12624e.tar.gz |
Intermediate changes
Diffstat (limited to 'contrib/python/numpy')
38 files changed, 1375 insertions, 1062 deletions
diff --git a/contrib/python/numpy/include/numpy/core/feature_detection_misc.h b/contrib/python/numpy/include/numpy/core/feature_detection_misc.h new file mode 100644 index 0000000000..0e6447fbd1 --- /dev/null +++ b/contrib/python/numpy/include/numpy/core/feature_detection_misc.h @@ -0,0 +1,5 @@ +#ifdef USE_PYTHON3 +#include <contrib/python/numpy/py3/numpy/core/feature_detection_misc.h> +#else +#error #include <contrib/python/numpy/py2/numpy/core/feature_detection_misc.h> +#endif diff --git a/contrib/python/numpy/py3/.dist-info/METADATA b/contrib/python/numpy/py3/.dist-info/METADATA index 5e515025ec..8246dc4ed3 100644 --- a/contrib/python/numpy/py3/.dist-info/METADATA +++ b/contrib/python/numpy/py3/.dist-info/METADATA @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: numpy -Version: 1.26.3 +Version: 1.26.4 Summary: Fundamental package for array computing in Python Home-page: https://numpy.org Author: Travis E. Oliphant et al. @@ -70,11 +70,6 @@ License: Copyright (c) 2005-2023, NumPy Developers. License: Apache 2.0 For license text, see vendored-meson/meson/COPYING - Name: meson-python - Files: vendored-meson/meson-python/* - License: MIT - For license text, see vendored-meson/meson-python/LICENSE - Name: spin Files: .spin/cmds.py License: BSD-3 diff --git a/contrib/python/numpy/py3/LICENSES_bundled.txt b/contrib/python/numpy/py3/LICENSES_bundled.txt index 26faf7ff30..aae0e774fa 100644 --- a/contrib/python/numpy/py3/LICENSES_bundled.txt +++ b/contrib/python/numpy/py3/LICENSES_bundled.txt @@ -30,11 +30,6 @@ Files: vendored-meson/meson/* License: Apache 2.0 For license text, see vendored-meson/meson/COPYING -Name: meson-python -Files: vendored-meson/meson-python/* -License: MIT - For license text, see vendored-meson/meson-python/LICENSE - Name: spin Files: .spin/cmds.py License: BSD-3 diff --git a/contrib/python/numpy/py3/numpy/__config__.py.in b/contrib/python/numpy/py3/numpy/__config__.py.in index 6c6c21cb85..f3b32c28c1 100644 --- a/contrib/python/numpy/py3/numpy/__config__.py.in +++ b/contrib/python/numpy/py3/numpy/__config__.py.in @@ -32,21 +32,27 @@ CONFIG = _cleanup( "Compilers": { "c": { "name": "@C_COMP@", - "linker": "@C_COMP_LINKER_ID@", + "linker": r"@C_COMP_LINKER_ID@", "version": "@C_COMP_VERSION@", - "commands": "@C_COMP_CMD_ARRAY@", + "commands": r"@C_COMP_CMD_ARRAY@", + "args": r"@C_COMP_ARGS@", + "linker args": r"@C_COMP_LINK_ARGS@", }, "cython": { "name": "@CYTHON_COMP@", - "linker": "@CYTHON_COMP_LINKER_ID@", + "linker": r"@CYTHON_COMP_LINKER_ID@", "version": "@CYTHON_COMP_VERSION@", - "commands": "@CYTHON_COMP_CMD_ARRAY@", + "commands": r"@CYTHON_COMP_CMD_ARRAY@", + "args": r"@CYTHON_COMP_ARGS@", + "linker args": r"@CYTHON_COMP_LINK_ARGS@", }, "c++": { "name": "@CPP_COMP@", - "linker": "@CPP_COMP_LINKER_ID@", + "linker": r"@CPP_COMP_LINKER_ID@", "version": "@CPP_COMP_VERSION@", - "commands": "@CPP_COMP_CMD_ARRAY@", + "commands": r"@CPP_COMP_CMD_ARRAY@", + "args": r"@CPP_COMP_ARGS@", + "linker args": r"@CPP_COMP_LINK_ARGS@", }, }, "Machine Information": { @@ -72,7 +78,7 @@ CONFIG = _cleanup( "detection method": "@BLAS_TYPE_NAME@", "include directory": r"@BLAS_INCLUDEDIR@", "lib directory": r"@BLAS_LIBDIR@", - "openblas configuration": "@BLAS_OPENBLAS_CONFIG@", + "openblas configuration": r"@BLAS_OPENBLAS_CONFIG@", "pc file directory": r"@BLAS_PCFILEDIR@", }, "lapack": { @@ -82,7 +88,7 @@ CONFIG = _cleanup( "detection method": "@LAPACK_TYPE_NAME@", "include directory": r"@LAPACK_INCLUDEDIR@", "lib directory": r"@LAPACK_LIBDIR@", - "openblas configuration": "@LAPACK_OPENBLAS_CONFIG@", + "openblas configuration": r"@LAPACK_OPENBLAS_CONFIG@", "pc file directory": r"@LAPACK_PCFILEDIR@", }, }, diff --git a/contrib/python/numpy/py3/numpy/array_api/__init__.py b/contrib/python/numpy/py3/numpy/array_api/__init__.py index 77f227882e..edc3205fd5 100644 --- a/contrib/python/numpy/py3/numpy/array_api/__init__.py +++ b/contrib/python/numpy/py3/numpy/array_api/__init__.py @@ -127,7 +127,7 @@ __all__ = ["__array_api_version__"] from ._constants import e, inf, nan, pi, newaxis -__all__ += ["e", "inf", "nan", "pi"] +__all__ += ["e", "inf", "nan", "pi", "newaxis"] from ._creation_functions import ( asarray, diff --git a/contrib/python/numpy/py3/numpy/array_api/linalg.py b/contrib/python/numpy/py3/numpy/array_api/linalg.py index 09af9dfc3a..c18360f6e6 100644 --- a/contrib/python/numpy/py3/numpy/array_api/linalg.py +++ b/contrib/python/numpy/py3/numpy/array_api/linalg.py @@ -9,6 +9,7 @@ from ._dtypes import ( complex128 ) from ._manipulation_functions import reshape +from ._elementwise_functions import conj from ._array_object import Array from ..core.numeric import normalize_axis_tuple @@ -53,7 +54,10 @@ def cholesky(x: Array, /, *, upper: bool = False) -> Array: raise TypeError('Only floating-point dtypes are allowed in cholesky') L = np.linalg.cholesky(x._array) if upper: - return Array._new(L).mT + U = Array._new(L).mT + if U.dtype in [complex64, complex128]: + U = conj(U) + return U return Array._new(L) # Note: cross is the numpy top-level namespace, not np.linalg diff --git a/contrib/python/numpy/py3/numpy/core/code_generators/genapi.py b/contrib/python/numpy/py3/numpy/core/code_generators/genapi.py index 2cdaba52d9..d9d7862b28 100644 --- a/contrib/python/numpy/py3/numpy/core/code_generators/genapi.py +++ b/contrib/python/numpy/py3/numpy/core/code_generators/genapi.py @@ -304,15 +304,6 @@ def find_functions(filename, tag='API'): fo.close() return functions -def should_rebuild(targets, source_files): - from distutils.dep_util import newer_group - for t in targets: - if not os.path.exists(t): - return True - sources = API_FILES + list(source_files) + [__file__] - if newer_group(sources, targets[0], missing='newer'): - return True - return False def write_file(filename, data): """ diff --git a/contrib/python/numpy/py3/numpy/core/code_generators/generate_numpy_api.py b/contrib/python/numpy/py3/numpy/core/code_generators/generate_numpy_api.py index ae38c4efc2..640bae9e5f 100644 --- a/contrib/python/numpy/py3/numpy/core/code_generators/generate_numpy_api.py +++ b/contrib/python/numpy/py3/numpy/core/code_generators/generate_numpy_api.py @@ -148,12 +148,7 @@ def generate_api(output_dir, force=False): targets = (h_file, c_file) sources = numpy_api.multiarray_api - - if (not force and not genapi.should_rebuild(targets, [numpy_api.__file__, __file__])): - return targets - else: - do_generate_api(targets, sources) - + do_generate_api(targets, sources) return targets def do_generate_api(targets, sources): diff --git a/contrib/python/numpy/py3/numpy/core/code_generators/generate_ufunc_api.py b/contrib/python/numpy/py3/numpy/core/code_generators/generate_ufunc_api.py index e03299a52c..3734cbd6a0 100644 --- a/contrib/python/numpy/py3/numpy/core/code_generators/generate_ufunc_api.py +++ b/contrib/python/numpy/py3/numpy/core/code_generators/generate_ufunc_api.py @@ -125,12 +125,7 @@ def generate_api(output_dir, force=False): targets = (h_file, c_file) sources = ['ufunc_api_order.txt'] - - if (not force and not genapi.should_rebuild(targets, sources + [__file__])): - return targets - else: - do_generate_api(targets, sources) - + do_generate_api(targets, sources) return targets def do_generate_api(targets, sources): diff --git a/contrib/python/numpy/py3/numpy/core/feature_detection_stdio.h b/contrib/python/numpy/py3/numpy/core/feature_detection_stdio.h index bc14d16d04..d8bbfbd8b2 100644 --- a/contrib/python/numpy/py3/numpy/core/feature_detection_stdio.h +++ b/contrib/python/numpy/py3/numpy/core/feature_detection_stdio.h @@ -1,6 +1,9 @@ +#define _GNU_SOURCE #include <stdio.h> #include <fcntl.h> +#if 0 /* Only for setup_common.py, not the C compiler */ off_t ftello(FILE *stream); int fseeko(FILE *stream, off_t offset, int whence); int fallocate(int, int, off_t, off_t); +#endif diff --git a/contrib/python/numpy/py3/numpy/core/generate_numpy_api.py b/contrib/python/numpy/py3/numpy/core/generate_numpy_api.py new file mode 100644 index 0000000000..640bae9e5f --- /dev/null +++ b/contrib/python/numpy/py3/numpy/core/generate_numpy_api.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 +import os +import argparse + +import genapi +from genapi import \ + TypeApi, GlobalVarApi, FunctionApi, BoolValuesApi + +import numpy_api + +# use annotated api when running under cpychecker +h_template = r""" +#if defined(_MULTIARRAYMODULE) || defined(WITH_CPYCHECKER_STEALS_REFERENCE_TO_ARG_ATTRIBUTE) + +typedef struct { + PyObject_HEAD + npy_bool obval; +} PyBoolScalarObject; + +extern NPY_NO_EXPORT PyTypeObject PyArrayMapIter_Type; +extern NPY_NO_EXPORT PyTypeObject PyArrayNeighborhoodIter_Type; +extern NPY_NO_EXPORT PyBoolScalarObject _PyArrayScalar_BoolValues[2]; + +%s + +#else + +#if defined(PY_ARRAY_UNIQUE_SYMBOL) +#define PyArray_API PY_ARRAY_UNIQUE_SYMBOL +#endif + +#if defined(NO_IMPORT) || defined(NO_IMPORT_ARRAY) +extern void **PyArray_API; +#else +#if defined(PY_ARRAY_UNIQUE_SYMBOL) +void **PyArray_API; +#else +static void **PyArray_API=NULL; +#endif +#endif + +%s + +#if !defined(NO_IMPORT_ARRAY) && !defined(NO_IMPORT) +static int +_import_array(void) +{ + int st; + PyObject *numpy = PyImport_ImportModule("numpy.core._multiarray_umath"); + PyObject *c_api = NULL; + + if (numpy == NULL) { + return -1; + } + c_api = PyObject_GetAttrString(numpy, "_ARRAY_API"); + Py_DECREF(numpy); + if (c_api == NULL) { + return -1; + } + + if (!PyCapsule_CheckExact(c_api)) { + PyErr_SetString(PyExc_RuntimeError, "_ARRAY_API is not PyCapsule object"); + Py_DECREF(c_api); + return -1; + } + PyArray_API = (void **)PyCapsule_GetPointer(c_api, NULL); + Py_DECREF(c_api); + if (PyArray_API == NULL) { + PyErr_SetString(PyExc_RuntimeError, "_ARRAY_API is NULL pointer"); + return -1; + } + + /* Perform runtime check of C API version */ + if (NPY_VERSION != PyArray_GetNDArrayCVersion()) { + PyErr_Format(PyExc_RuntimeError, "module compiled against "\ + "ABI version 0x%%x but this version of numpy is 0x%%x", \ + (int) NPY_VERSION, (int) PyArray_GetNDArrayCVersion()); + return -1; + } + if (NPY_FEATURE_VERSION > PyArray_GetNDArrayCFeatureVersion()) { + PyErr_Format(PyExc_RuntimeError, "module compiled against "\ + "API version 0x%%x but this version of numpy is 0x%%x . "\ + "Check the section C-API incompatibility at the "\ + "Troubleshooting ImportError section at "\ + "https://numpy.org/devdocs/user/troubleshooting-importerror.html"\ + "#c-api-incompatibility "\ + "for indications on how to solve this problem .", \ + (int) NPY_FEATURE_VERSION, (int) PyArray_GetNDArrayCFeatureVersion()); + return -1; + } + + /* + * Perform runtime check of endianness and check it matches the one set by + * the headers (npy_endian.h) as a safeguard + */ + st = PyArray_GetEndianness(); + if (st == NPY_CPU_UNKNOWN_ENDIAN) { + PyErr_SetString(PyExc_RuntimeError, + "FATAL: module compiled as unknown endian"); + return -1; + } +#if NPY_BYTE_ORDER == NPY_BIG_ENDIAN + if (st != NPY_CPU_BIG) { + PyErr_SetString(PyExc_RuntimeError, + "FATAL: module compiled as big endian, but " + "detected different endianness at runtime"); + return -1; + } +#elif NPY_BYTE_ORDER == NPY_LITTLE_ENDIAN + if (st != NPY_CPU_LITTLE) { + PyErr_SetString(PyExc_RuntimeError, + "FATAL: module compiled as little endian, but " + "detected different endianness at runtime"); + return -1; + } +#endif + + return 0; +} + +#define import_array() {if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import"); return NULL; } } + +#define import_array1(ret) {if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import"); return ret; } } + +#define import_array2(msg, ret) {if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, msg); return ret; } } + +#endif + +#endif +""" + + +c_template = r""" +/* These pointers will be stored in the C-object for use in other + extension modules +*/ + +void *PyArray_API[] = { +%s +}; +""" + +def generate_api(output_dir, force=False): + basename = 'multiarray_api' + + h_file = os.path.join(output_dir, '__%s.h' % basename) + c_file = os.path.join(output_dir, '__%s.c' % basename) + targets = (h_file, c_file) + + sources = numpy_api.multiarray_api + do_generate_api(targets, sources) + return targets + +def do_generate_api(targets, sources): + header_file = targets[0] + c_file = targets[1] + + global_vars = sources[0] + scalar_bool_values = sources[1] + types_api = sources[2] + multiarray_funcs = sources[3] + + multiarray_api = sources[:] + + module_list = [] + extension_list = [] + init_list = [] + + # Check multiarray api indexes + multiarray_api_index = genapi.merge_api_dicts(multiarray_api) + genapi.check_api_dict(multiarray_api_index) + + numpyapi_list = genapi.get_api_functions('NUMPY_API', + multiarray_funcs) + + # Create dict name -> *Api instance + api_name = 'PyArray_API' + multiarray_api_dict = {} + for f in numpyapi_list: + name = f.name + index = multiarray_funcs[name][0] + annotations = multiarray_funcs[name][1:] + multiarray_api_dict[f.name] = FunctionApi(f.name, index, annotations, + f.return_type, + f.args, api_name) + + for name, val in global_vars.items(): + index, type = val + multiarray_api_dict[name] = GlobalVarApi(name, index, type, api_name) + + for name, val in scalar_bool_values.items(): + index = val[0] + multiarray_api_dict[name] = BoolValuesApi(name, index, api_name) + + for name, val in types_api.items(): + index = val[0] + internal_type = None if len(val) == 1 else val[1] + multiarray_api_dict[name] = TypeApi( + name, index, 'PyTypeObject', api_name, internal_type) + + if len(multiarray_api_dict) != len(multiarray_api_index): + keys_dict = set(multiarray_api_dict.keys()) + keys_index = set(multiarray_api_index.keys()) + raise AssertionError( + "Multiarray API size mismatch - " + "index has extra keys {}, dict has extra keys {}" + .format(keys_index - keys_dict, keys_dict - keys_index) + ) + + extension_list = [] + for name, index in genapi.order_dict(multiarray_api_index): + api_item = multiarray_api_dict[name] + extension_list.append(api_item.define_from_array_api_string()) + init_list.append(api_item.array_api_define()) + module_list.append(api_item.internal_define()) + + # Write to header + s = h_template % ('\n'.join(module_list), '\n'.join(extension_list)) + genapi.write_file(header_file, s) + + # Write to c-code + s = c_template % ',\n'.join(init_list) + genapi.write_file(c_file, s) + + return targets + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-o", + "--outdir", + type=str, + help="Path to the output directory" + ) + parser.add_argument( + "-i", + "--ignore", + type=str, + help="An ignored input - may be useful to add a " + "dependency between custom targets" + ) + args = parser.parse_args() + + outdir_abs = os.path.join(os.getcwd(), args.outdir) + + generate_api(outdir_abs) + + +if __name__ == "__main__": + main() diff --git a/contrib/python/numpy/py3/numpy/core/src/common/npy_cpu_features.c b/contrib/python/numpy/py3/numpy/core/src/common/npy_cpu_features.c index 64a85f6fb2..bd149f8b43 100644 --- a/contrib/python/numpy/py3/numpy/core/src/common/npy_cpu_features.c +++ b/contrib/python/numpy/py3/numpy/core/src/common/npy_cpu_features.c @@ -656,7 +656,7 @@ npy__cpu_init_features(void) /***************** ARM ******************/ -#elif defined(__arm__) || defined(__aarch64__) +#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM64) static inline void npy__cpu_init_features_arm8(void) @@ -781,7 +781,7 @@ npy__cpu_init_features(void) return; #endif // We have nothing else todo -#if defined(NPY_HAVE_ASIMD) || defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8) +#if defined(NPY_HAVE_ASIMD) || defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8) || defined(_M_ARM64) #if defined(NPY_HAVE_FPHP) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) npy__cpu_have[NPY_CPU_FEATURE_FPHP] = 1; #endif diff --git a/contrib/python/numpy/py3/numpy/core/src/multiarray/convert.c b/contrib/python/numpy/py3/numpy/core/src/multiarray/convert.c index 60c1a1b9b0..8ec0aeefb7 100644 --- a/contrib/python/numpy/py3/numpy/core/src/multiarray/convert.c +++ b/contrib/python/numpy/py3/numpy/core/src/multiarray/convert.c @@ -23,8 +23,9 @@ #include "array_coercion.h" #include "refcount.h" -int -fallocate(int fd, int mode, off_t offset, off_t len); +#if defined(HAVE_FALLOCATE) && defined(__linux__) +#include <fcntl.h> +#endif /* * allocate nbytes of diskspace for file fp diff --git a/contrib/python/numpy/py3/numpy/core/src/multiarray/temp_elide.c b/contrib/python/numpy/py3/numpy/core/src/multiarray/temp_elide.c index 15257804bc..a38f90e76c 100644 --- a/contrib/python/numpy/py3/numpy/core/src/multiarray/temp_elide.c +++ b/contrib/python/numpy/py3/numpy/core/src/multiarray/temp_elide.c @@ -59,6 +59,9 @@ */ #if defined HAVE_BACKTRACE && defined HAVE_DLFCN_H && ! defined PYPY_VERSION + +#include <feature_detection_misc.h> + /* 1 prints elided operations, 2 prints stacktraces */ #define NPY_ELIDE_DEBUG 0 #define NPY_MAX_STACKSIZE 10 diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c b/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c index 25fae7f711..0d80a96966 100644 --- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c +++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c @@ -46,8 +46,16 @@ * q = TRUNC((n - (-dsign ) + (-nsign))/d) - (-qsign); ********************************************************************************/ +#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON) + // Due to integer 128-bit multiplication emulation, SIMD 64-bit division + // may not perform well on both neon and up to VSX3 compared to scalar + // division. + #define SIMD_DISABLE_DIV64_OPT +#endif + #if NPY_SIMD -#line 45 +#line 52 +#if 8 < 64 || (8 == 64 && !defined(SIMD_DISABLE_DIV64_OPT)) static inline void simd_divide_by_scalar_contig_s8(char **args, npy_intp len) { @@ -107,8 +115,10 @@ simd_divide_by_scalar_contig_s8(char **args, npy_intp len) } npyv_cleanup(); } +#endif -#line 45 +#line 52 +#if 16 < 64 || (16 == 64 && !defined(SIMD_DISABLE_DIV64_OPT)) static inline void simd_divide_by_scalar_contig_s16(char **args, npy_intp len) { @@ -168,8 +178,10 @@ simd_divide_by_scalar_contig_s16(char **args, npy_intp len) } npyv_cleanup(); } +#endif -#line 45 +#line 52 +#if 32 < 64 || (32 == 64 && !defined(SIMD_DISABLE_DIV64_OPT)) static inline void simd_divide_by_scalar_contig_s32(char **args, npy_intp len) { @@ -229,8 +241,10 @@ simd_divide_by_scalar_contig_s32(char **args, npy_intp len) } npyv_cleanup(); } +#endif -#line 45 +#line 52 +#if 64 < 64 || (64 == 64 && !defined(SIMD_DISABLE_DIV64_OPT)) static inline void simd_divide_by_scalar_contig_s64(char **args, npy_intp len) { @@ -290,9 +304,11 @@ simd_divide_by_scalar_contig_s64(char **args, npy_intp len) } npyv_cleanup(); } +#endif -#line 111 +#line 120 +#if 8 < 64 || (8 == 64 && !defined(SIMD_DISABLE_DIV64_OPT)) static inline void simd_divide_by_scalar_contig_u8(char **args, npy_intp len) { @@ -314,8 +330,10 @@ simd_divide_by_scalar_contig_u8(char **args, npy_intp len) } npyv_cleanup(); } +#endif -#line 111 +#line 120 +#if 16 < 64 || (16 == 64 && !defined(SIMD_DISABLE_DIV64_OPT)) static inline void simd_divide_by_scalar_contig_u16(char **args, npy_intp len) { @@ -337,8 +355,10 @@ simd_divide_by_scalar_contig_u16(char **args, npy_intp len) } npyv_cleanup(); } +#endif -#line 111 +#line 120 +#if 32 < 64 || (32 == 64 && !defined(SIMD_DISABLE_DIV64_OPT)) static inline void simd_divide_by_scalar_contig_u32(char **args, npy_intp len) { @@ -360,8 +380,10 @@ simd_divide_by_scalar_contig_u32(char **args, npy_intp len) } npyv_cleanup(); } +#endif -#line 111 +#line 120 +#if 64 < 64 || (64 == 64 && !defined(SIMD_DISABLE_DIV64_OPT)) static inline void simd_divide_by_scalar_contig_u64(char **args, npy_intp len) { @@ -383,11 +405,12 @@ simd_divide_by_scalar_contig_u64(char **args, npy_intp len) } npyv_cleanup(); } +#endif #if defined(NPY_HAVE_VSX4) -#line 140 +#line 151 /* * Computes division of 2 8-bit signed/unsigned integer vectors * @@ -452,7 +475,7 @@ vsx4_div_u16(npyv_u16 a, npyv_u16 b) #define vsx4_div_u32 vec_div #define vsx4_div_u64 vec_div -#line 140 +#line 151 /* * Computes division of 2 8-bit signed/unsigned integer vectors * @@ -518,7 +541,7 @@ vsx4_div_s16(npyv_s16 a, npyv_s16 b) #define vsx4_div_s64 vec_div -#line 210 +#line 221 static inline void vsx4_simd_divide_contig_u8(char **args, npy_intp len) { @@ -552,7 +575,7 @@ vsx4_simd_divide_contig_u8(char **args, npy_intp len) npyv_cleanup(); } -#line 210 +#line 221 static inline void vsx4_simd_divide_contig_u16(char **args, npy_intp len) { @@ -586,7 +609,7 @@ vsx4_simd_divide_contig_u16(char **args, npy_intp len) npyv_cleanup(); } -#line 210 +#line 221 static inline void vsx4_simd_divide_contig_u32(char **args, npy_intp len) { @@ -620,7 +643,7 @@ vsx4_simd_divide_contig_u32(char **args, npy_intp len) npyv_cleanup(); } -#line 210 +#line 221 static inline void vsx4_simd_divide_contig_u64(char **args, npy_intp len) { @@ -655,7 +678,7 @@ vsx4_simd_divide_contig_u64(char **args, npy_intp len) } -#line 249 +#line 260 static inline void vsx4_simd_divide_contig_s8(char **args, npy_intp len) { @@ -724,7 +747,7 @@ vsx4_simd_divide_contig_s8(char **args, npy_intp len) npyv_cleanup(); } -#line 249 +#line 260 static inline void vsx4_simd_divide_contig_s16(char **args, npy_intp len) { @@ -793,7 +816,7 @@ vsx4_simd_divide_contig_s16(char **args, npy_intp len) npyv_cleanup(); } -#line 249 +#line 260 static inline void vsx4_simd_divide_contig_s32(char **args, npy_intp len) { @@ -862,7 +885,7 @@ vsx4_simd_divide_contig_s32(char **args, npy_intp len) npyv_cleanup(); } -#line 249 +#line 260 static inline void vsx4_simd_divide_contig_s64(char **args, npy_intp len) { @@ -938,28 +961,27 @@ vsx4_simd_divide_contig_s64(char **args, npy_intp len) ** Defining ufunc inner functions ********************************************************************************/ -#line 329 +#line 340 #undef TO_SIMD_SFX #if 0 -#line 334 +#line 345 #elif NPY_BITSOF_BYTE == 8 #define TO_SIMD_SFX(X) X##_s8 -#line 334 +#line 345 #elif NPY_BITSOF_BYTE == 16 #define TO_SIMD_SFX(X) X##_s16 -#line 334 +#line 345 #elif NPY_BITSOF_BYTE == 32 #define TO_SIMD_SFX(X) X##_s32 -#line 334 +#line 345 #elif NPY_BITSOF_BYTE == 64 #define TO_SIMD_SFX(X) X##_s64 #endif - -#if NPY_BITSOF_BYTE == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) +#if NPY_BITSOF_BYTE == 64 && defined(SIMD_DISABLE_DIV64_OPT) #undef TO_SIMD_SFX #endif @@ -1042,28 +1064,27 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_divide_indexed) } -#line 329 +#line 340 #undef TO_SIMD_SFX #if 0 -#line 334 +#line 345 #elif NPY_BITSOF_SHORT == 8 #define TO_SIMD_SFX(X) X##_s8 -#line 334 +#line 345 #elif NPY_BITSOF_SHORT == 16 #define TO_SIMD_SFX(X) X##_s16 -#line 334 +#line 345 #elif NPY_BITSOF_SHORT == 32 #define TO_SIMD_SFX(X) X##_s32 -#line 334 +#line 345 #elif NPY_BITSOF_SHORT == 64 #define TO_SIMD_SFX(X) X##_s64 #endif - -#if NPY_BITSOF_SHORT == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) +#if NPY_BITSOF_SHORT == 64 && defined(SIMD_DISABLE_DIV64_OPT) #undef TO_SIMD_SFX #endif @@ -1146,28 +1167,27 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_divide_indexed) } -#line 329 +#line 340 #undef TO_SIMD_SFX #if 0 -#line 334 +#line 345 #elif NPY_BITSOF_INT == 8 #define TO_SIMD_SFX(X) X##_s8 -#line 334 +#line 345 #elif NPY_BITSOF_INT == 16 #define TO_SIMD_SFX(X) X##_s16 -#line 334 +#line 345 #elif NPY_BITSOF_INT == 32 #define TO_SIMD_SFX(X) X##_s32 -#line 334 +#line 345 #elif NPY_BITSOF_INT == 64 #define TO_SIMD_SFX(X) X##_s64 #endif - -#if NPY_BITSOF_INT == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) +#if NPY_BITSOF_INT == 64 && defined(SIMD_DISABLE_DIV64_OPT) #undef TO_SIMD_SFX #endif @@ -1250,28 +1270,27 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_divide_indexed) } -#line 329 +#line 340 #undef TO_SIMD_SFX #if 0 -#line 334 +#line 345 #elif NPY_BITSOF_LONG == 8 #define TO_SIMD_SFX(X) X##_s8 -#line 334 +#line 345 #elif NPY_BITSOF_LONG == 16 #define TO_SIMD_SFX(X) X##_s16 -#line 334 +#line 345 #elif NPY_BITSOF_LONG == 32 #define TO_SIMD_SFX(X) X##_s32 -#line 334 +#line 345 #elif NPY_BITSOF_LONG == 64 #define TO_SIMD_SFX(X) X##_s64 #endif - -#if NPY_BITSOF_LONG == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) +#if NPY_BITSOF_LONG == 64 && defined(SIMD_DISABLE_DIV64_OPT) #undef TO_SIMD_SFX #endif @@ -1354,28 +1373,27 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_divide_indexed) } -#line 329 +#line 340 #undef TO_SIMD_SFX #if 0 -#line 334 +#line 345 #elif NPY_BITSOF_LONGLONG == 8 #define TO_SIMD_SFX(X) X##_s8 -#line 334 +#line 345 #elif NPY_BITSOF_LONGLONG == 16 #define TO_SIMD_SFX(X) X##_s16 -#line 334 +#line 345 #elif NPY_BITSOF_LONGLONG == 32 #define TO_SIMD_SFX(X) X##_s32 -#line 334 +#line 345 #elif NPY_BITSOF_LONGLONG == 64 #define TO_SIMD_SFX(X) X##_s64 #endif - -#if NPY_BITSOF_LONGLONG == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) +#if NPY_BITSOF_LONGLONG == 64 && defined(SIMD_DISABLE_DIV64_OPT) #undef TO_SIMD_SFX #endif @@ -1459,22 +1477,22 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_divide_indexed) -#line 429 +#line 439 #undef TO_SIMD_SFX #if 0 -#line 434 +#line 444 #elif NPY_BITSOF_BYTE == 8 #define TO_SIMD_SFX(X) X##_u8 -#line 434 +#line 444 #elif NPY_BITSOF_BYTE == 16 #define TO_SIMD_SFX(X) X##_u16 -#line 434 +#line 444 #elif NPY_BITSOF_BYTE == 32 #define TO_SIMD_SFX(X) X##_u32 -#line 434 +#line 444 #elif NPY_BITSOF_BYTE == 64 #define TO_SIMD_SFX(X) X##_u64 @@ -1560,22 +1578,22 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_divide_indexed) } -#line 429 +#line 439 #undef TO_SIMD_SFX #if 0 -#line 434 +#line 444 #elif NPY_BITSOF_SHORT == 8 #define TO_SIMD_SFX(X) X##_u8 -#line 434 +#line 444 #elif NPY_BITSOF_SHORT == 16 #define TO_SIMD_SFX(X) X##_u16 -#line 434 +#line 444 #elif NPY_BITSOF_SHORT == 32 #define TO_SIMD_SFX(X) X##_u32 -#line 434 +#line 444 #elif NPY_BITSOF_SHORT == 64 #define TO_SIMD_SFX(X) X##_u64 @@ -1661,22 +1679,22 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_divide_indexed) } -#line 429 +#line 439 #undef TO_SIMD_SFX #if 0 -#line 434 +#line 444 #elif NPY_BITSOF_INT == 8 #define TO_SIMD_SFX(X) X##_u8 -#line 434 +#line 444 #elif NPY_BITSOF_INT == 16 #define TO_SIMD_SFX(X) X##_u16 -#line 434 +#line 444 #elif NPY_BITSOF_INT == 32 #define TO_SIMD_SFX(X) X##_u32 -#line 434 +#line 444 #elif NPY_BITSOF_INT == 64 #define TO_SIMD_SFX(X) X##_u64 @@ -1762,22 +1780,22 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_divide_indexed) } -#line 429 +#line 439 #undef TO_SIMD_SFX #if 0 -#line 434 +#line 444 #elif NPY_BITSOF_LONG == 8 #define TO_SIMD_SFX(X) X##_u8 -#line 434 +#line 444 #elif NPY_BITSOF_LONG == 16 #define TO_SIMD_SFX(X) X##_u16 -#line 434 +#line 444 #elif NPY_BITSOF_LONG == 32 #define TO_SIMD_SFX(X) X##_u32 -#line 434 +#line 444 #elif NPY_BITSOF_LONG == 64 #define TO_SIMD_SFX(X) X##_u64 @@ -1863,22 +1881,22 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_divide_indexed) } -#line 429 +#line 439 #undef TO_SIMD_SFX #if 0 -#line 434 +#line 444 #elif NPY_BITSOF_LONGLONG == 8 #define TO_SIMD_SFX(X) X##_u8 -#line 434 +#line 444 #elif NPY_BITSOF_LONGLONG == 16 #define TO_SIMD_SFX(X) X##_u16 -#line 434 +#line 444 #elif NPY_BITSOF_LONGLONG == 32 #define TO_SIMD_SFX(X) X##_u32 -#line 434 +#line 444 #elif NPY_BITSOF_LONGLONG == 64 #define TO_SIMD_SFX(X) X##_u64 diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c.src index e07bb79808..d056046e05 100644 --- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c.src +++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c.src @@ -36,12 +36,20 @@ * q = TRUNC((n - (-dsign ) + (-nsign))/d) - (-qsign); ********************************************************************************/ +#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON) + // Due to integer 128-bit multiplication emulation, SIMD 64-bit division + // may not perform well on both neon and up to VSX3 compared to scalar + // division. + #define SIMD_DISABLE_DIV64_OPT +#endif + #if NPY_SIMD /**begin repeat * Signed types * #sfx = s8, s16, s32, s64# * #len = 8, 16, 32, 64# */ +#if @len@ < 64 || (@len@ == 64 && !defined(SIMD_DISABLE_DIV64_OPT)) static inline void simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) { @@ -101,6 +109,7 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) } npyv_cleanup(); } +#endif /**end repeat**/ /**begin repeat @@ -108,6 +117,7 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) * #sfx = u8, u16, u32, u64# * #len = 8, 16, 32, 64# */ +#if @len@ < 64 || (@len@ == 64 && !defined(SIMD_DISABLE_DIV64_OPT)) static inline void simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) { @@ -129,6 +139,7 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) } npyv_cleanup(); } +#endif /**end repeat**/ #if defined(NPY_HAVE_VSX4) @@ -335,8 +346,7 @@ vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len) #define TO_SIMD_SFX(X) X##_s@len@ /**end repeat1**/ #endif - -#if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) +#if NPY_BITSOF_@TYPE@ == 64 && defined(SIMD_DISABLE_DIV64_OPT) #undef TO_SIMD_SFX #endif diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c b/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c index 5e9827a14c..8f446c3a8d 100644 --- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c +++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c @@ -134,18 +134,6 @@ fma_blend(__m256 x, __m256 y, __m256 ymask) } NPY_FINLINE __m256 -fma_invert_mask_ps(__m256 ymask) -{ - return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0)); -} - -NPY_FINLINE __m256i -fma_invert_mask_pd(__m256i ymask) -{ - return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF)); -} - -NPY_FINLINE __m256 fma_get_exponent(__m256 x) { /* @@ -321,18 +309,6 @@ avx512_blend(__m512 x, __m512 y, __mmask16 ymask) return _mm512_mask_mov_ps(x, ymask, y); } -NPY_FINLINE __mmask16 -avx512_invert_mask_ps(__mmask16 ymask) -{ - return _mm512_knot(ymask); -} - -NPY_FINLINE __mmask8 -avx512_invert_mask_pd(__mmask8 ymask) -{ - return _mm512_knot(ymask); -} - NPY_FINLINE __m512 avx512_get_exponent(__m512 x) { @@ -384,7 +360,7 @@ avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3, /******************************************************************************** ** Defining the SIMD kernels ********************************************************************************/ -#line 396 +#line 372 #ifdef SIMD_AVX2_FMA3 /* * Vectorized Cody-Waite range reduction technique @@ -683,7 +659,7 @@ simd_log_FLOAT(npy_float * op, } #endif // SIMD_AVX2_FMA3 -#line 396 +#line 372 #ifdef SIMD_AVX512F /* * Vectorized Cody-Waite range reduction technique @@ -984,7 +960,7 @@ simd_log_FLOAT(npy_float * op, #if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) -#line 700 +#line 676 static void simd_exp_f64(const npyv_lanetype_f64 *src, npy_intp ssrc, npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len) @@ -1015,7 +991,7 @@ simd_exp_f64(const npyv_lanetype_f64 *src, npy_intp ssrc, npyv_cleanup(); } -#line 700 +#line 676 static void simd_log_f64(const npyv_lanetype_f64 *src, npy_intp ssrc, npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len) @@ -1298,49 +1274,49 @@ AVX512F_log_DOUBLE(npy_double * op, __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]); /* Load lookup table data */ - #line 985 + #line 961 __m512d mLUT_TOP_0 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*0])); __m512d mLUT_TAIL_0 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*0])); -#line 985 +#line 961 __m512d mLUT_TOP_1 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*1])); __m512d mLUT_TAIL_1 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*1])); -#line 985 +#line 961 __m512d mLUT_TOP_2 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*2])); __m512d mLUT_TAIL_2 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*2])); -#line 985 +#line 961 __m512d mLUT_TOP_3 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*3])); __m512d mLUT_TAIL_3 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*3])); -#line 985 +#line 961 __m512d mLUT_TOP_4 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*4])); __m512d mLUT_TAIL_4 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*4])); -#line 985 +#line 961 __m512d mLUT_TOP_5 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*5])); __m512d mLUT_TAIL_5 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*5])); -#line 985 +#line 961 __m512d mLUT_TOP_6 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*6])); __m512d mLUT_TAIL_6 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*6])); -#line 985 +#line 961 __m512d mLUT_TOP_7 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*7])); __m512d mLUT_TAIL_7 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*7])); @@ -1487,7 +1463,7 @@ AVX512F_log_DOUBLE(npy_double * op, #endif // NPY_CAN_LINK_SVML #ifdef SIMD_AVX512_SKX -#line 1149 +#line 1125 static inline void AVX512_SKX_ldexp_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *steps) { @@ -1634,7 +1610,7 @@ AVX512_SKX_frexp_FLOAT(char **args, npy_intp const *dimensions, npy_intp const * } } -#line 1149 +#line 1125 static inline void AVX512_SKX_ldexp_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps) { @@ -1787,7 +1763,7 @@ AVX512_SKX_frexp_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const /******************************************************************************** ** Defining ufunc inner functions ********************************************************************************/ -#line 1305 +#line 1281 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_exp) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { @@ -1816,7 +1792,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_exp) #endif } -#line 1305 +#line 1281 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_log) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { @@ -1846,7 +1822,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_log) } -#line 1338 +#line 1314 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_exp) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { @@ -1879,7 +1855,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_exp) } -#line 1338 +#line 1314 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_log) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { @@ -1913,7 +1889,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_log) -#line 1378 +#line 1354 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_frexp) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -1945,7 +1921,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_ldexp) } } -#line 1378 +#line 1354 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_frexp) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c.src b/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c.src index 1fac3c150c..85dac9c20d 100644 --- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c.src +++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c.src @@ -124,18 +124,6 @@ fma_blend(__m256 x, __m256 y, __m256 ymask) } NPY_FINLINE __m256 -fma_invert_mask_ps(__m256 ymask) -{ - return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0)); -} - -NPY_FINLINE __m256i -fma_invert_mask_pd(__m256i ymask) -{ - return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF)); -} - -NPY_FINLINE __m256 fma_get_exponent(__m256 x) { /* @@ -311,18 +299,6 @@ avx512_blend(__m512 x, __m512 y, __mmask16 ymask) return _mm512_mask_mov_ps(x, ymask, y); } -NPY_FINLINE __mmask16 -avx512_invert_mask_ps(__mmask16 ymask) -{ - return _mm512_knot(ymask); -} - -NPY_FINLINE __mmask8 -avx512_invert_mask_pd(__mmask8 ymask) -{ - return _mm512_knot(ymask); -} - NPY_FINLINE __m512 avx512_get_exponent(__m512 x) { diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c b/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c index ad8c1ef397..97a78b0e12 100644 --- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c +++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c @@ -320,7 +320,8 @@ simd_binary_ccc_max_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2, } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_max_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1, const npyv_lanetype_s8 *ip2, npy_intp sip2, @@ -483,7 +484,8 @@ simd_binary_ccc_min_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2, } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_min_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1, const npyv_lanetype_s8 *ip2, npy_intp sip2, @@ -646,7 +648,8 @@ simd_binary_ccc_maxp_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2 } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_maxp_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1, const npyv_lanetype_s8 *ip2, npy_intp sip2, @@ -809,7 +812,8 @@ simd_binary_ccc_minp_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2 } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_minp_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1, const npyv_lanetype_s8 *ip2, npy_intp sip2, @@ -974,7 +978,8 @@ simd_binary_ccc_max_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2, } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_max_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1, const npyv_lanetype_u8 *ip2, npy_intp sip2, @@ -1137,7 +1142,8 @@ simd_binary_ccc_min_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2, } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_min_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1, const npyv_lanetype_u8 *ip2, npy_intp sip2, @@ -1300,7 +1306,8 @@ simd_binary_ccc_maxp_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2 } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_maxp_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1, const npyv_lanetype_u8 *ip2, npy_intp sip2, @@ -1463,7 +1470,8 @@ simd_binary_ccc_minp_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2 } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_minp_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1, const npyv_lanetype_u8 *ip2, npy_intp sip2, @@ -1628,7 +1636,8 @@ simd_binary_ccc_max_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 *i } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_max_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1, const npyv_lanetype_s16 *ip2, npy_intp sip2, @@ -1791,7 +1800,8 @@ simd_binary_ccc_min_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 *i } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_min_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1, const npyv_lanetype_s16 *ip2, npy_intp sip2, @@ -1954,7 +1964,8 @@ simd_binary_ccc_maxp_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 * } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_maxp_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1, const npyv_lanetype_s16 *ip2, npy_intp sip2, @@ -2117,7 +2128,8 @@ simd_binary_ccc_minp_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 * } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_minp_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1, const npyv_lanetype_s16 *ip2, npy_intp sip2, @@ -2282,7 +2294,8 @@ simd_binary_ccc_max_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 *i } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_max_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1, const npyv_lanetype_u16 *ip2, npy_intp sip2, @@ -2445,7 +2458,8 @@ simd_binary_ccc_min_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 *i } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_min_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1, const npyv_lanetype_u16 *ip2, npy_intp sip2, @@ -2608,7 +2622,8 @@ simd_binary_ccc_maxp_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 * } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_maxp_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1, const npyv_lanetype_u16 *ip2, npy_intp sip2, @@ -2771,7 +2786,8 @@ simd_binary_ccc_minp_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 * } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_minp_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1, const npyv_lanetype_u16 *ip2, npy_intp sip2, @@ -2936,7 +2952,8 @@ simd_binary_ccc_max_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 *i } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_max_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1, const npyv_lanetype_s32 *ip2, npy_intp sip2, @@ -3099,7 +3116,8 @@ simd_binary_ccc_min_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 *i } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_min_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1, const npyv_lanetype_s32 *ip2, npy_intp sip2, @@ -3262,7 +3280,8 @@ simd_binary_ccc_maxp_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 * } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_maxp_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1, const npyv_lanetype_s32 *ip2, npy_intp sip2, @@ -3425,7 +3444,8 @@ simd_binary_ccc_minp_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 * } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_minp_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1, const npyv_lanetype_s32 *ip2, npy_intp sip2, @@ -3590,7 +3610,8 @@ simd_binary_ccc_max_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 *i } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_max_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1, const npyv_lanetype_u32 *ip2, npy_intp sip2, @@ -3753,7 +3774,8 @@ simd_binary_ccc_min_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 *i } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_min_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1, const npyv_lanetype_u32 *ip2, npy_intp sip2, @@ -3916,7 +3938,8 @@ simd_binary_ccc_maxp_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 * } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_maxp_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1, const npyv_lanetype_u32 *ip2, npy_intp sip2, @@ -4079,7 +4102,8 @@ simd_binary_ccc_minp_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 * } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_minp_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1, const npyv_lanetype_u32 *ip2, npy_intp sip2, @@ -4244,7 +4268,8 @@ simd_binary_ccc_max_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 *i } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_max_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1, const npyv_lanetype_s64 *ip2, npy_intp sip2, @@ -4407,7 +4432,8 @@ simd_binary_ccc_min_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 *i } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_min_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1, const npyv_lanetype_s64 *ip2, npy_intp sip2, @@ -4570,7 +4596,8 @@ simd_binary_ccc_maxp_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 * } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_maxp_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1, const npyv_lanetype_s64 *ip2, npy_intp sip2, @@ -4733,7 +4760,8 @@ simd_binary_ccc_minp_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 * } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_minp_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1, const npyv_lanetype_s64 *ip2, npy_intp sip2, @@ -4898,7 +4926,8 @@ simd_binary_ccc_max_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 *i } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_max_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1, const npyv_lanetype_u64 *ip2, npy_intp sip2, @@ -5061,7 +5090,8 @@ simd_binary_ccc_min_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 *i } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_min_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1, const npyv_lanetype_u64 *ip2, npy_intp sip2, @@ -5224,7 +5254,8 @@ simd_binary_ccc_maxp_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 * } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_maxp_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1, const npyv_lanetype_u64 *ip2, npy_intp sip2, @@ -5387,7 +5418,8 @@ simd_binary_ccc_minp_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 * } } // non-contiguous for float 32/64-bit memory access -#if 0 +#if 0 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_minp_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1, const npyv_lanetype_u64 *ip2, npy_intp sip2, @@ -5552,7 +5584,8 @@ simd_binary_ccc_max_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 *i } } // non-contiguous for float 32/64-bit memory access -#if 1 +#if 1 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_max_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1, const npyv_lanetype_f32 *ip2, npy_intp sip2, @@ -5715,7 +5748,8 @@ simd_binary_ccc_min_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 *i } } // non-contiguous for float 32/64-bit memory access -#if 1 +#if 1 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_min_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1, const npyv_lanetype_f32 *ip2, npy_intp sip2, @@ -5878,7 +5912,8 @@ simd_binary_ccc_maxp_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 * } } // non-contiguous for float 32/64-bit memory access -#if 1 +#if 1 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_maxp_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1, const npyv_lanetype_f32 *ip2, npy_intp sip2, @@ -6041,7 +6076,8 @@ simd_binary_ccc_minp_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 * } } // non-contiguous for float 32/64-bit memory access -#if 1 +#if 1 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_minp_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1, const npyv_lanetype_f32 *ip2, npy_intp sip2, @@ -6206,7 +6242,8 @@ simd_binary_ccc_max_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 *i } } // non-contiguous for float 32/64-bit memory access -#if 1 +#if 1 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_max_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1, const npyv_lanetype_f64 *ip2, npy_intp sip2, @@ -6369,7 +6406,8 @@ simd_binary_ccc_min_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 *i } } // non-contiguous for float 32/64-bit memory access -#if 1 +#if 1 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_min_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1, const npyv_lanetype_f64 *ip2, npy_intp sip2, @@ -6532,7 +6570,8 @@ simd_binary_ccc_maxp_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 * } } // non-contiguous for float 32/64-bit memory access -#if 1 +#if 1 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_maxp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1, const npyv_lanetype_f64 *ip2, npy_intp sip2, @@ -6695,7 +6734,8 @@ simd_binary_ccc_minp_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 * } } // non-contiguous for float 32/64-bit memory access -#if 1 +#if 1 && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1, const npyv_lanetype_f64 *ip2, npy_intp sip2, @@ -6744,10 +6784,10 @@ simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1, /******************************************************************************* ** Defining ufunc inner functions ******************************************************************************/ -#line 293 +#line 294 #undef TO_SIMD_SFX #if 0 -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_BYTE == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -6763,7 +6803,7 @@ simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1, #define TO_SIMD_SFX(X) X##_s8 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_BYTE == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -6779,7 +6819,7 @@ simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1, #define TO_SIMD_SFX(X) X##_s16 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_BYTE == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -6795,7 +6835,7 @@ simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1, #define TO_SIMD_SFX(X) X##_s32 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_BYTE == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -6813,7 +6853,7 @@ simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1, #endif -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_max_i @@ -6921,22 +6961,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_maximum) * result of iteration 1. */ - #line 430 + #line 431 npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1)); npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2)); *((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1)); npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2)); *((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1)); npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2)); *((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1)); npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2)); *((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -6988,7 +7028,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_maximum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_min_i @@ -7096,22 +7136,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_minimum) * result of iteration 1. */ - #line 430 + #line 431 npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1)); npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2)); *((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1)); npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2)); *((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1)); npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2)); *((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1)); npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2)); *((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -7163,7 +7203,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_minimum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_maxp_i @@ -7271,22 +7311,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_fmax) * result of iteration 1. */ - #line 430 + #line 431 npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1)); npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2)); *((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1)); npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2)); *((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1)); npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2)); *((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1)); npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2)); *((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -7338,7 +7378,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmax_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_minp_i @@ -7446,22 +7486,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_fmin) * result of iteration 1. */ - #line 430 + #line 431 npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1)); npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2)); *((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1)); npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2)); *((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1)); npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2)); *((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1)); npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2)); *((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -7514,10 +7554,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmin_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 293 +#line 294 #undef TO_SIMD_SFX #if 0 -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_SHORT == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -7533,7 +7573,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmin_indexed) #define TO_SIMD_SFX(X) X##_s8 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_SHORT == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -7549,7 +7589,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmin_indexed) #define TO_SIMD_SFX(X) X##_s16 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_SHORT == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -7565,7 +7605,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmin_indexed) #define TO_SIMD_SFX(X) X##_s32 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_SHORT == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -7583,7 +7623,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmin_indexed) #endif -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_max_i @@ -7691,22 +7731,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_maximum) * result of iteration 1. */ - #line 430 + #line 431 npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1)); npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2)); *((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1)); npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2)); *((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1)); npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2)); *((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1)); npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2)); *((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -7758,7 +7798,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_maximum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_min_i @@ -7866,22 +7906,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_minimum) * result of iteration 1. */ - #line 430 + #line 431 npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1)); npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2)); *((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1)); npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2)); *((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1)); npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2)); *((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1)); npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2)); *((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -7933,7 +7973,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_minimum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_maxp_i @@ -8041,22 +8081,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_fmax) * result of iteration 1. */ - #line 430 + #line 431 npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1)); npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2)); *((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1)); npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2)); *((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1)); npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2)); *((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1)); npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2)); *((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -8108,7 +8148,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmax_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_minp_i @@ -8216,22 +8256,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_fmin) * result of iteration 1. */ - #line 430 + #line 431 npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1)); npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2)); *((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1)); npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2)); *((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1)); npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2)); *((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1)); npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2)); *((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -8284,10 +8324,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmin_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 293 +#line 294 #undef TO_SIMD_SFX #if 0 -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_INT == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -8303,7 +8343,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmin_indexed) #define TO_SIMD_SFX(X) X##_s8 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_INT == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -8319,7 +8359,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmin_indexed) #define TO_SIMD_SFX(X) X##_s16 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_INT == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -8335,7 +8375,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmin_indexed) #define TO_SIMD_SFX(X) X##_s32 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_INT == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -8353,7 +8393,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmin_indexed) #endif -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_max_i @@ -8461,22 +8501,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_maximum) * result of iteration 1. */ - #line 430 + #line 431 npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1)); npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2)); *((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1)); npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2)); *((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1)); npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2)); *((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1)); npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2)); *((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -8528,7 +8568,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_maximum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_min_i @@ -8636,22 +8676,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_minimum) * result of iteration 1. */ - #line 430 + #line 431 npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1)); npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2)); *((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1)); npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2)); *((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1)); npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2)); *((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1)); npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2)); *((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -8703,7 +8743,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_minimum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_maxp_i @@ -8811,22 +8851,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_fmax) * result of iteration 1. */ - #line 430 + #line 431 npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1)); npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2)); *((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1)); npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2)); *((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1)); npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2)); *((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1)); npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2)); *((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -8878,7 +8918,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmax_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_minp_i @@ -8986,22 +9026,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_fmin) * result of iteration 1. */ - #line 430 + #line 431 npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1)); npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2)); *((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1)); npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2)); *((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1)); npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2)); *((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1)); npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2)); *((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -9054,10 +9094,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmin_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 293 +#line 294 #undef TO_SIMD_SFX #if 0 -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONG == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -9073,7 +9113,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmin_indexed) #define TO_SIMD_SFX(X) X##_s8 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONG == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -9089,7 +9129,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmin_indexed) #define TO_SIMD_SFX(X) X##_s16 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONG == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -9105,7 +9145,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmin_indexed) #define TO_SIMD_SFX(X) X##_s32 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONG == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -9123,7 +9163,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmin_indexed) #endif -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_max_i @@ -9231,22 +9271,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_maximum) * result of iteration 1. */ - #line 430 + #line 431 npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1)); npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2)); *((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1)); npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2)); *((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1)); npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2)); *((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1)); npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2)); *((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -9298,7 +9338,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_maximum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_min_i @@ -9406,22 +9446,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_minimum) * result of iteration 1. */ - #line 430 + #line 431 npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1)); npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2)); *((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1)); npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2)); *((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1)); npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2)); *((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1)); npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2)); *((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -9473,7 +9513,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_minimum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_maxp_i @@ -9581,22 +9621,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_fmax) * result of iteration 1. */ - #line 430 + #line 431 npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1)); npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2)); *((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1)); npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2)); *((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1)); npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2)); *((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1)); npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2)); *((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -9648,7 +9688,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmax_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_minp_i @@ -9756,22 +9796,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_fmin) * result of iteration 1. */ - #line 430 + #line 431 npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1)); npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2)); *((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1)); npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2)); *((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1)); npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2)); *((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1)); npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2)); *((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -9824,10 +9864,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmin_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 293 +#line 294 #undef TO_SIMD_SFX #if 0 -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -9843,7 +9883,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmin_indexed) #define TO_SIMD_SFX(X) X##_s8 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -9859,7 +9899,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmin_indexed) #define TO_SIMD_SFX(X) X##_s16 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -9875,7 +9915,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmin_indexed) #define TO_SIMD_SFX(X) X##_s32 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -9893,7 +9933,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmin_indexed) #endif -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_max_i @@ -10001,22 +10041,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_maximum) * result of iteration 1. */ - #line 430 + #line 431 npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1)); npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2)); *((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1)); npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2)); *((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1)); npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2)); *((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1)); npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2)); *((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -10068,7 +10108,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_maximum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_min_i @@ -10176,22 +10216,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_minimum) * result of iteration 1. */ - #line 430 + #line 431 npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1)); npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2)); *((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1)); npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2)); *((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1)); npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2)); *((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1)); npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2)); *((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -10243,7 +10283,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_minimum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_maxp_i @@ -10351,22 +10391,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmax) * result of iteration 1. */ - #line 430 + #line 431 npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1)); npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2)); *((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1)); npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2)); *((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1)); npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2)); *((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1)); npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2)); *((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -10418,7 +10458,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmax_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_minp_i @@ -10526,22 +10566,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin) * result of iteration 1. */ - #line 430 + #line 431 npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1)); npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2)); *((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1)); npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2)); *((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1)); npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2)); *((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1)); npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2)); *((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -10594,10 +10634,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 293 +#line 294 #undef TO_SIMD_SFX #if 0 -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_BYTE == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -10613,7 +10653,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin_indexed) #define TO_SIMD_SFX(X) X##_s8 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_BYTE == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -10629,7 +10669,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin_indexed) #define TO_SIMD_SFX(X) X##_s16 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_BYTE == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -10645,7 +10685,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin_indexed) #define TO_SIMD_SFX(X) X##_s32 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_BYTE == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -10663,7 +10703,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin_indexed) #endif -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_max_i @@ -10771,22 +10811,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_maximum) * result of iteration 1. */ - #line 430 + #line 431 npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1)); npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2)); *((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1)); npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2)); *((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1)); npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2)); *((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1)); npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2)); *((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -10838,7 +10878,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_maximum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_min_i @@ -10946,22 +10986,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_minimum) * result of iteration 1. */ - #line 430 + #line 431 npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1)); npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2)); *((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1)); npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2)); *((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1)); npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2)); *((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1)); npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2)); *((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -11013,7 +11053,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_minimum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_maxp_i @@ -11121,22 +11161,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_fmax) * result of iteration 1. */ - #line 430 + #line 431 npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1)); npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2)); *((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1)); npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2)); *((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1)); npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2)); *((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1)); npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2)); *((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -11188,7 +11228,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmax_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_minp_i @@ -11296,22 +11336,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_fmin) * result of iteration 1. */ - #line 430 + #line 431 npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1)); npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2)); *((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1)); npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2)); *((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1)); npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2)); *((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1)); npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2)); *((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -11364,10 +11404,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmin_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 293 +#line 294 #undef TO_SIMD_SFX #if 0 -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_SHORT == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -11383,7 +11423,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmin_indexed) #define TO_SIMD_SFX(X) X##_s8 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_SHORT == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -11399,7 +11439,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmin_indexed) #define TO_SIMD_SFX(X) X##_s16 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_SHORT == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -11415,7 +11455,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmin_indexed) #define TO_SIMD_SFX(X) X##_s32 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_SHORT == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -11433,7 +11473,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmin_indexed) #endif -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_max_i @@ -11541,22 +11581,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_maximum) * result of iteration 1. */ - #line 430 + #line 431 npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1)); npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2)); *((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1)); npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2)); *((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1)); npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2)); *((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1)); npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2)); *((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -11608,7 +11648,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_maximum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_min_i @@ -11716,22 +11756,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_minimum) * result of iteration 1. */ - #line 430 + #line 431 npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1)); npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2)); *((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1)); npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2)); *((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1)); npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2)); *((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1)); npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2)); *((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -11783,7 +11823,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_minimum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_maxp_i @@ -11891,22 +11931,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_fmax) * result of iteration 1. */ - #line 430 + #line 431 npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1)); npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2)); *((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1)); npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2)); *((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1)); npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2)); *((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1)); npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2)); *((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -11958,7 +11998,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmax_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_minp_i @@ -12066,22 +12106,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_fmin) * result of iteration 1. */ - #line 430 + #line 431 npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1)); npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2)); *((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1)); npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2)); *((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1)); npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2)); *((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1)); npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2)); *((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -12134,10 +12174,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmin_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 293 +#line 294 #undef TO_SIMD_SFX #if 0 -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_INT == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -12153,7 +12193,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmin_indexed) #define TO_SIMD_SFX(X) X##_s8 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_INT == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -12169,7 +12209,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmin_indexed) #define TO_SIMD_SFX(X) X##_s16 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_INT == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -12185,7 +12225,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmin_indexed) #define TO_SIMD_SFX(X) X##_s32 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_INT == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -12203,7 +12243,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmin_indexed) #endif -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_max_i @@ -12311,22 +12351,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_maximum) * result of iteration 1. */ - #line 430 + #line 431 npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1)); npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2)); *((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1)); npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2)); *((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1)); npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2)); *((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1)); npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2)); *((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -12378,7 +12418,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_maximum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_min_i @@ -12486,22 +12526,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_minimum) * result of iteration 1. */ - #line 430 + #line 431 npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1)); npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2)); *((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1)); npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2)); *((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1)); npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2)); *((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1)); npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2)); *((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -12553,7 +12593,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_minimum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_maxp_i @@ -12661,22 +12701,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_fmax) * result of iteration 1. */ - #line 430 + #line 431 npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1)); npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2)); *((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1)); npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2)); *((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1)); npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2)); *((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1)); npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2)); *((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -12728,7 +12768,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmax_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_minp_i @@ -12836,22 +12876,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_fmin) * result of iteration 1. */ - #line 430 + #line 431 npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1)); npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2)); *((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1)); npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2)); *((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1)); npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2)); *((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1)); npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2)); *((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -12904,10 +12944,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmin_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 293 +#line 294 #undef TO_SIMD_SFX #if 0 -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONG == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -12923,7 +12963,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmin_indexed) #define TO_SIMD_SFX(X) X##_s8 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONG == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -12939,7 +12979,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmin_indexed) #define TO_SIMD_SFX(X) X##_s16 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONG == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -12955,7 +12995,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmin_indexed) #define TO_SIMD_SFX(X) X##_s32 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONG == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -12973,7 +13013,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmin_indexed) #endif -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_max_i @@ -13081,22 +13121,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_maximum) * result of iteration 1. */ - #line 430 + #line 431 npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1)); npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2)); *((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1)); npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2)); *((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1)); npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2)); *((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1)); npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2)); *((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -13148,7 +13188,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_maximum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_min_i @@ -13256,22 +13296,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_minimum) * result of iteration 1. */ - #line 430 + #line 431 npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1)); npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2)); *((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1)); npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2)); *((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1)); npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2)); *((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1)); npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2)); *((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -13323,7 +13363,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_minimum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_maxp_i @@ -13431,22 +13471,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_fmax) * result of iteration 1. */ - #line 430 + #line 431 npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1)); npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2)); *((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1)); npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2)); *((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1)); npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2)); *((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1)); npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2)); *((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -13498,7 +13538,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmax_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_minp_i @@ -13606,22 +13646,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_fmin) * result of iteration 1. */ - #line 430 + #line 431 npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1)); npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2)); *((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1)); npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2)); *((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1)); npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2)); *((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1)); npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2)); *((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -13674,10 +13714,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmin_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 293 +#line 294 #undef TO_SIMD_SFX #if 0 -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -13693,7 +13733,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmin_indexed) #define TO_SIMD_SFX(X) X##_s8 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -13709,7 +13749,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmin_indexed) #define TO_SIMD_SFX(X) X##_s16 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -13725,7 +13765,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmin_indexed) #define TO_SIMD_SFX(X) X##_s32 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -13743,7 +13783,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmin_indexed) #endif -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_max_i @@ -13851,22 +13891,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_maximum) * result of iteration 1. */ - #line 430 + #line 431 npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1)); npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2)); *((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1)); npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2)); *((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1)); npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2)); *((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1)); npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2)); *((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -13918,7 +13958,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_maximum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !0 || (0 && 0) #define SCALAR_OP scalar_min_i @@ -14026,22 +14066,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_minimum) * result of iteration 1. */ - #line 430 + #line 431 npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1)); npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2)); *((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1)); npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2)); *((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1)); npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2)); *((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1)); npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2)); *((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -14093,7 +14133,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_minimum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_maxp_i @@ -14201,22 +14241,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_fmax) * result of iteration 1. */ - #line 430 + #line 431 npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1)); npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2)); *((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1)); npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2)); *((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1)); npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2)); *((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1)); npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2)); *((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -14268,7 +14308,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmax_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (0 && 1) #define SCALAR_OP scalar_minp_i @@ -14376,22 +14416,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin) * result of iteration 1. */ - #line 430 + #line 431 npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1)); npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2)); *((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1)); npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2)); *((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1)); npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2)); *((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1)); npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2)); *((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -14444,10 +14484,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 293 +#line 294 #undef TO_SIMD_SFX #if 0 -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_FLOAT == 8 #if 1 #define TO_SIMD_SFX(X) X##_f8 @@ -14463,7 +14503,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin_indexed) #define TO_SIMD_SFX(X) X##_s8 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_FLOAT == 16 #if 1 #define TO_SIMD_SFX(X) X##_f16 @@ -14479,7 +14519,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin_indexed) #define TO_SIMD_SFX(X) X##_s16 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_FLOAT == 32 #if 1 #define TO_SIMD_SFX(X) X##_f32 @@ -14495,7 +14535,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin_indexed) #define TO_SIMD_SFX(X) X##_s32 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_FLOAT == 64 #if 1 #define TO_SIMD_SFX(X) X##_f64 @@ -14513,7 +14553,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin_indexed) #endif -#line 320 +#line 321 #if !0 || (1 && 0) #define SCALAR_OP scalar_max_f @@ -14621,22 +14661,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_maximum) * result of iteration 1. */ - #line 430 + #line 431 npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1)); npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2)); *((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1)); npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2)); *((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1)); npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2)); *((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1)); npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2)); *((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -14688,7 +14728,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_maximum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !0 || (1 && 0) #define SCALAR_OP scalar_min_f @@ -14796,22 +14836,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_minimum) * result of iteration 1. */ - #line 430 + #line 431 npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1)); npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2)); *((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1)); npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2)); *((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1)); npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2)); *((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1)); npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2)); *((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -14863,7 +14903,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_minimum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (1 && 1) #define SCALAR_OP scalar_maxp_f @@ -14971,22 +15011,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_fmax) * result of iteration 1. */ - #line 430 + #line 431 npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1)); npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2)); *((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1)); npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2)); *((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1)); npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2)); *((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1)); npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2)); *((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -15038,7 +15078,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmax_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (1 && 1) #define SCALAR_OP scalar_minp_f @@ -15146,22 +15186,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_fmin) * result of iteration 1. */ - #line 430 + #line 431 npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1)); npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2)); *((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1)); npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2)); *((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1)); npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2)); *((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1)); npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2)); *((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -15214,10 +15254,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmin_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 293 +#line 294 #undef TO_SIMD_SFX #if 0 -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_DOUBLE == 8 #if 1 #define TO_SIMD_SFX(X) X##_f8 @@ -15233,7 +15273,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmin_indexed) #define TO_SIMD_SFX(X) X##_s8 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_DOUBLE == 16 #if 1 #define TO_SIMD_SFX(X) X##_f16 @@ -15249,7 +15289,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmin_indexed) #define TO_SIMD_SFX(X) X##_s16 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_DOUBLE == 32 #if 1 #define TO_SIMD_SFX(X) X##_f32 @@ -15265,7 +15305,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmin_indexed) #define TO_SIMD_SFX(X) X##_s32 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_DOUBLE == 64 #if 1 #define TO_SIMD_SFX(X) X##_f64 @@ -15283,7 +15323,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmin_indexed) #endif -#line 320 +#line 321 #if !0 || (1 && 0) #define SCALAR_OP scalar_max_d @@ -15391,22 +15431,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_maximum) * result of iteration 1. */ - #line 430 + #line 431 npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1)); npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2)); *((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1)); npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2)); *((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1)); npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2)); *((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1)); npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2)); *((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -15458,7 +15498,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_maximum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !0 || (1 && 0) #define SCALAR_OP scalar_min_d @@ -15566,22 +15606,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_minimum) * result of iteration 1. */ - #line 430 + #line 431 npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1)); npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2)); *((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1)); npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2)); *((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1)); npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2)); *((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1)); npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2)); *((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -15633,7 +15673,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_minimum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (1 && 1) #define SCALAR_OP scalar_maxp_d @@ -15741,22 +15781,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_fmax) * result of iteration 1. */ - #line 430 + #line 431 npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1)); npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2)); *((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1)); npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2)); *((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1)); npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2)); *((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1)); npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2)); *((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -15808,7 +15848,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmax_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (1 && 1) #define SCALAR_OP scalar_minp_d @@ -15916,22 +15956,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin) * result of iteration 1. */ - #line 430 + #line 431 npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1)); npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2)); *((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1)); npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2)); *((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1)); npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2)); *((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1)); npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2)); *((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -15984,10 +16024,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 293 +#line 294 #undef TO_SIMD_SFX #if 0 -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 8 #if 1 #define TO_SIMD_SFX(X) X##_f8 @@ -16003,7 +16043,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin_indexed) #define TO_SIMD_SFX(X) X##_s8 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 16 #if 1 #define TO_SIMD_SFX(X) X##_f16 @@ -16019,7 +16059,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin_indexed) #define TO_SIMD_SFX(X) X##_s16 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 32 #if 1 #define TO_SIMD_SFX(X) X##_f32 @@ -16035,7 +16075,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin_indexed) #define TO_SIMD_SFX(X) X##_s32 #endif -#line 298 +#line 299 #elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 64 #if 1 #define TO_SIMD_SFX(X) X##_f64 @@ -16053,7 +16093,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin_indexed) #endif -#line 320 +#line 321 #if !0 || (1 && 0) #define SCALAR_OP scalar_max_l @@ -16161,22 +16201,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_maximum) * result of iteration 1. */ - #line 430 + #line 431 npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1)); npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2)); *((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1)); npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2)); *((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1)); npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2)); *((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1)); npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2)); *((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -16228,7 +16268,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_maximum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !0 || (1 && 0) #define SCALAR_OP scalar_min_l @@ -16336,22 +16376,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_minimum) * result of iteration 1. */ - #line 430 + #line 431 npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1)); npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2)); *((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1)); npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2)); *((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1)); npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2)); *((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1)); npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2)); *((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -16403,7 +16443,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_minimum_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (1 && 1) #define SCALAR_OP scalar_maxp_l @@ -16511,22 +16551,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_fmax) * result of iteration 1. */ - #line 430 + #line 431 npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1)); npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2)); *((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1)); npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2)); *((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1)); npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2)); *((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1)); npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2)); *((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); @@ -16578,7 +16618,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_fmax_indexed) #endif // !fp_only || (is_fp && fp_only) -#line 320 +#line 321 #if !1 || (1 && 1) #define SCALAR_OP scalar_minp_l @@ -16686,22 +16726,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_fmin) * result of iteration 1. */ - #line 430 + #line 431 npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1)); npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2)); *((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0); -#line 430 +#line 431 npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1)); npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2)); *((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1); -#line 430 +#line 431 npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1)); npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2)); *((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2); -#line 430 +#line 431 npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1)); npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2)); *((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3); diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c.src b/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c.src index 236e2e2eb7..319072c01f 100644 --- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c.src +++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c.src @@ -225,7 +225,8 @@ simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanety } } // non-contiguous for float 32/64-bit memory access -#if @is_fp@ +#if @is_fp@ && !defined(NPY_HAVE_NEON) +// unroll scalars faster than non-contiguous vector load/store on Arm static inline void simd_binary_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, npy_intp sip1, const npyv_lanetype_@sfx@ *ip2, npy_intp sip2, diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c b/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c index 9d9bc64a16..30ce938d66 100644 --- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c +++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c @@ -26,8 +26,8 @@ * when there's no native FUSED support instead of fallback to libc */ #if NPY_SIMD_FMA3 // native support -#line 23 -#if NPY_SIMD_F64 +#line 24 +#if NPY_SIMD_F64 && 0 /* * Vectorized Cody-Waite range reduction technique * Performs the reduction step x* = x - y*C in three steps: @@ -46,8 +46,8 @@ simd_range_reduction_f64(npyv_f64 x, npyv_f64 y, npyv_f64 c1, npyv_f64 c2, npyv_ } #endif -#line 23 -#if NPY_SIMD_F32 +#line 24 +#if NPY_SIMD_F32 && 1 /* * Vectorized Cody-Waite range reduction technique * Performs the reduction step x* = x - y*C in three steps: @@ -66,9 +66,11 @@ simd_range_reduction_f32(npyv_f32 x, npyv_f32 y, npyv_f32 c1, npyv_f32 c2, npyv_ } #endif - -#if NPY_SIMD_F64 -#line 47 +/* Disable SIMD code and revert to libm: see + * https://mail.python.org/archives/list/numpy-discussion@python.org/thread/C6EYZZSR4EWGVKHAZXLE7IBILRMNVK7L/ + * for detailed discussion on this*/ +#if 0 // NPY_SIMD_F64 +#line 50 #if defined(NPY_OS_WIN32) || defined(NPY_OS_CYGWIN) NPY_FINLINE npyv_f64 #else @@ -90,7 +92,7 @@ simd_cos_scalar_f64(npyv_f64 out, npy_uint64 cmp_bits) return npyv_loada_f64(out_copy); } -#line 47 +#line 50 #if defined(NPY_OS_WIN32) || defined(NPY_OS_CYGWIN) NPY_FINLINE npyv_f64 #else @@ -208,7 +210,7 @@ simd_sin_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign) return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), sign), odd)); } -#line 167 +#line 170 NPY_FINLINE void simd_cos_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_intp len) { @@ -254,7 +256,7 @@ simd_cos_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_i npyv_cleanup(); } -#line 167 +#line 170 NPY_FINLINE void simd_sin_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_intp len) { @@ -473,7 +475,7 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, #endif // NPY_SIMD_FP32 #endif // NYP_SIMD_FMA3 -#line 388 +#line 391 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_cos) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { @@ -507,7 +509,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_cos) #endif } -#line 388 +#line 391 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_sin) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { @@ -542,7 +544,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_sin) } -#line 426 +#line 429 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_sin) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { @@ -572,7 +574,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_sin) #endif } -#line 426 +#line 429 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_cos) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c.src b/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c.src index f07cb70f39..31de906098 100644 --- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c.src +++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c.src @@ -19,8 +19,9 @@ /**begin repeat * #check = F64, F32# * #sfx = f64, f32# + * #enable = 0, 1# */ -#if NPY_SIMD_@check@ +#if NPY_SIMD_@check@ && @enable@ /* * Vectorized Cody-Waite range reduction technique * Performs the reduction step x* = x - y*C in three steps: @@ -39,8 +40,10 @@ simd_range_reduction_@sfx@(npyv_@sfx@ x, npyv_@sfx@ y, npyv_@sfx@ c1, npyv_@sfx@ } #endif /**end repeat**/ - -#if NPY_SIMD_F64 +/* Disable SIMD code and revert to libm: see + * https://mail.python.org/archives/list/numpy-discussion@python.org/thread/C6EYZZSR4EWGVKHAZXLE7IBILRMNVK7L/ + * for detailed discussion on this*/ +#if 0 // NPY_SIMD_F64 /**begin repeat * #op = cos, sin# */ diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c b/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c index 3ea2747d9e..b2d3b0976a 100644 --- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c +++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c @@ -604,6 +604,8 @@ simd_unary_nc_negative_s8(const npyv_lanetype_s8 *ip, npy_intp istride, #undef UNROLL #define UNROLL 2 #endif +// X86 does better with unrolled scalar for heavy non-contiguous +#ifndef NPY_HAVE_SSE2 static NPY_INLINE void simd_unary_nn_negative_s8(const npyv_lanetype_s8 *ip, npy_intp istride, npyv_lanetype_s8 *op, npy_intp ostride, @@ -614,112 +616,112 @@ simd_unary_nn_negative_s8(const npyv_lanetype_s8 *ip, npy_intp istride, // unrolled vector loop for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) { - #line 211 + #line 213 #if UNROLL > 0 npyv_s8 v_0 = npyv_loadn_s8(ip + 0 * vstep * istride, istride); npyv_s8 r_0 = npyv_negative_s8(v_0); npyv_storen_s8(op + 0 * vstep * ostride, ostride, r_0); #endif -#line 211 +#line 213 #if UNROLL > 1 npyv_s8 v_1 = npyv_loadn_s8(ip + 1 * vstep * istride, istride); npyv_s8 r_1 = npyv_negative_s8(v_1); npyv_storen_s8(op + 1 * vstep * ostride, ostride, r_1); #endif -#line 211 +#line 213 #if UNROLL > 2 npyv_s8 v_2 = npyv_loadn_s8(ip + 2 * vstep * istride, istride); npyv_s8 r_2 = npyv_negative_s8(v_2); npyv_storen_s8(op + 2 * vstep * ostride, ostride, r_2); #endif -#line 211 +#line 213 #if UNROLL > 3 npyv_s8 v_3 = npyv_loadn_s8(ip + 3 * vstep * istride, istride); npyv_s8 r_3 = npyv_negative_s8(v_3); npyv_storen_s8(op + 3 * vstep * ostride, ostride, r_3); #endif -#line 211 +#line 213 #if UNROLL > 4 npyv_s8 v_4 = npyv_loadn_s8(ip + 4 * vstep * istride, istride); npyv_s8 r_4 = npyv_negative_s8(v_4); npyv_storen_s8(op + 4 * vstep * ostride, ostride, r_4); #endif -#line 211 +#line 213 #if UNROLL > 5 npyv_s8 v_5 = npyv_loadn_s8(ip + 5 * vstep * istride, istride); npyv_s8 r_5 = npyv_negative_s8(v_5); npyv_storen_s8(op + 5 * vstep * ostride, ostride, r_5); #endif -#line 211 +#line 213 #if UNROLL > 6 npyv_s8 v_6 = npyv_loadn_s8(ip + 6 * vstep * istride, istride); npyv_s8 r_6 = npyv_negative_s8(v_6); npyv_storen_s8(op + 6 * vstep * ostride, ostride, r_6); #endif -#line 211 +#line 213 #if UNROLL > 7 npyv_s8 v_7 = npyv_loadn_s8(ip + 7 * vstep * istride, istride); npyv_s8 r_7 = npyv_negative_s8(v_7); npyv_storen_s8(op + 7 * vstep * ostride, ostride, r_7); #endif -#line 211 +#line 213 #if UNROLL > 8 npyv_s8 v_8 = npyv_loadn_s8(ip + 8 * vstep * istride, istride); npyv_s8 r_8 = npyv_negative_s8(v_8); npyv_storen_s8(op + 8 * vstep * ostride, ostride, r_8); #endif -#line 211 +#line 213 #if UNROLL > 9 npyv_s8 v_9 = npyv_loadn_s8(ip + 9 * vstep * istride, istride); npyv_s8 r_9 = npyv_negative_s8(v_9); npyv_storen_s8(op + 9 * vstep * ostride, ostride, r_9); #endif -#line 211 +#line 213 #if UNROLL > 10 npyv_s8 v_10 = npyv_loadn_s8(ip + 10 * vstep * istride, istride); npyv_s8 r_10 = npyv_negative_s8(v_10); npyv_storen_s8(op + 10 * vstep * ostride, ostride, r_10); #endif -#line 211 +#line 213 #if UNROLL > 11 npyv_s8 v_11 = npyv_loadn_s8(ip + 11 * vstep * istride, istride); npyv_s8 r_11 = npyv_negative_s8(v_11); npyv_storen_s8(op + 11 * vstep * ostride, ostride, r_11); #endif -#line 211 +#line 213 #if UNROLL > 12 npyv_s8 v_12 = npyv_loadn_s8(ip + 12 * vstep * istride, istride); npyv_s8 r_12 = npyv_negative_s8(v_12); npyv_storen_s8(op + 12 * vstep * ostride, ostride, r_12); #endif -#line 211 +#line 213 #if UNROLL > 13 npyv_s8 v_13 = npyv_loadn_s8(ip + 13 * vstep * istride, istride); npyv_s8 r_13 = npyv_negative_s8(v_13); npyv_storen_s8(op + 13 * vstep * ostride, ostride, r_13); #endif -#line 211 +#line 213 #if UNROLL > 14 npyv_s8 v_14 = npyv_loadn_s8(ip + 14 * vstep * istride, istride); npyv_s8 r_14 = npyv_negative_s8(v_14); npyv_storen_s8(op + 14 * vstep * ostride, ostride, r_14); #endif -#line 211 +#line 213 #if UNROLL > 15 npyv_s8 v_15 = npyv_loadn_s8(ip + 15 * vstep * istride, istride); npyv_s8 r_15 = npyv_negative_s8(v_15); @@ -738,6 +740,7 @@ simd_unary_nn_negative_s8(const npyv_lanetype_s8 *ip, npy_intp istride, *op = scalar_negative(*ip); } } +#endif // NPY_HAVE_SSE2 #endif // 0 #undef UNROLL #endif // NPY_SIMD @@ -1167,6 +1170,8 @@ simd_unary_nc_negative_u8(const npyv_lanetype_u8 *ip, npy_intp istride, #undef UNROLL #define UNROLL 2 #endif +// X86 does better with unrolled scalar for heavy non-contiguous +#ifndef NPY_HAVE_SSE2 static NPY_INLINE void simd_unary_nn_negative_u8(const npyv_lanetype_u8 *ip, npy_intp istride, npyv_lanetype_u8 *op, npy_intp ostride, @@ -1177,112 +1182,112 @@ simd_unary_nn_negative_u8(const npyv_lanetype_u8 *ip, npy_intp istride, // unrolled vector loop for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) { - #line 211 + #line 213 #if UNROLL > 0 npyv_u8 v_0 = npyv_loadn_u8(ip + 0 * vstep * istride, istride); npyv_u8 r_0 = npyv_negative_u8(v_0); npyv_storen_u8(op + 0 * vstep * ostride, ostride, r_0); #endif -#line 211 +#line 213 #if UNROLL > 1 npyv_u8 v_1 = npyv_loadn_u8(ip + 1 * vstep * istride, istride); npyv_u8 r_1 = npyv_negative_u8(v_1); npyv_storen_u8(op + 1 * vstep * ostride, ostride, r_1); #endif -#line 211 +#line 213 #if UNROLL > 2 npyv_u8 v_2 = npyv_loadn_u8(ip + 2 * vstep * istride, istride); npyv_u8 r_2 = npyv_negative_u8(v_2); npyv_storen_u8(op + 2 * vstep * ostride, ostride, r_2); #endif -#line 211 +#line 213 #if UNROLL > 3 npyv_u8 v_3 = npyv_loadn_u8(ip + 3 * vstep * istride, istride); npyv_u8 r_3 = npyv_negative_u8(v_3); npyv_storen_u8(op + 3 * vstep * ostride, ostride, r_3); #endif -#line 211 +#line 213 #if UNROLL > 4 npyv_u8 v_4 = npyv_loadn_u8(ip + 4 * vstep * istride, istride); npyv_u8 r_4 = npyv_negative_u8(v_4); npyv_storen_u8(op + 4 * vstep * ostride, ostride, r_4); #endif -#line 211 +#line 213 #if UNROLL > 5 npyv_u8 v_5 = npyv_loadn_u8(ip + 5 * vstep * istride, istride); npyv_u8 r_5 = npyv_negative_u8(v_5); npyv_storen_u8(op + 5 * vstep * ostride, ostride, r_5); #endif -#line 211 +#line 213 #if UNROLL > 6 npyv_u8 v_6 = npyv_loadn_u8(ip + 6 * vstep * istride, istride); npyv_u8 r_6 = npyv_negative_u8(v_6); npyv_storen_u8(op + 6 * vstep * ostride, ostride, r_6); #endif -#line 211 +#line 213 #if UNROLL > 7 npyv_u8 v_7 = npyv_loadn_u8(ip + 7 * vstep * istride, istride); npyv_u8 r_7 = npyv_negative_u8(v_7); npyv_storen_u8(op + 7 * vstep * ostride, ostride, r_7); #endif -#line 211 +#line 213 #if UNROLL > 8 npyv_u8 v_8 = npyv_loadn_u8(ip + 8 * vstep * istride, istride); npyv_u8 r_8 = npyv_negative_u8(v_8); npyv_storen_u8(op + 8 * vstep * ostride, ostride, r_8); #endif -#line 211 +#line 213 #if UNROLL > 9 npyv_u8 v_9 = npyv_loadn_u8(ip + 9 * vstep * istride, istride); npyv_u8 r_9 = npyv_negative_u8(v_9); npyv_storen_u8(op + 9 * vstep * ostride, ostride, r_9); #endif -#line 211 +#line 213 #if UNROLL > 10 npyv_u8 v_10 = npyv_loadn_u8(ip + 10 * vstep * istride, istride); npyv_u8 r_10 = npyv_negative_u8(v_10); npyv_storen_u8(op + 10 * vstep * ostride, ostride, r_10); #endif -#line 211 +#line 213 #if UNROLL > 11 npyv_u8 v_11 = npyv_loadn_u8(ip + 11 * vstep * istride, istride); npyv_u8 r_11 = npyv_negative_u8(v_11); npyv_storen_u8(op + 11 * vstep * ostride, ostride, r_11); #endif -#line 211 +#line 213 #if UNROLL > 12 npyv_u8 v_12 = npyv_loadn_u8(ip + 12 * vstep * istride, istride); npyv_u8 r_12 = npyv_negative_u8(v_12); npyv_storen_u8(op + 12 * vstep * ostride, ostride, r_12); #endif -#line 211 +#line 213 #if UNROLL > 13 npyv_u8 v_13 = npyv_loadn_u8(ip + 13 * vstep * istride, istride); npyv_u8 r_13 = npyv_negative_u8(v_13); npyv_storen_u8(op + 13 * vstep * ostride, ostride, r_13); #endif -#line 211 +#line 213 #if UNROLL > 14 npyv_u8 v_14 = npyv_loadn_u8(ip + 14 * vstep * istride, istride); npyv_u8 r_14 = npyv_negative_u8(v_14); npyv_storen_u8(op + 14 * vstep * ostride, ostride, r_14); #endif -#line 211 +#line 213 #if UNROLL > 15 npyv_u8 v_15 = npyv_loadn_u8(ip + 15 * vstep * istride, istride); npyv_u8 r_15 = npyv_negative_u8(v_15); @@ -1301,6 +1306,7 @@ simd_unary_nn_negative_u8(const npyv_lanetype_u8 *ip, npy_intp istride, *op = scalar_negative(*ip); } } +#endif // NPY_HAVE_SSE2 #endif // 0 #undef UNROLL #endif // NPY_SIMD @@ -1730,6 +1736,8 @@ simd_unary_nc_negative_s16(const npyv_lanetype_s16 *ip, npy_intp istride, #undef UNROLL #define UNROLL 2 #endif +// X86 does better with unrolled scalar for heavy non-contiguous +#ifndef NPY_HAVE_SSE2 static NPY_INLINE void simd_unary_nn_negative_s16(const npyv_lanetype_s16 *ip, npy_intp istride, npyv_lanetype_s16 *op, npy_intp ostride, @@ -1740,112 +1748,112 @@ simd_unary_nn_negative_s16(const npyv_lanetype_s16 *ip, npy_intp istride, // unrolled vector loop for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) { - #line 211 + #line 213 #if UNROLL > 0 npyv_s16 v_0 = npyv_loadn_s16(ip + 0 * vstep * istride, istride); npyv_s16 r_0 = npyv_negative_s16(v_0); npyv_storen_s16(op + 0 * vstep * ostride, ostride, r_0); #endif -#line 211 +#line 213 #if UNROLL > 1 npyv_s16 v_1 = npyv_loadn_s16(ip + 1 * vstep * istride, istride); npyv_s16 r_1 = npyv_negative_s16(v_1); npyv_storen_s16(op + 1 * vstep * ostride, ostride, r_1); #endif -#line 211 +#line 213 #if UNROLL > 2 npyv_s16 v_2 = npyv_loadn_s16(ip + 2 * vstep * istride, istride); npyv_s16 r_2 = npyv_negative_s16(v_2); npyv_storen_s16(op + 2 * vstep * ostride, ostride, r_2); #endif -#line 211 +#line 213 #if UNROLL > 3 npyv_s16 v_3 = npyv_loadn_s16(ip + 3 * vstep * istride, istride); npyv_s16 r_3 = npyv_negative_s16(v_3); npyv_storen_s16(op + 3 * vstep * ostride, ostride, r_3); #endif -#line 211 +#line 213 #if UNROLL > 4 npyv_s16 v_4 = npyv_loadn_s16(ip + 4 * vstep * istride, istride); npyv_s16 r_4 = npyv_negative_s16(v_4); npyv_storen_s16(op + 4 * vstep * ostride, ostride, r_4); #endif -#line 211 +#line 213 #if UNROLL > 5 npyv_s16 v_5 = npyv_loadn_s16(ip + 5 * vstep * istride, istride); npyv_s16 r_5 = npyv_negative_s16(v_5); npyv_storen_s16(op + 5 * vstep * ostride, ostride, r_5); #endif -#line 211 +#line 213 #if UNROLL > 6 npyv_s16 v_6 = npyv_loadn_s16(ip + 6 * vstep * istride, istride); npyv_s16 r_6 = npyv_negative_s16(v_6); npyv_storen_s16(op + 6 * vstep * ostride, ostride, r_6); #endif -#line 211 +#line 213 #if UNROLL > 7 npyv_s16 v_7 = npyv_loadn_s16(ip + 7 * vstep * istride, istride); npyv_s16 r_7 = npyv_negative_s16(v_7); npyv_storen_s16(op + 7 * vstep * ostride, ostride, r_7); #endif -#line 211 +#line 213 #if UNROLL > 8 npyv_s16 v_8 = npyv_loadn_s16(ip + 8 * vstep * istride, istride); npyv_s16 r_8 = npyv_negative_s16(v_8); npyv_storen_s16(op + 8 * vstep * ostride, ostride, r_8); #endif -#line 211 +#line 213 #if UNROLL > 9 npyv_s16 v_9 = npyv_loadn_s16(ip + 9 * vstep * istride, istride); npyv_s16 r_9 = npyv_negative_s16(v_9); npyv_storen_s16(op + 9 * vstep * ostride, ostride, r_9); #endif -#line 211 +#line 213 #if UNROLL > 10 npyv_s16 v_10 = npyv_loadn_s16(ip + 10 * vstep * istride, istride); npyv_s16 r_10 = npyv_negative_s16(v_10); npyv_storen_s16(op + 10 * vstep * ostride, ostride, r_10); #endif -#line 211 +#line 213 #if UNROLL > 11 npyv_s16 v_11 = npyv_loadn_s16(ip + 11 * vstep * istride, istride); npyv_s16 r_11 = npyv_negative_s16(v_11); npyv_storen_s16(op + 11 * vstep * ostride, ostride, r_11); #endif -#line 211 +#line 213 #if UNROLL > 12 npyv_s16 v_12 = npyv_loadn_s16(ip + 12 * vstep * istride, istride); npyv_s16 r_12 = npyv_negative_s16(v_12); npyv_storen_s16(op + 12 * vstep * ostride, ostride, r_12); #endif -#line 211 +#line 213 #if UNROLL > 13 npyv_s16 v_13 = npyv_loadn_s16(ip + 13 * vstep * istride, istride); npyv_s16 r_13 = npyv_negative_s16(v_13); npyv_storen_s16(op + 13 * vstep * ostride, ostride, r_13); #endif -#line 211 +#line 213 #if UNROLL > 14 npyv_s16 v_14 = npyv_loadn_s16(ip + 14 * vstep * istride, istride); npyv_s16 r_14 = npyv_negative_s16(v_14); npyv_storen_s16(op + 14 * vstep * ostride, ostride, r_14); #endif -#line 211 +#line 213 #if UNROLL > 15 npyv_s16 v_15 = npyv_loadn_s16(ip + 15 * vstep * istride, istride); npyv_s16 r_15 = npyv_negative_s16(v_15); @@ -1864,6 +1872,7 @@ simd_unary_nn_negative_s16(const npyv_lanetype_s16 *ip, npy_intp istride, *op = scalar_negative(*ip); } } +#endif // NPY_HAVE_SSE2 #endif // 0 #undef UNROLL #endif // NPY_SIMD @@ -2293,6 +2302,8 @@ simd_unary_nc_negative_u16(const npyv_lanetype_u16 *ip, npy_intp istride, #undef UNROLL #define UNROLL 2 #endif +// X86 does better with unrolled scalar for heavy non-contiguous +#ifndef NPY_HAVE_SSE2 static NPY_INLINE void simd_unary_nn_negative_u16(const npyv_lanetype_u16 *ip, npy_intp istride, npyv_lanetype_u16 *op, npy_intp ostride, @@ -2303,112 +2314,112 @@ simd_unary_nn_negative_u16(const npyv_lanetype_u16 *ip, npy_intp istride, // unrolled vector loop for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) { - #line 211 + #line 213 #if UNROLL > 0 npyv_u16 v_0 = npyv_loadn_u16(ip + 0 * vstep * istride, istride); npyv_u16 r_0 = npyv_negative_u16(v_0); npyv_storen_u16(op + 0 * vstep * ostride, ostride, r_0); #endif -#line 211 +#line 213 #if UNROLL > 1 npyv_u16 v_1 = npyv_loadn_u16(ip + 1 * vstep * istride, istride); npyv_u16 r_1 = npyv_negative_u16(v_1); npyv_storen_u16(op + 1 * vstep * ostride, ostride, r_1); #endif -#line 211 +#line 213 #if UNROLL > 2 npyv_u16 v_2 = npyv_loadn_u16(ip + 2 * vstep * istride, istride); npyv_u16 r_2 = npyv_negative_u16(v_2); npyv_storen_u16(op + 2 * vstep * ostride, ostride, r_2); #endif -#line 211 +#line 213 #if UNROLL > 3 npyv_u16 v_3 = npyv_loadn_u16(ip + 3 * vstep * istride, istride); npyv_u16 r_3 = npyv_negative_u16(v_3); npyv_storen_u16(op + 3 * vstep * ostride, ostride, r_3); #endif -#line 211 +#line 213 #if UNROLL > 4 npyv_u16 v_4 = npyv_loadn_u16(ip + 4 * vstep * istride, istride); npyv_u16 r_4 = npyv_negative_u16(v_4); npyv_storen_u16(op + 4 * vstep * ostride, ostride, r_4); #endif -#line 211 +#line 213 #if UNROLL > 5 npyv_u16 v_5 = npyv_loadn_u16(ip + 5 * vstep * istride, istride); npyv_u16 r_5 = npyv_negative_u16(v_5); npyv_storen_u16(op + 5 * vstep * ostride, ostride, r_5); #endif -#line 211 +#line 213 #if UNROLL > 6 npyv_u16 v_6 = npyv_loadn_u16(ip + 6 * vstep * istride, istride); npyv_u16 r_6 = npyv_negative_u16(v_6); npyv_storen_u16(op + 6 * vstep * ostride, ostride, r_6); #endif -#line 211 +#line 213 #if UNROLL > 7 npyv_u16 v_7 = npyv_loadn_u16(ip + 7 * vstep * istride, istride); npyv_u16 r_7 = npyv_negative_u16(v_7); npyv_storen_u16(op + 7 * vstep * ostride, ostride, r_7); #endif -#line 211 +#line 213 #if UNROLL > 8 npyv_u16 v_8 = npyv_loadn_u16(ip + 8 * vstep * istride, istride); npyv_u16 r_8 = npyv_negative_u16(v_8); npyv_storen_u16(op + 8 * vstep * ostride, ostride, r_8); #endif -#line 211 +#line 213 #if UNROLL > 9 npyv_u16 v_9 = npyv_loadn_u16(ip + 9 * vstep * istride, istride); npyv_u16 r_9 = npyv_negative_u16(v_9); npyv_storen_u16(op + 9 * vstep * ostride, ostride, r_9); #endif -#line 211 +#line 213 #if UNROLL > 10 npyv_u16 v_10 = npyv_loadn_u16(ip + 10 * vstep * istride, istride); npyv_u16 r_10 = npyv_negative_u16(v_10); npyv_storen_u16(op + 10 * vstep * ostride, ostride, r_10); #endif -#line 211 +#line 213 #if UNROLL > 11 npyv_u16 v_11 = npyv_loadn_u16(ip + 11 * vstep * istride, istride); npyv_u16 r_11 = npyv_negative_u16(v_11); npyv_storen_u16(op + 11 * vstep * ostride, ostride, r_11); #endif -#line 211 +#line 213 #if UNROLL > 12 npyv_u16 v_12 = npyv_loadn_u16(ip + 12 * vstep * istride, istride); npyv_u16 r_12 = npyv_negative_u16(v_12); npyv_storen_u16(op + 12 * vstep * ostride, ostride, r_12); #endif -#line 211 +#line 213 #if UNROLL > 13 npyv_u16 v_13 = npyv_loadn_u16(ip + 13 * vstep * istride, istride); npyv_u16 r_13 = npyv_negative_u16(v_13); npyv_storen_u16(op + 13 * vstep * ostride, ostride, r_13); #endif -#line 211 +#line 213 #if UNROLL > 14 npyv_u16 v_14 = npyv_loadn_u16(ip + 14 * vstep * istride, istride); npyv_u16 r_14 = npyv_negative_u16(v_14); npyv_storen_u16(op + 14 * vstep * ostride, ostride, r_14); #endif -#line 211 +#line 213 #if UNROLL > 15 npyv_u16 v_15 = npyv_loadn_u16(ip + 15 * vstep * istride, istride); npyv_u16 r_15 = npyv_negative_u16(v_15); @@ -2427,6 +2438,7 @@ simd_unary_nn_negative_u16(const npyv_lanetype_u16 *ip, npy_intp istride, *op = scalar_negative(*ip); } } +#endif // NPY_HAVE_SSE2 #endif // 0 #undef UNROLL #endif // NPY_SIMD @@ -2856,6 +2868,8 @@ simd_unary_nc_negative_s32(const npyv_lanetype_s32 *ip, npy_intp istride, #undef UNROLL #define UNROLL 2 #endif +// X86 does better with unrolled scalar for heavy non-contiguous +#ifndef NPY_HAVE_SSE2 static NPY_INLINE void simd_unary_nn_negative_s32(const npyv_lanetype_s32 *ip, npy_intp istride, npyv_lanetype_s32 *op, npy_intp ostride, @@ -2866,112 +2880,112 @@ simd_unary_nn_negative_s32(const npyv_lanetype_s32 *ip, npy_intp istride, // unrolled vector loop for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) { - #line 211 + #line 213 #if UNROLL > 0 npyv_s32 v_0 = npyv_loadn_s32(ip + 0 * vstep * istride, istride); npyv_s32 r_0 = npyv_negative_s32(v_0); npyv_storen_s32(op + 0 * vstep * ostride, ostride, r_0); #endif -#line 211 +#line 213 #if UNROLL > 1 npyv_s32 v_1 = npyv_loadn_s32(ip + 1 * vstep * istride, istride); npyv_s32 r_1 = npyv_negative_s32(v_1); npyv_storen_s32(op + 1 * vstep * ostride, ostride, r_1); #endif -#line 211 +#line 213 #if UNROLL > 2 npyv_s32 v_2 = npyv_loadn_s32(ip + 2 * vstep * istride, istride); npyv_s32 r_2 = npyv_negative_s32(v_2); npyv_storen_s32(op + 2 * vstep * ostride, ostride, r_2); #endif -#line 211 +#line 213 #if UNROLL > 3 npyv_s32 v_3 = npyv_loadn_s32(ip + 3 * vstep * istride, istride); npyv_s32 r_3 = npyv_negative_s32(v_3); npyv_storen_s32(op + 3 * vstep * ostride, ostride, r_3); #endif -#line 211 +#line 213 #if UNROLL > 4 npyv_s32 v_4 = npyv_loadn_s32(ip + 4 * vstep * istride, istride); npyv_s32 r_4 = npyv_negative_s32(v_4); npyv_storen_s32(op + 4 * vstep * ostride, ostride, r_4); #endif -#line 211 +#line 213 #if UNROLL > 5 npyv_s32 v_5 = npyv_loadn_s32(ip + 5 * vstep * istride, istride); npyv_s32 r_5 = npyv_negative_s32(v_5); npyv_storen_s32(op + 5 * vstep * ostride, ostride, r_5); #endif -#line 211 +#line 213 #if UNROLL > 6 npyv_s32 v_6 = npyv_loadn_s32(ip + 6 * vstep * istride, istride); npyv_s32 r_6 = npyv_negative_s32(v_6); npyv_storen_s32(op + 6 * vstep * ostride, ostride, r_6); #endif -#line 211 +#line 213 #if UNROLL > 7 npyv_s32 v_7 = npyv_loadn_s32(ip + 7 * vstep * istride, istride); npyv_s32 r_7 = npyv_negative_s32(v_7); npyv_storen_s32(op + 7 * vstep * ostride, ostride, r_7); #endif -#line 211 +#line 213 #if UNROLL > 8 npyv_s32 v_8 = npyv_loadn_s32(ip + 8 * vstep * istride, istride); npyv_s32 r_8 = npyv_negative_s32(v_8); npyv_storen_s32(op + 8 * vstep * ostride, ostride, r_8); #endif -#line 211 +#line 213 #if UNROLL > 9 npyv_s32 v_9 = npyv_loadn_s32(ip + 9 * vstep * istride, istride); npyv_s32 r_9 = npyv_negative_s32(v_9); npyv_storen_s32(op + 9 * vstep * ostride, ostride, r_9); #endif -#line 211 +#line 213 #if UNROLL > 10 npyv_s32 v_10 = npyv_loadn_s32(ip + 10 * vstep * istride, istride); npyv_s32 r_10 = npyv_negative_s32(v_10); npyv_storen_s32(op + 10 * vstep * ostride, ostride, r_10); #endif -#line 211 +#line 213 #if UNROLL > 11 npyv_s32 v_11 = npyv_loadn_s32(ip + 11 * vstep * istride, istride); npyv_s32 r_11 = npyv_negative_s32(v_11); npyv_storen_s32(op + 11 * vstep * ostride, ostride, r_11); #endif -#line 211 +#line 213 #if UNROLL > 12 npyv_s32 v_12 = npyv_loadn_s32(ip + 12 * vstep * istride, istride); npyv_s32 r_12 = npyv_negative_s32(v_12); npyv_storen_s32(op + 12 * vstep * ostride, ostride, r_12); #endif -#line 211 +#line 213 #if UNROLL > 13 npyv_s32 v_13 = npyv_loadn_s32(ip + 13 * vstep * istride, istride); npyv_s32 r_13 = npyv_negative_s32(v_13); npyv_storen_s32(op + 13 * vstep * ostride, ostride, r_13); #endif -#line 211 +#line 213 #if UNROLL > 14 npyv_s32 v_14 = npyv_loadn_s32(ip + 14 * vstep * istride, istride); npyv_s32 r_14 = npyv_negative_s32(v_14); npyv_storen_s32(op + 14 * vstep * ostride, ostride, r_14); #endif -#line 211 +#line 213 #if UNROLL > 15 npyv_s32 v_15 = npyv_loadn_s32(ip + 15 * vstep * istride, istride); npyv_s32 r_15 = npyv_negative_s32(v_15); @@ -2990,6 +3004,7 @@ simd_unary_nn_negative_s32(const npyv_lanetype_s32 *ip, npy_intp istride, *op = scalar_negative(*ip); } } +#endif // NPY_HAVE_SSE2 #endif // 1 #undef UNROLL #endif // NPY_SIMD @@ -3419,6 +3434,8 @@ simd_unary_nc_negative_u32(const npyv_lanetype_u32 *ip, npy_intp istride, #undef UNROLL #define UNROLL 2 #endif +// X86 does better with unrolled scalar for heavy non-contiguous +#ifndef NPY_HAVE_SSE2 static NPY_INLINE void simd_unary_nn_negative_u32(const npyv_lanetype_u32 *ip, npy_intp istride, npyv_lanetype_u32 *op, npy_intp ostride, @@ -3429,112 +3446,112 @@ simd_unary_nn_negative_u32(const npyv_lanetype_u32 *ip, npy_intp istride, // unrolled vector loop for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) { - #line 211 + #line 213 #if UNROLL > 0 npyv_u32 v_0 = npyv_loadn_u32(ip + 0 * vstep * istride, istride); npyv_u32 r_0 = npyv_negative_u32(v_0); npyv_storen_u32(op + 0 * vstep * ostride, ostride, r_0); #endif -#line 211 +#line 213 #if UNROLL > 1 npyv_u32 v_1 = npyv_loadn_u32(ip + 1 * vstep * istride, istride); npyv_u32 r_1 = npyv_negative_u32(v_1); npyv_storen_u32(op + 1 * vstep * ostride, ostride, r_1); #endif -#line 211 +#line 213 #if UNROLL > 2 npyv_u32 v_2 = npyv_loadn_u32(ip + 2 * vstep * istride, istride); npyv_u32 r_2 = npyv_negative_u32(v_2); npyv_storen_u32(op + 2 * vstep * ostride, ostride, r_2); #endif -#line 211 +#line 213 #if UNROLL > 3 npyv_u32 v_3 = npyv_loadn_u32(ip + 3 * vstep * istride, istride); npyv_u32 r_3 = npyv_negative_u32(v_3); npyv_storen_u32(op + 3 * vstep * ostride, ostride, r_3); #endif -#line 211 +#line 213 #if UNROLL > 4 npyv_u32 v_4 = npyv_loadn_u32(ip + 4 * vstep * istride, istride); npyv_u32 r_4 = npyv_negative_u32(v_4); npyv_storen_u32(op + 4 * vstep * ostride, ostride, r_4); #endif -#line 211 +#line 213 #if UNROLL > 5 npyv_u32 v_5 = npyv_loadn_u32(ip + 5 * vstep * istride, istride); npyv_u32 r_5 = npyv_negative_u32(v_5); npyv_storen_u32(op + 5 * vstep * ostride, ostride, r_5); #endif -#line 211 +#line 213 #if UNROLL > 6 npyv_u32 v_6 = npyv_loadn_u32(ip + 6 * vstep * istride, istride); npyv_u32 r_6 = npyv_negative_u32(v_6); npyv_storen_u32(op + 6 * vstep * ostride, ostride, r_6); #endif -#line 211 +#line 213 #if UNROLL > 7 npyv_u32 v_7 = npyv_loadn_u32(ip + 7 * vstep * istride, istride); npyv_u32 r_7 = npyv_negative_u32(v_7); npyv_storen_u32(op + 7 * vstep * ostride, ostride, r_7); #endif -#line 211 +#line 213 #if UNROLL > 8 npyv_u32 v_8 = npyv_loadn_u32(ip + 8 * vstep * istride, istride); npyv_u32 r_8 = npyv_negative_u32(v_8); npyv_storen_u32(op + 8 * vstep * ostride, ostride, r_8); #endif -#line 211 +#line 213 #if UNROLL > 9 npyv_u32 v_9 = npyv_loadn_u32(ip + 9 * vstep * istride, istride); npyv_u32 r_9 = npyv_negative_u32(v_9); npyv_storen_u32(op + 9 * vstep * ostride, ostride, r_9); #endif -#line 211 +#line 213 #if UNROLL > 10 npyv_u32 v_10 = npyv_loadn_u32(ip + 10 * vstep * istride, istride); npyv_u32 r_10 = npyv_negative_u32(v_10); npyv_storen_u32(op + 10 * vstep * ostride, ostride, r_10); #endif -#line 211 +#line 213 #if UNROLL > 11 npyv_u32 v_11 = npyv_loadn_u32(ip + 11 * vstep * istride, istride); npyv_u32 r_11 = npyv_negative_u32(v_11); npyv_storen_u32(op + 11 * vstep * ostride, ostride, r_11); #endif -#line 211 +#line 213 #if UNROLL > 12 npyv_u32 v_12 = npyv_loadn_u32(ip + 12 * vstep * istride, istride); npyv_u32 r_12 = npyv_negative_u32(v_12); npyv_storen_u32(op + 12 * vstep * ostride, ostride, r_12); #endif -#line 211 +#line 213 #if UNROLL > 13 npyv_u32 v_13 = npyv_loadn_u32(ip + 13 * vstep * istride, istride); npyv_u32 r_13 = npyv_negative_u32(v_13); npyv_storen_u32(op + 13 * vstep * ostride, ostride, r_13); #endif -#line 211 +#line 213 #if UNROLL > 14 npyv_u32 v_14 = npyv_loadn_u32(ip + 14 * vstep * istride, istride); npyv_u32 r_14 = npyv_negative_u32(v_14); npyv_storen_u32(op + 14 * vstep * ostride, ostride, r_14); #endif -#line 211 +#line 213 #if UNROLL > 15 npyv_u32 v_15 = npyv_loadn_u32(ip + 15 * vstep * istride, istride); npyv_u32 r_15 = npyv_negative_u32(v_15); @@ -3553,6 +3570,7 @@ simd_unary_nn_negative_u32(const npyv_lanetype_u32 *ip, npy_intp istride, *op = scalar_negative(*ip); } } +#endif // NPY_HAVE_SSE2 #endif // 1 #undef UNROLL #endif // NPY_SIMD @@ -3982,6 +4000,8 @@ simd_unary_nc_negative_s64(const npyv_lanetype_s64 *ip, npy_intp istride, #undef UNROLL #define UNROLL 2 #endif +// X86 does better with unrolled scalar for heavy non-contiguous +#ifndef NPY_HAVE_SSE2 static NPY_INLINE void simd_unary_nn_negative_s64(const npyv_lanetype_s64 *ip, npy_intp istride, npyv_lanetype_s64 *op, npy_intp ostride, @@ -3992,112 +4012,112 @@ simd_unary_nn_negative_s64(const npyv_lanetype_s64 *ip, npy_intp istride, // unrolled vector loop for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) { - #line 211 + #line 213 #if UNROLL > 0 npyv_s64 v_0 = npyv_loadn_s64(ip + 0 * vstep * istride, istride); npyv_s64 r_0 = npyv_negative_s64(v_0); npyv_storen_s64(op + 0 * vstep * ostride, ostride, r_0); #endif -#line 211 +#line 213 #if UNROLL > 1 npyv_s64 v_1 = npyv_loadn_s64(ip + 1 * vstep * istride, istride); npyv_s64 r_1 = npyv_negative_s64(v_1); npyv_storen_s64(op + 1 * vstep * ostride, ostride, r_1); #endif -#line 211 +#line 213 #if UNROLL > 2 npyv_s64 v_2 = npyv_loadn_s64(ip + 2 * vstep * istride, istride); npyv_s64 r_2 = npyv_negative_s64(v_2); npyv_storen_s64(op + 2 * vstep * ostride, ostride, r_2); #endif -#line 211 +#line 213 #if UNROLL > 3 npyv_s64 v_3 = npyv_loadn_s64(ip + 3 * vstep * istride, istride); npyv_s64 r_3 = npyv_negative_s64(v_3); npyv_storen_s64(op + 3 * vstep * ostride, ostride, r_3); #endif -#line 211 +#line 213 #if UNROLL > 4 npyv_s64 v_4 = npyv_loadn_s64(ip + 4 * vstep * istride, istride); npyv_s64 r_4 = npyv_negative_s64(v_4); npyv_storen_s64(op + 4 * vstep * ostride, ostride, r_4); #endif -#line 211 +#line 213 #if UNROLL > 5 npyv_s64 v_5 = npyv_loadn_s64(ip + 5 * vstep * istride, istride); npyv_s64 r_5 = npyv_negative_s64(v_5); npyv_storen_s64(op + 5 * vstep * ostride, ostride, r_5); #endif -#line 211 +#line 213 #if UNROLL > 6 npyv_s64 v_6 = npyv_loadn_s64(ip + 6 * vstep * istride, istride); npyv_s64 r_6 = npyv_negative_s64(v_6); npyv_storen_s64(op + 6 * vstep * ostride, ostride, r_6); #endif -#line 211 +#line 213 #if UNROLL > 7 npyv_s64 v_7 = npyv_loadn_s64(ip + 7 * vstep * istride, istride); npyv_s64 r_7 = npyv_negative_s64(v_7); npyv_storen_s64(op + 7 * vstep * ostride, ostride, r_7); #endif -#line 211 +#line 213 #if UNROLL > 8 npyv_s64 v_8 = npyv_loadn_s64(ip + 8 * vstep * istride, istride); npyv_s64 r_8 = npyv_negative_s64(v_8); npyv_storen_s64(op + 8 * vstep * ostride, ostride, r_8); #endif -#line 211 +#line 213 #if UNROLL > 9 npyv_s64 v_9 = npyv_loadn_s64(ip + 9 * vstep * istride, istride); npyv_s64 r_9 = npyv_negative_s64(v_9); npyv_storen_s64(op + 9 * vstep * ostride, ostride, r_9); #endif -#line 211 +#line 213 #if UNROLL > 10 npyv_s64 v_10 = npyv_loadn_s64(ip + 10 * vstep * istride, istride); npyv_s64 r_10 = npyv_negative_s64(v_10); npyv_storen_s64(op + 10 * vstep * ostride, ostride, r_10); #endif -#line 211 +#line 213 #if UNROLL > 11 npyv_s64 v_11 = npyv_loadn_s64(ip + 11 * vstep * istride, istride); npyv_s64 r_11 = npyv_negative_s64(v_11); npyv_storen_s64(op + 11 * vstep * ostride, ostride, r_11); #endif -#line 211 +#line 213 #if UNROLL > 12 npyv_s64 v_12 = npyv_loadn_s64(ip + 12 * vstep * istride, istride); npyv_s64 r_12 = npyv_negative_s64(v_12); npyv_storen_s64(op + 12 * vstep * ostride, ostride, r_12); #endif -#line 211 +#line 213 #if UNROLL > 13 npyv_s64 v_13 = npyv_loadn_s64(ip + 13 * vstep * istride, istride); npyv_s64 r_13 = npyv_negative_s64(v_13); npyv_storen_s64(op + 13 * vstep * ostride, ostride, r_13); #endif -#line 211 +#line 213 #if UNROLL > 14 npyv_s64 v_14 = npyv_loadn_s64(ip + 14 * vstep * istride, istride); npyv_s64 r_14 = npyv_negative_s64(v_14); npyv_storen_s64(op + 14 * vstep * ostride, ostride, r_14); #endif -#line 211 +#line 213 #if UNROLL > 15 npyv_s64 v_15 = npyv_loadn_s64(ip + 15 * vstep * istride, istride); npyv_s64 r_15 = npyv_negative_s64(v_15); @@ -4116,6 +4136,7 @@ simd_unary_nn_negative_s64(const npyv_lanetype_s64 *ip, npy_intp istride, *op = scalar_negative(*ip); } } +#endif // NPY_HAVE_SSE2 #endif // 1 #undef UNROLL #endif // NPY_SIMD @@ -4545,6 +4566,8 @@ simd_unary_nc_negative_u64(const npyv_lanetype_u64 *ip, npy_intp istride, #undef UNROLL #define UNROLL 2 #endif +// X86 does better with unrolled scalar for heavy non-contiguous +#ifndef NPY_HAVE_SSE2 static NPY_INLINE void simd_unary_nn_negative_u64(const npyv_lanetype_u64 *ip, npy_intp istride, npyv_lanetype_u64 *op, npy_intp ostride, @@ -4555,112 +4578,112 @@ simd_unary_nn_negative_u64(const npyv_lanetype_u64 *ip, npy_intp istride, // unrolled vector loop for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) { - #line 211 + #line 213 #if UNROLL > 0 npyv_u64 v_0 = npyv_loadn_u64(ip + 0 * vstep * istride, istride); npyv_u64 r_0 = npyv_negative_u64(v_0); npyv_storen_u64(op + 0 * vstep * ostride, ostride, r_0); #endif -#line 211 +#line 213 #if UNROLL > 1 npyv_u64 v_1 = npyv_loadn_u64(ip + 1 * vstep * istride, istride); npyv_u64 r_1 = npyv_negative_u64(v_1); npyv_storen_u64(op + 1 * vstep * ostride, ostride, r_1); #endif -#line 211 +#line 213 #if UNROLL > 2 npyv_u64 v_2 = npyv_loadn_u64(ip + 2 * vstep * istride, istride); npyv_u64 r_2 = npyv_negative_u64(v_2); npyv_storen_u64(op + 2 * vstep * ostride, ostride, r_2); #endif -#line 211 +#line 213 #if UNROLL > 3 npyv_u64 v_3 = npyv_loadn_u64(ip + 3 * vstep * istride, istride); npyv_u64 r_3 = npyv_negative_u64(v_3); npyv_storen_u64(op + 3 * vstep * ostride, ostride, r_3); #endif -#line 211 +#line 213 #if UNROLL > 4 npyv_u64 v_4 = npyv_loadn_u64(ip + 4 * vstep * istride, istride); npyv_u64 r_4 = npyv_negative_u64(v_4); npyv_storen_u64(op + 4 * vstep * ostride, ostride, r_4); #endif -#line 211 +#line 213 #if UNROLL > 5 npyv_u64 v_5 = npyv_loadn_u64(ip + 5 * vstep * istride, istride); npyv_u64 r_5 = npyv_negative_u64(v_5); npyv_storen_u64(op + 5 * vstep * ostride, ostride, r_5); #endif -#line 211 +#line 213 #if UNROLL > 6 npyv_u64 v_6 = npyv_loadn_u64(ip + 6 * vstep * istride, istride); npyv_u64 r_6 = npyv_negative_u64(v_6); npyv_storen_u64(op + 6 * vstep * ostride, ostride, r_6); #endif -#line 211 +#line 213 #if UNROLL > 7 npyv_u64 v_7 = npyv_loadn_u64(ip + 7 * vstep * istride, istride); npyv_u64 r_7 = npyv_negative_u64(v_7); npyv_storen_u64(op + 7 * vstep * ostride, ostride, r_7); #endif -#line 211 +#line 213 #if UNROLL > 8 npyv_u64 v_8 = npyv_loadn_u64(ip + 8 * vstep * istride, istride); npyv_u64 r_8 = npyv_negative_u64(v_8); npyv_storen_u64(op + 8 * vstep * ostride, ostride, r_8); #endif -#line 211 +#line 213 #if UNROLL > 9 npyv_u64 v_9 = npyv_loadn_u64(ip + 9 * vstep * istride, istride); npyv_u64 r_9 = npyv_negative_u64(v_9); npyv_storen_u64(op + 9 * vstep * ostride, ostride, r_9); #endif -#line 211 +#line 213 #if UNROLL > 10 npyv_u64 v_10 = npyv_loadn_u64(ip + 10 * vstep * istride, istride); npyv_u64 r_10 = npyv_negative_u64(v_10); npyv_storen_u64(op + 10 * vstep * ostride, ostride, r_10); #endif -#line 211 +#line 213 #if UNROLL > 11 npyv_u64 v_11 = npyv_loadn_u64(ip + 11 * vstep * istride, istride); npyv_u64 r_11 = npyv_negative_u64(v_11); npyv_storen_u64(op + 11 * vstep * ostride, ostride, r_11); #endif -#line 211 +#line 213 #if UNROLL > 12 npyv_u64 v_12 = npyv_loadn_u64(ip + 12 * vstep * istride, istride); npyv_u64 r_12 = npyv_negative_u64(v_12); npyv_storen_u64(op + 12 * vstep * ostride, ostride, r_12); #endif -#line 211 +#line 213 #if UNROLL > 13 npyv_u64 v_13 = npyv_loadn_u64(ip + 13 * vstep * istride, istride); npyv_u64 r_13 = npyv_negative_u64(v_13); npyv_storen_u64(op + 13 * vstep * ostride, ostride, r_13); #endif -#line 211 +#line 213 #if UNROLL > 14 npyv_u64 v_14 = npyv_loadn_u64(ip + 14 * vstep * istride, istride); npyv_u64 r_14 = npyv_negative_u64(v_14); npyv_storen_u64(op + 14 * vstep * ostride, ostride, r_14); #endif -#line 211 +#line 213 #if UNROLL > 15 npyv_u64 v_15 = npyv_loadn_u64(ip + 15 * vstep * istride, istride); npyv_u64 r_15 = npyv_negative_u64(v_15); @@ -4679,6 +4702,7 @@ simd_unary_nn_negative_u64(const npyv_lanetype_u64 *ip, npy_intp istride, *op = scalar_negative(*ip); } } +#endif // NPY_HAVE_SSE2 #endif // 1 #undef UNROLL #endif // NPY_SIMD @@ -5108,6 +5132,8 @@ simd_unary_nc_negative_f32(const npyv_lanetype_f32 *ip, npy_intp istride, #undef UNROLL #define UNROLL 2 #endif +// X86 does better with unrolled scalar for heavy non-contiguous +#ifndef NPY_HAVE_SSE2 static NPY_INLINE void simd_unary_nn_negative_f32(const npyv_lanetype_f32 *ip, npy_intp istride, npyv_lanetype_f32 *op, npy_intp ostride, @@ -5118,112 +5144,112 @@ simd_unary_nn_negative_f32(const npyv_lanetype_f32 *ip, npy_intp istride, // unrolled vector loop for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) { - #line 211 + #line 213 #if UNROLL > 0 npyv_f32 v_0 = npyv_loadn_f32(ip + 0 * vstep * istride, istride); npyv_f32 r_0 = npyv_negative_f32(v_0); npyv_storen_f32(op + 0 * vstep * ostride, ostride, r_0); #endif -#line 211 +#line 213 #if UNROLL > 1 npyv_f32 v_1 = npyv_loadn_f32(ip + 1 * vstep * istride, istride); npyv_f32 r_1 = npyv_negative_f32(v_1); npyv_storen_f32(op + 1 * vstep * ostride, ostride, r_1); #endif -#line 211 +#line 213 #if UNROLL > 2 npyv_f32 v_2 = npyv_loadn_f32(ip + 2 * vstep * istride, istride); npyv_f32 r_2 = npyv_negative_f32(v_2); npyv_storen_f32(op + 2 * vstep * ostride, ostride, r_2); #endif -#line 211 +#line 213 #if UNROLL > 3 npyv_f32 v_3 = npyv_loadn_f32(ip + 3 * vstep * istride, istride); npyv_f32 r_3 = npyv_negative_f32(v_3); npyv_storen_f32(op + 3 * vstep * ostride, ostride, r_3); #endif -#line 211 +#line 213 #if UNROLL > 4 npyv_f32 v_4 = npyv_loadn_f32(ip + 4 * vstep * istride, istride); npyv_f32 r_4 = npyv_negative_f32(v_4); npyv_storen_f32(op + 4 * vstep * ostride, ostride, r_4); #endif -#line 211 +#line 213 #if UNROLL > 5 npyv_f32 v_5 = npyv_loadn_f32(ip + 5 * vstep * istride, istride); npyv_f32 r_5 = npyv_negative_f32(v_5); npyv_storen_f32(op + 5 * vstep * ostride, ostride, r_5); #endif -#line 211 +#line 213 #if UNROLL > 6 npyv_f32 v_6 = npyv_loadn_f32(ip + 6 * vstep * istride, istride); npyv_f32 r_6 = npyv_negative_f32(v_6); npyv_storen_f32(op + 6 * vstep * ostride, ostride, r_6); #endif -#line 211 +#line 213 #if UNROLL > 7 npyv_f32 v_7 = npyv_loadn_f32(ip + 7 * vstep * istride, istride); npyv_f32 r_7 = npyv_negative_f32(v_7); npyv_storen_f32(op + 7 * vstep * ostride, ostride, r_7); #endif -#line 211 +#line 213 #if UNROLL > 8 npyv_f32 v_8 = npyv_loadn_f32(ip + 8 * vstep * istride, istride); npyv_f32 r_8 = npyv_negative_f32(v_8); npyv_storen_f32(op + 8 * vstep * ostride, ostride, r_8); #endif -#line 211 +#line 213 #if UNROLL > 9 npyv_f32 v_9 = npyv_loadn_f32(ip + 9 * vstep * istride, istride); npyv_f32 r_9 = npyv_negative_f32(v_9); npyv_storen_f32(op + 9 * vstep * ostride, ostride, r_9); #endif -#line 211 +#line 213 #if UNROLL > 10 npyv_f32 v_10 = npyv_loadn_f32(ip + 10 * vstep * istride, istride); npyv_f32 r_10 = npyv_negative_f32(v_10); npyv_storen_f32(op + 10 * vstep * ostride, ostride, r_10); #endif -#line 211 +#line 213 #if UNROLL > 11 npyv_f32 v_11 = npyv_loadn_f32(ip + 11 * vstep * istride, istride); npyv_f32 r_11 = npyv_negative_f32(v_11); npyv_storen_f32(op + 11 * vstep * ostride, ostride, r_11); #endif -#line 211 +#line 213 #if UNROLL > 12 npyv_f32 v_12 = npyv_loadn_f32(ip + 12 * vstep * istride, istride); npyv_f32 r_12 = npyv_negative_f32(v_12); npyv_storen_f32(op + 12 * vstep * ostride, ostride, r_12); #endif -#line 211 +#line 213 #if UNROLL > 13 npyv_f32 v_13 = npyv_loadn_f32(ip + 13 * vstep * istride, istride); npyv_f32 r_13 = npyv_negative_f32(v_13); npyv_storen_f32(op + 13 * vstep * ostride, ostride, r_13); #endif -#line 211 +#line 213 #if UNROLL > 14 npyv_f32 v_14 = npyv_loadn_f32(ip + 14 * vstep * istride, istride); npyv_f32 r_14 = npyv_negative_f32(v_14); npyv_storen_f32(op + 14 * vstep * ostride, ostride, r_14); #endif -#line 211 +#line 213 #if UNROLL > 15 npyv_f32 v_15 = npyv_loadn_f32(ip + 15 * vstep * istride, istride); npyv_f32 r_15 = npyv_negative_f32(v_15); @@ -5242,6 +5268,7 @@ simd_unary_nn_negative_f32(const npyv_lanetype_f32 *ip, npy_intp istride, *op = scalar_negative(*ip); } } +#endif // NPY_HAVE_SSE2 #endif // 1 #undef UNROLL #endif // NPY_SIMD_F32 @@ -5671,6 +5698,8 @@ simd_unary_nc_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride, #undef UNROLL #define UNROLL 2 #endif +// X86 does better with unrolled scalar for heavy non-contiguous +#ifndef NPY_HAVE_SSE2 static NPY_INLINE void simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride, npyv_lanetype_f64 *op, npy_intp ostride, @@ -5681,112 +5710,112 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride, // unrolled vector loop for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) { - #line 211 + #line 213 #if UNROLL > 0 npyv_f64 v_0 = npyv_loadn_f64(ip + 0 * vstep * istride, istride); npyv_f64 r_0 = npyv_negative_f64(v_0); npyv_storen_f64(op + 0 * vstep * ostride, ostride, r_0); #endif -#line 211 +#line 213 #if UNROLL > 1 npyv_f64 v_1 = npyv_loadn_f64(ip + 1 * vstep * istride, istride); npyv_f64 r_1 = npyv_negative_f64(v_1); npyv_storen_f64(op + 1 * vstep * ostride, ostride, r_1); #endif -#line 211 +#line 213 #if UNROLL > 2 npyv_f64 v_2 = npyv_loadn_f64(ip + 2 * vstep * istride, istride); npyv_f64 r_2 = npyv_negative_f64(v_2); npyv_storen_f64(op + 2 * vstep * ostride, ostride, r_2); #endif -#line 211 +#line 213 #if UNROLL > 3 npyv_f64 v_3 = npyv_loadn_f64(ip + 3 * vstep * istride, istride); npyv_f64 r_3 = npyv_negative_f64(v_3); npyv_storen_f64(op + 3 * vstep * ostride, ostride, r_3); #endif -#line 211 +#line 213 #if UNROLL > 4 npyv_f64 v_4 = npyv_loadn_f64(ip + 4 * vstep * istride, istride); npyv_f64 r_4 = npyv_negative_f64(v_4); npyv_storen_f64(op + 4 * vstep * ostride, ostride, r_4); #endif -#line 211 +#line 213 #if UNROLL > 5 npyv_f64 v_5 = npyv_loadn_f64(ip + 5 * vstep * istride, istride); npyv_f64 r_5 = npyv_negative_f64(v_5); npyv_storen_f64(op + 5 * vstep * ostride, ostride, r_5); #endif -#line 211 +#line 213 #if UNROLL > 6 npyv_f64 v_6 = npyv_loadn_f64(ip + 6 * vstep * istride, istride); npyv_f64 r_6 = npyv_negative_f64(v_6); npyv_storen_f64(op + 6 * vstep * ostride, ostride, r_6); #endif -#line 211 +#line 213 #if UNROLL > 7 npyv_f64 v_7 = npyv_loadn_f64(ip + 7 * vstep * istride, istride); npyv_f64 r_7 = npyv_negative_f64(v_7); npyv_storen_f64(op + 7 * vstep * ostride, ostride, r_7); #endif -#line 211 +#line 213 #if UNROLL > 8 npyv_f64 v_8 = npyv_loadn_f64(ip + 8 * vstep * istride, istride); npyv_f64 r_8 = npyv_negative_f64(v_8); npyv_storen_f64(op + 8 * vstep * ostride, ostride, r_8); #endif -#line 211 +#line 213 #if UNROLL > 9 npyv_f64 v_9 = npyv_loadn_f64(ip + 9 * vstep * istride, istride); npyv_f64 r_9 = npyv_negative_f64(v_9); npyv_storen_f64(op + 9 * vstep * ostride, ostride, r_9); #endif -#line 211 +#line 213 #if UNROLL > 10 npyv_f64 v_10 = npyv_loadn_f64(ip + 10 * vstep * istride, istride); npyv_f64 r_10 = npyv_negative_f64(v_10); npyv_storen_f64(op + 10 * vstep * ostride, ostride, r_10); #endif -#line 211 +#line 213 #if UNROLL > 11 npyv_f64 v_11 = npyv_loadn_f64(ip + 11 * vstep * istride, istride); npyv_f64 r_11 = npyv_negative_f64(v_11); npyv_storen_f64(op + 11 * vstep * ostride, ostride, r_11); #endif -#line 211 +#line 213 #if UNROLL > 12 npyv_f64 v_12 = npyv_loadn_f64(ip + 12 * vstep * istride, istride); npyv_f64 r_12 = npyv_negative_f64(v_12); npyv_storen_f64(op + 12 * vstep * ostride, ostride, r_12); #endif -#line 211 +#line 213 #if UNROLL > 13 npyv_f64 v_13 = npyv_loadn_f64(ip + 13 * vstep * istride, istride); npyv_f64 r_13 = npyv_negative_f64(v_13); npyv_storen_f64(op + 13 * vstep * ostride, ostride, r_13); #endif -#line 211 +#line 213 #if UNROLL > 14 npyv_f64 v_14 = npyv_loadn_f64(ip + 14 * vstep * istride, istride); npyv_f64 r_14 = npyv_negative_f64(v_14); npyv_storen_f64(op + 14 * vstep * ostride, ostride, r_14); #endif -#line 211 +#line 213 #if UNROLL > 15 npyv_f64 v_15 = npyv_loadn_f64(ip + 15 * vstep * istride, istride); npyv_f64 r_15 = npyv_negative_f64(v_15); @@ -5805,6 +5834,7 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride, *op = scalar_negative(*ip); } } +#endif // NPY_HAVE_SSE2 #endif // 1 #undef UNROLL #endif // NPY_SIMD_F64 @@ -5814,10 +5844,10 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride, /******************************************************************************** ** Defining ufunc inner functions ********************************************************************************/ -#line 254 +#line 257 #undef TO_SIMD_SFX #if 0 -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_BYTE == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -5833,7 +5863,7 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride, #define TO_SIMD_SFX(X) X##_s8 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_BYTE == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -5849,7 +5879,7 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride, #define TO_SIMD_SFX(X) X##_s16 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_BYTE == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -5865,7 +5895,7 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride, #define TO_SIMD_SFX(X) X##_s32 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_BYTE == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -5883,7 +5913,7 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride, #endif -#line 280 +#line 283 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_negative) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -5921,8 +5951,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_negative) ); goto clear; } - // SSE2 does better with unrolled scalar for heavy non-contiguous - #if !defined(NPY_HAVE_SSE2) + // X86 does better with unrolled scalar for heavy non-contiguous + #ifndef NPY_HAVE_SSE2 else if (istride != 1 && ostride != 1) { // non-contiguous input and output TO_SIMD_SFX(simd_unary_nn_negative)( @@ -5945,97 +5975,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_negative) */ #define UNROLL 8 for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) { - #line 344 + #line 347 #if UNROLL > 0 const npy_ubyte in_0 = *((const npy_ubyte *)(ip + 0 * istep)); *((npy_ubyte *)(op + 0 * ostep)) = scalar_negative(in_0); #endif -#line 344 +#line 347 #if UNROLL > 1 const npy_ubyte in_1 = *((const npy_ubyte *)(ip + 1 * istep)); *((npy_ubyte *)(op + 1 * ostep)) = scalar_negative(in_1); #endif -#line 344 +#line 347 #if UNROLL > 2 const npy_ubyte in_2 = *((const npy_ubyte *)(ip + 2 * istep)); *((npy_ubyte *)(op + 2 * ostep)) = scalar_negative(in_2); #endif -#line 344 +#line 347 #if UNROLL > 3 const npy_ubyte in_3 = *((const npy_ubyte *)(ip + 3 * istep)); *((npy_ubyte *)(op + 3 * ostep)) = scalar_negative(in_3); #endif -#line 344 +#line 347 #if UNROLL > 4 const npy_ubyte in_4 = *((const npy_ubyte *)(ip + 4 * istep)); *((npy_ubyte *)(op + 4 * ostep)) = scalar_negative(in_4); #endif -#line 344 +#line 347 #if UNROLL > 5 const npy_ubyte in_5 = *((const npy_ubyte *)(ip + 5 * istep)); *((npy_ubyte *)(op + 5 * ostep)) = scalar_negative(in_5); #endif -#line 344 +#line 347 #if UNROLL > 6 const npy_ubyte in_6 = *((const npy_ubyte *)(ip + 6 * istep)); *((npy_ubyte *)(op + 6 * ostep)) = scalar_negative(in_6); #endif -#line 344 +#line 347 #if UNROLL > 7 const npy_ubyte in_7 = *((const npy_ubyte *)(ip + 7 * istep)); *((npy_ubyte *)(op + 7 * ostep)) = scalar_negative(in_7); #endif -#line 344 +#line 347 #if UNROLL > 8 const npy_ubyte in_8 = *((const npy_ubyte *)(ip + 8 * istep)); *((npy_ubyte *)(op + 8 * ostep)) = scalar_negative(in_8); #endif -#line 344 +#line 347 #if UNROLL > 9 const npy_ubyte in_9 = *((const npy_ubyte *)(ip + 9 * istep)); *((npy_ubyte *)(op + 9 * ostep)) = scalar_negative(in_9); #endif -#line 344 +#line 347 #if UNROLL > 10 const npy_ubyte in_10 = *((const npy_ubyte *)(ip + 10 * istep)); *((npy_ubyte *)(op + 10 * ostep)) = scalar_negative(in_10); #endif -#line 344 +#line 347 #if UNROLL > 11 const npy_ubyte in_11 = *((const npy_ubyte *)(ip + 11 * istep)); *((npy_ubyte *)(op + 11 * ostep)) = scalar_negative(in_11); #endif -#line 344 +#line 347 #if UNROLL > 12 const npy_ubyte in_12 = *((const npy_ubyte *)(ip + 12 * istep)); *((npy_ubyte *)(op + 12 * ostep)) = scalar_negative(in_12); #endif -#line 344 +#line 347 #if UNROLL > 13 const npy_ubyte in_13 = *((const npy_ubyte *)(ip + 13 * istep)); *((npy_ubyte *)(op + 13 * ostep)) = scalar_negative(in_13); #endif -#line 344 +#line 347 #if UNROLL > 14 const npy_ubyte in_14 = *((const npy_ubyte *)(ip + 14 * istep)); *((npy_ubyte *)(op + 14 * ostep)) = scalar_negative(in_14); #endif -#line 344 +#line 347 #if UNROLL > 15 const npy_ubyte in_15 = *((const npy_ubyte *)(ip + 15 * istep)); *((npy_ubyte *)(op + 15 * ostep)) = scalar_negative(in_15); @@ -6055,10 +6085,10 @@ clear: #endif } -#line 254 +#line 257 #undef TO_SIMD_SFX #if 0 -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_SHORT == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -6074,7 +6104,7 @@ clear: #define TO_SIMD_SFX(X) X##_s8 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_SHORT == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -6090,7 +6120,7 @@ clear: #define TO_SIMD_SFX(X) X##_s16 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_SHORT == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -6106,7 +6136,7 @@ clear: #define TO_SIMD_SFX(X) X##_s32 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_SHORT == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -6124,7 +6154,7 @@ clear: #endif -#line 280 +#line 283 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_negative) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -6162,8 +6192,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_negative) ); goto clear; } - // SSE2 does better with unrolled scalar for heavy non-contiguous - #if !defined(NPY_HAVE_SSE2) + // X86 does better with unrolled scalar for heavy non-contiguous + #ifndef NPY_HAVE_SSE2 else if (istride != 1 && ostride != 1) { // non-contiguous input and output TO_SIMD_SFX(simd_unary_nn_negative)( @@ -6186,97 +6216,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_negative) */ #define UNROLL 8 for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) { - #line 344 + #line 347 #if UNROLL > 0 const npy_ushort in_0 = *((const npy_ushort *)(ip + 0 * istep)); *((npy_ushort *)(op + 0 * ostep)) = scalar_negative(in_0); #endif -#line 344 +#line 347 #if UNROLL > 1 const npy_ushort in_1 = *((const npy_ushort *)(ip + 1 * istep)); *((npy_ushort *)(op + 1 * ostep)) = scalar_negative(in_1); #endif -#line 344 +#line 347 #if UNROLL > 2 const npy_ushort in_2 = *((const npy_ushort *)(ip + 2 * istep)); *((npy_ushort *)(op + 2 * ostep)) = scalar_negative(in_2); #endif -#line 344 +#line 347 #if UNROLL > 3 const npy_ushort in_3 = *((const npy_ushort *)(ip + 3 * istep)); *((npy_ushort *)(op + 3 * ostep)) = scalar_negative(in_3); #endif -#line 344 +#line 347 #if UNROLL > 4 const npy_ushort in_4 = *((const npy_ushort *)(ip + 4 * istep)); *((npy_ushort *)(op + 4 * ostep)) = scalar_negative(in_4); #endif -#line 344 +#line 347 #if UNROLL > 5 const npy_ushort in_5 = *((const npy_ushort *)(ip + 5 * istep)); *((npy_ushort *)(op + 5 * ostep)) = scalar_negative(in_5); #endif -#line 344 +#line 347 #if UNROLL > 6 const npy_ushort in_6 = *((const npy_ushort *)(ip + 6 * istep)); *((npy_ushort *)(op + 6 * ostep)) = scalar_negative(in_6); #endif -#line 344 +#line 347 #if UNROLL > 7 const npy_ushort in_7 = *((const npy_ushort *)(ip + 7 * istep)); *((npy_ushort *)(op + 7 * ostep)) = scalar_negative(in_7); #endif -#line 344 +#line 347 #if UNROLL > 8 const npy_ushort in_8 = *((const npy_ushort *)(ip + 8 * istep)); *((npy_ushort *)(op + 8 * ostep)) = scalar_negative(in_8); #endif -#line 344 +#line 347 #if UNROLL > 9 const npy_ushort in_9 = *((const npy_ushort *)(ip + 9 * istep)); *((npy_ushort *)(op + 9 * ostep)) = scalar_negative(in_9); #endif -#line 344 +#line 347 #if UNROLL > 10 const npy_ushort in_10 = *((const npy_ushort *)(ip + 10 * istep)); *((npy_ushort *)(op + 10 * ostep)) = scalar_negative(in_10); #endif -#line 344 +#line 347 #if UNROLL > 11 const npy_ushort in_11 = *((const npy_ushort *)(ip + 11 * istep)); *((npy_ushort *)(op + 11 * ostep)) = scalar_negative(in_11); #endif -#line 344 +#line 347 #if UNROLL > 12 const npy_ushort in_12 = *((const npy_ushort *)(ip + 12 * istep)); *((npy_ushort *)(op + 12 * ostep)) = scalar_negative(in_12); #endif -#line 344 +#line 347 #if UNROLL > 13 const npy_ushort in_13 = *((const npy_ushort *)(ip + 13 * istep)); *((npy_ushort *)(op + 13 * ostep)) = scalar_negative(in_13); #endif -#line 344 +#line 347 #if UNROLL > 14 const npy_ushort in_14 = *((const npy_ushort *)(ip + 14 * istep)); *((npy_ushort *)(op + 14 * ostep)) = scalar_negative(in_14); #endif -#line 344 +#line 347 #if UNROLL > 15 const npy_ushort in_15 = *((const npy_ushort *)(ip + 15 * istep)); *((npy_ushort *)(op + 15 * ostep)) = scalar_negative(in_15); @@ -6296,10 +6326,10 @@ clear: #endif } -#line 254 +#line 257 #undef TO_SIMD_SFX #if 0 -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_INT == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -6315,7 +6345,7 @@ clear: #define TO_SIMD_SFX(X) X##_s8 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_INT == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -6331,7 +6361,7 @@ clear: #define TO_SIMD_SFX(X) X##_s16 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_INT == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -6347,7 +6377,7 @@ clear: #define TO_SIMD_SFX(X) X##_s32 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_INT == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -6365,7 +6395,7 @@ clear: #endif -#line 280 +#line 283 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_negative) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -6403,8 +6433,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_negative) ); goto clear; } - // SSE2 does better with unrolled scalar for heavy non-contiguous - #if !defined(NPY_HAVE_SSE2) + // X86 does better with unrolled scalar for heavy non-contiguous + #ifndef NPY_HAVE_SSE2 else if (istride != 1 && ostride != 1) { // non-contiguous input and output TO_SIMD_SFX(simd_unary_nn_negative)( @@ -6427,97 +6457,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_negative) */ #define UNROLL 8 for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) { - #line 344 + #line 347 #if UNROLL > 0 const npy_uint in_0 = *((const npy_uint *)(ip + 0 * istep)); *((npy_uint *)(op + 0 * ostep)) = scalar_negative(in_0); #endif -#line 344 +#line 347 #if UNROLL > 1 const npy_uint in_1 = *((const npy_uint *)(ip + 1 * istep)); *((npy_uint *)(op + 1 * ostep)) = scalar_negative(in_1); #endif -#line 344 +#line 347 #if UNROLL > 2 const npy_uint in_2 = *((const npy_uint *)(ip + 2 * istep)); *((npy_uint *)(op + 2 * ostep)) = scalar_negative(in_2); #endif -#line 344 +#line 347 #if UNROLL > 3 const npy_uint in_3 = *((const npy_uint *)(ip + 3 * istep)); *((npy_uint *)(op + 3 * ostep)) = scalar_negative(in_3); #endif -#line 344 +#line 347 #if UNROLL > 4 const npy_uint in_4 = *((const npy_uint *)(ip + 4 * istep)); *((npy_uint *)(op + 4 * ostep)) = scalar_negative(in_4); #endif -#line 344 +#line 347 #if UNROLL > 5 const npy_uint in_5 = *((const npy_uint *)(ip + 5 * istep)); *((npy_uint *)(op + 5 * ostep)) = scalar_negative(in_5); #endif -#line 344 +#line 347 #if UNROLL > 6 const npy_uint in_6 = *((const npy_uint *)(ip + 6 * istep)); *((npy_uint *)(op + 6 * ostep)) = scalar_negative(in_6); #endif -#line 344 +#line 347 #if UNROLL > 7 const npy_uint in_7 = *((const npy_uint *)(ip + 7 * istep)); *((npy_uint *)(op + 7 * ostep)) = scalar_negative(in_7); #endif -#line 344 +#line 347 #if UNROLL > 8 const npy_uint in_8 = *((const npy_uint *)(ip + 8 * istep)); *((npy_uint *)(op + 8 * ostep)) = scalar_negative(in_8); #endif -#line 344 +#line 347 #if UNROLL > 9 const npy_uint in_9 = *((const npy_uint *)(ip + 9 * istep)); *((npy_uint *)(op + 9 * ostep)) = scalar_negative(in_9); #endif -#line 344 +#line 347 #if UNROLL > 10 const npy_uint in_10 = *((const npy_uint *)(ip + 10 * istep)); *((npy_uint *)(op + 10 * ostep)) = scalar_negative(in_10); #endif -#line 344 +#line 347 #if UNROLL > 11 const npy_uint in_11 = *((const npy_uint *)(ip + 11 * istep)); *((npy_uint *)(op + 11 * ostep)) = scalar_negative(in_11); #endif -#line 344 +#line 347 #if UNROLL > 12 const npy_uint in_12 = *((const npy_uint *)(ip + 12 * istep)); *((npy_uint *)(op + 12 * ostep)) = scalar_negative(in_12); #endif -#line 344 +#line 347 #if UNROLL > 13 const npy_uint in_13 = *((const npy_uint *)(ip + 13 * istep)); *((npy_uint *)(op + 13 * ostep)) = scalar_negative(in_13); #endif -#line 344 +#line 347 #if UNROLL > 14 const npy_uint in_14 = *((const npy_uint *)(ip + 14 * istep)); *((npy_uint *)(op + 14 * ostep)) = scalar_negative(in_14); #endif -#line 344 +#line 347 #if UNROLL > 15 const npy_uint in_15 = *((const npy_uint *)(ip + 15 * istep)); *((npy_uint *)(op + 15 * ostep)) = scalar_negative(in_15); @@ -6537,10 +6567,10 @@ clear: #endif } -#line 254 +#line 257 #undef TO_SIMD_SFX #if 0 -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONG == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -6556,7 +6586,7 @@ clear: #define TO_SIMD_SFX(X) X##_s8 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONG == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -6572,7 +6602,7 @@ clear: #define TO_SIMD_SFX(X) X##_s16 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONG == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -6588,7 +6618,7 @@ clear: #define TO_SIMD_SFX(X) X##_s32 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONG == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -6606,7 +6636,7 @@ clear: #endif -#line 280 +#line 283 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_negative) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -6644,8 +6674,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_negative) ); goto clear; } - // SSE2 does better with unrolled scalar for heavy non-contiguous - #if !defined(NPY_HAVE_SSE2) + // X86 does better with unrolled scalar for heavy non-contiguous + #ifndef NPY_HAVE_SSE2 else if (istride != 1 && ostride != 1) { // non-contiguous input and output TO_SIMD_SFX(simd_unary_nn_negative)( @@ -6668,97 +6698,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_negative) */ #define UNROLL 8 for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) { - #line 344 + #line 347 #if UNROLL > 0 const npy_ulong in_0 = *((const npy_ulong *)(ip + 0 * istep)); *((npy_ulong *)(op + 0 * ostep)) = scalar_negative(in_0); #endif -#line 344 +#line 347 #if UNROLL > 1 const npy_ulong in_1 = *((const npy_ulong *)(ip + 1 * istep)); *((npy_ulong *)(op + 1 * ostep)) = scalar_negative(in_1); #endif -#line 344 +#line 347 #if UNROLL > 2 const npy_ulong in_2 = *((const npy_ulong *)(ip + 2 * istep)); *((npy_ulong *)(op + 2 * ostep)) = scalar_negative(in_2); #endif -#line 344 +#line 347 #if UNROLL > 3 const npy_ulong in_3 = *((const npy_ulong *)(ip + 3 * istep)); *((npy_ulong *)(op + 3 * ostep)) = scalar_negative(in_3); #endif -#line 344 +#line 347 #if UNROLL > 4 const npy_ulong in_4 = *((const npy_ulong *)(ip + 4 * istep)); *((npy_ulong *)(op + 4 * ostep)) = scalar_negative(in_4); #endif -#line 344 +#line 347 #if UNROLL > 5 const npy_ulong in_5 = *((const npy_ulong *)(ip + 5 * istep)); *((npy_ulong *)(op + 5 * ostep)) = scalar_negative(in_5); #endif -#line 344 +#line 347 #if UNROLL > 6 const npy_ulong in_6 = *((const npy_ulong *)(ip + 6 * istep)); *((npy_ulong *)(op + 6 * ostep)) = scalar_negative(in_6); #endif -#line 344 +#line 347 #if UNROLL > 7 const npy_ulong in_7 = *((const npy_ulong *)(ip + 7 * istep)); *((npy_ulong *)(op + 7 * ostep)) = scalar_negative(in_7); #endif -#line 344 +#line 347 #if UNROLL > 8 const npy_ulong in_8 = *((const npy_ulong *)(ip + 8 * istep)); *((npy_ulong *)(op + 8 * ostep)) = scalar_negative(in_8); #endif -#line 344 +#line 347 #if UNROLL > 9 const npy_ulong in_9 = *((const npy_ulong *)(ip + 9 * istep)); *((npy_ulong *)(op + 9 * ostep)) = scalar_negative(in_9); #endif -#line 344 +#line 347 #if UNROLL > 10 const npy_ulong in_10 = *((const npy_ulong *)(ip + 10 * istep)); *((npy_ulong *)(op + 10 * ostep)) = scalar_negative(in_10); #endif -#line 344 +#line 347 #if UNROLL > 11 const npy_ulong in_11 = *((const npy_ulong *)(ip + 11 * istep)); *((npy_ulong *)(op + 11 * ostep)) = scalar_negative(in_11); #endif -#line 344 +#line 347 #if UNROLL > 12 const npy_ulong in_12 = *((const npy_ulong *)(ip + 12 * istep)); *((npy_ulong *)(op + 12 * ostep)) = scalar_negative(in_12); #endif -#line 344 +#line 347 #if UNROLL > 13 const npy_ulong in_13 = *((const npy_ulong *)(ip + 13 * istep)); *((npy_ulong *)(op + 13 * ostep)) = scalar_negative(in_13); #endif -#line 344 +#line 347 #if UNROLL > 14 const npy_ulong in_14 = *((const npy_ulong *)(ip + 14 * istep)); *((npy_ulong *)(op + 14 * ostep)) = scalar_negative(in_14); #endif -#line 344 +#line 347 #if UNROLL > 15 const npy_ulong in_15 = *((const npy_ulong *)(ip + 15 * istep)); *((npy_ulong *)(op + 15 * ostep)) = scalar_negative(in_15); @@ -6778,10 +6808,10 @@ clear: #endif } -#line 254 +#line 257 #undef TO_SIMD_SFX #if 0 -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -6797,7 +6827,7 @@ clear: #define TO_SIMD_SFX(X) X##_s8 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -6813,7 +6843,7 @@ clear: #define TO_SIMD_SFX(X) X##_s16 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -6829,7 +6859,7 @@ clear: #define TO_SIMD_SFX(X) X##_s32 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -6847,7 +6877,7 @@ clear: #endif -#line 280 +#line 283 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_negative) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -6885,8 +6915,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_negative) ); goto clear; } - // SSE2 does better with unrolled scalar for heavy non-contiguous - #if !defined(NPY_HAVE_SSE2) + // X86 does better with unrolled scalar for heavy non-contiguous + #ifndef NPY_HAVE_SSE2 else if (istride != 1 && ostride != 1) { // non-contiguous input and output TO_SIMD_SFX(simd_unary_nn_negative)( @@ -6909,97 +6939,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_negative) */ #define UNROLL 8 for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) { - #line 344 + #line 347 #if UNROLL > 0 const npy_ulonglong in_0 = *((const npy_ulonglong *)(ip + 0 * istep)); *((npy_ulonglong *)(op + 0 * ostep)) = scalar_negative(in_0); #endif -#line 344 +#line 347 #if UNROLL > 1 const npy_ulonglong in_1 = *((const npy_ulonglong *)(ip + 1 * istep)); *((npy_ulonglong *)(op + 1 * ostep)) = scalar_negative(in_1); #endif -#line 344 +#line 347 #if UNROLL > 2 const npy_ulonglong in_2 = *((const npy_ulonglong *)(ip + 2 * istep)); *((npy_ulonglong *)(op + 2 * ostep)) = scalar_negative(in_2); #endif -#line 344 +#line 347 #if UNROLL > 3 const npy_ulonglong in_3 = *((const npy_ulonglong *)(ip + 3 * istep)); *((npy_ulonglong *)(op + 3 * ostep)) = scalar_negative(in_3); #endif -#line 344 +#line 347 #if UNROLL > 4 const npy_ulonglong in_4 = *((const npy_ulonglong *)(ip + 4 * istep)); *((npy_ulonglong *)(op + 4 * ostep)) = scalar_negative(in_4); #endif -#line 344 +#line 347 #if UNROLL > 5 const npy_ulonglong in_5 = *((const npy_ulonglong *)(ip + 5 * istep)); *((npy_ulonglong *)(op + 5 * ostep)) = scalar_negative(in_5); #endif -#line 344 +#line 347 #if UNROLL > 6 const npy_ulonglong in_6 = *((const npy_ulonglong *)(ip + 6 * istep)); *((npy_ulonglong *)(op + 6 * ostep)) = scalar_negative(in_6); #endif -#line 344 +#line 347 #if UNROLL > 7 const npy_ulonglong in_7 = *((const npy_ulonglong *)(ip + 7 * istep)); *((npy_ulonglong *)(op + 7 * ostep)) = scalar_negative(in_7); #endif -#line 344 +#line 347 #if UNROLL > 8 const npy_ulonglong in_8 = *((const npy_ulonglong *)(ip + 8 * istep)); *((npy_ulonglong *)(op + 8 * ostep)) = scalar_negative(in_8); #endif -#line 344 +#line 347 #if UNROLL > 9 const npy_ulonglong in_9 = *((const npy_ulonglong *)(ip + 9 * istep)); *((npy_ulonglong *)(op + 9 * ostep)) = scalar_negative(in_9); #endif -#line 344 +#line 347 #if UNROLL > 10 const npy_ulonglong in_10 = *((const npy_ulonglong *)(ip + 10 * istep)); *((npy_ulonglong *)(op + 10 * ostep)) = scalar_negative(in_10); #endif -#line 344 +#line 347 #if UNROLL > 11 const npy_ulonglong in_11 = *((const npy_ulonglong *)(ip + 11 * istep)); *((npy_ulonglong *)(op + 11 * ostep)) = scalar_negative(in_11); #endif -#line 344 +#line 347 #if UNROLL > 12 const npy_ulonglong in_12 = *((const npy_ulonglong *)(ip + 12 * istep)); *((npy_ulonglong *)(op + 12 * ostep)) = scalar_negative(in_12); #endif -#line 344 +#line 347 #if UNROLL > 13 const npy_ulonglong in_13 = *((const npy_ulonglong *)(ip + 13 * istep)); *((npy_ulonglong *)(op + 13 * ostep)) = scalar_negative(in_13); #endif -#line 344 +#line 347 #if UNROLL > 14 const npy_ulonglong in_14 = *((const npy_ulonglong *)(ip + 14 * istep)); *((npy_ulonglong *)(op + 14 * ostep)) = scalar_negative(in_14); #endif -#line 344 +#line 347 #if UNROLL > 15 const npy_ulonglong in_15 = *((const npy_ulonglong *)(ip + 15 * istep)); *((npy_ulonglong *)(op + 15 * ostep)) = scalar_negative(in_15); @@ -7019,10 +7049,10 @@ clear: #endif } -#line 254 +#line 257 #undef TO_SIMD_SFX #if 0 -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_BYTE == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -7038,7 +7068,7 @@ clear: #define TO_SIMD_SFX(X) X##_s8 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_BYTE == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -7054,7 +7084,7 @@ clear: #define TO_SIMD_SFX(X) X##_s16 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_BYTE == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -7070,7 +7100,7 @@ clear: #define TO_SIMD_SFX(X) X##_s32 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_BYTE == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -7088,7 +7118,7 @@ clear: #endif -#line 280 +#line 283 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_negative) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -7126,8 +7156,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_negative) ); goto clear; } - // SSE2 does better with unrolled scalar for heavy non-contiguous - #if !defined(NPY_HAVE_SSE2) + // X86 does better with unrolled scalar for heavy non-contiguous + #ifndef NPY_HAVE_SSE2 else if (istride != 1 && ostride != 1) { // non-contiguous input and output TO_SIMD_SFX(simd_unary_nn_negative)( @@ -7150,97 +7180,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_negative) */ #define UNROLL 8 for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) { - #line 344 + #line 347 #if UNROLL > 0 const npy_byte in_0 = *((const npy_byte *)(ip + 0 * istep)); *((npy_byte *)(op + 0 * ostep)) = scalar_negative(in_0); #endif -#line 344 +#line 347 #if UNROLL > 1 const npy_byte in_1 = *((const npy_byte *)(ip + 1 * istep)); *((npy_byte *)(op + 1 * ostep)) = scalar_negative(in_1); #endif -#line 344 +#line 347 #if UNROLL > 2 const npy_byte in_2 = *((const npy_byte *)(ip + 2 * istep)); *((npy_byte *)(op + 2 * ostep)) = scalar_negative(in_2); #endif -#line 344 +#line 347 #if UNROLL > 3 const npy_byte in_3 = *((const npy_byte *)(ip + 3 * istep)); *((npy_byte *)(op + 3 * ostep)) = scalar_negative(in_3); #endif -#line 344 +#line 347 #if UNROLL > 4 const npy_byte in_4 = *((const npy_byte *)(ip + 4 * istep)); *((npy_byte *)(op + 4 * ostep)) = scalar_negative(in_4); #endif -#line 344 +#line 347 #if UNROLL > 5 const npy_byte in_5 = *((const npy_byte *)(ip + 5 * istep)); *((npy_byte *)(op + 5 * ostep)) = scalar_negative(in_5); #endif -#line 344 +#line 347 #if UNROLL > 6 const npy_byte in_6 = *((const npy_byte *)(ip + 6 * istep)); *((npy_byte *)(op + 6 * ostep)) = scalar_negative(in_6); #endif -#line 344 +#line 347 #if UNROLL > 7 const npy_byte in_7 = *((const npy_byte *)(ip + 7 * istep)); *((npy_byte *)(op + 7 * ostep)) = scalar_negative(in_7); #endif -#line 344 +#line 347 #if UNROLL > 8 const npy_byte in_8 = *((const npy_byte *)(ip + 8 * istep)); *((npy_byte *)(op + 8 * ostep)) = scalar_negative(in_8); #endif -#line 344 +#line 347 #if UNROLL > 9 const npy_byte in_9 = *((const npy_byte *)(ip + 9 * istep)); *((npy_byte *)(op + 9 * ostep)) = scalar_negative(in_9); #endif -#line 344 +#line 347 #if UNROLL > 10 const npy_byte in_10 = *((const npy_byte *)(ip + 10 * istep)); *((npy_byte *)(op + 10 * ostep)) = scalar_negative(in_10); #endif -#line 344 +#line 347 #if UNROLL > 11 const npy_byte in_11 = *((const npy_byte *)(ip + 11 * istep)); *((npy_byte *)(op + 11 * ostep)) = scalar_negative(in_11); #endif -#line 344 +#line 347 #if UNROLL > 12 const npy_byte in_12 = *((const npy_byte *)(ip + 12 * istep)); *((npy_byte *)(op + 12 * ostep)) = scalar_negative(in_12); #endif -#line 344 +#line 347 #if UNROLL > 13 const npy_byte in_13 = *((const npy_byte *)(ip + 13 * istep)); *((npy_byte *)(op + 13 * ostep)) = scalar_negative(in_13); #endif -#line 344 +#line 347 #if UNROLL > 14 const npy_byte in_14 = *((const npy_byte *)(ip + 14 * istep)); *((npy_byte *)(op + 14 * ostep)) = scalar_negative(in_14); #endif -#line 344 +#line 347 #if UNROLL > 15 const npy_byte in_15 = *((const npy_byte *)(ip + 15 * istep)); *((npy_byte *)(op + 15 * ostep)) = scalar_negative(in_15); @@ -7260,10 +7290,10 @@ clear: #endif } -#line 254 +#line 257 #undef TO_SIMD_SFX #if 0 -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_SHORT == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -7279,7 +7309,7 @@ clear: #define TO_SIMD_SFX(X) X##_s8 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_SHORT == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -7295,7 +7325,7 @@ clear: #define TO_SIMD_SFX(X) X##_s16 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_SHORT == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -7311,7 +7341,7 @@ clear: #define TO_SIMD_SFX(X) X##_s32 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_SHORT == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -7329,7 +7359,7 @@ clear: #endif -#line 280 +#line 283 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_negative) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -7367,8 +7397,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_negative) ); goto clear; } - // SSE2 does better with unrolled scalar for heavy non-contiguous - #if !defined(NPY_HAVE_SSE2) + // X86 does better with unrolled scalar for heavy non-contiguous + #ifndef NPY_HAVE_SSE2 else if (istride != 1 && ostride != 1) { // non-contiguous input and output TO_SIMD_SFX(simd_unary_nn_negative)( @@ -7391,97 +7421,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_negative) */ #define UNROLL 8 for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) { - #line 344 + #line 347 #if UNROLL > 0 const npy_short in_0 = *((const npy_short *)(ip + 0 * istep)); *((npy_short *)(op + 0 * ostep)) = scalar_negative(in_0); #endif -#line 344 +#line 347 #if UNROLL > 1 const npy_short in_1 = *((const npy_short *)(ip + 1 * istep)); *((npy_short *)(op + 1 * ostep)) = scalar_negative(in_1); #endif -#line 344 +#line 347 #if UNROLL > 2 const npy_short in_2 = *((const npy_short *)(ip + 2 * istep)); *((npy_short *)(op + 2 * ostep)) = scalar_negative(in_2); #endif -#line 344 +#line 347 #if UNROLL > 3 const npy_short in_3 = *((const npy_short *)(ip + 3 * istep)); *((npy_short *)(op + 3 * ostep)) = scalar_negative(in_3); #endif -#line 344 +#line 347 #if UNROLL > 4 const npy_short in_4 = *((const npy_short *)(ip + 4 * istep)); *((npy_short *)(op + 4 * ostep)) = scalar_negative(in_4); #endif -#line 344 +#line 347 #if UNROLL > 5 const npy_short in_5 = *((const npy_short *)(ip + 5 * istep)); *((npy_short *)(op + 5 * ostep)) = scalar_negative(in_5); #endif -#line 344 +#line 347 #if UNROLL > 6 const npy_short in_6 = *((const npy_short *)(ip + 6 * istep)); *((npy_short *)(op + 6 * ostep)) = scalar_negative(in_6); #endif -#line 344 +#line 347 #if UNROLL > 7 const npy_short in_7 = *((const npy_short *)(ip + 7 * istep)); *((npy_short *)(op + 7 * ostep)) = scalar_negative(in_7); #endif -#line 344 +#line 347 #if UNROLL > 8 const npy_short in_8 = *((const npy_short *)(ip + 8 * istep)); *((npy_short *)(op + 8 * ostep)) = scalar_negative(in_8); #endif -#line 344 +#line 347 #if UNROLL > 9 const npy_short in_9 = *((const npy_short *)(ip + 9 * istep)); *((npy_short *)(op + 9 * ostep)) = scalar_negative(in_9); #endif -#line 344 +#line 347 #if UNROLL > 10 const npy_short in_10 = *((const npy_short *)(ip + 10 * istep)); *((npy_short *)(op + 10 * ostep)) = scalar_negative(in_10); #endif -#line 344 +#line 347 #if UNROLL > 11 const npy_short in_11 = *((const npy_short *)(ip + 11 * istep)); *((npy_short *)(op + 11 * ostep)) = scalar_negative(in_11); #endif -#line 344 +#line 347 #if UNROLL > 12 const npy_short in_12 = *((const npy_short *)(ip + 12 * istep)); *((npy_short *)(op + 12 * ostep)) = scalar_negative(in_12); #endif -#line 344 +#line 347 #if UNROLL > 13 const npy_short in_13 = *((const npy_short *)(ip + 13 * istep)); *((npy_short *)(op + 13 * ostep)) = scalar_negative(in_13); #endif -#line 344 +#line 347 #if UNROLL > 14 const npy_short in_14 = *((const npy_short *)(ip + 14 * istep)); *((npy_short *)(op + 14 * ostep)) = scalar_negative(in_14); #endif -#line 344 +#line 347 #if UNROLL > 15 const npy_short in_15 = *((const npy_short *)(ip + 15 * istep)); *((npy_short *)(op + 15 * ostep)) = scalar_negative(in_15); @@ -7501,10 +7531,10 @@ clear: #endif } -#line 254 +#line 257 #undef TO_SIMD_SFX #if 0 -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_INT == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -7520,7 +7550,7 @@ clear: #define TO_SIMD_SFX(X) X##_s8 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_INT == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -7536,7 +7566,7 @@ clear: #define TO_SIMD_SFX(X) X##_s16 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_INT == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -7552,7 +7582,7 @@ clear: #define TO_SIMD_SFX(X) X##_s32 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_INT == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -7570,7 +7600,7 @@ clear: #endif -#line 280 +#line 283 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_negative) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -7608,8 +7638,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_negative) ); goto clear; } - // SSE2 does better with unrolled scalar for heavy non-contiguous - #if !defined(NPY_HAVE_SSE2) + // X86 does better with unrolled scalar for heavy non-contiguous + #ifndef NPY_HAVE_SSE2 else if (istride != 1 && ostride != 1) { // non-contiguous input and output TO_SIMD_SFX(simd_unary_nn_negative)( @@ -7632,97 +7662,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_negative) */ #define UNROLL 8 for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) { - #line 344 + #line 347 #if UNROLL > 0 const npy_int in_0 = *((const npy_int *)(ip + 0 * istep)); *((npy_int *)(op + 0 * ostep)) = scalar_negative(in_0); #endif -#line 344 +#line 347 #if UNROLL > 1 const npy_int in_1 = *((const npy_int *)(ip + 1 * istep)); *((npy_int *)(op + 1 * ostep)) = scalar_negative(in_1); #endif -#line 344 +#line 347 #if UNROLL > 2 const npy_int in_2 = *((const npy_int *)(ip + 2 * istep)); *((npy_int *)(op + 2 * ostep)) = scalar_negative(in_2); #endif -#line 344 +#line 347 #if UNROLL > 3 const npy_int in_3 = *((const npy_int *)(ip + 3 * istep)); *((npy_int *)(op + 3 * ostep)) = scalar_negative(in_3); #endif -#line 344 +#line 347 #if UNROLL > 4 const npy_int in_4 = *((const npy_int *)(ip + 4 * istep)); *((npy_int *)(op + 4 * ostep)) = scalar_negative(in_4); #endif -#line 344 +#line 347 #if UNROLL > 5 const npy_int in_5 = *((const npy_int *)(ip + 5 * istep)); *((npy_int *)(op + 5 * ostep)) = scalar_negative(in_5); #endif -#line 344 +#line 347 #if UNROLL > 6 const npy_int in_6 = *((const npy_int *)(ip + 6 * istep)); *((npy_int *)(op + 6 * ostep)) = scalar_negative(in_6); #endif -#line 344 +#line 347 #if UNROLL > 7 const npy_int in_7 = *((const npy_int *)(ip + 7 * istep)); *((npy_int *)(op + 7 * ostep)) = scalar_negative(in_7); #endif -#line 344 +#line 347 #if UNROLL > 8 const npy_int in_8 = *((const npy_int *)(ip + 8 * istep)); *((npy_int *)(op + 8 * ostep)) = scalar_negative(in_8); #endif -#line 344 +#line 347 #if UNROLL > 9 const npy_int in_9 = *((const npy_int *)(ip + 9 * istep)); *((npy_int *)(op + 9 * ostep)) = scalar_negative(in_9); #endif -#line 344 +#line 347 #if UNROLL > 10 const npy_int in_10 = *((const npy_int *)(ip + 10 * istep)); *((npy_int *)(op + 10 * ostep)) = scalar_negative(in_10); #endif -#line 344 +#line 347 #if UNROLL > 11 const npy_int in_11 = *((const npy_int *)(ip + 11 * istep)); *((npy_int *)(op + 11 * ostep)) = scalar_negative(in_11); #endif -#line 344 +#line 347 #if UNROLL > 12 const npy_int in_12 = *((const npy_int *)(ip + 12 * istep)); *((npy_int *)(op + 12 * ostep)) = scalar_negative(in_12); #endif -#line 344 +#line 347 #if UNROLL > 13 const npy_int in_13 = *((const npy_int *)(ip + 13 * istep)); *((npy_int *)(op + 13 * ostep)) = scalar_negative(in_13); #endif -#line 344 +#line 347 #if UNROLL > 14 const npy_int in_14 = *((const npy_int *)(ip + 14 * istep)); *((npy_int *)(op + 14 * ostep)) = scalar_negative(in_14); #endif -#line 344 +#line 347 #if UNROLL > 15 const npy_int in_15 = *((const npy_int *)(ip + 15 * istep)); *((npy_int *)(op + 15 * ostep)) = scalar_negative(in_15); @@ -7742,10 +7772,10 @@ clear: #endif } -#line 254 +#line 257 #undef TO_SIMD_SFX #if 0 -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONG == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -7761,7 +7791,7 @@ clear: #define TO_SIMD_SFX(X) X##_s8 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONG == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -7777,7 +7807,7 @@ clear: #define TO_SIMD_SFX(X) X##_s16 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONG == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -7793,7 +7823,7 @@ clear: #define TO_SIMD_SFX(X) X##_s32 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONG == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -7811,7 +7841,7 @@ clear: #endif -#line 280 +#line 283 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_negative) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -7849,8 +7879,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_negative) ); goto clear; } - // SSE2 does better with unrolled scalar for heavy non-contiguous - #if !defined(NPY_HAVE_SSE2) + // X86 does better with unrolled scalar for heavy non-contiguous + #ifndef NPY_HAVE_SSE2 else if (istride != 1 && ostride != 1) { // non-contiguous input and output TO_SIMD_SFX(simd_unary_nn_negative)( @@ -7873,97 +7903,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_negative) */ #define UNROLL 8 for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) { - #line 344 + #line 347 #if UNROLL > 0 const npy_long in_0 = *((const npy_long *)(ip + 0 * istep)); *((npy_long *)(op + 0 * ostep)) = scalar_negative(in_0); #endif -#line 344 +#line 347 #if UNROLL > 1 const npy_long in_1 = *((const npy_long *)(ip + 1 * istep)); *((npy_long *)(op + 1 * ostep)) = scalar_negative(in_1); #endif -#line 344 +#line 347 #if UNROLL > 2 const npy_long in_2 = *((const npy_long *)(ip + 2 * istep)); *((npy_long *)(op + 2 * ostep)) = scalar_negative(in_2); #endif -#line 344 +#line 347 #if UNROLL > 3 const npy_long in_3 = *((const npy_long *)(ip + 3 * istep)); *((npy_long *)(op + 3 * ostep)) = scalar_negative(in_3); #endif -#line 344 +#line 347 #if UNROLL > 4 const npy_long in_4 = *((const npy_long *)(ip + 4 * istep)); *((npy_long *)(op + 4 * ostep)) = scalar_negative(in_4); #endif -#line 344 +#line 347 #if UNROLL > 5 const npy_long in_5 = *((const npy_long *)(ip + 5 * istep)); *((npy_long *)(op + 5 * ostep)) = scalar_negative(in_5); #endif -#line 344 +#line 347 #if UNROLL > 6 const npy_long in_6 = *((const npy_long *)(ip + 6 * istep)); *((npy_long *)(op + 6 * ostep)) = scalar_negative(in_6); #endif -#line 344 +#line 347 #if UNROLL > 7 const npy_long in_7 = *((const npy_long *)(ip + 7 * istep)); *((npy_long *)(op + 7 * ostep)) = scalar_negative(in_7); #endif -#line 344 +#line 347 #if UNROLL > 8 const npy_long in_8 = *((const npy_long *)(ip + 8 * istep)); *((npy_long *)(op + 8 * ostep)) = scalar_negative(in_8); #endif -#line 344 +#line 347 #if UNROLL > 9 const npy_long in_9 = *((const npy_long *)(ip + 9 * istep)); *((npy_long *)(op + 9 * ostep)) = scalar_negative(in_9); #endif -#line 344 +#line 347 #if UNROLL > 10 const npy_long in_10 = *((const npy_long *)(ip + 10 * istep)); *((npy_long *)(op + 10 * ostep)) = scalar_negative(in_10); #endif -#line 344 +#line 347 #if UNROLL > 11 const npy_long in_11 = *((const npy_long *)(ip + 11 * istep)); *((npy_long *)(op + 11 * ostep)) = scalar_negative(in_11); #endif -#line 344 +#line 347 #if UNROLL > 12 const npy_long in_12 = *((const npy_long *)(ip + 12 * istep)); *((npy_long *)(op + 12 * ostep)) = scalar_negative(in_12); #endif -#line 344 +#line 347 #if UNROLL > 13 const npy_long in_13 = *((const npy_long *)(ip + 13 * istep)); *((npy_long *)(op + 13 * ostep)) = scalar_negative(in_13); #endif -#line 344 +#line 347 #if UNROLL > 14 const npy_long in_14 = *((const npy_long *)(ip + 14 * istep)); *((npy_long *)(op + 14 * ostep)) = scalar_negative(in_14); #endif -#line 344 +#line 347 #if UNROLL > 15 const npy_long in_15 = *((const npy_long *)(ip + 15 * istep)); *((npy_long *)(op + 15 * ostep)) = scalar_negative(in_15); @@ -7983,10 +8013,10 @@ clear: #endif } -#line 254 +#line 257 #undef TO_SIMD_SFX #if 0 -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8 #if 0 #define TO_SIMD_SFX(X) X##_f8 @@ -8002,7 +8032,7 @@ clear: #define TO_SIMD_SFX(X) X##_s8 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16 #if 0 #define TO_SIMD_SFX(X) X##_f16 @@ -8018,7 +8048,7 @@ clear: #define TO_SIMD_SFX(X) X##_s16 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32 #if 0 #define TO_SIMD_SFX(X) X##_f32 @@ -8034,7 +8064,7 @@ clear: #define TO_SIMD_SFX(X) X##_s32 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64 #if 0 #define TO_SIMD_SFX(X) X##_f64 @@ -8052,7 +8082,7 @@ clear: #endif -#line 280 +#line 283 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_negative) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -8090,8 +8120,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_negative) ); goto clear; } - // SSE2 does better with unrolled scalar for heavy non-contiguous - #if !defined(NPY_HAVE_SSE2) + // X86 does better with unrolled scalar for heavy non-contiguous + #ifndef NPY_HAVE_SSE2 else if (istride != 1 && ostride != 1) { // non-contiguous input and output TO_SIMD_SFX(simd_unary_nn_negative)( @@ -8114,97 +8144,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_negative) */ #define UNROLL 8 for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) { - #line 344 + #line 347 #if UNROLL > 0 const npy_longlong in_0 = *((const npy_longlong *)(ip + 0 * istep)); *((npy_longlong *)(op + 0 * ostep)) = scalar_negative(in_0); #endif -#line 344 +#line 347 #if UNROLL > 1 const npy_longlong in_1 = *((const npy_longlong *)(ip + 1 * istep)); *((npy_longlong *)(op + 1 * ostep)) = scalar_negative(in_1); #endif -#line 344 +#line 347 #if UNROLL > 2 const npy_longlong in_2 = *((const npy_longlong *)(ip + 2 * istep)); *((npy_longlong *)(op + 2 * ostep)) = scalar_negative(in_2); #endif -#line 344 +#line 347 #if UNROLL > 3 const npy_longlong in_3 = *((const npy_longlong *)(ip + 3 * istep)); *((npy_longlong *)(op + 3 * ostep)) = scalar_negative(in_3); #endif -#line 344 +#line 347 #if UNROLL > 4 const npy_longlong in_4 = *((const npy_longlong *)(ip + 4 * istep)); *((npy_longlong *)(op + 4 * ostep)) = scalar_negative(in_4); #endif -#line 344 +#line 347 #if UNROLL > 5 const npy_longlong in_5 = *((const npy_longlong *)(ip + 5 * istep)); *((npy_longlong *)(op + 5 * ostep)) = scalar_negative(in_5); #endif -#line 344 +#line 347 #if UNROLL > 6 const npy_longlong in_6 = *((const npy_longlong *)(ip + 6 * istep)); *((npy_longlong *)(op + 6 * ostep)) = scalar_negative(in_6); #endif -#line 344 +#line 347 #if UNROLL > 7 const npy_longlong in_7 = *((const npy_longlong *)(ip + 7 * istep)); *((npy_longlong *)(op + 7 * ostep)) = scalar_negative(in_7); #endif -#line 344 +#line 347 #if UNROLL > 8 const npy_longlong in_8 = *((const npy_longlong *)(ip + 8 * istep)); *((npy_longlong *)(op + 8 * ostep)) = scalar_negative(in_8); #endif -#line 344 +#line 347 #if UNROLL > 9 const npy_longlong in_9 = *((const npy_longlong *)(ip + 9 * istep)); *((npy_longlong *)(op + 9 * ostep)) = scalar_negative(in_9); #endif -#line 344 +#line 347 #if UNROLL > 10 const npy_longlong in_10 = *((const npy_longlong *)(ip + 10 * istep)); *((npy_longlong *)(op + 10 * ostep)) = scalar_negative(in_10); #endif -#line 344 +#line 347 #if UNROLL > 11 const npy_longlong in_11 = *((const npy_longlong *)(ip + 11 * istep)); *((npy_longlong *)(op + 11 * ostep)) = scalar_negative(in_11); #endif -#line 344 +#line 347 #if UNROLL > 12 const npy_longlong in_12 = *((const npy_longlong *)(ip + 12 * istep)); *((npy_longlong *)(op + 12 * ostep)) = scalar_negative(in_12); #endif -#line 344 +#line 347 #if UNROLL > 13 const npy_longlong in_13 = *((const npy_longlong *)(ip + 13 * istep)); *((npy_longlong *)(op + 13 * ostep)) = scalar_negative(in_13); #endif -#line 344 +#line 347 #if UNROLL > 14 const npy_longlong in_14 = *((const npy_longlong *)(ip + 14 * istep)); *((npy_longlong *)(op + 14 * ostep)) = scalar_negative(in_14); #endif -#line 344 +#line 347 #if UNROLL > 15 const npy_longlong in_15 = *((const npy_longlong *)(ip + 15 * istep)); *((npy_longlong *)(op + 15 * ostep)) = scalar_negative(in_15); @@ -8224,10 +8254,10 @@ clear: #endif } -#line 254 +#line 257 #undef TO_SIMD_SFX #if 0 -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_FLOAT == 8 #if 1 #define TO_SIMD_SFX(X) X##_f8 @@ -8243,7 +8273,7 @@ clear: #define TO_SIMD_SFX(X) X##_s8 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_FLOAT == 16 #if 1 #define TO_SIMD_SFX(X) X##_f16 @@ -8259,7 +8289,7 @@ clear: #define TO_SIMD_SFX(X) X##_s16 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_FLOAT == 32 #if 1 #define TO_SIMD_SFX(X) X##_f32 @@ -8275,7 +8305,7 @@ clear: #define TO_SIMD_SFX(X) X##_s32 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_FLOAT == 64 #if 1 #define TO_SIMD_SFX(X) X##_f64 @@ -8293,7 +8323,7 @@ clear: #endif -#line 280 +#line 283 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_negative) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -8331,8 +8361,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_negative) ); goto clear; } - // SSE2 does better with unrolled scalar for heavy non-contiguous - #if !defined(NPY_HAVE_SSE2) + // X86 does better with unrolled scalar for heavy non-contiguous + #ifndef NPY_HAVE_SSE2 else if (istride != 1 && ostride != 1) { // non-contiguous input and output TO_SIMD_SFX(simd_unary_nn_negative)( @@ -8355,97 +8385,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_negative) */ #define UNROLL 8 for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) { - #line 344 + #line 347 #if UNROLL > 0 const npy_float in_0 = *((const npy_float *)(ip + 0 * istep)); *((npy_float *)(op + 0 * ostep)) = scalar_negative(in_0); #endif -#line 344 +#line 347 #if UNROLL > 1 const npy_float in_1 = *((const npy_float *)(ip + 1 * istep)); *((npy_float *)(op + 1 * ostep)) = scalar_negative(in_1); #endif -#line 344 +#line 347 #if UNROLL > 2 const npy_float in_2 = *((const npy_float *)(ip + 2 * istep)); *((npy_float *)(op + 2 * ostep)) = scalar_negative(in_2); #endif -#line 344 +#line 347 #if UNROLL > 3 const npy_float in_3 = *((const npy_float *)(ip + 3 * istep)); *((npy_float *)(op + 3 * ostep)) = scalar_negative(in_3); #endif -#line 344 +#line 347 #if UNROLL > 4 const npy_float in_4 = *((const npy_float *)(ip + 4 * istep)); *((npy_float *)(op + 4 * ostep)) = scalar_negative(in_4); #endif -#line 344 +#line 347 #if UNROLL > 5 const npy_float in_5 = *((const npy_float *)(ip + 5 * istep)); *((npy_float *)(op + 5 * ostep)) = scalar_negative(in_5); #endif -#line 344 +#line 347 #if UNROLL > 6 const npy_float in_6 = *((const npy_float *)(ip + 6 * istep)); *((npy_float *)(op + 6 * ostep)) = scalar_negative(in_6); #endif -#line 344 +#line 347 #if UNROLL > 7 const npy_float in_7 = *((const npy_float *)(ip + 7 * istep)); *((npy_float *)(op + 7 * ostep)) = scalar_negative(in_7); #endif -#line 344 +#line 347 #if UNROLL > 8 const npy_float in_8 = *((const npy_float *)(ip + 8 * istep)); *((npy_float *)(op + 8 * ostep)) = scalar_negative(in_8); #endif -#line 344 +#line 347 #if UNROLL > 9 const npy_float in_9 = *((const npy_float *)(ip + 9 * istep)); *((npy_float *)(op + 9 * ostep)) = scalar_negative(in_9); #endif -#line 344 +#line 347 #if UNROLL > 10 const npy_float in_10 = *((const npy_float *)(ip + 10 * istep)); *((npy_float *)(op + 10 * ostep)) = scalar_negative(in_10); #endif -#line 344 +#line 347 #if UNROLL > 11 const npy_float in_11 = *((const npy_float *)(ip + 11 * istep)); *((npy_float *)(op + 11 * ostep)) = scalar_negative(in_11); #endif -#line 344 +#line 347 #if UNROLL > 12 const npy_float in_12 = *((const npy_float *)(ip + 12 * istep)); *((npy_float *)(op + 12 * ostep)) = scalar_negative(in_12); #endif -#line 344 +#line 347 #if UNROLL > 13 const npy_float in_13 = *((const npy_float *)(ip + 13 * istep)); *((npy_float *)(op + 13 * ostep)) = scalar_negative(in_13); #endif -#line 344 +#line 347 #if UNROLL > 14 const npy_float in_14 = *((const npy_float *)(ip + 14 * istep)); *((npy_float *)(op + 14 * ostep)) = scalar_negative(in_14); #endif -#line 344 +#line 347 #if UNROLL > 15 const npy_float in_15 = *((const npy_float *)(ip + 15 * istep)); *((npy_float *)(op + 15 * ostep)) = scalar_negative(in_15); @@ -8465,10 +8495,10 @@ clear: #endif } -#line 254 +#line 257 #undef TO_SIMD_SFX #if 0 -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_DOUBLE == 8 #if 1 #define TO_SIMD_SFX(X) X##_f8 @@ -8484,7 +8514,7 @@ clear: #define TO_SIMD_SFX(X) X##_s8 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_DOUBLE == 16 #if 1 #define TO_SIMD_SFX(X) X##_f16 @@ -8500,7 +8530,7 @@ clear: #define TO_SIMD_SFX(X) X##_s16 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_DOUBLE == 32 #if 1 #define TO_SIMD_SFX(X) X##_f32 @@ -8516,7 +8546,7 @@ clear: #define TO_SIMD_SFX(X) X##_s32 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_DOUBLE == 64 #if 1 #define TO_SIMD_SFX(X) X##_f64 @@ -8534,7 +8564,7 @@ clear: #endif -#line 280 +#line 283 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_negative) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -8572,8 +8602,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_negative) ); goto clear; } - // SSE2 does better with unrolled scalar for heavy non-contiguous - #if !defined(NPY_HAVE_SSE2) + // X86 does better with unrolled scalar for heavy non-contiguous + #ifndef NPY_HAVE_SSE2 else if (istride != 1 && ostride != 1) { // non-contiguous input and output TO_SIMD_SFX(simd_unary_nn_negative)( @@ -8596,97 +8626,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_negative) */ #define UNROLL 8 for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) { - #line 344 + #line 347 #if UNROLL > 0 const npy_double in_0 = *((const npy_double *)(ip + 0 * istep)); *((npy_double *)(op + 0 * ostep)) = scalar_negative(in_0); #endif -#line 344 +#line 347 #if UNROLL > 1 const npy_double in_1 = *((const npy_double *)(ip + 1 * istep)); *((npy_double *)(op + 1 * ostep)) = scalar_negative(in_1); #endif -#line 344 +#line 347 #if UNROLL > 2 const npy_double in_2 = *((const npy_double *)(ip + 2 * istep)); *((npy_double *)(op + 2 * ostep)) = scalar_negative(in_2); #endif -#line 344 +#line 347 #if UNROLL > 3 const npy_double in_3 = *((const npy_double *)(ip + 3 * istep)); *((npy_double *)(op + 3 * ostep)) = scalar_negative(in_3); #endif -#line 344 +#line 347 #if UNROLL > 4 const npy_double in_4 = *((const npy_double *)(ip + 4 * istep)); *((npy_double *)(op + 4 * ostep)) = scalar_negative(in_4); #endif -#line 344 +#line 347 #if UNROLL > 5 const npy_double in_5 = *((const npy_double *)(ip + 5 * istep)); *((npy_double *)(op + 5 * ostep)) = scalar_negative(in_5); #endif -#line 344 +#line 347 #if UNROLL > 6 const npy_double in_6 = *((const npy_double *)(ip + 6 * istep)); *((npy_double *)(op + 6 * ostep)) = scalar_negative(in_6); #endif -#line 344 +#line 347 #if UNROLL > 7 const npy_double in_7 = *((const npy_double *)(ip + 7 * istep)); *((npy_double *)(op + 7 * ostep)) = scalar_negative(in_7); #endif -#line 344 +#line 347 #if UNROLL > 8 const npy_double in_8 = *((const npy_double *)(ip + 8 * istep)); *((npy_double *)(op + 8 * ostep)) = scalar_negative(in_8); #endif -#line 344 +#line 347 #if UNROLL > 9 const npy_double in_9 = *((const npy_double *)(ip + 9 * istep)); *((npy_double *)(op + 9 * ostep)) = scalar_negative(in_9); #endif -#line 344 +#line 347 #if UNROLL > 10 const npy_double in_10 = *((const npy_double *)(ip + 10 * istep)); *((npy_double *)(op + 10 * ostep)) = scalar_negative(in_10); #endif -#line 344 +#line 347 #if UNROLL > 11 const npy_double in_11 = *((const npy_double *)(ip + 11 * istep)); *((npy_double *)(op + 11 * ostep)) = scalar_negative(in_11); #endif -#line 344 +#line 347 #if UNROLL > 12 const npy_double in_12 = *((const npy_double *)(ip + 12 * istep)); *((npy_double *)(op + 12 * ostep)) = scalar_negative(in_12); #endif -#line 344 +#line 347 #if UNROLL > 13 const npy_double in_13 = *((const npy_double *)(ip + 13 * istep)); *((npy_double *)(op + 13 * ostep)) = scalar_negative(in_13); #endif -#line 344 +#line 347 #if UNROLL > 14 const npy_double in_14 = *((const npy_double *)(ip + 14 * istep)); *((npy_double *)(op + 14 * ostep)) = scalar_negative(in_14); #endif -#line 344 +#line 347 #if UNROLL > 15 const npy_double in_15 = *((const npy_double *)(ip + 15 * istep)); *((npy_double *)(op + 15 * ostep)) = scalar_negative(in_15); @@ -8706,10 +8736,10 @@ clear: #endif } -#line 254 +#line 257 #undef TO_SIMD_SFX #if 0 -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 8 #if 1 #define TO_SIMD_SFX(X) X##_f8 @@ -8725,7 +8755,7 @@ clear: #define TO_SIMD_SFX(X) X##_s8 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 16 #if 1 #define TO_SIMD_SFX(X) X##_f16 @@ -8741,7 +8771,7 @@ clear: #define TO_SIMD_SFX(X) X##_s16 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 32 #if 1 #define TO_SIMD_SFX(X) X##_f32 @@ -8757,7 +8787,7 @@ clear: #define TO_SIMD_SFX(X) X##_s32 #endif -#line 259 +#line 262 #elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 64 #if 1 #define TO_SIMD_SFX(X) X##_f64 @@ -8775,7 +8805,7 @@ clear: #endif -#line 280 +#line 283 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_negative) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -8813,8 +8843,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_negative) ); goto clear; } - // SSE2 does better with unrolled scalar for heavy non-contiguous - #if !defined(NPY_HAVE_SSE2) + // X86 does better with unrolled scalar for heavy non-contiguous + #ifndef NPY_HAVE_SSE2 else if (istride != 1 && ostride != 1) { // non-contiguous input and output TO_SIMD_SFX(simd_unary_nn_negative)( @@ -8837,97 +8867,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_negative) */ #define UNROLL 8 for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) { - #line 344 + #line 347 #if UNROLL > 0 const npy_longdouble in_0 = *((const npy_longdouble *)(ip + 0 * istep)); *((npy_longdouble *)(op + 0 * ostep)) = scalar_negative(in_0); #endif -#line 344 +#line 347 #if UNROLL > 1 const npy_longdouble in_1 = *((const npy_longdouble *)(ip + 1 * istep)); *((npy_longdouble *)(op + 1 * ostep)) = scalar_negative(in_1); #endif -#line 344 +#line 347 #if UNROLL > 2 const npy_longdouble in_2 = *((const npy_longdouble *)(ip + 2 * istep)); *((npy_longdouble *)(op + 2 * ostep)) = scalar_negative(in_2); #endif -#line 344 +#line 347 #if UNROLL > 3 const npy_longdouble in_3 = *((const npy_longdouble *)(ip + 3 * istep)); *((npy_longdouble *)(op + 3 * ostep)) = scalar_negative(in_3); #endif -#line 344 +#line 347 #if UNROLL > 4 const npy_longdouble in_4 = *((const npy_longdouble *)(ip + 4 * istep)); *((npy_longdouble *)(op + 4 * ostep)) = scalar_negative(in_4); #endif -#line 344 +#line 347 #if UNROLL > 5 const npy_longdouble in_5 = *((const npy_longdouble *)(ip + 5 * istep)); *((npy_longdouble *)(op + 5 * ostep)) = scalar_negative(in_5); #endif -#line 344 +#line 347 #if UNROLL > 6 const npy_longdouble in_6 = *((const npy_longdouble *)(ip + 6 * istep)); *((npy_longdouble *)(op + 6 * ostep)) = scalar_negative(in_6); #endif -#line 344 +#line 347 #if UNROLL > 7 const npy_longdouble in_7 = *((const npy_longdouble *)(ip + 7 * istep)); *((npy_longdouble *)(op + 7 * ostep)) = scalar_negative(in_7); #endif -#line 344 +#line 347 #if UNROLL > 8 const npy_longdouble in_8 = *((const npy_longdouble *)(ip + 8 * istep)); *((npy_longdouble *)(op + 8 * ostep)) = scalar_negative(in_8); #endif -#line 344 +#line 347 #if UNROLL > 9 const npy_longdouble in_9 = *((const npy_longdouble *)(ip + 9 * istep)); *((npy_longdouble *)(op + 9 * ostep)) = scalar_negative(in_9); #endif -#line 344 +#line 347 #if UNROLL > 10 const npy_longdouble in_10 = *((const npy_longdouble *)(ip + 10 * istep)); *((npy_longdouble *)(op + 10 * ostep)) = scalar_negative(in_10); #endif -#line 344 +#line 347 #if UNROLL > 11 const npy_longdouble in_11 = *((const npy_longdouble *)(ip + 11 * istep)); *((npy_longdouble *)(op + 11 * ostep)) = scalar_negative(in_11); #endif -#line 344 +#line 347 #if UNROLL > 12 const npy_longdouble in_12 = *((const npy_longdouble *)(ip + 12 * istep)); *((npy_longdouble *)(op + 12 * ostep)) = scalar_negative(in_12); #endif -#line 344 +#line 347 #if UNROLL > 13 const npy_longdouble in_13 = *((const npy_longdouble *)(ip + 13 * istep)); *((npy_longdouble *)(op + 13 * ostep)) = scalar_negative(in_13); #endif -#line 344 +#line 347 #if UNROLL > 14 const npy_longdouble in_14 = *((const npy_longdouble *)(ip + 14 * istep)); *((npy_longdouble *)(op + 14 * ostep)) = scalar_negative(in_14); #endif -#line 344 +#line 347 #if UNROLL > 15 const npy_longdouble in_15 = *((const npy_longdouble *)(ip + 15 * istep)); *((npy_longdouble *)(op + 15 * ostep)) = scalar_negative(in_15); diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c.src b/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c.src index 1e2a81d20b..bfe4d892d0 100644 --- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c.src +++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c.src @@ -195,6 +195,8 @@ simd_unary_nc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride, #undef UNROLL #define UNROLL 2 #endif +// X86 does better with unrolled scalar for heavy non-contiguous +#ifndef NPY_HAVE_SSE2 static NPY_INLINE void simd_unary_nn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride, npyv_lanetype_@sfx@ *op, npy_intp ostride, @@ -226,6 +228,7 @@ simd_unary_nn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride, *op = scalar_@intrin@(*ip); } } +#endif // NPY_HAVE_SSE2 #endif // @supports_ncontig@ #undef UNROLL #endif // @simd_chk@ @@ -314,8 +317,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) ); goto clear; } - // SSE2 does better with unrolled scalar for heavy non-contiguous - #if !defined(NPY_HAVE_SSE2) + // X86 does better with unrolled scalar for heavy non-contiguous + #ifndef NPY_HAVE_SSE2 else if (istride != 1 && ostride != 1) { // non-contiguous input and output TO_SIMD_SFX(simd_unary_nn_@intrin@)( diff --git a/contrib/python/numpy/py3/numpy/core/tests/test_numeric.py b/contrib/python/numpy/py3/numpy/core/tests/test_numeric.py index 1bbdde1317..a88189e03e 100644 --- a/contrib/python/numpy/py3/numpy/core/tests/test_numeric.py +++ b/contrib/python/numpy/py3/numpy/core/tests/test_numeric.py @@ -477,7 +477,14 @@ class TestBoolCmp: self.signd[self.ed] *= -1. self.signf[1::6][self.ef[1::6]] = -np.inf self.signd[1::6][self.ed[1::6]] = -np.inf - self.signf[3::6][self.ef[3::6]] = -np.nan + # On RISC-V, many operations that produce NaNs, such as converting + # a -NaN from f64 to f32, return a canonical NaN. The canonical + # NaNs are always positive. See section 11.3 NaN Generation and + # Propagation of the RISC-V Unprivileged ISA for more details. + # We disable the float32 sign test on riscv64 for -np.nan as the sign + # of the NaN will be lost when it's converted to a float32. + if platform.processor() != 'riscv64': + self.signf[3::6][self.ef[3::6]] = -np.nan self.signd[3::6][self.ed[3::6]] = -np.nan self.signf[4::6][self.ef[4::6]] = -0. self.signd[4::6][self.ed[4::6]] = -0. diff --git a/contrib/python/numpy/py3/numpy/f2py/crackfortran.py b/contrib/python/numpy/py3/numpy/f2py/crackfortran.py index 8d3fc27608..8d3fc27608 100755..100644 --- a/contrib/python/numpy/py3/numpy/f2py/crackfortran.py +++ b/contrib/python/numpy/py3/numpy/f2py/crackfortran.py diff --git a/contrib/python/numpy/py3/numpy/f2py/f2py2e.py b/contrib/python/numpy/py3/numpy/f2py/f2py2e.py index ce22b2d8a9..ce22b2d8a9 100755..100644 --- a/contrib/python/numpy/py3/numpy/f2py/f2py2e.py +++ b/contrib/python/numpy/py3/numpy/f2py/f2py2e.py diff --git a/contrib/python/numpy/py3/numpy/f2py/rules.py b/contrib/python/numpy/py3/numpy/f2py/rules.py index 009365e047..009365e047 100755..100644 --- a/contrib/python/numpy/py3/numpy/f2py/rules.py +++ b/contrib/python/numpy/py3/numpy/f2py/rules.py diff --git a/contrib/python/numpy/py3/numpy/f2py/tests/util.py b/contrib/python/numpy/py3/numpy/f2py/tests/util.py index 75b257cdb8..6ed6c0855f 100644 --- a/contrib/python/numpy/py3/numpy/f2py/tests/util.py +++ b/contrib/python/numpy/py3/numpy/f2py/tests/util.py @@ -20,6 +20,7 @@ import contextlib import numpy from pathlib import Path +from numpy.compat import asstr from numpy._utils import asunicode from numpy.testing import temppath, IS_WASM from importlib import import_module diff --git a/contrib/python/numpy/py3/numpy/lib/function_base.py b/contrib/python/numpy/py3/numpy/lib/function_base.py index e75aca1e58..a3dab04d33 100644 --- a/contrib/python/numpy/py3/numpy/lib/function_base.py +++ b/contrib/python/numpy/py3/numpy/lib/function_base.py @@ -4655,7 +4655,8 @@ def _lerp(a, b, t, out=None): diff_b_a = subtract(b, a) # asanyarray is a stop-gap until gh-13105 lerp_interpolation = asanyarray(add(a, diff_b_a * t, out=out)) - subtract(b, diff_b_a * (1 - t), out=lerp_interpolation, where=t >= 0.5) + subtract(b, diff_b_a * (1 - t), out=lerp_interpolation, where=t >= 0.5, + casting='unsafe', dtype=type(lerp_interpolation.dtype)) if lerp_interpolation.ndim == 0 and out is None: lerp_interpolation = lerp_interpolation[()] # unpack 0d arrays return lerp_interpolation diff --git a/contrib/python/numpy/py3/numpy/lib/tests/test_function_base.py b/contrib/python/numpy/py3/numpy/lib/tests/test_function_base.py index 11e44630e7..2bb73b6003 100644 --- a/contrib/python/numpy/py3/numpy/lib/tests/test_function_base.py +++ b/contrib/python/numpy/py3/numpy/lib/tests/test_function_base.py @@ -3606,6 +3606,10 @@ class TestQuantile: assert_equal(q, Fraction(7, 2)) assert_equal(type(q), Fraction) + q = np.quantile(x, .5) + assert_equal(q, 1.75) + assert_equal(type(q), np.float64) + q = np.quantile(x, Fraction(1, 2)) assert_equal(q, Fraction(7, 4)) assert_equal(type(q), Fraction) diff --git a/contrib/python/numpy/py3/numpy/linalg/umath_linalg.cpp b/contrib/python/numpy/py3/numpy/linalg/umath_linalg.cpp index 0c0b35e9c0..3b5effe14a 100644 --- a/contrib/python/numpy/py3/numpy/linalg/umath_linalg.cpp +++ b/contrib/python/numpy/py3/numpy/linalg/umath_linalg.cpp @@ -2259,7 +2259,7 @@ process_geev_results(GEEV_PARAMS_t<typ> *params, scalar_trait) } } - +#if 0 static inline fortran_int call_geev(GEEV_PARAMS_t<fortran_complex>* params) { @@ -2275,6 +2275,8 @@ call_geev(GEEV_PARAMS_t<fortran_complex>* params) &rv); return rv; } +#endif + static inline fortran_int call_geev(GEEV_PARAMS_t<fortran_doublecomplex>* params) { diff --git a/contrib/python/numpy/py3/numpy/testing/print_coercion_tables.py b/contrib/python/numpy/py3/numpy/testing/print_coercion_tables.py index c1d4cdff8f..c1d4cdff8f 100755..100644 --- a/contrib/python/numpy/py3/numpy/testing/print_coercion_tables.py +++ b/contrib/python/numpy/py3/numpy/testing/print_coercion_tables.py diff --git a/contrib/python/numpy/py3/numpy/testing/setup.py b/contrib/python/numpy/py3/numpy/testing/setup.py index 6f203e8727..6f203e8727 100755..100644 --- a/contrib/python/numpy/py3/numpy/testing/setup.py +++ b/contrib/python/numpy/py3/numpy/testing/setup.py diff --git a/contrib/python/numpy/py3/numpy/tests/test_warnings.py b/contrib/python/numpy/py3/numpy/tests/test_warnings.py index ee5124c5d5..df90fcef8c 100644 --- a/contrib/python/numpy/py3/numpy/tests/test_warnings.py +++ b/contrib/python/numpy/py3/numpy/tests/test_warnings.py @@ -5,7 +5,6 @@ all of these occurrences but should catch almost all. import pytest from pathlib import Path -import sys import ast import tokenize import numpy @@ -33,7 +32,7 @@ class FindFuncs(ast.NodeVisitor): ast.NodeVisitor.generic_visit(self, node) if p.ls[-1] == 'simplefilter' or p.ls[-1] == 'filterwarnings': - if node.args[0].s == "ignore": + if node.args[0].value == "ignore": raise AssertionError( "warnings should have an appropriate stacklevel; found in " "{} on line {}".format(self.__filename, node.lineno)) @@ -57,8 +56,6 @@ class FindFuncs(ast.NodeVisitor): @pytest.mark.slow -@pytest.mark.skipif(sys.version_info >= (3, 12), - reason="Deprecation warning in ast") def test_warning_calls(): # combined "ignore" and stacklevel error base = Path(numpy.__file__).parent diff --git a/contrib/python/numpy/py3/numpy/typing/tests/test_typing.py b/contrib/python/numpy/py3/numpy/typing/tests/test_typing.py index 68c6f5d03f..6f778e5515 100644 --- a/contrib/python/numpy/py3/numpy/typing/tests/test_typing.py +++ b/contrib/python/numpy/py3/numpy/typing/tests/test_typing.py @@ -86,8 +86,6 @@ def strip_func(match: re.Match[str]) -> str: return match.groups()[1] -@pytest.mark.slow -@pytest.mark.skipif(NO_MYPY, reason="Mypy is not installed") @pytest.fixture(scope="module", autouse=True) def run_mypy() -> None: """Clears the cache and run mypy before running any of the typing tests. diff --git a/contrib/python/numpy/py3/numpy/version.py b/contrib/python/numpy/py3/numpy/version.py index 692240a486..e96055ea6d 100644 --- a/contrib/python/numpy/py3/numpy/version.py +++ b/contrib/python/numpy/py3/numpy/version.py @@ -1,5 +1,5 @@ -version = "1.26.3" +version = "1.26.4" __version__ = version full_version = version diff --git a/contrib/python/numpy/py3/ya.make b/contrib/python/numpy/py3/ya.make index 92042220c3..0eb98bef02 100644 --- a/contrib/python/numpy/py3/ya.make +++ b/contrib/python/numpy/py3/ya.make @@ -2,7 +2,7 @@ PY3_LIBRARY() PROVIDES(numpy) -VERSION(1.26.3) +VERSION(1.26.4) LICENSE(BSD-3-Clause) |