aboutsummaryrefslogtreecommitdiffstats
path: root/contrib
diff options
context:
space:
mode:
authorrobot-piglet <robot-piglet@yandex-team.com>2024-02-06 15:03:31 +0300
committerrobot-piglet <robot-piglet@yandex-team.com>2024-02-06 15:23:04 +0300
commitaa875067a1bd13a74c342017536b8ddc8443729e (patch)
treea718a39f103a27ecf168e3310049a64c34be9611 /contrib
parent58bf93aa2a19207b73af4e3a4894741903ee0566 (diff)
downloadydb-aa875067a1bd13a74c342017536b8ddc8443729e.tar.gz
Intermediate changes
Diffstat (limited to 'contrib')
-rw-r--r--contrib/python/numpy/include/numpy/core/feature_detection_misc.h5
-rw-r--r--contrib/python/numpy/py3/.dist-info/METADATA7
-rw-r--r--contrib/python/numpy/py3/LICENSES_bundled.txt5
-rw-r--r--contrib/python/numpy/py3/numpy/__config__.py.in22
-rw-r--r--contrib/python/numpy/py3/numpy/array_api/__init__.py2
-rw-r--r--contrib/python/numpy/py3/numpy/array_api/linalg.py6
-rw-r--r--contrib/python/numpy/py3/numpy/core/code_generators/genapi.py9
-rw-r--r--contrib/python/numpy/py3/numpy/core/code_generators/generate_numpy_api.py7
-rw-r--r--contrib/python/numpy/py3/numpy/core/code_generators/generate_ufunc_api.py7
-rw-r--r--contrib/python/numpy/py3/numpy/core/feature_detection_stdio.h3
-rw-r--r--contrib/python/numpy/py3/numpy/core/generate_numpy_api.py251
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/common/npy_cpu_features.c4
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/multiarray/convert.c5
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/multiarray/temp_elide.c3
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c174
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c.src14
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c64
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c.src24
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c770
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c.src3
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c30
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c.src9
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c974
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c.src7
-rw-r--r--contrib/python/numpy/py3/numpy/core/tests/test_numeric.py9
-rw-r--r--[-rwxr-xr-x]contrib/python/numpy/py3/numpy/f2py/crackfortran.py0
-rw-r--r--[-rwxr-xr-x]contrib/python/numpy/py3/numpy/f2py/f2py2e.py0
-rw-r--r--[-rwxr-xr-x]contrib/python/numpy/py3/numpy/f2py/rules.py0
-rw-r--r--contrib/python/numpy/py3/numpy/f2py/tests/util.py1
-rw-r--r--contrib/python/numpy/py3/numpy/lib/function_base.py3
-rw-r--r--contrib/python/numpy/py3/numpy/lib/tests/test_function_base.py4
-rw-r--r--contrib/python/numpy/py3/numpy/linalg/umath_linalg.cpp4
-rw-r--r--[-rwxr-xr-x]contrib/python/numpy/py3/numpy/testing/print_coercion_tables.py0
-rw-r--r--[-rwxr-xr-x]contrib/python/numpy/py3/numpy/testing/setup.py0
-rw-r--r--contrib/python/numpy/py3/numpy/tests/test_warnings.py5
-rw-r--r--contrib/python/numpy/py3/numpy/typing/tests/test_typing.py2
-rw-r--r--contrib/python/numpy/py3/numpy/version.py2
-rw-r--r--contrib/python/numpy/py3/ya.make2
38 files changed, 1375 insertions, 1062 deletions
diff --git a/contrib/python/numpy/include/numpy/core/feature_detection_misc.h b/contrib/python/numpy/include/numpy/core/feature_detection_misc.h
new file mode 100644
index 0000000000..0e6447fbd1
--- /dev/null
+++ b/contrib/python/numpy/include/numpy/core/feature_detection_misc.h
@@ -0,0 +1,5 @@
+#ifdef USE_PYTHON3
+#include <contrib/python/numpy/py3/numpy/core/feature_detection_misc.h>
+#else
+#error #include <contrib/python/numpy/py2/numpy/core/feature_detection_misc.h>
+#endif
diff --git a/contrib/python/numpy/py3/.dist-info/METADATA b/contrib/python/numpy/py3/.dist-info/METADATA
index 5e515025ec..8246dc4ed3 100644
--- a/contrib/python/numpy/py3/.dist-info/METADATA
+++ b/contrib/python/numpy/py3/.dist-info/METADATA
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: numpy
-Version: 1.26.3
+Version: 1.26.4
Summary: Fundamental package for array computing in Python
Home-page: https://numpy.org
Author: Travis E. Oliphant et al.
@@ -70,11 +70,6 @@ License: Copyright (c) 2005-2023, NumPy Developers.
License: Apache 2.0
For license text, see vendored-meson/meson/COPYING
- Name: meson-python
- Files: vendored-meson/meson-python/*
- License: MIT
- For license text, see vendored-meson/meson-python/LICENSE
-
Name: spin
Files: .spin/cmds.py
License: BSD-3
diff --git a/contrib/python/numpy/py3/LICENSES_bundled.txt b/contrib/python/numpy/py3/LICENSES_bundled.txt
index 26faf7ff30..aae0e774fa 100644
--- a/contrib/python/numpy/py3/LICENSES_bundled.txt
+++ b/contrib/python/numpy/py3/LICENSES_bundled.txt
@@ -30,11 +30,6 @@ Files: vendored-meson/meson/*
License: Apache 2.0
For license text, see vendored-meson/meson/COPYING
-Name: meson-python
-Files: vendored-meson/meson-python/*
-License: MIT
- For license text, see vendored-meson/meson-python/LICENSE
-
Name: spin
Files: .spin/cmds.py
License: BSD-3
diff --git a/contrib/python/numpy/py3/numpy/__config__.py.in b/contrib/python/numpy/py3/numpy/__config__.py.in
index 6c6c21cb85..f3b32c28c1 100644
--- a/contrib/python/numpy/py3/numpy/__config__.py.in
+++ b/contrib/python/numpy/py3/numpy/__config__.py.in
@@ -32,21 +32,27 @@ CONFIG = _cleanup(
"Compilers": {
"c": {
"name": "@C_COMP@",
- "linker": "@C_COMP_LINKER_ID@",
+ "linker": r"@C_COMP_LINKER_ID@",
"version": "@C_COMP_VERSION@",
- "commands": "@C_COMP_CMD_ARRAY@",
+ "commands": r"@C_COMP_CMD_ARRAY@",
+ "args": r"@C_COMP_ARGS@",
+ "linker args": r"@C_COMP_LINK_ARGS@",
},
"cython": {
"name": "@CYTHON_COMP@",
- "linker": "@CYTHON_COMP_LINKER_ID@",
+ "linker": r"@CYTHON_COMP_LINKER_ID@",
"version": "@CYTHON_COMP_VERSION@",
- "commands": "@CYTHON_COMP_CMD_ARRAY@",
+ "commands": r"@CYTHON_COMP_CMD_ARRAY@",
+ "args": r"@CYTHON_COMP_ARGS@",
+ "linker args": r"@CYTHON_COMP_LINK_ARGS@",
},
"c++": {
"name": "@CPP_COMP@",
- "linker": "@CPP_COMP_LINKER_ID@",
+ "linker": r"@CPP_COMP_LINKER_ID@",
"version": "@CPP_COMP_VERSION@",
- "commands": "@CPP_COMP_CMD_ARRAY@",
+ "commands": r"@CPP_COMP_CMD_ARRAY@",
+ "args": r"@CPP_COMP_ARGS@",
+ "linker args": r"@CPP_COMP_LINK_ARGS@",
},
},
"Machine Information": {
@@ -72,7 +78,7 @@ CONFIG = _cleanup(
"detection method": "@BLAS_TYPE_NAME@",
"include directory": r"@BLAS_INCLUDEDIR@",
"lib directory": r"@BLAS_LIBDIR@",
- "openblas configuration": "@BLAS_OPENBLAS_CONFIG@",
+ "openblas configuration": r"@BLAS_OPENBLAS_CONFIG@",
"pc file directory": r"@BLAS_PCFILEDIR@",
},
"lapack": {
@@ -82,7 +88,7 @@ CONFIG = _cleanup(
"detection method": "@LAPACK_TYPE_NAME@",
"include directory": r"@LAPACK_INCLUDEDIR@",
"lib directory": r"@LAPACK_LIBDIR@",
- "openblas configuration": "@LAPACK_OPENBLAS_CONFIG@",
+ "openblas configuration": r"@LAPACK_OPENBLAS_CONFIG@",
"pc file directory": r"@LAPACK_PCFILEDIR@",
},
},
diff --git a/contrib/python/numpy/py3/numpy/array_api/__init__.py b/contrib/python/numpy/py3/numpy/array_api/__init__.py
index 77f227882e..edc3205fd5 100644
--- a/contrib/python/numpy/py3/numpy/array_api/__init__.py
+++ b/contrib/python/numpy/py3/numpy/array_api/__init__.py
@@ -127,7 +127,7 @@ __all__ = ["__array_api_version__"]
from ._constants import e, inf, nan, pi, newaxis
-__all__ += ["e", "inf", "nan", "pi"]
+__all__ += ["e", "inf", "nan", "pi", "newaxis"]
from ._creation_functions import (
asarray,
diff --git a/contrib/python/numpy/py3/numpy/array_api/linalg.py b/contrib/python/numpy/py3/numpy/array_api/linalg.py
index 09af9dfc3a..c18360f6e6 100644
--- a/contrib/python/numpy/py3/numpy/array_api/linalg.py
+++ b/contrib/python/numpy/py3/numpy/array_api/linalg.py
@@ -9,6 +9,7 @@ from ._dtypes import (
complex128
)
from ._manipulation_functions import reshape
+from ._elementwise_functions import conj
from ._array_object import Array
from ..core.numeric import normalize_axis_tuple
@@ -53,7 +54,10 @@ def cholesky(x: Array, /, *, upper: bool = False) -> Array:
raise TypeError('Only floating-point dtypes are allowed in cholesky')
L = np.linalg.cholesky(x._array)
if upper:
- return Array._new(L).mT
+ U = Array._new(L).mT
+ if U.dtype in [complex64, complex128]:
+ U = conj(U)
+ return U
return Array._new(L)
# Note: cross is the numpy top-level namespace, not np.linalg
diff --git a/contrib/python/numpy/py3/numpy/core/code_generators/genapi.py b/contrib/python/numpy/py3/numpy/core/code_generators/genapi.py
index 2cdaba52d9..d9d7862b28 100644
--- a/contrib/python/numpy/py3/numpy/core/code_generators/genapi.py
+++ b/contrib/python/numpy/py3/numpy/core/code_generators/genapi.py
@@ -304,15 +304,6 @@ def find_functions(filename, tag='API'):
fo.close()
return functions
-def should_rebuild(targets, source_files):
- from distutils.dep_util import newer_group
- for t in targets:
- if not os.path.exists(t):
- return True
- sources = API_FILES + list(source_files) + [__file__]
- if newer_group(sources, targets[0], missing='newer'):
- return True
- return False
def write_file(filename, data):
"""
diff --git a/contrib/python/numpy/py3/numpy/core/code_generators/generate_numpy_api.py b/contrib/python/numpy/py3/numpy/core/code_generators/generate_numpy_api.py
index ae38c4efc2..640bae9e5f 100644
--- a/contrib/python/numpy/py3/numpy/core/code_generators/generate_numpy_api.py
+++ b/contrib/python/numpy/py3/numpy/core/code_generators/generate_numpy_api.py
@@ -148,12 +148,7 @@ def generate_api(output_dir, force=False):
targets = (h_file, c_file)
sources = numpy_api.multiarray_api
-
- if (not force and not genapi.should_rebuild(targets, [numpy_api.__file__, __file__])):
- return targets
- else:
- do_generate_api(targets, sources)
-
+ do_generate_api(targets, sources)
return targets
def do_generate_api(targets, sources):
diff --git a/contrib/python/numpy/py3/numpy/core/code_generators/generate_ufunc_api.py b/contrib/python/numpy/py3/numpy/core/code_generators/generate_ufunc_api.py
index e03299a52c..3734cbd6a0 100644
--- a/contrib/python/numpy/py3/numpy/core/code_generators/generate_ufunc_api.py
+++ b/contrib/python/numpy/py3/numpy/core/code_generators/generate_ufunc_api.py
@@ -125,12 +125,7 @@ def generate_api(output_dir, force=False):
targets = (h_file, c_file)
sources = ['ufunc_api_order.txt']
-
- if (not force and not genapi.should_rebuild(targets, sources + [__file__])):
- return targets
- else:
- do_generate_api(targets, sources)
-
+ do_generate_api(targets, sources)
return targets
def do_generate_api(targets, sources):
diff --git a/contrib/python/numpy/py3/numpy/core/feature_detection_stdio.h b/contrib/python/numpy/py3/numpy/core/feature_detection_stdio.h
index bc14d16d04..d8bbfbd8b2 100644
--- a/contrib/python/numpy/py3/numpy/core/feature_detection_stdio.h
+++ b/contrib/python/numpy/py3/numpy/core/feature_detection_stdio.h
@@ -1,6 +1,9 @@
+#define _GNU_SOURCE
#include <stdio.h>
#include <fcntl.h>
+#if 0 /* Only for setup_common.py, not the C compiler */
off_t ftello(FILE *stream);
int fseeko(FILE *stream, off_t offset, int whence);
int fallocate(int, int, off_t, off_t);
+#endif
diff --git a/contrib/python/numpy/py3/numpy/core/generate_numpy_api.py b/contrib/python/numpy/py3/numpy/core/generate_numpy_api.py
new file mode 100644
index 0000000000..640bae9e5f
--- /dev/null
+++ b/contrib/python/numpy/py3/numpy/core/generate_numpy_api.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+import os
+import argparse
+
+import genapi
+from genapi import \
+ TypeApi, GlobalVarApi, FunctionApi, BoolValuesApi
+
+import numpy_api
+
+# use annotated api when running under cpychecker
+h_template = r"""
+#if defined(_MULTIARRAYMODULE) || defined(WITH_CPYCHECKER_STEALS_REFERENCE_TO_ARG_ATTRIBUTE)
+
+typedef struct {
+ PyObject_HEAD
+ npy_bool obval;
+} PyBoolScalarObject;
+
+extern NPY_NO_EXPORT PyTypeObject PyArrayMapIter_Type;
+extern NPY_NO_EXPORT PyTypeObject PyArrayNeighborhoodIter_Type;
+extern NPY_NO_EXPORT PyBoolScalarObject _PyArrayScalar_BoolValues[2];
+
+%s
+
+#else
+
+#if defined(PY_ARRAY_UNIQUE_SYMBOL)
+#define PyArray_API PY_ARRAY_UNIQUE_SYMBOL
+#endif
+
+#if defined(NO_IMPORT) || defined(NO_IMPORT_ARRAY)
+extern void **PyArray_API;
+#else
+#if defined(PY_ARRAY_UNIQUE_SYMBOL)
+void **PyArray_API;
+#else
+static void **PyArray_API=NULL;
+#endif
+#endif
+
+%s
+
+#if !defined(NO_IMPORT_ARRAY) && !defined(NO_IMPORT)
+static int
+_import_array(void)
+{
+ int st;
+ PyObject *numpy = PyImport_ImportModule("numpy.core._multiarray_umath");
+ PyObject *c_api = NULL;
+
+ if (numpy == NULL) {
+ return -1;
+ }
+ c_api = PyObject_GetAttrString(numpy, "_ARRAY_API");
+ Py_DECREF(numpy);
+ if (c_api == NULL) {
+ return -1;
+ }
+
+ if (!PyCapsule_CheckExact(c_api)) {
+ PyErr_SetString(PyExc_RuntimeError, "_ARRAY_API is not PyCapsule object");
+ Py_DECREF(c_api);
+ return -1;
+ }
+ PyArray_API = (void **)PyCapsule_GetPointer(c_api, NULL);
+ Py_DECREF(c_api);
+ if (PyArray_API == NULL) {
+ PyErr_SetString(PyExc_RuntimeError, "_ARRAY_API is NULL pointer");
+ return -1;
+ }
+
+ /* Perform runtime check of C API version */
+ if (NPY_VERSION != PyArray_GetNDArrayCVersion()) {
+ PyErr_Format(PyExc_RuntimeError, "module compiled against "\
+ "ABI version 0x%%x but this version of numpy is 0x%%x", \
+ (int) NPY_VERSION, (int) PyArray_GetNDArrayCVersion());
+ return -1;
+ }
+ if (NPY_FEATURE_VERSION > PyArray_GetNDArrayCFeatureVersion()) {
+ PyErr_Format(PyExc_RuntimeError, "module compiled against "\
+ "API version 0x%%x but this version of numpy is 0x%%x . "\
+ "Check the section C-API incompatibility at the "\
+ "Troubleshooting ImportError section at "\
+ "https://numpy.org/devdocs/user/troubleshooting-importerror.html"\
+ "#c-api-incompatibility "\
+ "for indications on how to solve this problem .", \
+ (int) NPY_FEATURE_VERSION, (int) PyArray_GetNDArrayCFeatureVersion());
+ return -1;
+ }
+
+ /*
+ * Perform runtime check of endianness and check it matches the one set by
+ * the headers (npy_endian.h) as a safeguard
+ */
+ st = PyArray_GetEndianness();
+ if (st == NPY_CPU_UNKNOWN_ENDIAN) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "FATAL: module compiled as unknown endian");
+ return -1;
+ }
+#if NPY_BYTE_ORDER == NPY_BIG_ENDIAN
+ if (st != NPY_CPU_BIG) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "FATAL: module compiled as big endian, but "
+ "detected different endianness at runtime");
+ return -1;
+ }
+#elif NPY_BYTE_ORDER == NPY_LITTLE_ENDIAN
+ if (st != NPY_CPU_LITTLE) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "FATAL: module compiled as little endian, but "
+ "detected different endianness at runtime");
+ return -1;
+ }
+#endif
+
+ return 0;
+}
+
+#define import_array() {if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import"); return NULL; } }
+
+#define import_array1(ret) {if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import"); return ret; } }
+
+#define import_array2(msg, ret) {if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, msg); return ret; } }
+
+#endif
+
+#endif
+"""
+
+
+c_template = r"""
+/* These pointers will be stored in the C-object for use in other
+ extension modules
+*/
+
+void *PyArray_API[] = {
+%s
+};
+"""
+
+def generate_api(output_dir, force=False):
+ basename = 'multiarray_api'
+
+ h_file = os.path.join(output_dir, '__%s.h' % basename)
+ c_file = os.path.join(output_dir, '__%s.c' % basename)
+ targets = (h_file, c_file)
+
+ sources = numpy_api.multiarray_api
+ do_generate_api(targets, sources)
+ return targets
+
+def do_generate_api(targets, sources):
+ header_file = targets[0]
+ c_file = targets[1]
+
+ global_vars = sources[0]
+ scalar_bool_values = sources[1]
+ types_api = sources[2]
+ multiarray_funcs = sources[3]
+
+ multiarray_api = sources[:]
+
+ module_list = []
+ extension_list = []
+ init_list = []
+
+ # Check multiarray api indexes
+ multiarray_api_index = genapi.merge_api_dicts(multiarray_api)
+ genapi.check_api_dict(multiarray_api_index)
+
+ numpyapi_list = genapi.get_api_functions('NUMPY_API',
+ multiarray_funcs)
+
+ # Create dict name -> *Api instance
+ api_name = 'PyArray_API'
+ multiarray_api_dict = {}
+ for f in numpyapi_list:
+ name = f.name
+ index = multiarray_funcs[name][0]
+ annotations = multiarray_funcs[name][1:]
+ multiarray_api_dict[f.name] = FunctionApi(f.name, index, annotations,
+ f.return_type,
+ f.args, api_name)
+
+ for name, val in global_vars.items():
+ index, type = val
+ multiarray_api_dict[name] = GlobalVarApi(name, index, type, api_name)
+
+ for name, val in scalar_bool_values.items():
+ index = val[0]
+ multiarray_api_dict[name] = BoolValuesApi(name, index, api_name)
+
+ for name, val in types_api.items():
+ index = val[0]
+ internal_type = None if len(val) == 1 else val[1]
+ multiarray_api_dict[name] = TypeApi(
+ name, index, 'PyTypeObject', api_name, internal_type)
+
+ if len(multiarray_api_dict) != len(multiarray_api_index):
+ keys_dict = set(multiarray_api_dict.keys())
+ keys_index = set(multiarray_api_index.keys())
+ raise AssertionError(
+ "Multiarray API size mismatch - "
+ "index has extra keys {}, dict has extra keys {}"
+ .format(keys_index - keys_dict, keys_dict - keys_index)
+ )
+
+ extension_list = []
+ for name, index in genapi.order_dict(multiarray_api_index):
+ api_item = multiarray_api_dict[name]
+ extension_list.append(api_item.define_from_array_api_string())
+ init_list.append(api_item.array_api_define())
+ module_list.append(api_item.internal_define())
+
+ # Write to header
+ s = h_template % ('\n'.join(module_list), '\n'.join(extension_list))
+ genapi.write_file(header_file, s)
+
+ # Write to c-code
+ s = c_template % ',\n'.join(init_list)
+ genapi.write_file(c_file, s)
+
+ return targets
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-o",
+ "--outdir",
+ type=str,
+ help="Path to the output directory"
+ )
+ parser.add_argument(
+ "-i",
+ "--ignore",
+ type=str,
+ help="An ignored input - may be useful to add a "
+ "dependency between custom targets"
+ )
+ args = parser.parse_args()
+
+ outdir_abs = os.path.join(os.getcwd(), args.outdir)
+
+ generate_api(outdir_abs)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/contrib/python/numpy/py3/numpy/core/src/common/npy_cpu_features.c b/contrib/python/numpy/py3/numpy/core/src/common/npy_cpu_features.c
index 64a85f6fb2..bd149f8b43 100644
--- a/contrib/python/numpy/py3/numpy/core/src/common/npy_cpu_features.c
+++ b/contrib/python/numpy/py3/numpy/core/src/common/npy_cpu_features.c
@@ -656,7 +656,7 @@ npy__cpu_init_features(void)
/***************** ARM ******************/
-#elif defined(__arm__) || defined(__aarch64__)
+#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM64)
static inline void
npy__cpu_init_features_arm8(void)
@@ -781,7 +781,7 @@ npy__cpu_init_features(void)
return;
#endif
// We have nothing else todo
-#if defined(NPY_HAVE_ASIMD) || defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8)
+#if defined(NPY_HAVE_ASIMD) || defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8) || defined(_M_ARM64)
#if defined(NPY_HAVE_FPHP) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
npy__cpu_have[NPY_CPU_FEATURE_FPHP] = 1;
#endif
diff --git a/contrib/python/numpy/py3/numpy/core/src/multiarray/convert.c b/contrib/python/numpy/py3/numpy/core/src/multiarray/convert.c
index 60c1a1b9b0..8ec0aeefb7 100644
--- a/contrib/python/numpy/py3/numpy/core/src/multiarray/convert.c
+++ b/contrib/python/numpy/py3/numpy/core/src/multiarray/convert.c
@@ -23,8 +23,9 @@
#include "array_coercion.h"
#include "refcount.h"
-int
-fallocate(int fd, int mode, off_t offset, off_t len);
+#if defined(HAVE_FALLOCATE) && defined(__linux__)
+#include <fcntl.h>
+#endif
/*
* allocate nbytes of diskspace for file fp
diff --git a/contrib/python/numpy/py3/numpy/core/src/multiarray/temp_elide.c b/contrib/python/numpy/py3/numpy/core/src/multiarray/temp_elide.c
index 15257804bc..a38f90e76c 100644
--- a/contrib/python/numpy/py3/numpy/core/src/multiarray/temp_elide.c
+++ b/contrib/python/numpy/py3/numpy/core/src/multiarray/temp_elide.c
@@ -59,6 +59,9 @@
*/
#if defined HAVE_BACKTRACE && defined HAVE_DLFCN_H && ! defined PYPY_VERSION
+
+#include <feature_detection_misc.h>
+
/* 1 prints elided operations, 2 prints stacktraces */
#define NPY_ELIDE_DEBUG 0
#define NPY_MAX_STACKSIZE 10
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c b/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c
index 25fae7f711..0d80a96966 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c
@@ -46,8 +46,16 @@
* q = TRUNC((n - (-dsign ) + (-nsign))/d) - (-qsign);
********************************************************************************/
+#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON)
+ // Due to integer 128-bit multiplication emulation, SIMD 64-bit division
+ // may not perform well on both neon and up to VSX3 compared to scalar
+ // division.
+ #define SIMD_DISABLE_DIV64_OPT
+#endif
+
#if NPY_SIMD
-#line 45
+#line 52
+#if 8 < 64 || (8 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_s8(char **args, npy_intp len)
{
@@ -107,8 +115,10 @@ simd_divide_by_scalar_contig_s8(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
-#line 45
+#line 52
+#if 16 < 64 || (16 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_s16(char **args, npy_intp len)
{
@@ -168,8 +178,10 @@ simd_divide_by_scalar_contig_s16(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
-#line 45
+#line 52
+#if 32 < 64 || (32 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_s32(char **args, npy_intp len)
{
@@ -229,8 +241,10 @@ simd_divide_by_scalar_contig_s32(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
-#line 45
+#line 52
+#if 64 < 64 || (64 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_s64(char **args, npy_intp len)
{
@@ -290,9 +304,11 @@ simd_divide_by_scalar_contig_s64(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
-#line 111
+#line 120
+#if 8 < 64 || (8 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_u8(char **args, npy_intp len)
{
@@ -314,8 +330,10 @@ simd_divide_by_scalar_contig_u8(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
-#line 111
+#line 120
+#if 16 < 64 || (16 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_u16(char **args, npy_intp len)
{
@@ -337,8 +355,10 @@ simd_divide_by_scalar_contig_u16(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
-#line 111
+#line 120
+#if 32 < 64 || (32 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_u32(char **args, npy_intp len)
{
@@ -360,8 +380,10 @@ simd_divide_by_scalar_contig_u32(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
-#line 111
+#line 120
+#if 64 < 64 || (64 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_u64(char **args, npy_intp len)
{
@@ -383,11 +405,12 @@ simd_divide_by_scalar_contig_u64(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
#if defined(NPY_HAVE_VSX4)
-#line 140
+#line 151
/*
* Computes division of 2 8-bit signed/unsigned integer vectors
*
@@ -452,7 +475,7 @@ vsx4_div_u16(npyv_u16 a, npyv_u16 b)
#define vsx4_div_u32 vec_div
#define vsx4_div_u64 vec_div
-#line 140
+#line 151
/*
* Computes division of 2 8-bit signed/unsigned integer vectors
*
@@ -518,7 +541,7 @@ vsx4_div_s16(npyv_s16 a, npyv_s16 b)
#define vsx4_div_s64 vec_div
-#line 210
+#line 221
static inline void
vsx4_simd_divide_contig_u8(char **args, npy_intp len)
{
@@ -552,7 +575,7 @@ vsx4_simd_divide_contig_u8(char **args, npy_intp len)
npyv_cleanup();
}
-#line 210
+#line 221
static inline void
vsx4_simd_divide_contig_u16(char **args, npy_intp len)
{
@@ -586,7 +609,7 @@ vsx4_simd_divide_contig_u16(char **args, npy_intp len)
npyv_cleanup();
}
-#line 210
+#line 221
static inline void
vsx4_simd_divide_contig_u32(char **args, npy_intp len)
{
@@ -620,7 +643,7 @@ vsx4_simd_divide_contig_u32(char **args, npy_intp len)
npyv_cleanup();
}
-#line 210
+#line 221
static inline void
vsx4_simd_divide_contig_u64(char **args, npy_intp len)
{
@@ -655,7 +678,7 @@ vsx4_simd_divide_contig_u64(char **args, npy_intp len)
}
-#line 249
+#line 260
static inline void
vsx4_simd_divide_contig_s8(char **args, npy_intp len)
{
@@ -724,7 +747,7 @@ vsx4_simd_divide_contig_s8(char **args, npy_intp len)
npyv_cleanup();
}
-#line 249
+#line 260
static inline void
vsx4_simd_divide_contig_s16(char **args, npy_intp len)
{
@@ -793,7 +816,7 @@ vsx4_simd_divide_contig_s16(char **args, npy_intp len)
npyv_cleanup();
}
-#line 249
+#line 260
static inline void
vsx4_simd_divide_contig_s32(char **args, npy_intp len)
{
@@ -862,7 +885,7 @@ vsx4_simd_divide_contig_s32(char **args, npy_intp len)
npyv_cleanup();
}
-#line 249
+#line 260
static inline void
vsx4_simd_divide_contig_s64(char **args, npy_intp len)
{
@@ -938,28 +961,27 @@ vsx4_simd_divide_contig_s64(char **args, npy_intp len)
** Defining ufunc inner functions
********************************************************************************/
-#line 329
+#line 340
#undef TO_SIMD_SFX
#if 0
-#line 334
+#line 345
#elif NPY_BITSOF_BYTE == 8
#define TO_SIMD_SFX(X) X##_s8
-#line 334
+#line 345
#elif NPY_BITSOF_BYTE == 16
#define TO_SIMD_SFX(X) X##_s16
-#line 334
+#line 345
#elif NPY_BITSOF_BYTE == 32
#define TO_SIMD_SFX(X) X##_s32
-#line 334
+#line 345
#elif NPY_BITSOF_BYTE == 64
#define TO_SIMD_SFX(X) X##_s64
#endif
-
-#if NPY_BITSOF_BYTE == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+#if NPY_BITSOF_BYTE == 64 && defined(SIMD_DISABLE_DIV64_OPT)
#undef TO_SIMD_SFX
#endif
@@ -1042,28 +1064,27 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_divide_indexed)
}
-#line 329
+#line 340
#undef TO_SIMD_SFX
#if 0
-#line 334
+#line 345
#elif NPY_BITSOF_SHORT == 8
#define TO_SIMD_SFX(X) X##_s8
-#line 334
+#line 345
#elif NPY_BITSOF_SHORT == 16
#define TO_SIMD_SFX(X) X##_s16
-#line 334
+#line 345
#elif NPY_BITSOF_SHORT == 32
#define TO_SIMD_SFX(X) X##_s32
-#line 334
+#line 345
#elif NPY_BITSOF_SHORT == 64
#define TO_SIMD_SFX(X) X##_s64
#endif
-
-#if NPY_BITSOF_SHORT == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+#if NPY_BITSOF_SHORT == 64 && defined(SIMD_DISABLE_DIV64_OPT)
#undef TO_SIMD_SFX
#endif
@@ -1146,28 +1167,27 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_divide_indexed)
}
-#line 329
+#line 340
#undef TO_SIMD_SFX
#if 0
-#line 334
+#line 345
#elif NPY_BITSOF_INT == 8
#define TO_SIMD_SFX(X) X##_s8
-#line 334
+#line 345
#elif NPY_BITSOF_INT == 16
#define TO_SIMD_SFX(X) X##_s16
-#line 334
+#line 345
#elif NPY_BITSOF_INT == 32
#define TO_SIMD_SFX(X) X##_s32
-#line 334
+#line 345
#elif NPY_BITSOF_INT == 64
#define TO_SIMD_SFX(X) X##_s64
#endif
-
-#if NPY_BITSOF_INT == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+#if NPY_BITSOF_INT == 64 && defined(SIMD_DISABLE_DIV64_OPT)
#undef TO_SIMD_SFX
#endif
@@ -1250,28 +1270,27 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_divide_indexed)
}
-#line 329
+#line 340
#undef TO_SIMD_SFX
#if 0
-#line 334
+#line 345
#elif NPY_BITSOF_LONG == 8
#define TO_SIMD_SFX(X) X##_s8
-#line 334
+#line 345
#elif NPY_BITSOF_LONG == 16
#define TO_SIMD_SFX(X) X##_s16
-#line 334
+#line 345
#elif NPY_BITSOF_LONG == 32
#define TO_SIMD_SFX(X) X##_s32
-#line 334
+#line 345
#elif NPY_BITSOF_LONG == 64
#define TO_SIMD_SFX(X) X##_s64
#endif
-
-#if NPY_BITSOF_LONG == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+#if NPY_BITSOF_LONG == 64 && defined(SIMD_DISABLE_DIV64_OPT)
#undef TO_SIMD_SFX
#endif
@@ -1354,28 +1373,27 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_divide_indexed)
}
-#line 329
+#line 340
#undef TO_SIMD_SFX
#if 0
-#line 334
+#line 345
#elif NPY_BITSOF_LONGLONG == 8
#define TO_SIMD_SFX(X) X##_s8
-#line 334
+#line 345
#elif NPY_BITSOF_LONGLONG == 16
#define TO_SIMD_SFX(X) X##_s16
-#line 334
+#line 345
#elif NPY_BITSOF_LONGLONG == 32
#define TO_SIMD_SFX(X) X##_s32
-#line 334
+#line 345
#elif NPY_BITSOF_LONGLONG == 64
#define TO_SIMD_SFX(X) X##_s64
#endif
-
-#if NPY_BITSOF_LONGLONG == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+#if NPY_BITSOF_LONGLONG == 64 && defined(SIMD_DISABLE_DIV64_OPT)
#undef TO_SIMD_SFX
#endif
@@ -1459,22 +1477,22 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_divide_indexed)
-#line 429
+#line 439
#undef TO_SIMD_SFX
#if 0
-#line 434
+#line 444
#elif NPY_BITSOF_BYTE == 8
#define TO_SIMD_SFX(X) X##_u8
-#line 434
+#line 444
#elif NPY_BITSOF_BYTE == 16
#define TO_SIMD_SFX(X) X##_u16
-#line 434
+#line 444
#elif NPY_BITSOF_BYTE == 32
#define TO_SIMD_SFX(X) X##_u32
-#line 434
+#line 444
#elif NPY_BITSOF_BYTE == 64
#define TO_SIMD_SFX(X) X##_u64
@@ -1560,22 +1578,22 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_divide_indexed)
}
-#line 429
+#line 439
#undef TO_SIMD_SFX
#if 0
-#line 434
+#line 444
#elif NPY_BITSOF_SHORT == 8
#define TO_SIMD_SFX(X) X##_u8
-#line 434
+#line 444
#elif NPY_BITSOF_SHORT == 16
#define TO_SIMD_SFX(X) X##_u16
-#line 434
+#line 444
#elif NPY_BITSOF_SHORT == 32
#define TO_SIMD_SFX(X) X##_u32
-#line 434
+#line 444
#elif NPY_BITSOF_SHORT == 64
#define TO_SIMD_SFX(X) X##_u64
@@ -1661,22 +1679,22 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_divide_indexed)
}
-#line 429
+#line 439
#undef TO_SIMD_SFX
#if 0
-#line 434
+#line 444
#elif NPY_BITSOF_INT == 8
#define TO_SIMD_SFX(X) X##_u8
-#line 434
+#line 444
#elif NPY_BITSOF_INT == 16
#define TO_SIMD_SFX(X) X##_u16
-#line 434
+#line 444
#elif NPY_BITSOF_INT == 32
#define TO_SIMD_SFX(X) X##_u32
-#line 434
+#line 444
#elif NPY_BITSOF_INT == 64
#define TO_SIMD_SFX(X) X##_u64
@@ -1762,22 +1780,22 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_divide_indexed)
}
-#line 429
+#line 439
#undef TO_SIMD_SFX
#if 0
-#line 434
+#line 444
#elif NPY_BITSOF_LONG == 8
#define TO_SIMD_SFX(X) X##_u8
-#line 434
+#line 444
#elif NPY_BITSOF_LONG == 16
#define TO_SIMD_SFX(X) X##_u16
-#line 434
+#line 444
#elif NPY_BITSOF_LONG == 32
#define TO_SIMD_SFX(X) X##_u32
-#line 434
+#line 444
#elif NPY_BITSOF_LONG == 64
#define TO_SIMD_SFX(X) X##_u64
@@ -1863,22 +1881,22 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_divide_indexed)
}
-#line 429
+#line 439
#undef TO_SIMD_SFX
#if 0
-#line 434
+#line 444
#elif NPY_BITSOF_LONGLONG == 8
#define TO_SIMD_SFX(X) X##_u8
-#line 434
+#line 444
#elif NPY_BITSOF_LONGLONG == 16
#define TO_SIMD_SFX(X) X##_u16
-#line 434
+#line 444
#elif NPY_BITSOF_LONGLONG == 32
#define TO_SIMD_SFX(X) X##_u32
-#line 434
+#line 444
#elif NPY_BITSOF_LONGLONG == 64
#define TO_SIMD_SFX(X) X##_u64
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index e07bb79808..d056046e05 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -36,12 +36,20 @@
* q = TRUNC((n - (-dsign ) + (-nsign))/d) - (-qsign);
********************************************************************************/
+#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON)
+ // Due to integer 128-bit multiplication emulation, SIMD 64-bit division
+ // may not perform well on both neon and up to VSX3 compared to scalar
+ // division.
+ #define SIMD_DISABLE_DIV64_OPT
+#endif
+
#if NPY_SIMD
/**begin repeat
* Signed types
* #sfx = s8, s16, s32, s64#
* #len = 8, 16, 32, 64#
*/
+#if @len@ < 64 || (@len@ == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
{
@@ -101,6 +109,7 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
/**end repeat**/
/**begin repeat
@@ -108,6 +117,7 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
* #sfx = u8, u16, u32, u64#
* #len = 8, 16, 32, 64#
*/
+#if @len@ < 64 || (@len@ == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
{
@@ -129,6 +139,7 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
/**end repeat**/
#if defined(NPY_HAVE_VSX4)
@@ -335,8 +346,7 @@ vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len)
#define TO_SIMD_SFX(X) X##_s@len@
/**end repeat1**/
#endif
-
-#if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+#if NPY_BITSOF_@TYPE@ == 64 && defined(SIMD_DISABLE_DIV64_OPT)
#undef TO_SIMD_SFX
#endif
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c b/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c
index 5e9827a14c..8f446c3a8d 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c
@@ -134,18 +134,6 @@ fma_blend(__m256 x, __m256 y, __m256 ymask)
}
NPY_FINLINE __m256
-fma_invert_mask_ps(__m256 ymask)
-{
- return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
-}
-
-NPY_FINLINE __m256i
-fma_invert_mask_pd(__m256i ymask)
-{
- return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
-}
-
-NPY_FINLINE __m256
fma_get_exponent(__m256 x)
{
/*
@@ -321,18 +309,6 @@ avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
return _mm512_mask_mov_ps(x, ymask, y);
}
-NPY_FINLINE __mmask16
-avx512_invert_mask_ps(__mmask16 ymask)
-{
- return _mm512_knot(ymask);
-}
-
-NPY_FINLINE __mmask8
-avx512_invert_mask_pd(__mmask8 ymask)
-{
- return _mm512_knot(ymask);
-}
-
NPY_FINLINE __m512
avx512_get_exponent(__m512 x)
{
@@ -384,7 +360,7 @@ avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
/********************************************************************************
** Defining the SIMD kernels
********************************************************************************/
-#line 396
+#line 372
#ifdef SIMD_AVX2_FMA3
/*
* Vectorized Cody-Waite range reduction technique
@@ -683,7 +659,7 @@ simd_log_FLOAT(npy_float * op,
}
#endif // SIMD_AVX2_FMA3
-#line 396
+#line 372
#ifdef SIMD_AVX512F
/*
* Vectorized Cody-Waite range reduction technique
@@ -984,7 +960,7 @@ simd_log_FLOAT(npy_float * op,
#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
-#line 700
+#line 676
static void
simd_exp_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
@@ -1015,7 +991,7 @@ simd_exp_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
npyv_cleanup();
}
-#line 700
+#line 676
static void
simd_log_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
@@ -1298,49 +1274,49 @@ AVX512F_log_DOUBLE(npy_double * op,
__m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);
/* Load lookup table data */
- #line 985
+ #line 961
__m512d mLUT_TOP_0 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*0]));
__m512d mLUT_TAIL_0 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*0]));
-#line 985
+#line 961
__m512d mLUT_TOP_1 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*1]));
__m512d mLUT_TAIL_1 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*1]));
-#line 985
+#line 961
__m512d mLUT_TOP_2 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*2]));
__m512d mLUT_TAIL_2 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*2]));
-#line 985
+#line 961
__m512d mLUT_TOP_3 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*3]));
__m512d mLUT_TAIL_3 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*3]));
-#line 985
+#line 961
__m512d mLUT_TOP_4 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*4]));
__m512d mLUT_TAIL_4 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*4]));
-#line 985
+#line 961
__m512d mLUT_TOP_5 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*5]));
__m512d mLUT_TAIL_5 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*5]));
-#line 985
+#line 961
__m512d mLUT_TOP_6 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*6]));
__m512d mLUT_TAIL_6 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*6]));
-#line 985
+#line 961
__m512d mLUT_TOP_7 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*7]));
__m512d mLUT_TAIL_7 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*7]));
@@ -1487,7 +1463,7 @@ AVX512F_log_DOUBLE(npy_double * op,
#endif // NPY_CAN_LINK_SVML
#ifdef SIMD_AVX512_SKX
-#line 1149
+#line 1125
static inline void
AVX512_SKX_ldexp_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
@@ -1634,7 +1610,7 @@ AVX512_SKX_frexp_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *
}
}
-#line 1149
+#line 1125
static inline void
AVX512_SKX_ldexp_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
@@ -1787,7 +1763,7 @@ AVX512_SKX_frexp_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const
/********************************************************************************
** Defining ufunc inner functions
********************************************************************************/
-#line 1305
+#line 1281
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_exp)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
@@ -1816,7 +1792,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_exp)
#endif
}
-#line 1305
+#line 1281
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_log)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
@@ -1846,7 +1822,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_log)
}
-#line 1338
+#line 1314
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_exp)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
@@ -1879,7 +1855,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_exp)
}
-#line 1338
+#line 1314
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_log)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
@@ -1913,7 +1889,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_log)
-#line 1378
+#line 1354
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_frexp)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -1945,7 +1921,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_ldexp)
}
}
-#line 1378
+#line 1354
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_frexp)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c.src b/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
index 1fac3c150c..85dac9c20d 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
@@ -124,18 +124,6 @@ fma_blend(__m256 x, __m256 y, __m256 ymask)
}
NPY_FINLINE __m256
-fma_invert_mask_ps(__m256 ymask)
-{
- return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
-}
-
-NPY_FINLINE __m256i
-fma_invert_mask_pd(__m256i ymask)
-{
- return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
-}
-
-NPY_FINLINE __m256
fma_get_exponent(__m256 x)
{
/*
@@ -311,18 +299,6 @@ avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
return _mm512_mask_mov_ps(x, ymask, y);
}
-NPY_FINLINE __mmask16
-avx512_invert_mask_ps(__mmask16 ymask)
-{
- return _mm512_knot(ymask);
-}
-
-NPY_FINLINE __mmask8
-avx512_invert_mask_pd(__mmask8 ymask)
-{
- return _mm512_knot(ymask);
-}
-
NPY_FINLINE __m512
avx512_get_exponent(__m512 x)
{
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c b/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c
index ad8c1ef397..97a78b0e12 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c
@@ -320,7 +320,8 @@ simd_binary_ccc_max_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2,
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1,
const npyv_lanetype_s8 *ip2, npy_intp sip2,
@@ -483,7 +484,8 @@ simd_binary_ccc_min_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2,
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1,
const npyv_lanetype_s8 *ip2, npy_intp sip2,
@@ -646,7 +648,8 @@ simd_binary_ccc_maxp_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1,
const npyv_lanetype_s8 *ip2, npy_intp sip2,
@@ -809,7 +812,8 @@ simd_binary_ccc_minp_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1,
const npyv_lanetype_s8 *ip2, npy_intp sip2,
@@ -974,7 +978,8 @@ simd_binary_ccc_max_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2,
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1,
const npyv_lanetype_u8 *ip2, npy_intp sip2,
@@ -1137,7 +1142,8 @@ simd_binary_ccc_min_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2,
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1,
const npyv_lanetype_u8 *ip2, npy_intp sip2,
@@ -1300,7 +1306,8 @@ simd_binary_ccc_maxp_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1,
const npyv_lanetype_u8 *ip2, npy_intp sip2,
@@ -1463,7 +1470,8 @@ simd_binary_ccc_minp_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1,
const npyv_lanetype_u8 *ip2, npy_intp sip2,
@@ -1628,7 +1636,8 @@ simd_binary_ccc_max_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1,
const npyv_lanetype_s16 *ip2, npy_intp sip2,
@@ -1791,7 +1800,8 @@ simd_binary_ccc_min_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1,
const npyv_lanetype_s16 *ip2, npy_intp sip2,
@@ -1954,7 +1964,8 @@ simd_binary_ccc_maxp_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1,
const npyv_lanetype_s16 *ip2, npy_intp sip2,
@@ -2117,7 +2128,8 @@ simd_binary_ccc_minp_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1,
const npyv_lanetype_s16 *ip2, npy_intp sip2,
@@ -2282,7 +2294,8 @@ simd_binary_ccc_max_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1,
const npyv_lanetype_u16 *ip2, npy_intp sip2,
@@ -2445,7 +2458,8 @@ simd_binary_ccc_min_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1,
const npyv_lanetype_u16 *ip2, npy_intp sip2,
@@ -2608,7 +2622,8 @@ simd_binary_ccc_maxp_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1,
const npyv_lanetype_u16 *ip2, npy_intp sip2,
@@ -2771,7 +2786,8 @@ simd_binary_ccc_minp_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1,
const npyv_lanetype_u16 *ip2, npy_intp sip2,
@@ -2936,7 +2952,8 @@ simd_binary_ccc_max_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1,
const npyv_lanetype_s32 *ip2, npy_intp sip2,
@@ -3099,7 +3116,8 @@ simd_binary_ccc_min_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1,
const npyv_lanetype_s32 *ip2, npy_intp sip2,
@@ -3262,7 +3280,8 @@ simd_binary_ccc_maxp_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1,
const npyv_lanetype_s32 *ip2, npy_intp sip2,
@@ -3425,7 +3444,8 @@ simd_binary_ccc_minp_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1,
const npyv_lanetype_s32 *ip2, npy_intp sip2,
@@ -3590,7 +3610,8 @@ simd_binary_ccc_max_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1,
const npyv_lanetype_u32 *ip2, npy_intp sip2,
@@ -3753,7 +3774,8 @@ simd_binary_ccc_min_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1,
const npyv_lanetype_u32 *ip2, npy_intp sip2,
@@ -3916,7 +3938,8 @@ simd_binary_ccc_maxp_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1,
const npyv_lanetype_u32 *ip2, npy_intp sip2,
@@ -4079,7 +4102,8 @@ simd_binary_ccc_minp_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1,
const npyv_lanetype_u32 *ip2, npy_intp sip2,
@@ -4244,7 +4268,8 @@ simd_binary_ccc_max_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1,
const npyv_lanetype_s64 *ip2, npy_intp sip2,
@@ -4407,7 +4432,8 @@ simd_binary_ccc_min_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1,
const npyv_lanetype_s64 *ip2, npy_intp sip2,
@@ -4570,7 +4596,8 @@ simd_binary_ccc_maxp_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1,
const npyv_lanetype_s64 *ip2, npy_intp sip2,
@@ -4733,7 +4760,8 @@ simd_binary_ccc_minp_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1,
const npyv_lanetype_s64 *ip2, npy_intp sip2,
@@ -4898,7 +4926,8 @@ simd_binary_ccc_max_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1,
const npyv_lanetype_u64 *ip2, npy_intp sip2,
@@ -5061,7 +5090,8 @@ simd_binary_ccc_min_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1,
const npyv_lanetype_u64 *ip2, npy_intp sip2,
@@ -5224,7 +5254,8 @@ simd_binary_ccc_maxp_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1,
const npyv_lanetype_u64 *ip2, npy_intp sip2,
@@ -5387,7 +5418,8 @@ simd_binary_ccc_minp_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1,
const npyv_lanetype_u64 *ip2, npy_intp sip2,
@@ -5552,7 +5584,8 @@ simd_binary_ccc_max_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 1
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1,
const npyv_lanetype_f32 *ip2, npy_intp sip2,
@@ -5715,7 +5748,8 @@ simd_binary_ccc_min_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 1
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1,
const npyv_lanetype_f32 *ip2, npy_intp sip2,
@@ -5878,7 +5912,8 @@ simd_binary_ccc_maxp_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 1
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1,
const npyv_lanetype_f32 *ip2, npy_intp sip2,
@@ -6041,7 +6076,8 @@ simd_binary_ccc_minp_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 1
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1,
const npyv_lanetype_f32 *ip2, npy_intp sip2,
@@ -6206,7 +6242,8 @@ simd_binary_ccc_max_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 1
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
const npyv_lanetype_f64 *ip2, npy_intp sip2,
@@ -6369,7 +6406,8 @@ simd_binary_ccc_min_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 1
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
const npyv_lanetype_f64 *ip2, npy_intp sip2,
@@ -6532,7 +6570,8 @@ simd_binary_ccc_maxp_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 1
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
const npyv_lanetype_f64 *ip2, npy_intp sip2,
@@ -6695,7 +6734,8 @@ simd_binary_ccc_minp_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 1
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
const npyv_lanetype_f64 *ip2, npy_intp sip2,
@@ -6744,10 +6784,10 @@ simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
/*******************************************************************************
** Defining ufunc inner functions
******************************************************************************/
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -6763,7 +6803,7 @@ simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -6779,7 +6819,7 @@ simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -6795,7 +6835,7 @@ simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -6813,7 +6853,7 @@ simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -6921,22 +6961,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1));
npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
*((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1));
npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
*((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1));
npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
*((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1));
npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
*((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -6988,7 +7028,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -7096,22 +7136,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1));
npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
*((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1));
npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
*((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1));
npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
*((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1));
npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
*((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -7163,7 +7203,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -7271,22 +7311,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1));
npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
*((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1));
npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
*((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1));
npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
*((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1));
npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
*((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -7338,7 +7378,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -7446,22 +7486,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1));
npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
*((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1));
npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
*((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1));
npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
*((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1));
npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
*((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -7514,10 +7554,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -7533,7 +7573,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -7549,7 +7589,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -7565,7 +7605,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -7583,7 +7623,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -7691,22 +7731,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1));
npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
*((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1));
npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
*((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1));
npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
*((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1));
npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
*((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -7758,7 +7798,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -7866,22 +7906,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1));
npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
*((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1));
npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
*((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1));
npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
*((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1));
npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
*((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -7933,7 +7973,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -8041,22 +8081,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1));
npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
*((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1));
npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
*((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1));
npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
*((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1));
npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
*((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -8108,7 +8148,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -8216,22 +8256,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1));
npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
*((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1));
npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
*((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1));
npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
*((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1));
npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
*((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -8284,10 +8324,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -8303,7 +8343,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -8319,7 +8359,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -8335,7 +8375,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -8353,7 +8393,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -8461,22 +8501,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1));
npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2));
*((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1));
npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2));
*((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1));
npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2));
*((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1));
npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2));
*((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -8528,7 +8568,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -8636,22 +8676,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1));
npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2));
*((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1));
npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2));
*((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1));
npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2));
*((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1));
npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2));
*((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -8703,7 +8743,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -8811,22 +8851,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1));
npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2));
*((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1));
npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2));
*((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1));
npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2));
*((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1));
npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2));
*((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -8878,7 +8918,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -8986,22 +9026,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1));
npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2));
*((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1));
npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2));
*((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1));
npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2));
*((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1));
npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2));
*((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -9054,10 +9094,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -9073,7 +9113,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -9089,7 +9129,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -9105,7 +9145,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -9123,7 +9163,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -9231,22 +9271,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1));
npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
*((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1));
npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
*((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1));
npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
*((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1));
npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
*((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -9298,7 +9338,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -9406,22 +9446,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1));
npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
*((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1));
npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
*((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1));
npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
*((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1));
npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
*((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -9473,7 +9513,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -9581,22 +9621,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1));
npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
*((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1));
npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
*((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1));
npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
*((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1));
npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
*((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -9648,7 +9688,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -9756,22 +9796,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1));
npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
*((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1));
npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
*((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1));
npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
*((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1));
npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
*((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -9824,10 +9864,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -9843,7 +9883,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -9859,7 +9899,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -9875,7 +9915,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -9893,7 +9933,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -10001,22 +10041,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1));
npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
*((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1));
npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
*((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1));
npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
*((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1));
npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
*((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -10068,7 +10108,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -10176,22 +10216,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1));
npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
*((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1));
npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
*((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1));
npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
*((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1));
npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
*((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -10243,7 +10283,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -10351,22 +10391,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1));
npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
*((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1));
npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
*((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1));
npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
*((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1));
npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
*((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -10418,7 +10458,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -10526,22 +10566,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1));
npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
*((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1));
npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
*((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1));
npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
*((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1));
npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
*((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -10594,10 +10634,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -10613,7 +10653,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -10629,7 +10669,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -10645,7 +10685,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -10663,7 +10703,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -10771,22 +10811,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1));
npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2));
*((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1));
npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2));
*((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1));
npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2));
*((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1));
npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2));
*((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -10838,7 +10878,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -10946,22 +10986,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1));
npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2));
*((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1));
npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2));
*((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1));
npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2));
*((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1));
npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2));
*((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -11013,7 +11053,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -11121,22 +11161,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1));
npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2));
*((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1));
npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2));
*((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1));
npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2));
*((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1));
npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2));
*((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -11188,7 +11228,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -11296,22 +11336,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1));
npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2));
*((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1));
npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2));
*((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1));
npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2));
*((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1));
npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2));
*((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -11364,10 +11404,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -11383,7 +11423,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -11399,7 +11439,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -11415,7 +11455,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -11433,7 +11473,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -11541,22 +11581,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1));
npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2));
*((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1));
npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2));
*((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1));
npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2));
*((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1));
npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2));
*((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -11608,7 +11648,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -11716,22 +11756,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1));
npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2));
*((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1));
npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2));
*((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1));
npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2));
*((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1));
npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2));
*((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -11783,7 +11823,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -11891,22 +11931,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1));
npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2));
*((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1));
npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2));
*((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1));
npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2));
*((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1));
npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2));
*((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -11958,7 +11998,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -12066,22 +12106,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1));
npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2));
*((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1));
npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2));
*((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1));
npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2));
*((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1));
npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2));
*((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -12134,10 +12174,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -12153,7 +12193,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -12169,7 +12209,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -12185,7 +12225,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -12203,7 +12243,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -12311,22 +12351,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1));
npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2));
*((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1));
npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2));
*((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1));
npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2));
*((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1));
npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2));
*((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -12378,7 +12418,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -12486,22 +12526,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1));
npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2));
*((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1));
npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2));
*((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1));
npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2));
*((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1));
npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2));
*((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -12553,7 +12593,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -12661,22 +12701,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1));
npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2));
*((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1));
npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2));
*((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1));
npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2));
*((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1));
npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2));
*((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -12728,7 +12768,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -12836,22 +12876,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1));
npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2));
*((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1));
npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2));
*((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1));
npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2));
*((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1));
npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2));
*((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -12904,10 +12944,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -12923,7 +12963,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -12939,7 +12979,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -12955,7 +12995,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -12973,7 +13013,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -13081,22 +13121,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1));
npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2));
*((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1));
npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2));
*((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1));
npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2));
*((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1));
npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2));
*((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -13148,7 +13188,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -13256,22 +13296,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1));
npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2));
*((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1));
npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2));
*((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1));
npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2));
*((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1));
npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2));
*((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -13323,7 +13363,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -13431,22 +13471,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1));
npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2));
*((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1));
npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2));
*((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1));
npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2));
*((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1));
npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2));
*((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -13498,7 +13538,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -13606,22 +13646,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1));
npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2));
*((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1));
npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2));
*((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1));
npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2));
*((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1));
npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2));
*((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -13674,10 +13714,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -13693,7 +13733,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -13709,7 +13749,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -13725,7 +13765,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -13743,7 +13783,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -13851,22 +13891,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1));
npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
*((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1));
npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
*((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1));
npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
*((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1));
npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
*((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -13918,7 +13958,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -14026,22 +14066,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1));
npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
*((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1));
npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
*((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1));
npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
*((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1));
npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
*((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -14093,7 +14133,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -14201,22 +14241,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1));
npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
*((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1));
npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
*((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1));
npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
*((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1));
npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
*((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -14268,7 +14308,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -14376,22 +14416,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1));
npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
*((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1));
npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
*((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1));
npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
*((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1));
npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
*((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -14444,10 +14484,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 8
#if 1
#define TO_SIMD_SFX(X) X##_f8
@@ -14463,7 +14503,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 16
#if 1
#define TO_SIMD_SFX(X) X##_f16
@@ -14479,7 +14519,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 32
#if 1
#define TO_SIMD_SFX(X) X##_f32
@@ -14495,7 +14535,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 64
#if 1
#define TO_SIMD_SFX(X) X##_f64
@@ -14513,7 +14553,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (1 && 0)
#define SCALAR_OP scalar_max_f
@@ -14621,22 +14661,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1));
npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2));
*((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1));
npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2));
*((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1));
npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2));
*((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1));
npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2));
*((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -14688,7 +14728,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (1 && 0)
#define SCALAR_OP scalar_min_f
@@ -14796,22 +14836,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1));
npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2));
*((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1));
npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2));
*((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1));
npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2));
*((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1));
npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2));
*((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -14863,7 +14903,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (1 && 1)
#define SCALAR_OP scalar_maxp_f
@@ -14971,22 +15011,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1));
npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2));
*((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1));
npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2));
*((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1));
npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2));
*((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1));
npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2));
*((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -15038,7 +15078,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (1 && 1)
#define SCALAR_OP scalar_minp_f
@@ -15146,22 +15186,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1));
npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2));
*((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1));
npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2));
*((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1));
npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2));
*((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1));
npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2));
*((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -15214,10 +15254,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 8
#if 1
#define TO_SIMD_SFX(X) X##_f8
@@ -15233,7 +15273,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 16
#if 1
#define TO_SIMD_SFX(X) X##_f16
@@ -15249,7 +15289,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 32
#if 1
#define TO_SIMD_SFX(X) X##_f32
@@ -15265,7 +15305,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 64
#if 1
#define TO_SIMD_SFX(X) X##_f64
@@ -15283,7 +15323,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (1 && 0)
#define SCALAR_OP scalar_max_d
@@ -15391,22 +15431,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1));
npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2));
*((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1));
npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2));
*((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1));
npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2));
*((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1));
npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2));
*((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -15458,7 +15498,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (1 && 0)
#define SCALAR_OP scalar_min_d
@@ -15566,22 +15606,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1));
npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2));
*((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1));
npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2));
*((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1));
npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2));
*((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1));
npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2));
*((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -15633,7 +15673,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (1 && 1)
#define SCALAR_OP scalar_maxp_d
@@ -15741,22 +15781,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1));
npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2));
*((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1));
npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2));
*((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1));
npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2));
*((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1));
npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2));
*((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -15808,7 +15848,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (1 && 1)
#define SCALAR_OP scalar_minp_d
@@ -15916,22 +15956,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1));
npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2));
*((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1));
npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2));
*((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1));
npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2));
*((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1));
npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2));
*((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -15984,10 +16024,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 8
#if 1
#define TO_SIMD_SFX(X) X##_f8
@@ -16003,7 +16043,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 16
#if 1
#define TO_SIMD_SFX(X) X##_f16
@@ -16019,7 +16059,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 32
#if 1
#define TO_SIMD_SFX(X) X##_f32
@@ -16035,7 +16075,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 64
#if 1
#define TO_SIMD_SFX(X) X##_f64
@@ -16053,7 +16093,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (1 && 0)
#define SCALAR_OP scalar_max_l
@@ -16161,22 +16201,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1));
npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
*((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1));
npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
*((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1));
npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
*((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1));
npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
*((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -16228,7 +16268,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (1 && 0)
#define SCALAR_OP scalar_min_l
@@ -16336,22 +16376,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1));
npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
*((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1));
npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
*((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1));
npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
*((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1));
npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
*((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -16403,7 +16443,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (1 && 1)
#define SCALAR_OP scalar_maxp_l
@@ -16511,22 +16551,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1));
npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
*((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1));
npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
*((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1));
npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
*((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1));
npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
*((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -16578,7 +16618,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (1 && 1)
#define SCALAR_OP scalar_minp_l
@@ -16686,22 +16726,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1));
npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
*((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1));
npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
*((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1));
npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
*((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1));
npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
*((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c.src b/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c.src
index 236e2e2eb7..319072c01f 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c.src
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c.src
@@ -225,7 +225,8 @@ simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanety
}
}
// non-contiguous for float 32/64-bit memory access
-#if @is_fp@
+#if @is_fp@ && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, npy_intp sip1,
const npyv_lanetype_@sfx@ *ip2, npy_intp sip2,
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c b/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c
index 9d9bc64a16..30ce938d66 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c
@@ -26,8 +26,8 @@
* when there's no native FUSED support instead of fallback to libc
*/
#if NPY_SIMD_FMA3 // native support
-#line 23
-#if NPY_SIMD_F64
+#line 24
+#if NPY_SIMD_F64 && 0
/*
* Vectorized Cody-Waite range reduction technique
* Performs the reduction step x* = x - y*C in three steps:
@@ -46,8 +46,8 @@ simd_range_reduction_f64(npyv_f64 x, npyv_f64 y, npyv_f64 c1, npyv_f64 c2, npyv_
}
#endif
-#line 23
-#if NPY_SIMD_F32
+#line 24
+#if NPY_SIMD_F32 && 1
/*
* Vectorized Cody-Waite range reduction technique
* Performs the reduction step x* = x - y*C in three steps:
@@ -66,9 +66,11 @@ simd_range_reduction_f32(npyv_f32 x, npyv_f32 y, npyv_f32 c1, npyv_f32 c2, npyv_
}
#endif
-
-#if NPY_SIMD_F64
-#line 47
+/* Disable SIMD code and revert to libm: see
+ * https://mail.python.org/archives/list/numpy-discussion@python.org/thread/C6EYZZSR4EWGVKHAZXLE7IBILRMNVK7L/
+ * for detailed discussion on this*/
+#if 0 // NPY_SIMD_F64
+#line 50
#if defined(NPY_OS_WIN32) || defined(NPY_OS_CYGWIN)
NPY_FINLINE npyv_f64
#else
@@ -90,7 +92,7 @@ simd_cos_scalar_f64(npyv_f64 out, npy_uint64 cmp_bits)
return npyv_loada_f64(out_copy);
}
-#line 47
+#line 50
#if defined(NPY_OS_WIN32) || defined(NPY_OS_CYGWIN)
NPY_FINLINE npyv_f64
#else
@@ -208,7 +210,7 @@ simd_sin_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign)
return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), sign), odd));
}
-#line 167
+#line 170
NPY_FINLINE void
simd_cos_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_intp len)
{
@@ -254,7 +256,7 @@ simd_cos_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_i
npyv_cleanup();
}
-#line 167
+#line 170
NPY_FINLINE void
simd_sin_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_intp len)
{
@@ -473,7 +475,7 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
#endif // NPY_SIMD_FP32
#endif // NYP_SIMD_FMA3
-#line 388
+#line 391
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_cos)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
@@ -507,7 +509,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_cos)
#endif
}
-#line 388
+#line 391
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_sin)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
@@ -542,7 +544,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_sin)
}
-#line 426
+#line 429
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_sin)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
@@ -572,7 +574,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_sin)
#endif
}
-#line 426
+#line 429
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_cos)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c.src b/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
index f07cb70f39..31de906098 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
@@ -19,8 +19,9 @@
/**begin repeat
* #check = F64, F32#
* #sfx = f64, f32#
+ * #enable = 0, 1#
*/
-#if NPY_SIMD_@check@
+#if NPY_SIMD_@check@ && @enable@
/*
* Vectorized Cody-Waite range reduction technique
* Performs the reduction step x* = x - y*C in three steps:
@@ -39,8 +40,10 @@ simd_range_reduction_@sfx@(npyv_@sfx@ x, npyv_@sfx@ y, npyv_@sfx@ c1, npyv_@sfx@
}
#endif
/**end repeat**/
-
-#if NPY_SIMD_F64
+/* Disable SIMD code and revert to libm: see
+ * https://mail.python.org/archives/list/numpy-discussion@python.org/thread/C6EYZZSR4EWGVKHAZXLE7IBILRMNVK7L/
+ * for detailed discussion on this*/
+#if 0 // NPY_SIMD_F64
/**begin repeat
* #op = cos, sin#
*/
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c b/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c
index 3ea2747d9e..b2d3b0976a 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c
@@ -604,6 +604,8 @@ simd_unary_nc_negative_s8(const npyv_lanetype_s8 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_s8(const npyv_lanetype_s8 *ip, npy_intp istride,
npyv_lanetype_s8 *op, npy_intp ostride,
@@ -614,112 +616,112 @@ simd_unary_nn_negative_s8(const npyv_lanetype_s8 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_s8 v_0 = npyv_loadn_s8(ip + 0 * vstep * istride, istride);
npyv_s8 r_0 = npyv_negative_s8(v_0);
npyv_storen_s8(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_s8 v_1 = npyv_loadn_s8(ip + 1 * vstep * istride, istride);
npyv_s8 r_1 = npyv_negative_s8(v_1);
npyv_storen_s8(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_s8 v_2 = npyv_loadn_s8(ip + 2 * vstep * istride, istride);
npyv_s8 r_2 = npyv_negative_s8(v_2);
npyv_storen_s8(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_s8 v_3 = npyv_loadn_s8(ip + 3 * vstep * istride, istride);
npyv_s8 r_3 = npyv_negative_s8(v_3);
npyv_storen_s8(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_s8 v_4 = npyv_loadn_s8(ip + 4 * vstep * istride, istride);
npyv_s8 r_4 = npyv_negative_s8(v_4);
npyv_storen_s8(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_s8 v_5 = npyv_loadn_s8(ip + 5 * vstep * istride, istride);
npyv_s8 r_5 = npyv_negative_s8(v_5);
npyv_storen_s8(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_s8 v_6 = npyv_loadn_s8(ip + 6 * vstep * istride, istride);
npyv_s8 r_6 = npyv_negative_s8(v_6);
npyv_storen_s8(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_s8 v_7 = npyv_loadn_s8(ip + 7 * vstep * istride, istride);
npyv_s8 r_7 = npyv_negative_s8(v_7);
npyv_storen_s8(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_s8 v_8 = npyv_loadn_s8(ip + 8 * vstep * istride, istride);
npyv_s8 r_8 = npyv_negative_s8(v_8);
npyv_storen_s8(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_s8 v_9 = npyv_loadn_s8(ip + 9 * vstep * istride, istride);
npyv_s8 r_9 = npyv_negative_s8(v_9);
npyv_storen_s8(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_s8 v_10 = npyv_loadn_s8(ip + 10 * vstep * istride, istride);
npyv_s8 r_10 = npyv_negative_s8(v_10);
npyv_storen_s8(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_s8 v_11 = npyv_loadn_s8(ip + 11 * vstep * istride, istride);
npyv_s8 r_11 = npyv_negative_s8(v_11);
npyv_storen_s8(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_s8 v_12 = npyv_loadn_s8(ip + 12 * vstep * istride, istride);
npyv_s8 r_12 = npyv_negative_s8(v_12);
npyv_storen_s8(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_s8 v_13 = npyv_loadn_s8(ip + 13 * vstep * istride, istride);
npyv_s8 r_13 = npyv_negative_s8(v_13);
npyv_storen_s8(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_s8 v_14 = npyv_loadn_s8(ip + 14 * vstep * istride, istride);
npyv_s8 r_14 = npyv_negative_s8(v_14);
npyv_storen_s8(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_s8 v_15 = npyv_loadn_s8(ip + 15 * vstep * istride, istride);
npyv_s8 r_15 = npyv_negative_s8(v_15);
@@ -738,6 +740,7 @@ simd_unary_nn_negative_s8(const npyv_lanetype_s8 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 0
#undef UNROLL
#endif // NPY_SIMD
@@ -1167,6 +1170,8 @@ simd_unary_nc_negative_u8(const npyv_lanetype_u8 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_u8(const npyv_lanetype_u8 *ip, npy_intp istride,
npyv_lanetype_u8 *op, npy_intp ostride,
@@ -1177,112 +1182,112 @@ simd_unary_nn_negative_u8(const npyv_lanetype_u8 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_u8 v_0 = npyv_loadn_u8(ip + 0 * vstep * istride, istride);
npyv_u8 r_0 = npyv_negative_u8(v_0);
npyv_storen_u8(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_u8 v_1 = npyv_loadn_u8(ip + 1 * vstep * istride, istride);
npyv_u8 r_1 = npyv_negative_u8(v_1);
npyv_storen_u8(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_u8 v_2 = npyv_loadn_u8(ip + 2 * vstep * istride, istride);
npyv_u8 r_2 = npyv_negative_u8(v_2);
npyv_storen_u8(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_u8 v_3 = npyv_loadn_u8(ip + 3 * vstep * istride, istride);
npyv_u8 r_3 = npyv_negative_u8(v_3);
npyv_storen_u8(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_u8 v_4 = npyv_loadn_u8(ip + 4 * vstep * istride, istride);
npyv_u8 r_4 = npyv_negative_u8(v_4);
npyv_storen_u8(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_u8 v_5 = npyv_loadn_u8(ip + 5 * vstep * istride, istride);
npyv_u8 r_5 = npyv_negative_u8(v_5);
npyv_storen_u8(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_u8 v_6 = npyv_loadn_u8(ip + 6 * vstep * istride, istride);
npyv_u8 r_6 = npyv_negative_u8(v_6);
npyv_storen_u8(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_u8 v_7 = npyv_loadn_u8(ip + 7 * vstep * istride, istride);
npyv_u8 r_7 = npyv_negative_u8(v_7);
npyv_storen_u8(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_u8 v_8 = npyv_loadn_u8(ip + 8 * vstep * istride, istride);
npyv_u8 r_8 = npyv_negative_u8(v_8);
npyv_storen_u8(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_u8 v_9 = npyv_loadn_u8(ip + 9 * vstep * istride, istride);
npyv_u8 r_9 = npyv_negative_u8(v_9);
npyv_storen_u8(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_u8 v_10 = npyv_loadn_u8(ip + 10 * vstep * istride, istride);
npyv_u8 r_10 = npyv_negative_u8(v_10);
npyv_storen_u8(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_u8 v_11 = npyv_loadn_u8(ip + 11 * vstep * istride, istride);
npyv_u8 r_11 = npyv_negative_u8(v_11);
npyv_storen_u8(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_u8 v_12 = npyv_loadn_u8(ip + 12 * vstep * istride, istride);
npyv_u8 r_12 = npyv_negative_u8(v_12);
npyv_storen_u8(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_u8 v_13 = npyv_loadn_u8(ip + 13 * vstep * istride, istride);
npyv_u8 r_13 = npyv_negative_u8(v_13);
npyv_storen_u8(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_u8 v_14 = npyv_loadn_u8(ip + 14 * vstep * istride, istride);
npyv_u8 r_14 = npyv_negative_u8(v_14);
npyv_storen_u8(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_u8 v_15 = npyv_loadn_u8(ip + 15 * vstep * istride, istride);
npyv_u8 r_15 = npyv_negative_u8(v_15);
@@ -1301,6 +1306,7 @@ simd_unary_nn_negative_u8(const npyv_lanetype_u8 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 0
#undef UNROLL
#endif // NPY_SIMD
@@ -1730,6 +1736,8 @@ simd_unary_nc_negative_s16(const npyv_lanetype_s16 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_s16(const npyv_lanetype_s16 *ip, npy_intp istride,
npyv_lanetype_s16 *op, npy_intp ostride,
@@ -1740,112 +1748,112 @@ simd_unary_nn_negative_s16(const npyv_lanetype_s16 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_s16 v_0 = npyv_loadn_s16(ip + 0 * vstep * istride, istride);
npyv_s16 r_0 = npyv_negative_s16(v_0);
npyv_storen_s16(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_s16 v_1 = npyv_loadn_s16(ip + 1 * vstep * istride, istride);
npyv_s16 r_1 = npyv_negative_s16(v_1);
npyv_storen_s16(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_s16 v_2 = npyv_loadn_s16(ip + 2 * vstep * istride, istride);
npyv_s16 r_2 = npyv_negative_s16(v_2);
npyv_storen_s16(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_s16 v_3 = npyv_loadn_s16(ip + 3 * vstep * istride, istride);
npyv_s16 r_3 = npyv_negative_s16(v_3);
npyv_storen_s16(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_s16 v_4 = npyv_loadn_s16(ip + 4 * vstep * istride, istride);
npyv_s16 r_4 = npyv_negative_s16(v_4);
npyv_storen_s16(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_s16 v_5 = npyv_loadn_s16(ip + 5 * vstep * istride, istride);
npyv_s16 r_5 = npyv_negative_s16(v_5);
npyv_storen_s16(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_s16 v_6 = npyv_loadn_s16(ip + 6 * vstep * istride, istride);
npyv_s16 r_6 = npyv_negative_s16(v_6);
npyv_storen_s16(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_s16 v_7 = npyv_loadn_s16(ip + 7 * vstep * istride, istride);
npyv_s16 r_7 = npyv_negative_s16(v_7);
npyv_storen_s16(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_s16 v_8 = npyv_loadn_s16(ip + 8 * vstep * istride, istride);
npyv_s16 r_8 = npyv_negative_s16(v_8);
npyv_storen_s16(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_s16 v_9 = npyv_loadn_s16(ip + 9 * vstep * istride, istride);
npyv_s16 r_9 = npyv_negative_s16(v_9);
npyv_storen_s16(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_s16 v_10 = npyv_loadn_s16(ip + 10 * vstep * istride, istride);
npyv_s16 r_10 = npyv_negative_s16(v_10);
npyv_storen_s16(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_s16 v_11 = npyv_loadn_s16(ip + 11 * vstep * istride, istride);
npyv_s16 r_11 = npyv_negative_s16(v_11);
npyv_storen_s16(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_s16 v_12 = npyv_loadn_s16(ip + 12 * vstep * istride, istride);
npyv_s16 r_12 = npyv_negative_s16(v_12);
npyv_storen_s16(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_s16 v_13 = npyv_loadn_s16(ip + 13 * vstep * istride, istride);
npyv_s16 r_13 = npyv_negative_s16(v_13);
npyv_storen_s16(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_s16 v_14 = npyv_loadn_s16(ip + 14 * vstep * istride, istride);
npyv_s16 r_14 = npyv_negative_s16(v_14);
npyv_storen_s16(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_s16 v_15 = npyv_loadn_s16(ip + 15 * vstep * istride, istride);
npyv_s16 r_15 = npyv_negative_s16(v_15);
@@ -1864,6 +1872,7 @@ simd_unary_nn_negative_s16(const npyv_lanetype_s16 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 0
#undef UNROLL
#endif // NPY_SIMD
@@ -2293,6 +2302,8 @@ simd_unary_nc_negative_u16(const npyv_lanetype_u16 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_u16(const npyv_lanetype_u16 *ip, npy_intp istride,
npyv_lanetype_u16 *op, npy_intp ostride,
@@ -2303,112 +2314,112 @@ simd_unary_nn_negative_u16(const npyv_lanetype_u16 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_u16 v_0 = npyv_loadn_u16(ip + 0 * vstep * istride, istride);
npyv_u16 r_0 = npyv_negative_u16(v_0);
npyv_storen_u16(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_u16 v_1 = npyv_loadn_u16(ip + 1 * vstep * istride, istride);
npyv_u16 r_1 = npyv_negative_u16(v_1);
npyv_storen_u16(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_u16 v_2 = npyv_loadn_u16(ip + 2 * vstep * istride, istride);
npyv_u16 r_2 = npyv_negative_u16(v_2);
npyv_storen_u16(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_u16 v_3 = npyv_loadn_u16(ip + 3 * vstep * istride, istride);
npyv_u16 r_3 = npyv_negative_u16(v_3);
npyv_storen_u16(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_u16 v_4 = npyv_loadn_u16(ip + 4 * vstep * istride, istride);
npyv_u16 r_4 = npyv_negative_u16(v_4);
npyv_storen_u16(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_u16 v_5 = npyv_loadn_u16(ip + 5 * vstep * istride, istride);
npyv_u16 r_5 = npyv_negative_u16(v_5);
npyv_storen_u16(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_u16 v_6 = npyv_loadn_u16(ip + 6 * vstep * istride, istride);
npyv_u16 r_6 = npyv_negative_u16(v_6);
npyv_storen_u16(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_u16 v_7 = npyv_loadn_u16(ip + 7 * vstep * istride, istride);
npyv_u16 r_7 = npyv_negative_u16(v_7);
npyv_storen_u16(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_u16 v_8 = npyv_loadn_u16(ip + 8 * vstep * istride, istride);
npyv_u16 r_8 = npyv_negative_u16(v_8);
npyv_storen_u16(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_u16 v_9 = npyv_loadn_u16(ip + 9 * vstep * istride, istride);
npyv_u16 r_9 = npyv_negative_u16(v_9);
npyv_storen_u16(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_u16 v_10 = npyv_loadn_u16(ip + 10 * vstep * istride, istride);
npyv_u16 r_10 = npyv_negative_u16(v_10);
npyv_storen_u16(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_u16 v_11 = npyv_loadn_u16(ip + 11 * vstep * istride, istride);
npyv_u16 r_11 = npyv_negative_u16(v_11);
npyv_storen_u16(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_u16 v_12 = npyv_loadn_u16(ip + 12 * vstep * istride, istride);
npyv_u16 r_12 = npyv_negative_u16(v_12);
npyv_storen_u16(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_u16 v_13 = npyv_loadn_u16(ip + 13 * vstep * istride, istride);
npyv_u16 r_13 = npyv_negative_u16(v_13);
npyv_storen_u16(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_u16 v_14 = npyv_loadn_u16(ip + 14 * vstep * istride, istride);
npyv_u16 r_14 = npyv_negative_u16(v_14);
npyv_storen_u16(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_u16 v_15 = npyv_loadn_u16(ip + 15 * vstep * istride, istride);
npyv_u16 r_15 = npyv_negative_u16(v_15);
@@ -2427,6 +2438,7 @@ simd_unary_nn_negative_u16(const npyv_lanetype_u16 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 0
#undef UNROLL
#endif // NPY_SIMD
@@ -2856,6 +2868,8 @@ simd_unary_nc_negative_s32(const npyv_lanetype_s32 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_s32(const npyv_lanetype_s32 *ip, npy_intp istride,
npyv_lanetype_s32 *op, npy_intp ostride,
@@ -2866,112 +2880,112 @@ simd_unary_nn_negative_s32(const npyv_lanetype_s32 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_s32 v_0 = npyv_loadn_s32(ip + 0 * vstep * istride, istride);
npyv_s32 r_0 = npyv_negative_s32(v_0);
npyv_storen_s32(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_s32 v_1 = npyv_loadn_s32(ip + 1 * vstep * istride, istride);
npyv_s32 r_1 = npyv_negative_s32(v_1);
npyv_storen_s32(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_s32 v_2 = npyv_loadn_s32(ip + 2 * vstep * istride, istride);
npyv_s32 r_2 = npyv_negative_s32(v_2);
npyv_storen_s32(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_s32 v_3 = npyv_loadn_s32(ip + 3 * vstep * istride, istride);
npyv_s32 r_3 = npyv_negative_s32(v_3);
npyv_storen_s32(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_s32 v_4 = npyv_loadn_s32(ip + 4 * vstep * istride, istride);
npyv_s32 r_4 = npyv_negative_s32(v_4);
npyv_storen_s32(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_s32 v_5 = npyv_loadn_s32(ip + 5 * vstep * istride, istride);
npyv_s32 r_5 = npyv_negative_s32(v_5);
npyv_storen_s32(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_s32 v_6 = npyv_loadn_s32(ip + 6 * vstep * istride, istride);
npyv_s32 r_6 = npyv_negative_s32(v_6);
npyv_storen_s32(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_s32 v_7 = npyv_loadn_s32(ip + 7 * vstep * istride, istride);
npyv_s32 r_7 = npyv_negative_s32(v_7);
npyv_storen_s32(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_s32 v_8 = npyv_loadn_s32(ip + 8 * vstep * istride, istride);
npyv_s32 r_8 = npyv_negative_s32(v_8);
npyv_storen_s32(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_s32 v_9 = npyv_loadn_s32(ip + 9 * vstep * istride, istride);
npyv_s32 r_9 = npyv_negative_s32(v_9);
npyv_storen_s32(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_s32 v_10 = npyv_loadn_s32(ip + 10 * vstep * istride, istride);
npyv_s32 r_10 = npyv_negative_s32(v_10);
npyv_storen_s32(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_s32 v_11 = npyv_loadn_s32(ip + 11 * vstep * istride, istride);
npyv_s32 r_11 = npyv_negative_s32(v_11);
npyv_storen_s32(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_s32 v_12 = npyv_loadn_s32(ip + 12 * vstep * istride, istride);
npyv_s32 r_12 = npyv_negative_s32(v_12);
npyv_storen_s32(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_s32 v_13 = npyv_loadn_s32(ip + 13 * vstep * istride, istride);
npyv_s32 r_13 = npyv_negative_s32(v_13);
npyv_storen_s32(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_s32 v_14 = npyv_loadn_s32(ip + 14 * vstep * istride, istride);
npyv_s32 r_14 = npyv_negative_s32(v_14);
npyv_storen_s32(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_s32 v_15 = npyv_loadn_s32(ip + 15 * vstep * istride, istride);
npyv_s32 r_15 = npyv_negative_s32(v_15);
@@ -2990,6 +3004,7 @@ simd_unary_nn_negative_s32(const npyv_lanetype_s32 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 1
#undef UNROLL
#endif // NPY_SIMD
@@ -3419,6 +3434,8 @@ simd_unary_nc_negative_u32(const npyv_lanetype_u32 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_u32(const npyv_lanetype_u32 *ip, npy_intp istride,
npyv_lanetype_u32 *op, npy_intp ostride,
@@ -3429,112 +3446,112 @@ simd_unary_nn_negative_u32(const npyv_lanetype_u32 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_u32 v_0 = npyv_loadn_u32(ip + 0 * vstep * istride, istride);
npyv_u32 r_0 = npyv_negative_u32(v_0);
npyv_storen_u32(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_u32 v_1 = npyv_loadn_u32(ip + 1 * vstep * istride, istride);
npyv_u32 r_1 = npyv_negative_u32(v_1);
npyv_storen_u32(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_u32 v_2 = npyv_loadn_u32(ip + 2 * vstep * istride, istride);
npyv_u32 r_2 = npyv_negative_u32(v_2);
npyv_storen_u32(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_u32 v_3 = npyv_loadn_u32(ip + 3 * vstep * istride, istride);
npyv_u32 r_3 = npyv_negative_u32(v_3);
npyv_storen_u32(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_u32 v_4 = npyv_loadn_u32(ip + 4 * vstep * istride, istride);
npyv_u32 r_4 = npyv_negative_u32(v_4);
npyv_storen_u32(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_u32 v_5 = npyv_loadn_u32(ip + 5 * vstep * istride, istride);
npyv_u32 r_5 = npyv_negative_u32(v_5);
npyv_storen_u32(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_u32 v_6 = npyv_loadn_u32(ip + 6 * vstep * istride, istride);
npyv_u32 r_6 = npyv_negative_u32(v_6);
npyv_storen_u32(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_u32 v_7 = npyv_loadn_u32(ip + 7 * vstep * istride, istride);
npyv_u32 r_7 = npyv_negative_u32(v_7);
npyv_storen_u32(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_u32 v_8 = npyv_loadn_u32(ip + 8 * vstep * istride, istride);
npyv_u32 r_8 = npyv_negative_u32(v_8);
npyv_storen_u32(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_u32 v_9 = npyv_loadn_u32(ip + 9 * vstep * istride, istride);
npyv_u32 r_9 = npyv_negative_u32(v_9);
npyv_storen_u32(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_u32 v_10 = npyv_loadn_u32(ip + 10 * vstep * istride, istride);
npyv_u32 r_10 = npyv_negative_u32(v_10);
npyv_storen_u32(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_u32 v_11 = npyv_loadn_u32(ip + 11 * vstep * istride, istride);
npyv_u32 r_11 = npyv_negative_u32(v_11);
npyv_storen_u32(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_u32 v_12 = npyv_loadn_u32(ip + 12 * vstep * istride, istride);
npyv_u32 r_12 = npyv_negative_u32(v_12);
npyv_storen_u32(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_u32 v_13 = npyv_loadn_u32(ip + 13 * vstep * istride, istride);
npyv_u32 r_13 = npyv_negative_u32(v_13);
npyv_storen_u32(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_u32 v_14 = npyv_loadn_u32(ip + 14 * vstep * istride, istride);
npyv_u32 r_14 = npyv_negative_u32(v_14);
npyv_storen_u32(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_u32 v_15 = npyv_loadn_u32(ip + 15 * vstep * istride, istride);
npyv_u32 r_15 = npyv_negative_u32(v_15);
@@ -3553,6 +3570,7 @@ simd_unary_nn_negative_u32(const npyv_lanetype_u32 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 1
#undef UNROLL
#endif // NPY_SIMD
@@ -3982,6 +4000,8 @@ simd_unary_nc_negative_s64(const npyv_lanetype_s64 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_s64(const npyv_lanetype_s64 *ip, npy_intp istride,
npyv_lanetype_s64 *op, npy_intp ostride,
@@ -3992,112 +4012,112 @@ simd_unary_nn_negative_s64(const npyv_lanetype_s64 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_s64 v_0 = npyv_loadn_s64(ip + 0 * vstep * istride, istride);
npyv_s64 r_0 = npyv_negative_s64(v_0);
npyv_storen_s64(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_s64 v_1 = npyv_loadn_s64(ip + 1 * vstep * istride, istride);
npyv_s64 r_1 = npyv_negative_s64(v_1);
npyv_storen_s64(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_s64 v_2 = npyv_loadn_s64(ip + 2 * vstep * istride, istride);
npyv_s64 r_2 = npyv_negative_s64(v_2);
npyv_storen_s64(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_s64 v_3 = npyv_loadn_s64(ip + 3 * vstep * istride, istride);
npyv_s64 r_3 = npyv_negative_s64(v_3);
npyv_storen_s64(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_s64 v_4 = npyv_loadn_s64(ip + 4 * vstep * istride, istride);
npyv_s64 r_4 = npyv_negative_s64(v_4);
npyv_storen_s64(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_s64 v_5 = npyv_loadn_s64(ip + 5 * vstep * istride, istride);
npyv_s64 r_5 = npyv_negative_s64(v_5);
npyv_storen_s64(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_s64 v_6 = npyv_loadn_s64(ip + 6 * vstep * istride, istride);
npyv_s64 r_6 = npyv_negative_s64(v_6);
npyv_storen_s64(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_s64 v_7 = npyv_loadn_s64(ip + 7 * vstep * istride, istride);
npyv_s64 r_7 = npyv_negative_s64(v_7);
npyv_storen_s64(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_s64 v_8 = npyv_loadn_s64(ip + 8 * vstep * istride, istride);
npyv_s64 r_8 = npyv_negative_s64(v_8);
npyv_storen_s64(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_s64 v_9 = npyv_loadn_s64(ip + 9 * vstep * istride, istride);
npyv_s64 r_9 = npyv_negative_s64(v_9);
npyv_storen_s64(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_s64 v_10 = npyv_loadn_s64(ip + 10 * vstep * istride, istride);
npyv_s64 r_10 = npyv_negative_s64(v_10);
npyv_storen_s64(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_s64 v_11 = npyv_loadn_s64(ip + 11 * vstep * istride, istride);
npyv_s64 r_11 = npyv_negative_s64(v_11);
npyv_storen_s64(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_s64 v_12 = npyv_loadn_s64(ip + 12 * vstep * istride, istride);
npyv_s64 r_12 = npyv_negative_s64(v_12);
npyv_storen_s64(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_s64 v_13 = npyv_loadn_s64(ip + 13 * vstep * istride, istride);
npyv_s64 r_13 = npyv_negative_s64(v_13);
npyv_storen_s64(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_s64 v_14 = npyv_loadn_s64(ip + 14 * vstep * istride, istride);
npyv_s64 r_14 = npyv_negative_s64(v_14);
npyv_storen_s64(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_s64 v_15 = npyv_loadn_s64(ip + 15 * vstep * istride, istride);
npyv_s64 r_15 = npyv_negative_s64(v_15);
@@ -4116,6 +4136,7 @@ simd_unary_nn_negative_s64(const npyv_lanetype_s64 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 1
#undef UNROLL
#endif // NPY_SIMD
@@ -4545,6 +4566,8 @@ simd_unary_nc_negative_u64(const npyv_lanetype_u64 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_u64(const npyv_lanetype_u64 *ip, npy_intp istride,
npyv_lanetype_u64 *op, npy_intp ostride,
@@ -4555,112 +4578,112 @@ simd_unary_nn_negative_u64(const npyv_lanetype_u64 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_u64 v_0 = npyv_loadn_u64(ip + 0 * vstep * istride, istride);
npyv_u64 r_0 = npyv_negative_u64(v_0);
npyv_storen_u64(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_u64 v_1 = npyv_loadn_u64(ip + 1 * vstep * istride, istride);
npyv_u64 r_1 = npyv_negative_u64(v_1);
npyv_storen_u64(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_u64 v_2 = npyv_loadn_u64(ip + 2 * vstep * istride, istride);
npyv_u64 r_2 = npyv_negative_u64(v_2);
npyv_storen_u64(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_u64 v_3 = npyv_loadn_u64(ip + 3 * vstep * istride, istride);
npyv_u64 r_3 = npyv_negative_u64(v_3);
npyv_storen_u64(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_u64 v_4 = npyv_loadn_u64(ip + 4 * vstep * istride, istride);
npyv_u64 r_4 = npyv_negative_u64(v_4);
npyv_storen_u64(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_u64 v_5 = npyv_loadn_u64(ip + 5 * vstep * istride, istride);
npyv_u64 r_5 = npyv_negative_u64(v_5);
npyv_storen_u64(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_u64 v_6 = npyv_loadn_u64(ip + 6 * vstep * istride, istride);
npyv_u64 r_6 = npyv_negative_u64(v_6);
npyv_storen_u64(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_u64 v_7 = npyv_loadn_u64(ip + 7 * vstep * istride, istride);
npyv_u64 r_7 = npyv_negative_u64(v_7);
npyv_storen_u64(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_u64 v_8 = npyv_loadn_u64(ip + 8 * vstep * istride, istride);
npyv_u64 r_8 = npyv_negative_u64(v_8);
npyv_storen_u64(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_u64 v_9 = npyv_loadn_u64(ip + 9 * vstep * istride, istride);
npyv_u64 r_9 = npyv_negative_u64(v_9);
npyv_storen_u64(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_u64 v_10 = npyv_loadn_u64(ip + 10 * vstep * istride, istride);
npyv_u64 r_10 = npyv_negative_u64(v_10);
npyv_storen_u64(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_u64 v_11 = npyv_loadn_u64(ip + 11 * vstep * istride, istride);
npyv_u64 r_11 = npyv_negative_u64(v_11);
npyv_storen_u64(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_u64 v_12 = npyv_loadn_u64(ip + 12 * vstep * istride, istride);
npyv_u64 r_12 = npyv_negative_u64(v_12);
npyv_storen_u64(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_u64 v_13 = npyv_loadn_u64(ip + 13 * vstep * istride, istride);
npyv_u64 r_13 = npyv_negative_u64(v_13);
npyv_storen_u64(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_u64 v_14 = npyv_loadn_u64(ip + 14 * vstep * istride, istride);
npyv_u64 r_14 = npyv_negative_u64(v_14);
npyv_storen_u64(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_u64 v_15 = npyv_loadn_u64(ip + 15 * vstep * istride, istride);
npyv_u64 r_15 = npyv_negative_u64(v_15);
@@ -4679,6 +4702,7 @@ simd_unary_nn_negative_u64(const npyv_lanetype_u64 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 1
#undef UNROLL
#endif // NPY_SIMD
@@ -5108,6 +5132,8 @@ simd_unary_nc_negative_f32(const npyv_lanetype_f32 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_f32(const npyv_lanetype_f32 *ip, npy_intp istride,
npyv_lanetype_f32 *op, npy_intp ostride,
@@ -5118,112 +5144,112 @@ simd_unary_nn_negative_f32(const npyv_lanetype_f32 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_f32 v_0 = npyv_loadn_f32(ip + 0 * vstep * istride, istride);
npyv_f32 r_0 = npyv_negative_f32(v_0);
npyv_storen_f32(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_f32 v_1 = npyv_loadn_f32(ip + 1 * vstep * istride, istride);
npyv_f32 r_1 = npyv_negative_f32(v_1);
npyv_storen_f32(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_f32 v_2 = npyv_loadn_f32(ip + 2 * vstep * istride, istride);
npyv_f32 r_2 = npyv_negative_f32(v_2);
npyv_storen_f32(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_f32 v_3 = npyv_loadn_f32(ip + 3 * vstep * istride, istride);
npyv_f32 r_3 = npyv_negative_f32(v_3);
npyv_storen_f32(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_f32 v_4 = npyv_loadn_f32(ip + 4 * vstep * istride, istride);
npyv_f32 r_4 = npyv_negative_f32(v_4);
npyv_storen_f32(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_f32 v_5 = npyv_loadn_f32(ip + 5 * vstep * istride, istride);
npyv_f32 r_5 = npyv_negative_f32(v_5);
npyv_storen_f32(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_f32 v_6 = npyv_loadn_f32(ip + 6 * vstep * istride, istride);
npyv_f32 r_6 = npyv_negative_f32(v_6);
npyv_storen_f32(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_f32 v_7 = npyv_loadn_f32(ip + 7 * vstep * istride, istride);
npyv_f32 r_7 = npyv_negative_f32(v_7);
npyv_storen_f32(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_f32 v_8 = npyv_loadn_f32(ip + 8 * vstep * istride, istride);
npyv_f32 r_8 = npyv_negative_f32(v_8);
npyv_storen_f32(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_f32 v_9 = npyv_loadn_f32(ip + 9 * vstep * istride, istride);
npyv_f32 r_9 = npyv_negative_f32(v_9);
npyv_storen_f32(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_f32 v_10 = npyv_loadn_f32(ip + 10 * vstep * istride, istride);
npyv_f32 r_10 = npyv_negative_f32(v_10);
npyv_storen_f32(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_f32 v_11 = npyv_loadn_f32(ip + 11 * vstep * istride, istride);
npyv_f32 r_11 = npyv_negative_f32(v_11);
npyv_storen_f32(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_f32 v_12 = npyv_loadn_f32(ip + 12 * vstep * istride, istride);
npyv_f32 r_12 = npyv_negative_f32(v_12);
npyv_storen_f32(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_f32 v_13 = npyv_loadn_f32(ip + 13 * vstep * istride, istride);
npyv_f32 r_13 = npyv_negative_f32(v_13);
npyv_storen_f32(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_f32 v_14 = npyv_loadn_f32(ip + 14 * vstep * istride, istride);
npyv_f32 r_14 = npyv_negative_f32(v_14);
npyv_storen_f32(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_f32 v_15 = npyv_loadn_f32(ip + 15 * vstep * istride, istride);
npyv_f32 r_15 = npyv_negative_f32(v_15);
@@ -5242,6 +5268,7 @@ simd_unary_nn_negative_f32(const npyv_lanetype_f32 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 1
#undef UNROLL
#endif // NPY_SIMD_F32
@@ -5671,6 +5698,8 @@ simd_unary_nc_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
npyv_lanetype_f64 *op, npy_intp ostride,
@@ -5681,112 +5710,112 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_f64 v_0 = npyv_loadn_f64(ip + 0 * vstep * istride, istride);
npyv_f64 r_0 = npyv_negative_f64(v_0);
npyv_storen_f64(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_f64 v_1 = npyv_loadn_f64(ip + 1 * vstep * istride, istride);
npyv_f64 r_1 = npyv_negative_f64(v_1);
npyv_storen_f64(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_f64 v_2 = npyv_loadn_f64(ip + 2 * vstep * istride, istride);
npyv_f64 r_2 = npyv_negative_f64(v_2);
npyv_storen_f64(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_f64 v_3 = npyv_loadn_f64(ip + 3 * vstep * istride, istride);
npyv_f64 r_3 = npyv_negative_f64(v_3);
npyv_storen_f64(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_f64 v_4 = npyv_loadn_f64(ip + 4 * vstep * istride, istride);
npyv_f64 r_4 = npyv_negative_f64(v_4);
npyv_storen_f64(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_f64 v_5 = npyv_loadn_f64(ip + 5 * vstep * istride, istride);
npyv_f64 r_5 = npyv_negative_f64(v_5);
npyv_storen_f64(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_f64 v_6 = npyv_loadn_f64(ip + 6 * vstep * istride, istride);
npyv_f64 r_6 = npyv_negative_f64(v_6);
npyv_storen_f64(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_f64 v_7 = npyv_loadn_f64(ip + 7 * vstep * istride, istride);
npyv_f64 r_7 = npyv_negative_f64(v_7);
npyv_storen_f64(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_f64 v_8 = npyv_loadn_f64(ip + 8 * vstep * istride, istride);
npyv_f64 r_8 = npyv_negative_f64(v_8);
npyv_storen_f64(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_f64 v_9 = npyv_loadn_f64(ip + 9 * vstep * istride, istride);
npyv_f64 r_9 = npyv_negative_f64(v_9);
npyv_storen_f64(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_f64 v_10 = npyv_loadn_f64(ip + 10 * vstep * istride, istride);
npyv_f64 r_10 = npyv_negative_f64(v_10);
npyv_storen_f64(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_f64 v_11 = npyv_loadn_f64(ip + 11 * vstep * istride, istride);
npyv_f64 r_11 = npyv_negative_f64(v_11);
npyv_storen_f64(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_f64 v_12 = npyv_loadn_f64(ip + 12 * vstep * istride, istride);
npyv_f64 r_12 = npyv_negative_f64(v_12);
npyv_storen_f64(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_f64 v_13 = npyv_loadn_f64(ip + 13 * vstep * istride, istride);
npyv_f64 r_13 = npyv_negative_f64(v_13);
npyv_storen_f64(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_f64 v_14 = npyv_loadn_f64(ip + 14 * vstep * istride, istride);
npyv_f64 r_14 = npyv_negative_f64(v_14);
npyv_storen_f64(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_f64 v_15 = npyv_loadn_f64(ip + 15 * vstep * istride, istride);
npyv_f64 r_15 = npyv_negative_f64(v_15);
@@ -5805,6 +5834,7 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 1
#undef UNROLL
#endif // NPY_SIMD_F64
@@ -5814,10 +5844,10 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
/********************************************************************************
** Defining ufunc inner functions
********************************************************************************/
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -5833,7 +5863,7 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -5849,7 +5879,7 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -5865,7 +5895,7 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -5883,7 +5913,7 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -5921,8 +5951,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -5945,97 +5975,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_ubyte in_0 = *((const npy_ubyte *)(ip + 0 * istep));
*((npy_ubyte *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_ubyte in_1 = *((const npy_ubyte *)(ip + 1 * istep));
*((npy_ubyte *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_ubyte in_2 = *((const npy_ubyte *)(ip + 2 * istep));
*((npy_ubyte *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_ubyte in_3 = *((const npy_ubyte *)(ip + 3 * istep));
*((npy_ubyte *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_ubyte in_4 = *((const npy_ubyte *)(ip + 4 * istep));
*((npy_ubyte *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_ubyte in_5 = *((const npy_ubyte *)(ip + 5 * istep));
*((npy_ubyte *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_ubyte in_6 = *((const npy_ubyte *)(ip + 6 * istep));
*((npy_ubyte *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_ubyte in_7 = *((const npy_ubyte *)(ip + 7 * istep));
*((npy_ubyte *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_ubyte in_8 = *((const npy_ubyte *)(ip + 8 * istep));
*((npy_ubyte *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_ubyte in_9 = *((const npy_ubyte *)(ip + 9 * istep));
*((npy_ubyte *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_ubyte in_10 = *((const npy_ubyte *)(ip + 10 * istep));
*((npy_ubyte *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_ubyte in_11 = *((const npy_ubyte *)(ip + 11 * istep));
*((npy_ubyte *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_ubyte in_12 = *((const npy_ubyte *)(ip + 12 * istep));
*((npy_ubyte *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_ubyte in_13 = *((const npy_ubyte *)(ip + 13 * istep));
*((npy_ubyte *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_ubyte in_14 = *((const npy_ubyte *)(ip + 14 * istep));
*((npy_ubyte *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_ubyte in_15 = *((const npy_ubyte *)(ip + 15 * istep));
*((npy_ubyte *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -6055,10 +6085,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -6074,7 +6104,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -6090,7 +6120,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -6106,7 +6136,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -6124,7 +6154,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -6162,8 +6192,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -6186,97 +6216,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_ushort in_0 = *((const npy_ushort *)(ip + 0 * istep));
*((npy_ushort *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_ushort in_1 = *((const npy_ushort *)(ip + 1 * istep));
*((npy_ushort *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_ushort in_2 = *((const npy_ushort *)(ip + 2 * istep));
*((npy_ushort *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_ushort in_3 = *((const npy_ushort *)(ip + 3 * istep));
*((npy_ushort *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_ushort in_4 = *((const npy_ushort *)(ip + 4 * istep));
*((npy_ushort *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_ushort in_5 = *((const npy_ushort *)(ip + 5 * istep));
*((npy_ushort *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_ushort in_6 = *((const npy_ushort *)(ip + 6 * istep));
*((npy_ushort *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_ushort in_7 = *((const npy_ushort *)(ip + 7 * istep));
*((npy_ushort *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_ushort in_8 = *((const npy_ushort *)(ip + 8 * istep));
*((npy_ushort *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_ushort in_9 = *((const npy_ushort *)(ip + 9 * istep));
*((npy_ushort *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_ushort in_10 = *((const npy_ushort *)(ip + 10 * istep));
*((npy_ushort *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_ushort in_11 = *((const npy_ushort *)(ip + 11 * istep));
*((npy_ushort *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_ushort in_12 = *((const npy_ushort *)(ip + 12 * istep));
*((npy_ushort *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_ushort in_13 = *((const npy_ushort *)(ip + 13 * istep));
*((npy_ushort *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_ushort in_14 = *((const npy_ushort *)(ip + 14 * istep));
*((npy_ushort *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_ushort in_15 = *((const npy_ushort *)(ip + 15 * istep));
*((npy_ushort *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -6296,10 +6326,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -6315,7 +6345,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -6331,7 +6361,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -6347,7 +6377,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -6365,7 +6395,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -6403,8 +6433,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -6427,97 +6457,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_uint in_0 = *((const npy_uint *)(ip + 0 * istep));
*((npy_uint *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_uint in_1 = *((const npy_uint *)(ip + 1 * istep));
*((npy_uint *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_uint in_2 = *((const npy_uint *)(ip + 2 * istep));
*((npy_uint *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_uint in_3 = *((const npy_uint *)(ip + 3 * istep));
*((npy_uint *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_uint in_4 = *((const npy_uint *)(ip + 4 * istep));
*((npy_uint *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_uint in_5 = *((const npy_uint *)(ip + 5 * istep));
*((npy_uint *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_uint in_6 = *((const npy_uint *)(ip + 6 * istep));
*((npy_uint *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_uint in_7 = *((const npy_uint *)(ip + 7 * istep));
*((npy_uint *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_uint in_8 = *((const npy_uint *)(ip + 8 * istep));
*((npy_uint *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_uint in_9 = *((const npy_uint *)(ip + 9 * istep));
*((npy_uint *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_uint in_10 = *((const npy_uint *)(ip + 10 * istep));
*((npy_uint *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_uint in_11 = *((const npy_uint *)(ip + 11 * istep));
*((npy_uint *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_uint in_12 = *((const npy_uint *)(ip + 12 * istep));
*((npy_uint *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_uint in_13 = *((const npy_uint *)(ip + 13 * istep));
*((npy_uint *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_uint in_14 = *((const npy_uint *)(ip + 14 * istep));
*((npy_uint *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_uint in_15 = *((const npy_uint *)(ip + 15 * istep));
*((npy_uint *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -6537,10 +6567,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -6556,7 +6586,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -6572,7 +6602,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -6588,7 +6618,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -6606,7 +6636,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -6644,8 +6674,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -6668,97 +6698,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_ulong in_0 = *((const npy_ulong *)(ip + 0 * istep));
*((npy_ulong *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_ulong in_1 = *((const npy_ulong *)(ip + 1 * istep));
*((npy_ulong *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_ulong in_2 = *((const npy_ulong *)(ip + 2 * istep));
*((npy_ulong *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_ulong in_3 = *((const npy_ulong *)(ip + 3 * istep));
*((npy_ulong *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_ulong in_4 = *((const npy_ulong *)(ip + 4 * istep));
*((npy_ulong *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_ulong in_5 = *((const npy_ulong *)(ip + 5 * istep));
*((npy_ulong *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_ulong in_6 = *((const npy_ulong *)(ip + 6 * istep));
*((npy_ulong *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_ulong in_7 = *((const npy_ulong *)(ip + 7 * istep));
*((npy_ulong *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_ulong in_8 = *((const npy_ulong *)(ip + 8 * istep));
*((npy_ulong *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_ulong in_9 = *((const npy_ulong *)(ip + 9 * istep));
*((npy_ulong *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_ulong in_10 = *((const npy_ulong *)(ip + 10 * istep));
*((npy_ulong *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_ulong in_11 = *((const npy_ulong *)(ip + 11 * istep));
*((npy_ulong *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_ulong in_12 = *((const npy_ulong *)(ip + 12 * istep));
*((npy_ulong *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_ulong in_13 = *((const npy_ulong *)(ip + 13 * istep));
*((npy_ulong *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_ulong in_14 = *((const npy_ulong *)(ip + 14 * istep));
*((npy_ulong *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_ulong in_15 = *((const npy_ulong *)(ip + 15 * istep));
*((npy_ulong *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -6778,10 +6808,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -6797,7 +6827,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -6813,7 +6843,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -6829,7 +6859,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -6847,7 +6877,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -6885,8 +6915,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -6909,97 +6939,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_ulonglong in_0 = *((const npy_ulonglong *)(ip + 0 * istep));
*((npy_ulonglong *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_ulonglong in_1 = *((const npy_ulonglong *)(ip + 1 * istep));
*((npy_ulonglong *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_ulonglong in_2 = *((const npy_ulonglong *)(ip + 2 * istep));
*((npy_ulonglong *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_ulonglong in_3 = *((const npy_ulonglong *)(ip + 3 * istep));
*((npy_ulonglong *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_ulonglong in_4 = *((const npy_ulonglong *)(ip + 4 * istep));
*((npy_ulonglong *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_ulonglong in_5 = *((const npy_ulonglong *)(ip + 5 * istep));
*((npy_ulonglong *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_ulonglong in_6 = *((const npy_ulonglong *)(ip + 6 * istep));
*((npy_ulonglong *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_ulonglong in_7 = *((const npy_ulonglong *)(ip + 7 * istep));
*((npy_ulonglong *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_ulonglong in_8 = *((const npy_ulonglong *)(ip + 8 * istep));
*((npy_ulonglong *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_ulonglong in_9 = *((const npy_ulonglong *)(ip + 9 * istep));
*((npy_ulonglong *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_ulonglong in_10 = *((const npy_ulonglong *)(ip + 10 * istep));
*((npy_ulonglong *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_ulonglong in_11 = *((const npy_ulonglong *)(ip + 11 * istep));
*((npy_ulonglong *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_ulonglong in_12 = *((const npy_ulonglong *)(ip + 12 * istep));
*((npy_ulonglong *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_ulonglong in_13 = *((const npy_ulonglong *)(ip + 13 * istep));
*((npy_ulonglong *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_ulonglong in_14 = *((const npy_ulonglong *)(ip + 14 * istep));
*((npy_ulonglong *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_ulonglong in_15 = *((const npy_ulonglong *)(ip + 15 * istep));
*((npy_ulonglong *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -7019,10 +7049,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -7038,7 +7068,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -7054,7 +7084,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -7070,7 +7100,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -7088,7 +7118,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -7126,8 +7156,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -7150,97 +7180,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_byte in_0 = *((const npy_byte *)(ip + 0 * istep));
*((npy_byte *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_byte in_1 = *((const npy_byte *)(ip + 1 * istep));
*((npy_byte *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_byte in_2 = *((const npy_byte *)(ip + 2 * istep));
*((npy_byte *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_byte in_3 = *((const npy_byte *)(ip + 3 * istep));
*((npy_byte *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_byte in_4 = *((const npy_byte *)(ip + 4 * istep));
*((npy_byte *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_byte in_5 = *((const npy_byte *)(ip + 5 * istep));
*((npy_byte *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_byte in_6 = *((const npy_byte *)(ip + 6 * istep));
*((npy_byte *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_byte in_7 = *((const npy_byte *)(ip + 7 * istep));
*((npy_byte *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_byte in_8 = *((const npy_byte *)(ip + 8 * istep));
*((npy_byte *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_byte in_9 = *((const npy_byte *)(ip + 9 * istep));
*((npy_byte *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_byte in_10 = *((const npy_byte *)(ip + 10 * istep));
*((npy_byte *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_byte in_11 = *((const npy_byte *)(ip + 11 * istep));
*((npy_byte *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_byte in_12 = *((const npy_byte *)(ip + 12 * istep));
*((npy_byte *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_byte in_13 = *((const npy_byte *)(ip + 13 * istep));
*((npy_byte *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_byte in_14 = *((const npy_byte *)(ip + 14 * istep));
*((npy_byte *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_byte in_15 = *((const npy_byte *)(ip + 15 * istep));
*((npy_byte *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -7260,10 +7290,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -7279,7 +7309,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -7295,7 +7325,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -7311,7 +7341,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -7329,7 +7359,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -7367,8 +7397,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -7391,97 +7421,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_short in_0 = *((const npy_short *)(ip + 0 * istep));
*((npy_short *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_short in_1 = *((const npy_short *)(ip + 1 * istep));
*((npy_short *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_short in_2 = *((const npy_short *)(ip + 2 * istep));
*((npy_short *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_short in_3 = *((const npy_short *)(ip + 3 * istep));
*((npy_short *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_short in_4 = *((const npy_short *)(ip + 4 * istep));
*((npy_short *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_short in_5 = *((const npy_short *)(ip + 5 * istep));
*((npy_short *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_short in_6 = *((const npy_short *)(ip + 6 * istep));
*((npy_short *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_short in_7 = *((const npy_short *)(ip + 7 * istep));
*((npy_short *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_short in_8 = *((const npy_short *)(ip + 8 * istep));
*((npy_short *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_short in_9 = *((const npy_short *)(ip + 9 * istep));
*((npy_short *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_short in_10 = *((const npy_short *)(ip + 10 * istep));
*((npy_short *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_short in_11 = *((const npy_short *)(ip + 11 * istep));
*((npy_short *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_short in_12 = *((const npy_short *)(ip + 12 * istep));
*((npy_short *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_short in_13 = *((const npy_short *)(ip + 13 * istep));
*((npy_short *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_short in_14 = *((const npy_short *)(ip + 14 * istep));
*((npy_short *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_short in_15 = *((const npy_short *)(ip + 15 * istep));
*((npy_short *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -7501,10 +7531,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -7520,7 +7550,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -7536,7 +7566,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -7552,7 +7582,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -7570,7 +7600,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -7608,8 +7638,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -7632,97 +7662,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_int in_0 = *((const npy_int *)(ip + 0 * istep));
*((npy_int *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_int in_1 = *((const npy_int *)(ip + 1 * istep));
*((npy_int *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_int in_2 = *((const npy_int *)(ip + 2 * istep));
*((npy_int *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_int in_3 = *((const npy_int *)(ip + 3 * istep));
*((npy_int *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_int in_4 = *((const npy_int *)(ip + 4 * istep));
*((npy_int *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_int in_5 = *((const npy_int *)(ip + 5 * istep));
*((npy_int *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_int in_6 = *((const npy_int *)(ip + 6 * istep));
*((npy_int *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_int in_7 = *((const npy_int *)(ip + 7 * istep));
*((npy_int *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_int in_8 = *((const npy_int *)(ip + 8 * istep));
*((npy_int *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_int in_9 = *((const npy_int *)(ip + 9 * istep));
*((npy_int *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_int in_10 = *((const npy_int *)(ip + 10 * istep));
*((npy_int *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_int in_11 = *((const npy_int *)(ip + 11 * istep));
*((npy_int *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_int in_12 = *((const npy_int *)(ip + 12 * istep));
*((npy_int *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_int in_13 = *((const npy_int *)(ip + 13 * istep));
*((npy_int *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_int in_14 = *((const npy_int *)(ip + 14 * istep));
*((npy_int *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_int in_15 = *((const npy_int *)(ip + 15 * istep));
*((npy_int *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -7742,10 +7772,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -7761,7 +7791,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -7777,7 +7807,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -7793,7 +7823,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -7811,7 +7841,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -7849,8 +7879,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -7873,97 +7903,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_long in_0 = *((const npy_long *)(ip + 0 * istep));
*((npy_long *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_long in_1 = *((const npy_long *)(ip + 1 * istep));
*((npy_long *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_long in_2 = *((const npy_long *)(ip + 2 * istep));
*((npy_long *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_long in_3 = *((const npy_long *)(ip + 3 * istep));
*((npy_long *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_long in_4 = *((const npy_long *)(ip + 4 * istep));
*((npy_long *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_long in_5 = *((const npy_long *)(ip + 5 * istep));
*((npy_long *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_long in_6 = *((const npy_long *)(ip + 6 * istep));
*((npy_long *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_long in_7 = *((const npy_long *)(ip + 7 * istep));
*((npy_long *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_long in_8 = *((const npy_long *)(ip + 8 * istep));
*((npy_long *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_long in_9 = *((const npy_long *)(ip + 9 * istep));
*((npy_long *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_long in_10 = *((const npy_long *)(ip + 10 * istep));
*((npy_long *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_long in_11 = *((const npy_long *)(ip + 11 * istep));
*((npy_long *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_long in_12 = *((const npy_long *)(ip + 12 * istep));
*((npy_long *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_long in_13 = *((const npy_long *)(ip + 13 * istep));
*((npy_long *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_long in_14 = *((const npy_long *)(ip + 14 * istep));
*((npy_long *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_long in_15 = *((const npy_long *)(ip + 15 * istep));
*((npy_long *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -7983,10 +8013,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -8002,7 +8032,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -8018,7 +8048,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -8034,7 +8064,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -8052,7 +8082,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -8090,8 +8120,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -8114,97 +8144,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_longlong in_0 = *((const npy_longlong *)(ip + 0 * istep));
*((npy_longlong *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_longlong in_1 = *((const npy_longlong *)(ip + 1 * istep));
*((npy_longlong *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_longlong in_2 = *((const npy_longlong *)(ip + 2 * istep));
*((npy_longlong *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_longlong in_3 = *((const npy_longlong *)(ip + 3 * istep));
*((npy_longlong *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_longlong in_4 = *((const npy_longlong *)(ip + 4 * istep));
*((npy_longlong *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_longlong in_5 = *((const npy_longlong *)(ip + 5 * istep));
*((npy_longlong *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_longlong in_6 = *((const npy_longlong *)(ip + 6 * istep));
*((npy_longlong *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_longlong in_7 = *((const npy_longlong *)(ip + 7 * istep));
*((npy_longlong *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_longlong in_8 = *((const npy_longlong *)(ip + 8 * istep));
*((npy_longlong *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_longlong in_9 = *((const npy_longlong *)(ip + 9 * istep));
*((npy_longlong *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_longlong in_10 = *((const npy_longlong *)(ip + 10 * istep));
*((npy_longlong *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_longlong in_11 = *((const npy_longlong *)(ip + 11 * istep));
*((npy_longlong *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_longlong in_12 = *((const npy_longlong *)(ip + 12 * istep));
*((npy_longlong *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_longlong in_13 = *((const npy_longlong *)(ip + 13 * istep));
*((npy_longlong *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_longlong in_14 = *((const npy_longlong *)(ip + 14 * istep));
*((npy_longlong *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_longlong in_15 = *((const npy_longlong *)(ip + 15 * istep));
*((npy_longlong *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -8224,10 +8254,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 8
#if 1
#define TO_SIMD_SFX(X) X##_f8
@@ -8243,7 +8273,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 16
#if 1
#define TO_SIMD_SFX(X) X##_f16
@@ -8259,7 +8289,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 32
#if 1
#define TO_SIMD_SFX(X) X##_f32
@@ -8275,7 +8305,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 64
#if 1
#define TO_SIMD_SFX(X) X##_f64
@@ -8293,7 +8323,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -8331,8 +8361,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -8355,97 +8385,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_float in_0 = *((const npy_float *)(ip + 0 * istep));
*((npy_float *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_float in_1 = *((const npy_float *)(ip + 1 * istep));
*((npy_float *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_float in_2 = *((const npy_float *)(ip + 2 * istep));
*((npy_float *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_float in_3 = *((const npy_float *)(ip + 3 * istep));
*((npy_float *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_float in_4 = *((const npy_float *)(ip + 4 * istep));
*((npy_float *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_float in_5 = *((const npy_float *)(ip + 5 * istep));
*((npy_float *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_float in_6 = *((const npy_float *)(ip + 6 * istep));
*((npy_float *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_float in_7 = *((const npy_float *)(ip + 7 * istep));
*((npy_float *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_float in_8 = *((const npy_float *)(ip + 8 * istep));
*((npy_float *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_float in_9 = *((const npy_float *)(ip + 9 * istep));
*((npy_float *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_float in_10 = *((const npy_float *)(ip + 10 * istep));
*((npy_float *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_float in_11 = *((const npy_float *)(ip + 11 * istep));
*((npy_float *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_float in_12 = *((const npy_float *)(ip + 12 * istep));
*((npy_float *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_float in_13 = *((const npy_float *)(ip + 13 * istep));
*((npy_float *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_float in_14 = *((const npy_float *)(ip + 14 * istep));
*((npy_float *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_float in_15 = *((const npy_float *)(ip + 15 * istep));
*((npy_float *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -8465,10 +8495,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 8
#if 1
#define TO_SIMD_SFX(X) X##_f8
@@ -8484,7 +8514,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 16
#if 1
#define TO_SIMD_SFX(X) X##_f16
@@ -8500,7 +8530,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 32
#if 1
#define TO_SIMD_SFX(X) X##_f32
@@ -8516,7 +8546,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 64
#if 1
#define TO_SIMD_SFX(X) X##_f64
@@ -8534,7 +8564,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -8572,8 +8602,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -8596,97 +8626,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_double in_0 = *((const npy_double *)(ip + 0 * istep));
*((npy_double *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_double in_1 = *((const npy_double *)(ip + 1 * istep));
*((npy_double *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_double in_2 = *((const npy_double *)(ip + 2 * istep));
*((npy_double *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_double in_3 = *((const npy_double *)(ip + 3 * istep));
*((npy_double *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_double in_4 = *((const npy_double *)(ip + 4 * istep));
*((npy_double *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_double in_5 = *((const npy_double *)(ip + 5 * istep));
*((npy_double *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_double in_6 = *((const npy_double *)(ip + 6 * istep));
*((npy_double *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_double in_7 = *((const npy_double *)(ip + 7 * istep));
*((npy_double *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_double in_8 = *((const npy_double *)(ip + 8 * istep));
*((npy_double *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_double in_9 = *((const npy_double *)(ip + 9 * istep));
*((npy_double *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_double in_10 = *((const npy_double *)(ip + 10 * istep));
*((npy_double *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_double in_11 = *((const npy_double *)(ip + 11 * istep));
*((npy_double *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_double in_12 = *((const npy_double *)(ip + 12 * istep));
*((npy_double *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_double in_13 = *((const npy_double *)(ip + 13 * istep));
*((npy_double *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_double in_14 = *((const npy_double *)(ip + 14 * istep));
*((npy_double *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_double in_15 = *((const npy_double *)(ip + 15 * istep));
*((npy_double *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -8706,10 +8736,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 8
#if 1
#define TO_SIMD_SFX(X) X##_f8
@@ -8725,7 +8755,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 16
#if 1
#define TO_SIMD_SFX(X) X##_f16
@@ -8741,7 +8771,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 32
#if 1
#define TO_SIMD_SFX(X) X##_f32
@@ -8757,7 +8787,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 64
#if 1
#define TO_SIMD_SFX(X) X##_f64
@@ -8775,7 +8805,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -8813,8 +8843,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -8837,97 +8867,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_longdouble in_0 = *((const npy_longdouble *)(ip + 0 * istep));
*((npy_longdouble *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_longdouble in_1 = *((const npy_longdouble *)(ip + 1 * istep));
*((npy_longdouble *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_longdouble in_2 = *((const npy_longdouble *)(ip + 2 * istep));
*((npy_longdouble *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_longdouble in_3 = *((const npy_longdouble *)(ip + 3 * istep));
*((npy_longdouble *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_longdouble in_4 = *((const npy_longdouble *)(ip + 4 * istep));
*((npy_longdouble *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_longdouble in_5 = *((const npy_longdouble *)(ip + 5 * istep));
*((npy_longdouble *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_longdouble in_6 = *((const npy_longdouble *)(ip + 6 * istep));
*((npy_longdouble *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_longdouble in_7 = *((const npy_longdouble *)(ip + 7 * istep));
*((npy_longdouble *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_longdouble in_8 = *((const npy_longdouble *)(ip + 8 * istep));
*((npy_longdouble *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_longdouble in_9 = *((const npy_longdouble *)(ip + 9 * istep));
*((npy_longdouble *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_longdouble in_10 = *((const npy_longdouble *)(ip + 10 * istep));
*((npy_longdouble *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_longdouble in_11 = *((const npy_longdouble *)(ip + 11 * istep));
*((npy_longdouble *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_longdouble in_12 = *((const npy_longdouble *)(ip + 12 * istep));
*((npy_longdouble *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_longdouble in_13 = *((const npy_longdouble *)(ip + 13 * istep));
*((npy_longdouble *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_longdouble in_14 = *((const npy_longdouble *)(ip + 14 * istep));
*((npy_longdouble *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_longdouble in_15 = *((const npy_longdouble *)(ip + 15 * istep));
*((npy_longdouble *)(op + 15 * ostep)) = scalar_negative(in_15);
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c.src b/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c.src
index 1e2a81d20b..bfe4d892d0 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c.src
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c.src
@@ -195,6 +195,8 @@ simd_unary_nc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
npyv_lanetype_@sfx@ *op, npy_intp ostride,
@@ -226,6 +228,7 @@ simd_unary_nn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
*op = scalar_@intrin@(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // @supports_ncontig@
#undef UNROLL
#endif // @simd_chk@
@@ -314,8 +317,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_@intrin@)(
diff --git a/contrib/python/numpy/py3/numpy/core/tests/test_numeric.py b/contrib/python/numpy/py3/numpy/core/tests/test_numeric.py
index 1bbdde1317..a88189e03e 100644
--- a/contrib/python/numpy/py3/numpy/core/tests/test_numeric.py
+++ b/contrib/python/numpy/py3/numpy/core/tests/test_numeric.py
@@ -477,7 +477,14 @@ class TestBoolCmp:
self.signd[self.ed] *= -1.
self.signf[1::6][self.ef[1::6]] = -np.inf
self.signd[1::6][self.ed[1::6]] = -np.inf
- self.signf[3::6][self.ef[3::6]] = -np.nan
+ # On RISC-V, many operations that produce NaNs, such as converting
+ # a -NaN from f64 to f32, return a canonical NaN. The canonical
+ # NaNs are always positive. See section 11.3 NaN Generation and
+ # Propagation of the RISC-V Unprivileged ISA for more details.
+ # We disable the float32 sign test on riscv64 for -np.nan as the sign
+ # of the NaN will be lost when it's converted to a float32.
+ if platform.processor() != 'riscv64':
+ self.signf[3::6][self.ef[3::6]] = -np.nan
self.signd[3::6][self.ed[3::6]] = -np.nan
self.signf[4::6][self.ef[4::6]] = -0.
self.signd[4::6][self.ed[4::6]] = -0.
diff --git a/contrib/python/numpy/py3/numpy/f2py/crackfortran.py b/contrib/python/numpy/py3/numpy/f2py/crackfortran.py
index 8d3fc27608..8d3fc27608 100755..100644
--- a/contrib/python/numpy/py3/numpy/f2py/crackfortran.py
+++ b/contrib/python/numpy/py3/numpy/f2py/crackfortran.py
diff --git a/contrib/python/numpy/py3/numpy/f2py/f2py2e.py b/contrib/python/numpy/py3/numpy/f2py/f2py2e.py
index ce22b2d8a9..ce22b2d8a9 100755..100644
--- a/contrib/python/numpy/py3/numpy/f2py/f2py2e.py
+++ b/contrib/python/numpy/py3/numpy/f2py/f2py2e.py
diff --git a/contrib/python/numpy/py3/numpy/f2py/rules.py b/contrib/python/numpy/py3/numpy/f2py/rules.py
index 009365e047..009365e047 100755..100644
--- a/contrib/python/numpy/py3/numpy/f2py/rules.py
+++ b/contrib/python/numpy/py3/numpy/f2py/rules.py
diff --git a/contrib/python/numpy/py3/numpy/f2py/tests/util.py b/contrib/python/numpy/py3/numpy/f2py/tests/util.py
index 75b257cdb8..6ed6c0855f 100644
--- a/contrib/python/numpy/py3/numpy/f2py/tests/util.py
+++ b/contrib/python/numpy/py3/numpy/f2py/tests/util.py
@@ -20,6 +20,7 @@ import contextlib
import numpy
from pathlib import Path
+from numpy.compat import asstr
from numpy._utils import asunicode
from numpy.testing import temppath, IS_WASM
from importlib import import_module
diff --git a/contrib/python/numpy/py3/numpy/lib/function_base.py b/contrib/python/numpy/py3/numpy/lib/function_base.py
index e75aca1e58..a3dab04d33 100644
--- a/contrib/python/numpy/py3/numpy/lib/function_base.py
+++ b/contrib/python/numpy/py3/numpy/lib/function_base.py
@@ -4655,7 +4655,8 @@ def _lerp(a, b, t, out=None):
diff_b_a = subtract(b, a)
# asanyarray is a stop-gap until gh-13105
lerp_interpolation = asanyarray(add(a, diff_b_a * t, out=out))
- subtract(b, diff_b_a * (1 - t), out=lerp_interpolation, where=t >= 0.5)
+ subtract(b, diff_b_a * (1 - t), out=lerp_interpolation, where=t >= 0.5,
+ casting='unsafe', dtype=type(lerp_interpolation.dtype))
if lerp_interpolation.ndim == 0 and out is None:
lerp_interpolation = lerp_interpolation[()] # unpack 0d arrays
return lerp_interpolation
diff --git a/contrib/python/numpy/py3/numpy/lib/tests/test_function_base.py b/contrib/python/numpy/py3/numpy/lib/tests/test_function_base.py
index 11e44630e7..2bb73b6003 100644
--- a/contrib/python/numpy/py3/numpy/lib/tests/test_function_base.py
+++ b/contrib/python/numpy/py3/numpy/lib/tests/test_function_base.py
@@ -3606,6 +3606,10 @@ class TestQuantile:
assert_equal(q, Fraction(7, 2))
assert_equal(type(q), Fraction)
+ q = np.quantile(x, .5)
+ assert_equal(q, 1.75)
+ assert_equal(type(q), np.float64)
+
q = np.quantile(x, Fraction(1, 2))
assert_equal(q, Fraction(7, 4))
assert_equal(type(q), Fraction)
diff --git a/contrib/python/numpy/py3/numpy/linalg/umath_linalg.cpp b/contrib/python/numpy/py3/numpy/linalg/umath_linalg.cpp
index 0c0b35e9c0..3b5effe14a 100644
--- a/contrib/python/numpy/py3/numpy/linalg/umath_linalg.cpp
+++ b/contrib/python/numpy/py3/numpy/linalg/umath_linalg.cpp
@@ -2259,7 +2259,7 @@ process_geev_results(GEEV_PARAMS_t<typ> *params, scalar_trait)
}
}
-
+#if 0
static inline fortran_int
call_geev(GEEV_PARAMS_t<fortran_complex>* params)
{
@@ -2275,6 +2275,8 @@ call_geev(GEEV_PARAMS_t<fortran_complex>* params)
&rv);
return rv;
}
+#endif
+
static inline fortran_int
call_geev(GEEV_PARAMS_t<fortran_doublecomplex>* params)
{
diff --git a/contrib/python/numpy/py3/numpy/testing/print_coercion_tables.py b/contrib/python/numpy/py3/numpy/testing/print_coercion_tables.py
index c1d4cdff8f..c1d4cdff8f 100755..100644
--- a/contrib/python/numpy/py3/numpy/testing/print_coercion_tables.py
+++ b/contrib/python/numpy/py3/numpy/testing/print_coercion_tables.py
diff --git a/contrib/python/numpy/py3/numpy/testing/setup.py b/contrib/python/numpy/py3/numpy/testing/setup.py
index 6f203e8727..6f203e8727 100755..100644
--- a/contrib/python/numpy/py3/numpy/testing/setup.py
+++ b/contrib/python/numpy/py3/numpy/testing/setup.py
diff --git a/contrib/python/numpy/py3/numpy/tests/test_warnings.py b/contrib/python/numpy/py3/numpy/tests/test_warnings.py
index ee5124c5d5..df90fcef8c 100644
--- a/contrib/python/numpy/py3/numpy/tests/test_warnings.py
+++ b/contrib/python/numpy/py3/numpy/tests/test_warnings.py
@@ -5,7 +5,6 @@ all of these occurrences but should catch almost all.
import pytest
from pathlib import Path
-import sys
import ast
import tokenize
import numpy
@@ -33,7 +32,7 @@ class FindFuncs(ast.NodeVisitor):
ast.NodeVisitor.generic_visit(self, node)
if p.ls[-1] == 'simplefilter' or p.ls[-1] == 'filterwarnings':
- if node.args[0].s == "ignore":
+ if node.args[0].value == "ignore":
raise AssertionError(
"warnings should have an appropriate stacklevel; found in "
"{} on line {}".format(self.__filename, node.lineno))
@@ -57,8 +56,6 @@ class FindFuncs(ast.NodeVisitor):
@pytest.mark.slow
-@pytest.mark.skipif(sys.version_info >= (3, 12),
- reason="Deprecation warning in ast")
def test_warning_calls():
# combined "ignore" and stacklevel error
base = Path(numpy.__file__).parent
diff --git a/contrib/python/numpy/py3/numpy/typing/tests/test_typing.py b/contrib/python/numpy/py3/numpy/typing/tests/test_typing.py
index 68c6f5d03f..6f778e5515 100644
--- a/contrib/python/numpy/py3/numpy/typing/tests/test_typing.py
+++ b/contrib/python/numpy/py3/numpy/typing/tests/test_typing.py
@@ -86,8 +86,6 @@ def strip_func(match: re.Match[str]) -> str:
return match.groups()[1]
-@pytest.mark.slow
-@pytest.mark.skipif(NO_MYPY, reason="Mypy is not installed")
@pytest.fixture(scope="module", autouse=True)
def run_mypy() -> None:
"""Clears the cache and run mypy before running any of the typing tests.
diff --git a/contrib/python/numpy/py3/numpy/version.py b/contrib/python/numpy/py3/numpy/version.py
index 692240a486..e96055ea6d 100644
--- a/contrib/python/numpy/py3/numpy/version.py
+++ b/contrib/python/numpy/py3/numpy/version.py
@@ -1,5 +1,5 @@
-version = "1.26.3"
+version = "1.26.4"
__version__ = version
full_version = version
diff --git a/contrib/python/numpy/py3/ya.make b/contrib/python/numpy/py3/ya.make
index 92042220c3..0eb98bef02 100644
--- a/contrib/python/numpy/py3/ya.make
+++ b/contrib/python/numpy/py3/ya.make
@@ -2,7 +2,7 @@ PY3_LIBRARY()
PROVIDES(numpy)
-VERSION(1.26.3)
+VERSION(1.26.4)
LICENSE(BSD-3-Clause)