aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorrobot-piglet <robot-piglet@yandex-team.com>2024-02-06 15:03:31 +0300
committerAlexander Smirnov <alex@ydb.tech>2024-02-09 19:18:18 +0300
commit303fba2f20dfd94603064b607671b787de12624e (patch)
tree54c22fad0bcd67bf52f78822a3ee7714fd9dbf40
parentc7854274198c4168e713732ceb13e7075fce89b0 (diff)
downloadydb-303fba2f20dfd94603064b607671b787de12624e.tar.gz
Intermediate changes
-rw-r--r--contrib/python/numpy/include/numpy/core/feature_detection_misc.h5
-rw-r--r--contrib/python/numpy/py3/.dist-info/METADATA7
-rw-r--r--contrib/python/numpy/py3/LICENSES_bundled.txt5
-rw-r--r--contrib/python/numpy/py3/numpy/__config__.py.in22
-rw-r--r--contrib/python/numpy/py3/numpy/array_api/__init__.py2
-rw-r--r--contrib/python/numpy/py3/numpy/array_api/linalg.py6
-rw-r--r--contrib/python/numpy/py3/numpy/core/code_generators/genapi.py9
-rw-r--r--contrib/python/numpy/py3/numpy/core/code_generators/generate_numpy_api.py7
-rw-r--r--contrib/python/numpy/py3/numpy/core/code_generators/generate_ufunc_api.py7
-rw-r--r--contrib/python/numpy/py3/numpy/core/feature_detection_stdio.h3
-rw-r--r--contrib/python/numpy/py3/numpy/core/generate_numpy_api.py251
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/common/npy_cpu_features.c4
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/multiarray/convert.c5
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/multiarray/temp_elide.c3
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c174
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c.src14
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c64
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c.src24
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c770
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c.src3
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c30
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c.src9
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c974
-rw-r--r--contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c.src7
-rw-r--r--contrib/python/numpy/py3/numpy/core/tests/test_numeric.py9
-rw-r--r--[-rwxr-xr-x]contrib/python/numpy/py3/numpy/f2py/crackfortran.py0
-rw-r--r--[-rwxr-xr-x]contrib/python/numpy/py3/numpy/f2py/f2py2e.py0
-rw-r--r--[-rwxr-xr-x]contrib/python/numpy/py3/numpy/f2py/rules.py0
-rw-r--r--contrib/python/numpy/py3/numpy/f2py/tests/util.py1
-rw-r--r--contrib/python/numpy/py3/numpy/lib/function_base.py3
-rw-r--r--contrib/python/numpy/py3/numpy/lib/tests/test_function_base.py4
-rw-r--r--contrib/python/numpy/py3/numpy/linalg/umath_linalg.cpp4
-rw-r--r--[-rwxr-xr-x]contrib/python/numpy/py3/numpy/testing/print_coercion_tables.py0
-rw-r--r--[-rwxr-xr-x]contrib/python/numpy/py3/numpy/testing/setup.py0
-rw-r--r--contrib/python/numpy/py3/numpy/tests/test_warnings.py5
-rw-r--r--contrib/python/numpy/py3/numpy/typing/tests/test_typing.py2
-rw-r--r--contrib/python/numpy/py3/numpy/version.py2
-rw-r--r--contrib/python/numpy/py3/ya.make2
-rw-r--r--yt/yt/client/api/rpc_proxy/client_base.cpp1
-rw-r--r--yt/yt/client/api/rpc_proxy/config.cpp3
-rw-r--r--yt/yt/client/api/rpc_proxy/config.h2
-rw-r--r--yt/yt/client/api/rpc_proxy/transaction_impl.cpp1
-rw-r--r--yt/yt/client/cache/rpc.cpp3
-rw-r--r--yt/yt/core/rpc/client-inl.h15
-rw-r--r--yt/yt/core/rpc/client.cpp14
-rw-r--r--yt/yt/core/rpc/client.h6
-rw-r--r--yt/yt/core/rpc/grpc/channel.cpp16
-rw-r--r--yt/yt/core/rpc/grpc/helpers.cpp40
-rw-r--r--yt/yt/core/rpc/grpc/helpers.h2
-rw-r--r--yt/yt/core/rpc/grpc/public.cpp4
-rw-r--r--yt/yt/core/rpc/grpc/public.h2
-rw-r--r--yt/yt/core/rpc/grpc/server.cpp57
-rw-r--r--yt/yt/core/rpc/message.cpp29
-rw-r--r--yt/yt/core/rpc/message_format.cpp20
-rw-r--r--yt/yt/core/rpc/server_detail.cpp39
-rw-r--r--yt/yt/core/rpc/server_detail.h8
-rw-r--r--yt/yt/core/rpc/service.h5
-rw-r--r--yt/yt/core/rpc/service_detail.cpp61
-rw-r--r--yt/yt/core/rpc/service_detail.h19
-rw-r--r--yt/yt/core/rpc/unittests/rpc_ut.cpp2
-rw-r--r--yt/yt/core/ytree/ypath_client.cpp7
-rw-r--r--yt/yt/core/ytree/ypath_client.h2
-rw-r--r--yt/yt/library/tracing/jaeger/tracer.cpp1
-rw-r--r--yt/yt_proto/yt/client/cache/proto/config.proto2
-rw-r--r--yt/yt_proto/yt/core/rpc/proto/rpc.proto1
65 files changed, 1516 insertions, 1283 deletions
diff --git a/contrib/python/numpy/include/numpy/core/feature_detection_misc.h b/contrib/python/numpy/include/numpy/core/feature_detection_misc.h
new file mode 100644
index 0000000000..0e6447fbd1
--- /dev/null
+++ b/contrib/python/numpy/include/numpy/core/feature_detection_misc.h
@@ -0,0 +1,5 @@
+#ifdef USE_PYTHON3
+#include <contrib/python/numpy/py3/numpy/core/feature_detection_misc.h>
+#else
+#error #include <contrib/python/numpy/py2/numpy/core/feature_detection_misc.h>
+#endif
diff --git a/contrib/python/numpy/py3/.dist-info/METADATA b/contrib/python/numpy/py3/.dist-info/METADATA
index 5e515025ec..8246dc4ed3 100644
--- a/contrib/python/numpy/py3/.dist-info/METADATA
+++ b/contrib/python/numpy/py3/.dist-info/METADATA
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: numpy
-Version: 1.26.3
+Version: 1.26.4
Summary: Fundamental package for array computing in Python
Home-page: https://numpy.org
Author: Travis E. Oliphant et al.
@@ -70,11 +70,6 @@ License: Copyright (c) 2005-2023, NumPy Developers.
License: Apache 2.0
For license text, see vendored-meson/meson/COPYING
- Name: meson-python
- Files: vendored-meson/meson-python/*
- License: MIT
- For license text, see vendored-meson/meson-python/LICENSE
-
Name: spin
Files: .spin/cmds.py
License: BSD-3
diff --git a/contrib/python/numpy/py3/LICENSES_bundled.txt b/contrib/python/numpy/py3/LICENSES_bundled.txt
index 26faf7ff30..aae0e774fa 100644
--- a/contrib/python/numpy/py3/LICENSES_bundled.txt
+++ b/contrib/python/numpy/py3/LICENSES_bundled.txt
@@ -30,11 +30,6 @@ Files: vendored-meson/meson/*
License: Apache 2.0
For license text, see vendored-meson/meson/COPYING
-Name: meson-python
-Files: vendored-meson/meson-python/*
-License: MIT
- For license text, see vendored-meson/meson-python/LICENSE
-
Name: spin
Files: .spin/cmds.py
License: BSD-3
diff --git a/contrib/python/numpy/py3/numpy/__config__.py.in b/contrib/python/numpy/py3/numpy/__config__.py.in
index 6c6c21cb85..f3b32c28c1 100644
--- a/contrib/python/numpy/py3/numpy/__config__.py.in
+++ b/contrib/python/numpy/py3/numpy/__config__.py.in
@@ -32,21 +32,27 @@ CONFIG = _cleanup(
"Compilers": {
"c": {
"name": "@C_COMP@",
- "linker": "@C_COMP_LINKER_ID@",
+ "linker": r"@C_COMP_LINKER_ID@",
"version": "@C_COMP_VERSION@",
- "commands": "@C_COMP_CMD_ARRAY@",
+ "commands": r"@C_COMP_CMD_ARRAY@",
+ "args": r"@C_COMP_ARGS@",
+ "linker args": r"@C_COMP_LINK_ARGS@",
},
"cython": {
"name": "@CYTHON_COMP@",
- "linker": "@CYTHON_COMP_LINKER_ID@",
+ "linker": r"@CYTHON_COMP_LINKER_ID@",
"version": "@CYTHON_COMP_VERSION@",
- "commands": "@CYTHON_COMP_CMD_ARRAY@",
+ "commands": r"@CYTHON_COMP_CMD_ARRAY@",
+ "args": r"@CYTHON_COMP_ARGS@",
+ "linker args": r"@CYTHON_COMP_LINK_ARGS@",
},
"c++": {
"name": "@CPP_COMP@",
- "linker": "@CPP_COMP_LINKER_ID@",
+ "linker": r"@CPP_COMP_LINKER_ID@",
"version": "@CPP_COMP_VERSION@",
- "commands": "@CPP_COMP_CMD_ARRAY@",
+ "commands": r"@CPP_COMP_CMD_ARRAY@",
+ "args": r"@CPP_COMP_ARGS@",
+ "linker args": r"@CPP_COMP_LINK_ARGS@",
},
},
"Machine Information": {
@@ -72,7 +78,7 @@ CONFIG = _cleanup(
"detection method": "@BLAS_TYPE_NAME@",
"include directory": r"@BLAS_INCLUDEDIR@",
"lib directory": r"@BLAS_LIBDIR@",
- "openblas configuration": "@BLAS_OPENBLAS_CONFIG@",
+ "openblas configuration": r"@BLAS_OPENBLAS_CONFIG@",
"pc file directory": r"@BLAS_PCFILEDIR@",
},
"lapack": {
@@ -82,7 +88,7 @@ CONFIG = _cleanup(
"detection method": "@LAPACK_TYPE_NAME@",
"include directory": r"@LAPACK_INCLUDEDIR@",
"lib directory": r"@LAPACK_LIBDIR@",
- "openblas configuration": "@LAPACK_OPENBLAS_CONFIG@",
+ "openblas configuration": r"@LAPACK_OPENBLAS_CONFIG@",
"pc file directory": r"@LAPACK_PCFILEDIR@",
},
},
diff --git a/contrib/python/numpy/py3/numpy/array_api/__init__.py b/contrib/python/numpy/py3/numpy/array_api/__init__.py
index 77f227882e..edc3205fd5 100644
--- a/contrib/python/numpy/py3/numpy/array_api/__init__.py
+++ b/contrib/python/numpy/py3/numpy/array_api/__init__.py
@@ -127,7 +127,7 @@ __all__ = ["__array_api_version__"]
from ._constants import e, inf, nan, pi, newaxis
-__all__ += ["e", "inf", "nan", "pi"]
+__all__ += ["e", "inf", "nan", "pi", "newaxis"]
from ._creation_functions import (
asarray,
diff --git a/contrib/python/numpy/py3/numpy/array_api/linalg.py b/contrib/python/numpy/py3/numpy/array_api/linalg.py
index 09af9dfc3a..c18360f6e6 100644
--- a/contrib/python/numpy/py3/numpy/array_api/linalg.py
+++ b/contrib/python/numpy/py3/numpy/array_api/linalg.py
@@ -9,6 +9,7 @@ from ._dtypes import (
complex128
)
from ._manipulation_functions import reshape
+from ._elementwise_functions import conj
from ._array_object import Array
from ..core.numeric import normalize_axis_tuple
@@ -53,7 +54,10 @@ def cholesky(x: Array, /, *, upper: bool = False) -> Array:
raise TypeError('Only floating-point dtypes are allowed in cholesky')
L = np.linalg.cholesky(x._array)
if upper:
- return Array._new(L).mT
+ U = Array._new(L).mT
+ if U.dtype in [complex64, complex128]:
+ U = conj(U)
+ return U
return Array._new(L)
# Note: cross is the numpy top-level namespace, not np.linalg
diff --git a/contrib/python/numpy/py3/numpy/core/code_generators/genapi.py b/contrib/python/numpy/py3/numpy/core/code_generators/genapi.py
index 2cdaba52d9..d9d7862b28 100644
--- a/contrib/python/numpy/py3/numpy/core/code_generators/genapi.py
+++ b/contrib/python/numpy/py3/numpy/core/code_generators/genapi.py
@@ -304,15 +304,6 @@ def find_functions(filename, tag='API'):
fo.close()
return functions
-def should_rebuild(targets, source_files):
- from distutils.dep_util import newer_group
- for t in targets:
- if not os.path.exists(t):
- return True
- sources = API_FILES + list(source_files) + [__file__]
- if newer_group(sources, targets[0], missing='newer'):
- return True
- return False
def write_file(filename, data):
"""
diff --git a/contrib/python/numpy/py3/numpy/core/code_generators/generate_numpy_api.py b/contrib/python/numpy/py3/numpy/core/code_generators/generate_numpy_api.py
index ae38c4efc2..640bae9e5f 100644
--- a/contrib/python/numpy/py3/numpy/core/code_generators/generate_numpy_api.py
+++ b/contrib/python/numpy/py3/numpy/core/code_generators/generate_numpy_api.py
@@ -148,12 +148,7 @@ def generate_api(output_dir, force=False):
targets = (h_file, c_file)
sources = numpy_api.multiarray_api
-
- if (not force and not genapi.should_rebuild(targets, [numpy_api.__file__, __file__])):
- return targets
- else:
- do_generate_api(targets, sources)
-
+ do_generate_api(targets, sources)
return targets
def do_generate_api(targets, sources):
diff --git a/contrib/python/numpy/py3/numpy/core/code_generators/generate_ufunc_api.py b/contrib/python/numpy/py3/numpy/core/code_generators/generate_ufunc_api.py
index e03299a52c..3734cbd6a0 100644
--- a/contrib/python/numpy/py3/numpy/core/code_generators/generate_ufunc_api.py
+++ b/contrib/python/numpy/py3/numpy/core/code_generators/generate_ufunc_api.py
@@ -125,12 +125,7 @@ def generate_api(output_dir, force=False):
targets = (h_file, c_file)
sources = ['ufunc_api_order.txt']
-
- if (not force and not genapi.should_rebuild(targets, sources + [__file__])):
- return targets
- else:
- do_generate_api(targets, sources)
-
+ do_generate_api(targets, sources)
return targets
def do_generate_api(targets, sources):
diff --git a/contrib/python/numpy/py3/numpy/core/feature_detection_stdio.h b/contrib/python/numpy/py3/numpy/core/feature_detection_stdio.h
index bc14d16d04..d8bbfbd8b2 100644
--- a/contrib/python/numpy/py3/numpy/core/feature_detection_stdio.h
+++ b/contrib/python/numpy/py3/numpy/core/feature_detection_stdio.h
@@ -1,6 +1,9 @@
+#define _GNU_SOURCE
#include <stdio.h>
#include <fcntl.h>
+#if 0 /* Only for setup_common.py, not the C compiler */
off_t ftello(FILE *stream);
int fseeko(FILE *stream, off_t offset, int whence);
int fallocate(int, int, off_t, off_t);
+#endif
diff --git a/contrib/python/numpy/py3/numpy/core/generate_numpy_api.py b/contrib/python/numpy/py3/numpy/core/generate_numpy_api.py
new file mode 100644
index 0000000000..640bae9e5f
--- /dev/null
+++ b/contrib/python/numpy/py3/numpy/core/generate_numpy_api.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+import os
+import argparse
+
+import genapi
+from genapi import \
+ TypeApi, GlobalVarApi, FunctionApi, BoolValuesApi
+
+import numpy_api
+
+# use annotated api when running under cpychecker
+h_template = r"""
+#if defined(_MULTIARRAYMODULE) || defined(WITH_CPYCHECKER_STEALS_REFERENCE_TO_ARG_ATTRIBUTE)
+
+typedef struct {
+ PyObject_HEAD
+ npy_bool obval;
+} PyBoolScalarObject;
+
+extern NPY_NO_EXPORT PyTypeObject PyArrayMapIter_Type;
+extern NPY_NO_EXPORT PyTypeObject PyArrayNeighborhoodIter_Type;
+extern NPY_NO_EXPORT PyBoolScalarObject _PyArrayScalar_BoolValues[2];
+
+%s
+
+#else
+
+#if defined(PY_ARRAY_UNIQUE_SYMBOL)
+#define PyArray_API PY_ARRAY_UNIQUE_SYMBOL
+#endif
+
+#if defined(NO_IMPORT) || defined(NO_IMPORT_ARRAY)
+extern void **PyArray_API;
+#else
+#if defined(PY_ARRAY_UNIQUE_SYMBOL)
+void **PyArray_API;
+#else
+static void **PyArray_API=NULL;
+#endif
+#endif
+
+%s
+
+#if !defined(NO_IMPORT_ARRAY) && !defined(NO_IMPORT)
+static int
+_import_array(void)
+{
+ int st;
+ PyObject *numpy = PyImport_ImportModule("numpy.core._multiarray_umath");
+ PyObject *c_api = NULL;
+
+ if (numpy == NULL) {
+ return -1;
+ }
+ c_api = PyObject_GetAttrString(numpy, "_ARRAY_API");
+ Py_DECREF(numpy);
+ if (c_api == NULL) {
+ return -1;
+ }
+
+ if (!PyCapsule_CheckExact(c_api)) {
+ PyErr_SetString(PyExc_RuntimeError, "_ARRAY_API is not PyCapsule object");
+ Py_DECREF(c_api);
+ return -1;
+ }
+ PyArray_API = (void **)PyCapsule_GetPointer(c_api, NULL);
+ Py_DECREF(c_api);
+ if (PyArray_API == NULL) {
+ PyErr_SetString(PyExc_RuntimeError, "_ARRAY_API is NULL pointer");
+ return -1;
+ }
+
+ /* Perform runtime check of C API version */
+ if (NPY_VERSION != PyArray_GetNDArrayCVersion()) {
+ PyErr_Format(PyExc_RuntimeError, "module compiled against "\
+ "ABI version 0x%%x but this version of numpy is 0x%%x", \
+ (int) NPY_VERSION, (int) PyArray_GetNDArrayCVersion());
+ return -1;
+ }
+ if (NPY_FEATURE_VERSION > PyArray_GetNDArrayCFeatureVersion()) {
+ PyErr_Format(PyExc_RuntimeError, "module compiled against "\
+ "API version 0x%%x but this version of numpy is 0x%%x . "\
+ "Check the section C-API incompatibility at the "\
+ "Troubleshooting ImportError section at "\
+ "https://numpy.org/devdocs/user/troubleshooting-importerror.html"\
+ "#c-api-incompatibility "\
+ "for indications on how to solve this problem .", \
+ (int) NPY_FEATURE_VERSION, (int) PyArray_GetNDArrayCFeatureVersion());
+ return -1;
+ }
+
+ /*
+ * Perform runtime check of endianness and check it matches the one set by
+ * the headers (npy_endian.h) as a safeguard
+ */
+ st = PyArray_GetEndianness();
+ if (st == NPY_CPU_UNKNOWN_ENDIAN) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "FATAL: module compiled as unknown endian");
+ return -1;
+ }
+#if NPY_BYTE_ORDER == NPY_BIG_ENDIAN
+ if (st != NPY_CPU_BIG) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "FATAL: module compiled as big endian, but "
+ "detected different endianness at runtime");
+ return -1;
+ }
+#elif NPY_BYTE_ORDER == NPY_LITTLE_ENDIAN
+ if (st != NPY_CPU_LITTLE) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "FATAL: module compiled as little endian, but "
+ "detected different endianness at runtime");
+ return -1;
+ }
+#endif
+
+ return 0;
+}
+
+#define import_array() {if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import"); return NULL; } }
+
+#define import_array1(ret) {if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import"); return ret; } }
+
+#define import_array2(msg, ret) {if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, msg); return ret; } }
+
+#endif
+
+#endif
+"""
+
+
+c_template = r"""
+/* These pointers will be stored in the C-object for use in other
+ extension modules
+*/
+
+void *PyArray_API[] = {
+%s
+};
+"""
+
+def generate_api(output_dir, force=False):
+ basename = 'multiarray_api'
+
+ h_file = os.path.join(output_dir, '__%s.h' % basename)
+ c_file = os.path.join(output_dir, '__%s.c' % basename)
+ targets = (h_file, c_file)
+
+ sources = numpy_api.multiarray_api
+ do_generate_api(targets, sources)
+ return targets
+
+def do_generate_api(targets, sources):
+ header_file = targets[0]
+ c_file = targets[1]
+
+ global_vars = sources[0]
+ scalar_bool_values = sources[1]
+ types_api = sources[2]
+ multiarray_funcs = sources[3]
+
+ multiarray_api = sources[:]
+
+ module_list = []
+ extension_list = []
+ init_list = []
+
+ # Check multiarray api indexes
+ multiarray_api_index = genapi.merge_api_dicts(multiarray_api)
+ genapi.check_api_dict(multiarray_api_index)
+
+ numpyapi_list = genapi.get_api_functions('NUMPY_API',
+ multiarray_funcs)
+
+ # Create dict name -> *Api instance
+ api_name = 'PyArray_API'
+ multiarray_api_dict = {}
+ for f in numpyapi_list:
+ name = f.name
+ index = multiarray_funcs[name][0]
+ annotations = multiarray_funcs[name][1:]
+ multiarray_api_dict[f.name] = FunctionApi(f.name, index, annotations,
+ f.return_type,
+ f.args, api_name)
+
+ for name, val in global_vars.items():
+ index, type = val
+ multiarray_api_dict[name] = GlobalVarApi(name, index, type, api_name)
+
+ for name, val in scalar_bool_values.items():
+ index = val[0]
+ multiarray_api_dict[name] = BoolValuesApi(name, index, api_name)
+
+ for name, val in types_api.items():
+ index = val[0]
+ internal_type = None if len(val) == 1 else val[1]
+ multiarray_api_dict[name] = TypeApi(
+ name, index, 'PyTypeObject', api_name, internal_type)
+
+ if len(multiarray_api_dict) != len(multiarray_api_index):
+ keys_dict = set(multiarray_api_dict.keys())
+ keys_index = set(multiarray_api_index.keys())
+ raise AssertionError(
+ "Multiarray API size mismatch - "
+ "index has extra keys {}, dict has extra keys {}"
+ .format(keys_index - keys_dict, keys_dict - keys_index)
+ )
+
+ extension_list = []
+ for name, index in genapi.order_dict(multiarray_api_index):
+ api_item = multiarray_api_dict[name]
+ extension_list.append(api_item.define_from_array_api_string())
+ init_list.append(api_item.array_api_define())
+ module_list.append(api_item.internal_define())
+
+ # Write to header
+ s = h_template % ('\n'.join(module_list), '\n'.join(extension_list))
+ genapi.write_file(header_file, s)
+
+ # Write to c-code
+ s = c_template % ',\n'.join(init_list)
+ genapi.write_file(c_file, s)
+
+ return targets
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-o",
+ "--outdir",
+ type=str,
+ help="Path to the output directory"
+ )
+ parser.add_argument(
+ "-i",
+ "--ignore",
+ type=str,
+ help="An ignored input - may be useful to add a "
+ "dependency between custom targets"
+ )
+ args = parser.parse_args()
+
+ outdir_abs = os.path.join(os.getcwd(), args.outdir)
+
+ generate_api(outdir_abs)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/contrib/python/numpy/py3/numpy/core/src/common/npy_cpu_features.c b/contrib/python/numpy/py3/numpy/core/src/common/npy_cpu_features.c
index 64a85f6fb2..bd149f8b43 100644
--- a/contrib/python/numpy/py3/numpy/core/src/common/npy_cpu_features.c
+++ b/contrib/python/numpy/py3/numpy/core/src/common/npy_cpu_features.c
@@ -656,7 +656,7 @@ npy__cpu_init_features(void)
/***************** ARM ******************/
-#elif defined(__arm__) || defined(__aarch64__)
+#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM64)
static inline void
npy__cpu_init_features_arm8(void)
@@ -781,7 +781,7 @@ npy__cpu_init_features(void)
return;
#endif
// We have nothing else todo
-#if defined(NPY_HAVE_ASIMD) || defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8)
+#if defined(NPY_HAVE_ASIMD) || defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8) || defined(_M_ARM64)
#if defined(NPY_HAVE_FPHP) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
npy__cpu_have[NPY_CPU_FEATURE_FPHP] = 1;
#endif
diff --git a/contrib/python/numpy/py3/numpy/core/src/multiarray/convert.c b/contrib/python/numpy/py3/numpy/core/src/multiarray/convert.c
index 60c1a1b9b0..8ec0aeefb7 100644
--- a/contrib/python/numpy/py3/numpy/core/src/multiarray/convert.c
+++ b/contrib/python/numpy/py3/numpy/core/src/multiarray/convert.c
@@ -23,8 +23,9 @@
#include "array_coercion.h"
#include "refcount.h"
-int
-fallocate(int fd, int mode, off_t offset, off_t len);
+#if defined(HAVE_FALLOCATE) && defined(__linux__)
+#include <fcntl.h>
+#endif
/*
* allocate nbytes of diskspace for file fp
diff --git a/contrib/python/numpy/py3/numpy/core/src/multiarray/temp_elide.c b/contrib/python/numpy/py3/numpy/core/src/multiarray/temp_elide.c
index 15257804bc..a38f90e76c 100644
--- a/contrib/python/numpy/py3/numpy/core/src/multiarray/temp_elide.c
+++ b/contrib/python/numpy/py3/numpy/core/src/multiarray/temp_elide.c
@@ -59,6 +59,9 @@
*/
#if defined HAVE_BACKTRACE && defined HAVE_DLFCN_H && ! defined PYPY_VERSION
+
+#include <feature_detection_misc.h>
+
/* 1 prints elided operations, 2 prints stacktraces */
#define NPY_ELIDE_DEBUG 0
#define NPY_MAX_STACKSIZE 10
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c b/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c
index 25fae7f711..0d80a96966 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c
@@ -46,8 +46,16 @@
* q = TRUNC((n - (-dsign ) + (-nsign))/d) - (-qsign);
********************************************************************************/
+#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON)
+ // Due to integer 128-bit multiplication emulation, SIMD 64-bit division
+ // may not perform well on both neon and up to VSX3 compared to scalar
+ // division.
+ #define SIMD_DISABLE_DIV64_OPT
+#endif
+
#if NPY_SIMD
-#line 45
+#line 52
+#if 8 < 64 || (8 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_s8(char **args, npy_intp len)
{
@@ -107,8 +115,10 @@ simd_divide_by_scalar_contig_s8(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
-#line 45
+#line 52
+#if 16 < 64 || (16 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_s16(char **args, npy_intp len)
{
@@ -168,8 +178,10 @@ simd_divide_by_scalar_contig_s16(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
-#line 45
+#line 52
+#if 32 < 64 || (32 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_s32(char **args, npy_intp len)
{
@@ -229,8 +241,10 @@ simd_divide_by_scalar_contig_s32(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
-#line 45
+#line 52
+#if 64 < 64 || (64 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_s64(char **args, npy_intp len)
{
@@ -290,9 +304,11 @@ simd_divide_by_scalar_contig_s64(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
-#line 111
+#line 120
+#if 8 < 64 || (8 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_u8(char **args, npy_intp len)
{
@@ -314,8 +330,10 @@ simd_divide_by_scalar_contig_u8(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
-#line 111
+#line 120
+#if 16 < 64 || (16 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_u16(char **args, npy_intp len)
{
@@ -337,8 +355,10 @@ simd_divide_by_scalar_contig_u16(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
-#line 111
+#line 120
+#if 32 < 64 || (32 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_u32(char **args, npy_intp len)
{
@@ -360,8 +380,10 @@ simd_divide_by_scalar_contig_u32(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
-#line 111
+#line 120
+#if 64 < 64 || (64 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_u64(char **args, npy_intp len)
{
@@ -383,11 +405,12 @@ simd_divide_by_scalar_contig_u64(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
#if defined(NPY_HAVE_VSX4)
-#line 140
+#line 151
/*
* Computes division of 2 8-bit signed/unsigned integer vectors
*
@@ -452,7 +475,7 @@ vsx4_div_u16(npyv_u16 a, npyv_u16 b)
#define vsx4_div_u32 vec_div
#define vsx4_div_u64 vec_div
-#line 140
+#line 151
/*
* Computes division of 2 8-bit signed/unsigned integer vectors
*
@@ -518,7 +541,7 @@ vsx4_div_s16(npyv_s16 a, npyv_s16 b)
#define vsx4_div_s64 vec_div
-#line 210
+#line 221
static inline void
vsx4_simd_divide_contig_u8(char **args, npy_intp len)
{
@@ -552,7 +575,7 @@ vsx4_simd_divide_contig_u8(char **args, npy_intp len)
npyv_cleanup();
}
-#line 210
+#line 221
static inline void
vsx4_simd_divide_contig_u16(char **args, npy_intp len)
{
@@ -586,7 +609,7 @@ vsx4_simd_divide_contig_u16(char **args, npy_intp len)
npyv_cleanup();
}
-#line 210
+#line 221
static inline void
vsx4_simd_divide_contig_u32(char **args, npy_intp len)
{
@@ -620,7 +643,7 @@ vsx4_simd_divide_contig_u32(char **args, npy_intp len)
npyv_cleanup();
}
-#line 210
+#line 221
static inline void
vsx4_simd_divide_contig_u64(char **args, npy_intp len)
{
@@ -655,7 +678,7 @@ vsx4_simd_divide_contig_u64(char **args, npy_intp len)
}
-#line 249
+#line 260
static inline void
vsx4_simd_divide_contig_s8(char **args, npy_intp len)
{
@@ -724,7 +747,7 @@ vsx4_simd_divide_contig_s8(char **args, npy_intp len)
npyv_cleanup();
}
-#line 249
+#line 260
static inline void
vsx4_simd_divide_contig_s16(char **args, npy_intp len)
{
@@ -793,7 +816,7 @@ vsx4_simd_divide_contig_s16(char **args, npy_intp len)
npyv_cleanup();
}
-#line 249
+#line 260
static inline void
vsx4_simd_divide_contig_s32(char **args, npy_intp len)
{
@@ -862,7 +885,7 @@ vsx4_simd_divide_contig_s32(char **args, npy_intp len)
npyv_cleanup();
}
-#line 249
+#line 260
static inline void
vsx4_simd_divide_contig_s64(char **args, npy_intp len)
{
@@ -938,28 +961,27 @@ vsx4_simd_divide_contig_s64(char **args, npy_intp len)
** Defining ufunc inner functions
********************************************************************************/
-#line 329
+#line 340
#undef TO_SIMD_SFX
#if 0
-#line 334
+#line 345
#elif NPY_BITSOF_BYTE == 8
#define TO_SIMD_SFX(X) X##_s8
-#line 334
+#line 345
#elif NPY_BITSOF_BYTE == 16
#define TO_SIMD_SFX(X) X##_s16
-#line 334
+#line 345
#elif NPY_BITSOF_BYTE == 32
#define TO_SIMD_SFX(X) X##_s32
-#line 334
+#line 345
#elif NPY_BITSOF_BYTE == 64
#define TO_SIMD_SFX(X) X##_s64
#endif
-
-#if NPY_BITSOF_BYTE == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+#if NPY_BITSOF_BYTE == 64 && defined(SIMD_DISABLE_DIV64_OPT)
#undef TO_SIMD_SFX
#endif
@@ -1042,28 +1064,27 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_divide_indexed)
}
-#line 329
+#line 340
#undef TO_SIMD_SFX
#if 0
-#line 334
+#line 345
#elif NPY_BITSOF_SHORT == 8
#define TO_SIMD_SFX(X) X##_s8
-#line 334
+#line 345
#elif NPY_BITSOF_SHORT == 16
#define TO_SIMD_SFX(X) X##_s16
-#line 334
+#line 345
#elif NPY_BITSOF_SHORT == 32
#define TO_SIMD_SFX(X) X##_s32
-#line 334
+#line 345
#elif NPY_BITSOF_SHORT == 64
#define TO_SIMD_SFX(X) X##_s64
#endif
-
-#if NPY_BITSOF_SHORT == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+#if NPY_BITSOF_SHORT == 64 && defined(SIMD_DISABLE_DIV64_OPT)
#undef TO_SIMD_SFX
#endif
@@ -1146,28 +1167,27 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_divide_indexed)
}
-#line 329
+#line 340
#undef TO_SIMD_SFX
#if 0
-#line 334
+#line 345
#elif NPY_BITSOF_INT == 8
#define TO_SIMD_SFX(X) X##_s8
-#line 334
+#line 345
#elif NPY_BITSOF_INT == 16
#define TO_SIMD_SFX(X) X##_s16
-#line 334
+#line 345
#elif NPY_BITSOF_INT == 32
#define TO_SIMD_SFX(X) X##_s32
-#line 334
+#line 345
#elif NPY_BITSOF_INT == 64
#define TO_SIMD_SFX(X) X##_s64
#endif
-
-#if NPY_BITSOF_INT == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+#if NPY_BITSOF_INT == 64 && defined(SIMD_DISABLE_DIV64_OPT)
#undef TO_SIMD_SFX
#endif
@@ -1250,28 +1270,27 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_divide_indexed)
}
-#line 329
+#line 340
#undef TO_SIMD_SFX
#if 0
-#line 334
+#line 345
#elif NPY_BITSOF_LONG == 8
#define TO_SIMD_SFX(X) X##_s8
-#line 334
+#line 345
#elif NPY_BITSOF_LONG == 16
#define TO_SIMD_SFX(X) X##_s16
-#line 334
+#line 345
#elif NPY_BITSOF_LONG == 32
#define TO_SIMD_SFX(X) X##_s32
-#line 334
+#line 345
#elif NPY_BITSOF_LONG == 64
#define TO_SIMD_SFX(X) X##_s64
#endif
-
-#if NPY_BITSOF_LONG == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+#if NPY_BITSOF_LONG == 64 && defined(SIMD_DISABLE_DIV64_OPT)
#undef TO_SIMD_SFX
#endif
@@ -1354,28 +1373,27 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_divide_indexed)
}
-#line 329
+#line 340
#undef TO_SIMD_SFX
#if 0
-#line 334
+#line 345
#elif NPY_BITSOF_LONGLONG == 8
#define TO_SIMD_SFX(X) X##_s8
-#line 334
+#line 345
#elif NPY_BITSOF_LONGLONG == 16
#define TO_SIMD_SFX(X) X##_s16
-#line 334
+#line 345
#elif NPY_BITSOF_LONGLONG == 32
#define TO_SIMD_SFX(X) X##_s32
-#line 334
+#line 345
#elif NPY_BITSOF_LONGLONG == 64
#define TO_SIMD_SFX(X) X##_s64
#endif
-
-#if NPY_BITSOF_LONGLONG == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+#if NPY_BITSOF_LONGLONG == 64 && defined(SIMD_DISABLE_DIV64_OPT)
#undef TO_SIMD_SFX
#endif
@@ -1459,22 +1477,22 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_divide_indexed)
-#line 429
+#line 439
#undef TO_SIMD_SFX
#if 0
-#line 434
+#line 444
#elif NPY_BITSOF_BYTE == 8
#define TO_SIMD_SFX(X) X##_u8
-#line 434
+#line 444
#elif NPY_BITSOF_BYTE == 16
#define TO_SIMD_SFX(X) X##_u16
-#line 434
+#line 444
#elif NPY_BITSOF_BYTE == 32
#define TO_SIMD_SFX(X) X##_u32
-#line 434
+#line 444
#elif NPY_BITSOF_BYTE == 64
#define TO_SIMD_SFX(X) X##_u64
@@ -1560,22 +1578,22 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_divide_indexed)
}
-#line 429
+#line 439
#undef TO_SIMD_SFX
#if 0
-#line 434
+#line 444
#elif NPY_BITSOF_SHORT == 8
#define TO_SIMD_SFX(X) X##_u8
-#line 434
+#line 444
#elif NPY_BITSOF_SHORT == 16
#define TO_SIMD_SFX(X) X##_u16
-#line 434
+#line 444
#elif NPY_BITSOF_SHORT == 32
#define TO_SIMD_SFX(X) X##_u32
-#line 434
+#line 444
#elif NPY_BITSOF_SHORT == 64
#define TO_SIMD_SFX(X) X##_u64
@@ -1661,22 +1679,22 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_divide_indexed)
}
-#line 429
+#line 439
#undef TO_SIMD_SFX
#if 0
-#line 434
+#line 444
#elif NPY_BITSOF_INT == 8
#define TO_SIMD_SFX(X) X##_u8
-#line 434
+#line 444
#elif NPY_BITSOF_INT == 16
#define TO_SIMD_SFX(X) X##_u16
-#line 434
+#line 444
#elif NPY_BITSOF_INT == 32
#define TO_SIMD_SFX(X) X##_u32
-#line 434
+#line 444
#elif NPY_BITSOF_INT == 64
#define TO_SIMD_SFX(X) X##_u64
@@ -1762,22 +1780,22 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_divide_indexed)
}
-#line 429
+#line 439
#undef TO_SIMD_SFX
#if 0
-#line 434
+#line 444
#elif NPY_BITSOF_LONG == 8
#define TO_SIMD_SFX(X) X##_u8
-#line 434
+#line 444
#elif NPY_BITSOF_LONG == 16
#define TO_SIMD_SFX(X) X##_u16
-#line 434
+#line 444
#elif NPY_BITSOF_LONG == 32
#define TO_SIMD_SFX(X) X##_u32
-#line 434
+#line 444
#elif NPY_BITSOF_LONG == 64
#define TO_SIMD_SFX(X) X##_u64
@@ -1863,22 +1881,22 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_divide_indexed)
}
-#line 429
+#line 439
#undef TO_SIMD_SFX
#if 0
-#line 434
+#line 444
#elif NPY_BITSOF_LONGLONG == 8
#define TO_SIMD_SFX(X) X##_u8
-#line 434
+#line 444
#elif NPY_BITSOF_LONGLONG == 16
#define TO_SIMD_SFX(X) X##_u16
-#line 434
+#line 444
#elif NPY_BITSOF_LONGLONG == 32
#define TO_SIMD_SFX(X) X##_u32
-#line 434
+#line 444
#elif NPY_BITSOF_LONGLONG == 64
#define TO_SIMD_SFX(X) X##_u64
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index e07bb79808..d056046e05 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -36,12 +36,20 @@
* q = TRUNC((n - (-dsign ) + (-nsign))/d) - (-qsign);
********************************************************************************/
+#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON)
+ // Due to integer 128-bit multiplication emulation, SIMD 64-bit division
+ // may not perform well on both neon and up to VSX3 compared to scalar
+ // division.
+ #define SIMD_DISABLE_DIV64_OPT
+#endif
+
#if NPY_SIMD
/**begin repeat
* Signed types
* #sfx = s8, s16, s32, s64#
* #len = 8, 16, 32, 64#
*/
+#if @len@ < 64 || (@len@ == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
{
@@ -101,6 +109,7 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
/**end repeat**/
/**begin repeat
@@ -108,6 +117,7 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
* #sfx = u8, u16, u32, u64#
* #len = 8, 16, 32, 64#
*/
+#if @len@ < 64 || (@len@ == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
static inline void
simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
{
@@ -129,6 +139,7 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
}
npyv_cleanup();
}
+#endif
/**end repeat**/
#if defined(NPY_HAVE_VSX4)
@@ -335,8 +346,7 @@ vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len)
#define TO_SIMD_SFX(X) X##_s@len@
/**end repeat1**/
#endif
-
-#if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+#if NPY_BITSOF_@TYPE@ == 64 && defined(SIMD_DISABLE_DIV64_OPT)
#undef TO_SIMD_SFX
#endif
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c b/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c
index 5e9827a14c..8f446c3a8d 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c
@@ -134,18 +134,6 @@ fma_blend(__m256 x, __m256 y, __m256 ymask)
}
NPY_FINLINE __m256
-fma_invert_mask_ps(__m256 ymask)
-{
- return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
-}
-
-NPY_FINLINE __m256i
-fma_invert_mask_pd(__m256i ymask)
-{
- return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
-}
-
-NPY_FINLINE __m256
fma_get_exponent(__m256 x)
{
/*
@@ -321,18 +309,6 @@ avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
return _mm512_mask_mov_ps(x, ymask, y);
}
-NPY_FINLINE __mmask16
-avx512_invert_mask_ps(__mmask16 ymask)
-{
- return _mm512_knot(ymask);
-}
-
-NPY_FINLINE __mmask8
-avx512_invert_mask_pd(__mmask8 ymask)
-{
- return _mm512_knot(ymask);
-}
-
NPY_FINLINE __m512
avx512_get_exponent(__m512 x)
{
@@ -384,7 +360,7 @@ avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
/********************************************************************************
** Defining the SIMD kernels
********************************************************************************/
-#line 396
+#line 372
#ifdef SIMD_AVX2_FMA3
/*
* Vectorized Cody-Waite range reduction technique
@@ -683,7 +659,7 @@ simd_log_FLOAT(npy_float * op,
}
#endif // SIMD_AVX2_FMA3
-#line 396
+#line 372
#ifdef SIMD_AVX512F
/*
* Vectorized Cody-Waite range reduction technique
@@ -984,7 +960,7 @@ simd_log_FLOAT(npy_float * op,
#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
-#line 700
+#line 676
static void
simd_exp_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
@@ -1015,7 +991,7 @@ simd_exp_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
npyv_cleanup();
}
-#line 700
+#line 676
static void
simd_log_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
@@ -1298,49 +1274,49 @@ AVX512F_log_DOUBLE(npy_double * op,
__m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);
/* Load lookup table data */
- #line 985
+ #line 961
__m512d mLUT_TOP_0 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*0]));
__m512d mLUT_TAIL_0 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*0]));
-#line 985
+#line 961
__m512d mLUT_TOP_1 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*1]));
__m512d mLUT_TAIL_1 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*1]));
-#line 985
+#line 961
__m512d mLUT_TOP_2 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*2]));
__m512d mLUT_TAIL_2 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*2]));
-#line 985
+#line 961
__m512d mLUT_TOP_3 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*3]));
__m512d mLUT_TAIL_3 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*3]));
-#line 985
+#line 961
__m512d mLUT_TOP_4 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*4]));
__m512d mLUT_TAIL_4 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*4]));
-#line 985
+#line 961
__m512d mLUT_TOP_5 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*5]));
__m512d mLUT_TAIL_5 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*5]));
-#line 985
+#line 961
__m512d mLUT_TOP_6 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*6]));
__m512d mLUT_TAIL_6 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*6]));
-#line 985
+#line 961
__m512d mLUT_TOP_7 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*7]));
__m512d mLUT_TAIL_7 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*7]));
@@ -1487,7 +1463,7 @@ AVX512F_log_DOUBLE(npy_double * op,
#endif // NPY_CAN_LINK_SVML
#ifdef SIMD_AVX512_SKX
-#line 1149
+#line 1125
static inline void
AVX512_SKX_ldexp_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
@@ -1634,7 +1610,7 @@ AVX512_SKX_frexp_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *
}
}
-#line 1149
+#line 1125
static inline void
AVX512_SKX_ldexp_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
@@ -1787,7 +1763,7 @@ AVX512_SKX_frexp_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const
/********************************************************************************
** Defining ufunc inner functions
********************************************************************************/
-#line 1305
+#line 1281
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_exp)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
@@ -1816,7 +1792,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_exp)
#endif
}
-#line 1305
+#line 1281
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_log)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
@@ -1846,7 +1822,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_log)
}
-#line 1338
+#line 1314
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_exp)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
@@ -1879,7 +1855,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_exp)
}
-#line 1338
+#line 1314
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_log)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
@@ -1913,7 +1889,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_log)
-#line 1378
+#line 1354
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_frexp)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -1945,7 +1921,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_ldexp)
}
}
-#line 1378
+#line 1354
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_frexp)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c.src b/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
index 1fac3c150c..85dac9c20d 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
@@ -124,18 +124,6 @@ fma_blend(__m256 x, __m256 y, __m256 ymask)
}
NPY_FINLINE __m256
-fma_invert_mask_ps(__m256 ymask)
-{
- return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
-}
-
-NPY_FINLINE __m256i
-fma_invert_mask_pd(__m256i ymask)
-{
- return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
-}
-
-NPY_FINLINE __m256
fma_get_exponent(__m256 x)
{
/*
@@ -311,18 +299,6 @@ avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
return _mm512_mask_mov_ps(x, ymask, y);
}
-NPY_FINLINE __mmask16
-avx512_invert_mask_ps(__mmask16 ymask)
-{
- return _mm512_knot(ymask);
-}
-
-NPY_FINLINE __mmask8
-avx512_invert_mask_pd(__mmask8 ymask)
-{
- return _mm512_knot(ymask);
-}
-
NPY_FINLINE __m512
avx512_get_exponent(__m512 x)
{
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c b/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c
index ad8c1ef397..97a78b0e12 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c
@@ -320,7 +320,8 @@ simd_binary_ccc_max_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2,
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1,
const npyv_lanetype_s8 *ip2, npy_intp sip2,
@@ -483,7 +484,8 @@ simd_binary_ccc_min_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2,
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1,
const npyv_lanetype_s8 *ip2, npy_intp sip2,
@@ -646,7 +648,8 @@ simd_binary_ccc_maxp_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1,
const npyv_lanetype_s8 *ip2, npy_intp sip2,
@@ -809,7 +812,8 @@ simd_binary_ccc_minp_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1,
const npyv_lanetype_s8 *ip2, npy_intp sip2,
@@ -974,7 +978,8 @@ simd_binary_ccc_max_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2,
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1,
const npyv_lanetype_u8 *ip2, npy_intp sip2,
@@ -1137,7 +1142,8 @@ simd_binary_ccc_min_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2,
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1,
const npyv_lanetype_u8 *ip2, npy_intp sip2,
@@ -1300,7 +1306,8 @@ simd_binary_ccc_maxp_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1,
const npyv_lanetype_u8 *ip2, npy_intp sip2,
@@ -1463,7 +1470,8 @@ simd_binary_ccc_minp_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1,
const npyv_lanetype_u8 *ip2, npy_intp sip2,
@@ -1628,7 +1636,8 @@ simd_binary_ccc_max_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1,
const npyv_lanetype_s16 *ip2, npy_intp sip2,
@@ -1791,7 +1800,8 @@ simd_binary_ccc_min_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1,
const npyv_lanetype_s16 *ip2, npy_intp sip2,
@@ -1954,7 +1964,8 @@ simd_binary_ccc_maxp_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1,
const npyv_lanetype_s16 *ip2, npy_intp sip2,
@@ -2117,7 +2128,8 @@ simd_binary_ccc_minp_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1,
const npyv_lanetype_s16 *ip2, npy_intp sip2,
@@ -2282,7 +2294,8 @@ simd_binary_ccc_max_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1,
const npyv_lanetype_u16 *ip2, npy_intp sip2,
@@ -2445,7 +2458,8 @@ simd_binary_ccc_min_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1,
const npyv_lanetype_u16 *ip2, npy_intp sip2,
@@ -2608,7 +2622,8 @@ simd_binary_ccc_maxp_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1,
const npyv_lanetype_u16 *ip2, npy_intp sip2,
@@ -2771,7 +2786,8 @@ simd_binary_ccc_minp_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1,
const npyv_lanetype_u16 *ip2, npy_intp sip2,
@@ -2936,7 +2952,8 @@ simd_binary_ccc_max_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1,
const npyv_lanetype_s32 *ip2, npy_intp sip2,
@@ -3099,7 +3116,8 @@ simd_binary_ccc_min_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1,
const npyv_lanetype_s32 *ip2, npy_intp sip2,
@@ -3262,7 +3280,8 @@ simd_binary_ccc_maxp_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1,
const npyv_lanetype_s32 *ip2, npy_intp sip2,
@@ -3425,7 +3444,8 @@ simd_binary_ccc_minp_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1,
const npyv_lanetype_s32 *ip2, npy_intp sip2,
@@ -3590,7 +3610,8 @@ simd_binary_ccc_max_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1,
const npyv_lanetype_u32 *ip2, npy_intp sip2,
@@ -3753,7 +3774,8 @@ simd_binary_ccc_min_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1,
const npyv_lanetype_u32 *ip2, npy_intp sip2,
@@ -3916,7 +3938,8 @@ simd_binary_ccc_maxp_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1,
const npyv_lanetype_u32 *ip2, npy_intp sip2,
@@ -4079,7 +4102,8 @@ simd_binary_ccc_minp_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1,
const npyv_lanetype_u32 *ip2, npy_intp sip2,
@@ -4244,7 +4268,8 @@ simd_binary_ccc_max_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1,
const npyv_lanetype_s64 *ip2, npy_intp sip2,
@@ -4407,7 +4432,8 @@ simd_binary_ccc_min_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1,
const npyv_lanetype_s64 *ip2, npy_intp sip2,
@@ -4570,7 +4596,8 @@ simd_binary_ccc_maxp_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1,
const npyv_lanetype_s64 *ip2, npy_intp sip2,
@@ -4733,7 +4760,8 @@ simd_binary_ccc_minp_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1,
const npyv_lanetype_s64 *ip2, npy_intp sip2,
@@ -4898,7 +4926,8 @@ simd_binary_ccc_max_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1,
const npyv_lanetype_u64 *ip2, npy_intp sip2,
@@ -5061,7 +5090,8 @@ simd_binary_ccc_min_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1,
const npyv_lanetype_u64 *ip2, npy_intp sip2,
@@ -5224,7 +5254,8 @@ simd_binary_ccc_maxp_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1,
const npyv_lanetype_u64 *ip2, npy_intp sip2,
@@ -5387,7 +5418,8 @@ simd_binary_ccc_minp_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 0
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1,
const npyv_lanetype_u64 *ip2, npy_intp sip2,
@@ -5552,7 +5584,8 @@ simd_binary_ccc_max_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 1
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1,
const npyv_lanetype_f32 *ip2, npy_intp sip2,
@@ -5715,7 +5748,8 @@ simd_binary_ccc_min_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 1
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1,
const npyv_lanetype_f32 *ip2, npy_intp sip2,
@@ -5878,7 +5912,8 @@ simd_binary_ccc_maxp_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 1
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1,
const npyv_lanetype_f32 *ip2, npy_intp sip2,
@@ -6041,7 +6076,8 @@ simd_binary_ccc_minp_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 1
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1,
const npyv_lanetype_f32 *ip2, npy_intp sip2,
@@ -6206,7 +6242,8 @@ simd_binary_ccc_max_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 1
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_max_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
const npyv_lanetype_f64 *ip2, npy_intp sip2,
@@ -6369,7 +6406,8 @@ simd_binary_ccc_min_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 *i
}
}
// non-contiguous for float 32/64-bit memory access
-#if 1
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_min_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
const npyv_lanetype_f64 *ip2, npy_intp sip2,
@@ -6532,7 +6570,8 @@ simd_binary_ccc_maxp_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 1
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_maxp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
const npyv_lanetype_f64 *ip2, npy_intp sip2,
@@ -6695,7 +6734,8 @@ simd_binary_ccc_minp_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 *
}
}
// non-contiguous for float 32/64-bit memory access
-#if 1
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
const npyv_lanetype_f64 *ip2, npy_intp sip2,
@@ -6744,10 +6784,10 @@ simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
/*******************************************************************************
** Defining ufunc inner functions
******************************************************************************/
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -6763,7 +6803,7 @@ simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -6779,7 +6819,7 @@ simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -6795,7 +6835,7 @@ simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -6813,7 +6853,7 @@ simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -6921,22 +6961,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1));
npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
*((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1));
npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
*((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1));
npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
*((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1));
npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
*((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -6988,7 +7028,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -7096,22 +7136,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1));
npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
*((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1));
npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
*((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1));
npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
*((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1));
npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
*((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -7163,7 +7203,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -7271,22 +7311,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1));
npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
*((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1));
npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
*((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1));
npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
*((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1));
npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
*((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -7338,7 +7378,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -7446,22 +7486,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1));
npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
*((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1));
npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
*((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1));
npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
*((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1));
npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
*((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -7514,10 +7554,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -7533,7 +7573,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -7549,7 +7589,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -7565,7 +7605,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -7583,7 +7623,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -7691,22 +7731,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1));
npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
*((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1));
npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
*((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1));
npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
*((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1));
npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
*((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -7758,7 +7798,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -7866,22 +7906,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1));
npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
*((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1));
npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
*((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1));
npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
*((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1));
npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
*((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -7933,7 +7973,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -8041,22 +8081,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1));
npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
*((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1));
npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
*((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1));
npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
*((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1));
npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
*((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -8108,7 +8148,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -8216,22 +8256,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1));
npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
*((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1));
npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
*((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1));
npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
*((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1));
npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
*((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -8284,10 +8324,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -8303,7 +8343,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -8319,7 +8359,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -8335,7 +8375,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -8353,7 +8393,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -8461,22 +8501,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1));
npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2));
*((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1));
npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2));
*((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1));
npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2));
*((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1));
npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2));
*((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -8528,7 +8568,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -8636,22 +8676,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1));
npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2));
*((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1));
npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2));
*((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1));
npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2));
*((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1));
npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2));
*((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -8703,7 +8743,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -8811,22 +8851,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1));
npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2));
*((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1));
npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2));
*((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1));
npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2));
*((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1));
npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2));
*((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -8878,7 +8918,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -8986,22 +9026,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1));
npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2));
*((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1));
npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2));
*((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1));
npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2));
*((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1));
npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2));
*((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -9054,10 +9094,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -9073,7 +9113,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -9089,7 +9129,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -9105,7 +9145,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -9123,7 +9163,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -9231,22 +9271,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1));
npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
*((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1));
npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
*((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1));
npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
*((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1));
npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
*((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -9298,7 +9338,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -9406,22 +9446,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1));
npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
*((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1));
npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
*((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1));
npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
*((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1));
npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
*((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -9473,7 +9513,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -9581,22 +9621,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1));
npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
*((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1));
npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
*((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1));
npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
*((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1));
npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
*((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -9648,7 +9688,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -9756,22 +9796,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1));
npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
*((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1));
npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
*((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1));
npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
*((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1));
npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
*((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -9824,10 +9864,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -9843,7 +9883,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -9859,7 +9899,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -9875,7 +9915,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -9893,7 +9933,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -10001,22 +10041,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1));
npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
*((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1));
npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
*((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1));
npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
*((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1));
npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
*((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -10068,7 +10108,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -10176,22 +10216,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1));
npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
*((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1));
npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
*((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1));
npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
*((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1));
npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
*((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -10243,7 +10283,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -10351,22 +10391,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1));
npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
*((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1));
npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
*((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1));
npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
*((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1));
npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
*((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -10418,7 +10458,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -10526,22 +10566,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1));
npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
*((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1));
npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
*((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1));
npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
*((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1));
npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
*((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -10594,10 +10634,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -10613,7 +10653,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -10629,7 +10669,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -10645,7 +10685,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -10663,7 +10703,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -10771,22 +10811,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1));
npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2));
*((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1));
npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2));
*((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1));
npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2));
*((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1));
npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2));
*((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -10838,7 +10878,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -10946,22 +10986,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1));
npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2));
*((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1));
npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2));
*((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1));
npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2));
*((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1));
npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2));
*((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -11013,7 +11053,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -11121,22 +11161,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1));
npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2));
*((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1));
npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2));
*((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1));
npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2));
*((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1));
npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2));
*((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -11188,7 +11228,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -11296,22 +11336,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1));
npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2));
*((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1));
npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2));
*((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1));
npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2));
*((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1));
npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2));
*((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -11364,10 +11404,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -11383,7 +11423,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -11399,7 +11439,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -11415,7 +11455,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -11433,7 +11473,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -11541,22 +11581,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1));
npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2));
*((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1));
npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2));
*((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1));
npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2));
*((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1));
npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2));
*((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -11608,7 +11648,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -11716,22 +11756,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1));
npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2));
*((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1));
npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2));
*((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1));
npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2));
*((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1));
npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2));
*((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -11783,7 +11823,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -11891,22 +11931,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1));
npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2));
*((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1));
npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2));
*((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1));
npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2));
*((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1));
npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2));
*((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -11958,7 +11998,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -12066,22 +12106,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1));
npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2));
*((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1));
npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2));
*((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1));
npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2));
*((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1));
npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2));
*((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -12134,10 +12174,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -12153,7 +12193,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -12169,7 +12209,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -12185,7 +12225,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_INT == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -12203,7 +12243,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -12311,22 +12351,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1));
npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2));
*((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1));
npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2));
*((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1));
npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2));
*((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1));
npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2));
*((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -12378,7 +12418,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -12486,22 +12526,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1));
npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2));
*((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1));
npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2));
*((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1));
npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2));
*((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1));
npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2));
*((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -12553,7 +12593,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -12661,22 +12701,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1));
npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2));
*((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1));
npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2));
*((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1));
npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2));
*((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1));
npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2));
*((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -12728,7 +12768,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -12836,22 +12876,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1));
npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2));
*((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1));
npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2));
*((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1));
npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2));
*((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1));
npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2));
*((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -12904,10 +12944,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -12923,7 +12963,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -12939,7 +12979,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -12955,7 +12995,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONG == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -12973,7 +13013,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -13081,22 +13121,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1));
npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2));
*((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1));
npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2));
*((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1));
npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2));
*((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1));
npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2));
*((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -13148,7 +13188,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -13256,22 +13296,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1));
npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2));
*((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1));
npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2));
*((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1));
npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2));
*((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1));
npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2));
*((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -13323,7 +13363,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -13431,22 +13471,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1));
npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2));
*((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1));
npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2));
*((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1));
npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2));
*((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1));
npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2));
*((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -13498,7 +13538,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -13606,22 +13646,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1));
npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2));
*((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1));
npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2));
*((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1));
npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2));
*((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1));
npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2));
*((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -13674,10 +13714,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -13693,7 +13733,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -13709,7 +13749,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -13725,7 +13765,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -13743,7 +13783,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_max_i
@@ -13851,22 +13891,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1));
npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
*((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1));
npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
*((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1));
npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
*((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1));
npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
*((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -13918,7 +13958,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (0 && 0)
#define SCALAR_OP scalar_min_i
@@ -14026,22 +14066,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1));
npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
*((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1));
npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
*((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1));
npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
*((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1));
npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
*((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -14093,7 +14133,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_maxp_i
@@ -14201,22 +14241,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1));
npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
*((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1));
npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
*((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1));
npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
*((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1));
npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
*((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -14268,7 +14308,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (0 && 1)
#define SCALAR_OP scalar_minp_i
@@ -14376,22 +14416,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1));
npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
*((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1));
npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
*((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1));
npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
*((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1));
npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
*((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -14444,10 +14484,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 8
#if 1
#define TO_SIMD_SFX(X) X##_f8
@@ -14463,7 +14503,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 16
#if 1
#define TO_SIMD_SFX(X) X##_f16
@@ -14479,7 +14519,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 32
#if 1
#define TO_SIMD_SFX(X) X##_f32
@@ -14495,7 +14535,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 64
#if 1
#define TO_SIMD_SFX(X) X##_f64
@@ -14513,7 +14553,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (1 && 0)
#define SCALAR_OP scalar_max_f
@@ -14621,22 +14661,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1));
npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2));
*((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1));
npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2));
*((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1));
npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2));
*((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1));
npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2));
*((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -14688,7 +14728,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (1 && 0)
#define SCALAR_OP scalar_min_f
@@ -14796,22 +14836,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1));
npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2));
*((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1));
npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2));
*((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1));
npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2));
*((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1));
npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2));
*((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -14863,7 +14903,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (1 && 1)
#define SCALAR_OP scalar_maxp_f
@@ -14971,22 +15011,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1));
npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2));
*((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1));
npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2));
*((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1));
npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2));
*((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1));
npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2));
*((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -15038,7 +15078,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (1 && 1)
#define SCALAR_OP scalar_minp_f
@@ -15146,22 +15186,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1));
npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2));
*((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1));
npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2));
*((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1));
npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2));
*((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1));
npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2));
*((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -15214,10 +15254,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 8
#if 1
#define TO_SIMD_SFX(X) X##_f8
@@ -15233,7 +15273,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 16
#if 1
#define TO_SIMD_SFX(X) X##_f16
@@ -15249,7 +15289,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 32
#if 1
#define TO_SIMD_SFX(X) X##_f32
@@ -15265,7 +15305,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 64
#if 1
#define TO_SIMD_SFX(X) X##_f64
@@ -15283,7 +15323,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (1 && 0)
#define SCALAR_OP scalar_max_d
@@ -15391,22 +15431,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1));
npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2));
*((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1));
npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2));
*((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1));
npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2));
*((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1));
npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2));
*((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -15458,7 +15498,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (1 && 0)
#define SCALAR_OP scalar_min_d
@@ -15566,22 +15606,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1));
npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2));
*((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1));
npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2));
*((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1));
npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2));
*((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1));
npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2));
*((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -15633,7 +15673,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (1 && 1)
#define SCALAR_OP scalar_maxp_d
@@ -15741,22 +15781,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1));
npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2));
*((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1));
npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2));
*((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1));
npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2));
*((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1));
npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2));
*((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -15808,7 +15848,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (1 && 1)
#define SCALAR_OP scalar_minp_d
@@ -15916,22 +15956,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1));
npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2));
*((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1));
npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2));
*((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1));
npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2));
*((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1));
npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2));
*((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -15984,10 +16024,10 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 293
+#line 294
#undef TO_SIMD_SFX
#if 0
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 8
#if 1
#define TO_SIMD_SFX(X) X##_f8
@@ -16003,7 +16043,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 16
#if 1
#define TO_SIMD_SFX(X) X##_f16
@@ -16019,7 +16059,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 32
#if 1
#define TO_SIMD_SFX(X) X##_f32
@@ -16035,7 +16075,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin_indexed)
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 298
+#line 299
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 64
#if 1
#define TO_SIMD_SFX(X) X##_f64
@@ -16053,7 +16093,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin_indexed)
#endif
-#line 320
+#line 321
#if !0 || (1 && 0)
#define SCALAR_OP scalar_max_l
@@ -16161,22 +16201,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_maximum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1));
npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
*((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1));
npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
*((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1));
npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
*((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1));
npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
*((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -16228,7 +16268,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_maximum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !0 || (1 && 0)
#define SCALAR_OP scalar_min_l
@@ -16336,22 +16376,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_minimum)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1));
npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
*((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1));
npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
*((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1));
npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
*((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1));
npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
*((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -16403,7 +16443,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_minimum_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (1 && 1)
#define SCALAR_OP scalar_maxp_l
@@ -16511,22 +16551,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_fmax)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1));
npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
*((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1));
npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
*((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1));
npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
*((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1));
npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
*((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
@@ -16578,7 +16618,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_fmax_indexed)
#endif // !fp_only || (is_fp && fp_only)
-#line 320
+#line 321
#if !1 || (1 && 1)
#define SCALAR_OP scalar_minp_l
@@ -16686,22 +16726,22 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_fmin)
* result of iteration 1.
*/
- #line 430
+ #line 431
npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1));
npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
*((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
-#line 430
+#line 431
npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1));
npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
*((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
-#line 430
+#line 431
npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1));
npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
*((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
-#line 430
+#line 431
npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1));
npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
*((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c.src b/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c.src
index 236e2e2eb7..319072c01f 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c.src
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_minmax.dispatch.c.src
@@ -225,7 +225,8 @@ simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanety
}
}
// non-contiguous for float 32/64-bit memory access
-#if @is_fp@
+#if @is_fp@ && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
static inline void
simd_binary_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, npy_intp sip1,
const npyv_lanetype_@sfx@ *ip2, npy_intp sip2,
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c b/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c
index 9d9bc64a16..30ce938d66 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c
@@ -26,8 +26,8 @@
* when there's no native FUSED support instead of fallback to libc
*/
#if NPY_SIMD_FMA3 // native support
-#line 23
-#if NPY_SIMD_F64
+#line 24
+#if NPY_SIMD_F64 && 0
/*
* Vectorized Cody-Waite range reduction technique
* Performs the reduction step x* = x - y*C in three steps:
@@ -46,8 +46,8 @@ simd_range_reduction_f64(npyv_f64 x, npyv_f64 y, npyv_f64 c1, npyv_f64 c2, npyv_
}
#endif
-#line 23
-#if NPY_SIMD_F32
+#line 24
+#if NPY_SIMD_F32 && 1
/*
* Vectorized Cody-Waite range reduction technique
* Performs the reduction step x* = x - y*C in three steps:
@@ -66,9 +66,11 @@ simd_range_reduction_f32(npyv_f32 x, npyv_f32 y, npyv_f32 c1, npyv_f32 c2, npyv_
}
#endif
-
-#if NPY_SIMD_F64
-#line 47
+/* Disable SIMD code and revert to libm: see
+ * https://mail.python.org/archives/list/numpy-discussion@python.org/thread/C6EYZZSR4EWGVKHAZXLE7IBILRMNVK7L/
+ * for detailed discussion on this*/
+#if 0 // NPY_SIMD_F64
+#line 50
#if defined(NPY_OS_WIN32) || defined(NPY_OS_CYGWIN)
NPY_FINLINE npyv_f64
#else
@@ -90,7 +92,7 @@ simd_cos_scalar_f64(npyv_f64 out, npy_uint64 cmp_bits)
return npyv_loada_f64(out_copy);
}
-#line 47
+#line 50
#if defined(NPY_OS_WIN32) || defined(NPY_OS_CYGWIN)
NPY_FINLINE npyv_f64
#else
@@ -208,7 +210,7 @@ simd_sin_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign)
return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), sign), odd));
}
-#line 167
+#line 170
NPY_FINLINE void
simd_cos_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_intp len)
{
@@ -254,7 +256,7 @@ simd_cos_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_i
npyv_cleanup();
}
-#line 167
+#line 170
NPY_FINLINE void
simd_sin_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_intp len)
{
@@ -473,7 +475,7 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
#endif // NPY_SIMD_FP32
#endif // NYP_SIMD_FMA3
-#line 388
+#line 391
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_cos)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
@@ -507,7 +509,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_cos)
#endif
}
-#line 388
+#line 391
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_sin)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
@@ -542,7 +544,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_sin)
}
-#line 426
+#line 429
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_sin)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
@@ -572,7 +574,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_sin)
#endif
}
-#line 426
+#line 429
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_cos)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c.src b/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
index f07cb70f39..31de906098 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
@@ -19,8 +19,9 @@
/**begin repeat
* #check = F64, F32#
* #sfx = f64, f32#
+ * #enable = 0, 1#
*/
-#if NPY_SIMD_@check@
+#if NPY_SIMD_@check@ && @enable@
/*
* Vectorized Cody-Waite range reduction technique
* Performs the reduction step x* = x - y*C in three steps:
@@ -39,8 +40,10 @@ simd_range_reduction_@sfx@(npyv_@sfx@ x, npyv_@sfx@ y, npyv_@sfx@ c1, npyv_@sfx@
}
#endif
/**end repeat**/
-
-#if NPY_SIMD_F64
+/* Disable SIMD code and revert to libm: see
+ * https://mail.python.org/archives/list/numpy-discussion@python.org/thread/C6EYZZSR4EWGVKHAZXLE7IBILRMNVK7L/
+ * for detailed discussion on this*/
+#if 0 // NPY_SIMD_F64
/**begin repeat
* #op = cos, sin#
*/
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c b/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c
index 3ea2747d9e..b2d3b0976a 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c
@@ -604,6 +604,8 @@ simd_unary_nc_negative_s8(const npyv_lanetype_s8 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_s8(const npyv_lanetype_s8 *ip, npy_intp istride,
npyv_lanetype_s8 *op, npy_intp ostride,
@@ -614,112 +616,112 @@ simd_unary_nn_negative_s8(const npyv_lanetype_s8 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_s8 v_0 = npyv_loadn_s8(ip + 0 * vstep * istride, istride);
npyv_s8 r_0 = npyv_negative_s8(v_0);
npyv_storen_s8(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_s8 v_1 = npyv_loadn_s8(ip + 1 * vstep * istride, istride);
npyv_s8 r_1 = npyv_negative_s8(v_1);
npyv_storen_s8(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_s8 v_2 = npyv_loadn_s8(ip + 2 * vstep * istride, istride);
npyv_s8 r_2 = npyv_negative_s8(v_2);
npyv_storen_s8(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_s8 v_3 = npyv_loadn_s8(ip + 3 * vstep * istride, istride);
npyv_s8 r_3 = npyv_negative_s8(v_3);
npyv_storen_s8(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_s8 v_4 = npyv_loadn_s8(ip + 4 * vstep * istride, istride);
npyv_s8 r_4 = npyv_negative_s8(v_4);
npyv_storen_s8(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_s8 v_5 = npyv_loadn_s8(ip + 5 * vstep * istride, istride);
npyv_s8 r_5 = npyv_negative_s8(v_5);
npyv_storen_s8(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_s8 v_6 = npyv_loadn_s8(ip + 6 * vstep * istride, istride);
npyv_s8 r_6 = npyv_negative_s8(v_6);
npyv_storen_s8(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_s8 v_7 = npyv_loadn_s8(ip + 7 * vstep * istride, istride);
npyv_s8 r_7 = npyv_negative_s8(v_7);
npyv_storen_s8(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_s8 v_8 = npyv_loadn_s8(ip + 8 * vstep * istride, istride);
npyv_s8 r_8 = npyv_negative_s8(v_8);
npyv_storen_s8(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_s8 v_9 = npyv_loadn_s8(ip + 9 * vstep * istride, istride);
npyv_s8 r_9 = npyv_negative_s8(v_9);
npyv_storen_s8(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_s8 v_10 = npyv_loadn_s8(ip + 10 * vstep * istride, istride);
npyv_s8 r_10 = npyv_negative_s8(v_10);
npyv_storen_s8(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_s8 v_11 = npyv_loadn_s8(ip + 11 * vstep * istride, istride);
npyv_s8 r_11 = npyv_negative_s8(v_11);
npyv_storen_s8(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_s8 v_12 = npyv_loadn_s8(ip + 12 * vstep * istride, istride);
npyv_s8 r_12 = npyv_negative_s8(v_12);
npyv_storen_s8(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_s8 v_13 = npyv_loadn_s8(ip + 13 * vstep * istride, istride);
npyv_s8 r_13 = npyv_negative_s8(v_13);
npyv_storen_s8(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_s8 v_14 = npyv_loadn_s8(ip + 14 * vstep * istride, istride);
npyv_s8 r_14 = npyv_negative_s8(v_14);
npyv_storen_s8(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_s8 v_15 = npyv_loadn_s8(ip + 15 * vstep * istride, istride);
npyv_s8 r_15 = npyv_negative_s8(v_15);
@@ -738,6 +740,7 @@ simd_unary_nn_negative_s8(const npyv_lanetype_s8 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 0
#undef UNROLL
#endif // NPY_SIMD
@@ -1167,6 +1170,8 @@ simd_unary_nc_negative_u8(const npyv_lanetype_u8 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_u8(const npyv_lanetype_u8 *ip, npy_intp istride,
npyv_lanetype_u8 *op, npy_intp ostride,
@@ -1177,112 +1182,112 @@ simd_unary_nn_negative_u8(const npyv_lanetype_u8 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_u8 v_0 = npyv_loadn_u8(ip + 0 * vstep * istride, istride);
npyv_u8 r_0 = npyv_negative_u8(v_0);
npyv_storen_u8(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_u8 v_1 = npyv_loadn_u8(ip + 1 * vstep * istride, istride);
npyv_u8 r_1 = npyv_negative_u8(v_1);
npyv_storen_u8(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_u8 v_2 = npyv_loadn_u8(ip + 2 * vstep * istride, istride);
npyv_u8 r_2 = npyv_negative_u8(v_2);
npyv_storen_u8(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_u8 v_3 = npyv_loadn_u8(ip + 3 * vstep * istride, istride);
npyv_u8 r_3 = npyv_negative_u8(v_3);
npyv_storen_u8(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_u8 v_4 = npyv_loadn_u8(ip + 4 * vstep * istride, istride);
npyv_u8 r_4 = npyv_negative_u8(v_4);
npyv_storen_u8(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_u8 v_5 = npyv_loadn_u8(ip + 5 * vstep * istride, istride);
npyv_u8 r_5 = npyv_negative_u8(v_5);
npyv_storen_u8(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_u8 v_6 = npyv_loadn_u8(ip + 6 * vstep * istride, istride);
npyv_u8 r_6 = npyv_negative_u8(v_6);
npyv_storen_u8(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_u8 v_7 = npyv_loadn_u8(ip + 7 * vstep * istride, istride);
npyv_u8 r_7 = npyv_negative_u8(v_7);
npyv_storen_u8(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_u8 v_8 = npyv_loadn_u8(ip + 8 * vstep * istride, istride);
npyv_u8 r_8 = npyv_negative_u8(v_8);
npyv_storen_u8(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_u8 v_9 = npyv_loadn_u8(ip + 9 * vstep * istride, istride);
npyv_u8 r_9 = npyv_negative_u8(v_9);
npyv_storen_u8(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_u8 v_10 = npyv_loadn_u8(ip + 10 * vstep * istride, istride);
npyv_u8 r_10 = npyv_negative_u8(v_10);
npyv_storen_u8(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_u8 v_11 = npyv_loadn_u8(ip + 11 * vstep * istride, istride);
npyv_u8 r_11 = npyv_negative_u8(v_11);
npyv_storen_u8(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_u8 v_12 = npyv_loadn_u8(ip + 12 * vstep * istride, istride);
npyv_u8 r_12 = npyv_negative_u8(v_12);
npyv_storen_u8(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_u8 v_13 = npyv_loadn_u8(ip + 13 * vstep * istride, istride);
npyv_u8 r_13 = npyv_negative_u8(v_13);
npyv_storen_u8(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_u8 v_14 = npyv_loadn_u8(ip + 14 * vstep * istride, istride);
npyv_u8 r_14 = npyv_negative_u8(v_14);
npyv_storen_u8(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_u8 v_15 = npyv_loadn_u8(ip + 15 * vstep * istride, istride);
npyv_u8 r_15 = npyv_negative_u8(v_15);
@@ -1301,6 +1306,7 @@ simd_unary_nn_negative_u8(const npyv_lanetype_u8 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 0
#undef UNROLL
#endif // NPY_SIMD
@@ -1730,6 +1736,8 @@ simd_unary_nc_negative_s16(const npyv_lanetype_s16 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_s16(const npyv_lanetype_s16 *ip, npy_intp istride,
npyv_lanetype_s16 *op, npy_intp ostride,
@@ -1740,112 +1748,112 @@ simd_unary_nn_negative_s16(const npyv_lanetype_s16 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_s16 v_0 = npyv_loadn_s16(ip + 0 * vstep * istride, istride);
npyv_s16 r_0 = npyv_negative_s16(v_0);
npyv_storen_s16(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_s16 v_1 = npyv_loadn_s16(ip + 1 * vstep * istride, istride);
npyv_s16 r_1 = npyv_negative_s16(v_1);
npyv_storen_s16(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_s16 v_2 = npyv_loadn_s16(ip + 2 * vstep * istride, istride);
npyv_s16 r_2 = npyv_negative_s16(v_2);
npyv_storen_s16(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_s16 v_3 = npyv_loadn_s16(ip + 3 * vstep * istride, istride);
npyv_s16 r_3 = npyv_negative_s16(v_3);
npyv_storen_s16(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_s16 v_4 = npyv_loadn_s16(ip + 4 * vstep * istride, istride);
npyv_s16 r_4 = npyv_negative_s16(v_4);
npyv_storen_s16(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_s16 v_5 = npyv_loadn_s16(ip + 5 * vstep * istride, istride);
npyv_s16 r_5 = npyv_negative_s16(v_5);
npyv_storen_s16(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_s16 v_6 = npyv_loadn_s16(ip + 6 * vstep * istride, istride);
npyv_s16 r_6 = npyv_negative_s16(v_6);
npyv_storen_s16(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_s16 v_7 = npyv_loadn_s16(ip + 7 * vstep * istride, istride);
npyv_s16 r_7 = npyv_negative_s16(v_7);
npyv_storen_s16(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_s16 v_8 = npyv_loadn_s16(ip + 8 * vstep * istride, istride);
npyv_s16 r_8 = npyv_negative_s16(v_8);
npyv_storen_s16(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_s16 v_9 = npyv_loadn_s16(ip + 9 * vstep * istride, istride);
npyv_s16 r_9 = npyv_negative_s16(v_9);
npyv_storen_s16(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_s16 v_10 = npyv_loadn_s16(ip + 10 * vstep * istride, istride);
npyv_s16 r_10 = npyv_negative_s16(v_10);
npyv_storen_s16(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_s16 v_11 = npyv_loadn_s16(ip + 11 * vstep * istride, istride);
npyv_s16 r_11 = npyv_negative_s16(v_11);
npyv_storen_s16(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_s16 v_12 = npyv_loadn_s16(ip + 12 * vstep * istride, istride);
npyv_s16 r_12 = npyv_negative_s16(v_12);
npyv_storen_s16(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_s16 v_13 = npyv_loadn_s16(ip + 13 * vstep * istride, istride);
npyv_s16 r_13 = npyv_negative_s16(v_13);
npyv_storen_s16(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_s16 v_14 = npyv_loadn_s16(ip + 14 * vstep * istride, istride);
npyv_s16 r_14 = npyv_negative_s16(v_14);
npyv_storen_s16(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_s16 v_15 = npyv_loadn_s16(ip + 15 * vstep * istride, istride);
npyv_s16 r_15 = npyv_negative_s16(v_15);
@@ -1864,6 +1872,7 @@ simd_unary_nn_negative_s16(const npyv_lanetype_s16 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 0
#undef UNROLL
#endif // NPY_SIMD
@@ -2293,6 +2302,8 @@ simd_unary_nc_negative_u16(const npyv_lanetype_u16 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_u16(const npyv_lanetype_u16 *ip, npy_intp istride,
npyv_lanetype_u16 *op, npy_intp ostride,
@@ -2303,112 +2314,112 @@ simd_unary_nn_negative_u16(const npyv_lanetype_u16 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_u16 v_0 = npyv_loadn_u16(ip + 0 * vstep * istride, istride);
npyv_u16 r_0 = npyv_negative_u16(v_0);
npyv_storen_u16(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_u16 v_1 = npyv_loadn_u16(ip + 1 * vstep * istride, istride);
npyv_u16 r_1 = npyv_negative_u16(v_1);
npyv_storen_u16(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_u16 v_2 = npyv_loadn_u16(ip + 2 * vstep * istride, istride);
npyv_u16 r_2 = npyv_negative_u16(v_2);
npyv_storen_u16(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_u16 v_3 = npyv_loadn_u16(ip + 3 * vstep * istride, istride);
npyv_u16 r_3 = npyv_negative_u16(v_3);
npyv_storen_u16(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_u16 v_4 = npyv_loadn_u16(ip + 4 * vstep * istride, istride);
npyv_u16 r_4 = npyv_negative_u16(v_4);
npyv_storen_u16(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_u16 v_5 = npyv_loadn_u16(ip + 5 * vstep * istride, istride);
npyv_u16 r_5 = npyv_negative_u16(v_5);
npyv_storen_u16(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_u16 v_6 = npyv_loadn_u16(ip + 6 * vstep * istride, istride);
npyv_u16 r_6 = npyv_negative_u16(v_6);
npyv_storen_u16(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_u16 v_7 = npyv_loadn_u16(ip + 7 * vstep * istride, istride);
npyv_u16 r_7 = npyv_negative_u16(v_7);
npyv_storen_u16(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_u16 v_8 = npyv_loadn_u16(ip + 8 * vstep * istride, istride);
npyv_u16 r_8 = npyv_negative_u16(v_8);
npyv_storen_u16(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_u16 v_9 = npyv_loadn_u16(ip + 9 * vstep * istride, istride);
npyv_u16 r_9 = npyv_negative_u16(v_9);
npyv_storen_u16(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_u16 v_10 = npyv_loadn_u16(ip + 10 * vstep * istride, istride);
npyv_u16 r_10 = npyv_negative_u16(v_10);
npyv_storen_u16(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_u16 v_11 = npyv_loadn_u16(ip + 11 * vstep * istride, istride);
npyv_u16 r_11 = npyv_negative_u16(v_11);
npyv_storen_u16(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_u16 v_12 = npyv_loadn_u16(ip + 12 * vstep * istride, istride);
npyv_u16 r_12 = npyv_negative_u16(v_12);
npyv_storen_u16(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_u16 v_13 = npyv_loadn_u16(ip + 13 * vstep * istride, istride);
npyv_u16 r_13 = npyv_negative_u16(v_13);
npyv_storen_u16(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_u16 v_14 = npyv_loadn_u16(ip + 14 * vstep * istride, istride);
npyv_u16 r_14 = npyv_negative_u16(v_14);
npyv_storen_u16(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_u16 v_15 = npyv_loadn_u16(ip + 15 * vstep * istride, istride);
npyv_u16 r_15 = npyv_negative_u16(v_15);
@@ -2427,6 +2438,7 @@ simd_unary_nn_negative_u16(const npyv_lanetype_u16 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 0
#undef UNROLL
#endif // NPY_SIMD
@@ -2856,6 +2868,8 @@ simd_unary_nc_negative_s32(const npyv_lanetype_s32 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_s32(const npyv_lanetype_s32 *ip, npy_intp istride,
npyv_lanetype_s32 *op, npy_intp ostride,
@@ -2866,112 +2880,112 @@ simd_unary_nn_negative_s32(const npyv_lanetype_s32 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_s32 v_0 = npyv_loadn_s32(ip + 0 * vstep * istride, istride);
npyv_s32 r_0 = npyv_negative_s32(v_0);
npyv_storen_s32(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_s32 v_1 = npyv_loadn_s32(ip + 1 * vstep * istride, istride);
npyv_s32 r_1 = npyv_negative_s32(v_1);
npyv_storen_s32(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_s32 v_2 = npyv_loadn_s32(ip + 2 * vstep * istride, istride);
npyv_s32 r_2 = npyv_negative_s32(v_2);
npyv_storen_s32(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_s32 v_3 = npyv_loadn_s32(ip + 3 * vstep * istride, istride);
npyv_s32 r_3 = npyv_negative_s32(v_3);
npyv_storen_s32(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_s32 v_4 = npyv_loadn_s32(ip + 4 * vstep * istride, istride);
npyv_s32 r_4 = npyv_negative_s32(v_4);
npyv_storen_s32(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_s32 v_5 = npyv_loadn_s32(ip + 5 * vstep * istride, istride);
npyv_s32 r_5 = npyv_negative_s32(v_5);
npyv_storen_s32(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_s32 v_6 = npyv_loadn_s32(ip + 6 * vstep * istride, istride);
npyv_s32 r_6 = npyv_negative_s32(v_6);
npyv_storen_s32(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_s32 v_7 = npyv_loadn_s32(ip + 7 * vstep * istride, istride);
npyv_s32 r_7 = npyv_negative_s32(v_7);
npyv_storen_s32(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_s32 v_8 = npyv_loadn_s32(ip + 8 * vstep * istride, istride);
npyv_s32 r_8 = npyv_negative_s32(v_8);
npyv_storen_s32(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_s32 v_9 = npyv_loadn_s32(ip + 9 * vstep * istride, istride);
npyv_s32 r_9 = npyv_negative_s32(v_9);
npyv_storen_s32(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_s32 v_10 = npyv_loadn_s32(ip + 10 * vstep * istride, istride);
npyv_s32 r_10 = npyv_negative_s32(v_10);
npyv_storen_s32(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_s32 v_11 = npyv_loadn_s32(ip + 11 * vstep * istride, istride);
npyv_s32 r_11 = npyv_negative_s32(v_11);
npyv_storen_s32(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_s32 v_12 = npyv_loadn_s32(ip + 12 * vstep * istride, istride);
npyv_s32 r_12 = npyv_negative_s32(v_12);
npyv_storen_s32(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_s32 v_13 = npyv_loadn_s32(ip + 13 * vstep * istride, istride);
npyv_s32 r_13 = npyv_negative_s32(v_13);
npyv_storen_s32(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_s32 v_14 = npyv_loadn_s32(ip + 14 * vstep * istride, istride);
npyv_s32 r_14 = npyv_negative_s32(v_14);
npyv_storen_s32(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_s32 v_15 = npyv_loadn_s32(ip + 15 * vstep * istride, istride);
npyv_s32 r_15 = npyv_negative_s32(v_15);
@@ -2990,6 +3004,7 @@ simd_unary_nn_negative_s32(const npyv_lanetype_s32 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 1
#undef UNROLL
#endif // NPY_SIMD
@@ -3419,6 +3434,8 @@ simd_unary_nc_negative_u32(const npyv_lanetype_u32 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_u32(const npyv_lanetype_u32 *ip, npy_intp istride,
npyv_lanetype_u32 *op, npy_intp ostride,
@@ -3429,112 +3446,112 @@ simd_unary_nn_negative_u32(const npyv_lanetype_u32 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_u32 v_0 = npyv_loadn_u32(ip + 0 * vstep * istride, istride);
npyv_u32 r_0 = npyv_negative_u32(v_0);
npyv_storen_u32(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_u32 v_1 = npyv_loadn_u32(ip + 1 * vstep * istride, istride);
npyv_u32 r_1 = npyv_negative_u32(v_1);
npyv_storen_u32(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_u32 v_2 = npyv_loadn_u32(ip + 2 * vstep * istride, istride);
npyv_u32 r_2 = npyv_negative_u32(v_2);
npyv_storen_u32(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_u32 v_3 = npyv_loadn_u32(ip + 3 * vstep * istride, istride);
npyv_u32 r_3 = npyv_negative_u32(v_3);
npyv_storen_u32(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_u32 v_4 = npyv_loadn_u32(ip + 4 * vstep * istride, istride);
npyv_u32 r_4 = npyv_negative_u32(v_4);
npyv_storen_u32(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_u32 v_5 = npyv_loadn_u32(ip + 5 * vstep * istride, istride);
npyv_u32 r_5 = npyv_negative_u32(v_5);
npyv_storen_u32(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_u32 v_6 = npyv_loadn_u32(ip + 6 * vstep * istride, istride);
npyv_u32 r_6 = npyv_negative_u32(v_6);
npyv_storen_u32(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_u32 v_7 = npyv_loadn_u32(ip + 7 * vstep * istride, istride);
npyv_u32 r_7 = npyv_negative_u32(v_7);
npyv_storen_u32(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_u32 v_8 = npyv_loadn_u32(ip + 8 * vstep * istride, istride);
npyv_u32 r_8 = npyv_negative_u32(v_8);
npyv_storen_u32(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_u32 v_9 = npyv_loadn_u32(ip + 9 * vstep * istride, istride);
npyv_u32 r_9 = npyv_negative_u32(v_9);
npyv_storen_u32(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_u32 v_10 = npyv_loadn_u32(ip + 10 * vstep * istride, istride);
npyv_u32 r_10 = npyv_negative_u32(v_10);
npyv_storen_u32(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_u32 v_11 = npyv_loadn_u32(ip + 11 * vstep * istride, istride);
npyv_u32 r_11 = npyv_negative_u32(v_11);
npyv_storen_u32(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_u32 v_12 = npyv_loadn_u32(ip + 12 * vstep * istride, istride);
npyv_u32 r_12 = npyv_negative_u32(v_12);
npyv_storen_u32(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_u32 v_13 = npyv_loadn_u32(ip + 13 * vstep * istride, istride);
npyv_u32 r_13 = npyv_negative_u32(v_13);
npyv_storen_u32(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_u32 v_14 = npyv_loadn_u32(ip + 14 * vstep * istride, istride);
npyv_u32 r_14 = npyv_negative_u32(v_14);
npyv_storen_u32(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_u32 v_15 = npyv_loadn_u32(ip + 15 * vstep * istride, istride);
npyv_u32 r_15 = npyv_negative_u32(v_15);
@@ -3553,6 +3570,7 @@ simd_unary_nn_negative_u32(const npyv_lanetype_u32 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 1
#undef UNROLL
#endif // NPY_SIMD
@@ -3982,6 +4000,8 @@ simd_unary_nc_negative_s64(const npyv_lanetype_s64 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_s64(const npyv_lanetype_s64 *ip, npy_intp istride,
npyv_lanetype_s64 *op, npy_intp ostride,
@@ -3992,112 +4012,112 @@ simd_unary_nn_negative_s64(const npyv_lanetype_s64 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_s64 v_0 = npyv_loadn_s64(ip + 0 * vstep * istride, istride);
npyv_s64 r_0 = npyv_negative_s64(v_0);
npyv_storen_s64(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_s64 v_1 = npyv_loadn_s64(ip + 1 * vstep * istride, istride);
npyv_s64 r_1 = npyv_negative_s64(v_1);
npyv_storen_s64(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_s64 v_2 = npyv_loadn_s64(ip + 2 * vstep * istride, istride);
npyv_s64 r_2 = npyv_negative_s64(v_2);
npyv_storen_s64(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_s64 v_3 = npyv_loadn_s64(ip + 3 * vstep * istride, istride);
npyv_s64 r_3 = npyv_negative_s64(v_3);
npyv_storen_s64(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_s64 v_4 = npyv_loadn_s64(ip + 4 * vstep * istride, istride);
npyv_s64 r_4 = npyv_negative_s64(v_4);
npyv_storen_s64(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_s64 v_5 = npyv_loadn_s64(ip + 5 * vstep * istride, istride);
npyv_s64 r_5 = npyv_negative_s64(v_5);
npyv_storen_s64(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_s64 v_6 = npyv_loadn_s64(ip + 6 * vstep * istride, istride);
npyv_s64 r_6 = npyv_negative_s64(v_6);
npyv_storen_s64(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_s64 v_7 = npyv_loadn_s64(ip + 7 * vstep * istride, istride);
npyv_s64 r_7 = npyv_negative_s64(v_7);
npyv_storen_s64(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_s64 v_8 = npyv_loadn_s64(ip + 8 * vstep * istride, istride);
npyv_s64 r_8 = npyv_negative_s64(v_8);
npyv_storen_s64(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_s64 v_9 = npyv_loadn_s64(ip + 9 * vstep * istride, istride);
npyv_s64 r_9 = npyv_negative_s64(v_9);
npyv_storen_s64(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_s64 v_10 = npyv_loadn_s64(ip + 10 * vstep * istride, istride);
npyv_s64 r_10 = npyv_negative_s64(v_10);
npyv_storen_s64(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_s64 v_11 = npyv_loadn_s64(ip + 11 * vstep * istride, istride);
npyv_s64 r_11 = npyv_negative_s64(v_11);
npyv_storen_s64(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_s64 v_12 = npyv_loadn_s64(ip + 12 * vstep * istride, istride);
npyv_s64 r_12 = npyv_negative_s64(v_12);
npyv_storen_s64(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_s64 v_13 = npyv_loadn_s64(ip + 13 * vstep * istride, istride);
npyv_s64 r_13 = npyv_negative_s64(v_13);
npyv_storen_s64(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_s64 v_14 = npyv_loadn_s64(ip + 14 * vstep * istride, istride);
npyv_s64 r_14 = npyv_negative_s64(v_14);
npyv_storen_s64(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_s64 v_15 = npyv_loadn_s64(ip + 15 * vstep * istride, istride);
npyv_s64 r_15 = npyv_negative_s64(v_15);
@@ -4116,6 +4136,7 @@ simd_unary_nn_negative_s64(const npyv_lanetype_s64 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 1
#undef UNROLL
#endif // NPY_SIMD
@@ -4545,6 +4566,8 @@ simd_unary_nc_negative_u64(const npyv_lanetype_u64 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_u64(const npyv_lanetype_u64 *ip, npy_intp istride,
npyv_lanetype_u64 *op, npy_intp ostride,
@@ -4555,112 +4578,112 @@ simd_unary_nn_negative_u64(const npyv_lanetype_u64 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_u64 v_0 = npyv_loadn_u64(ip + 0 * vstep * istride, istride);
npyv_u64 r_0 = npyv_negative_u64(v_0);
npyv_storen_u64(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_u64 v_1 = npyv_loadn_u64(ip + 1 * vstep * istride, istride);
npyv_u64 r_1 = npyv_negative_u64(v_1);
npyv_storen_u64(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_u64 v_2 = npyv_loadn_u64(ip + 2 * vstep * istride, istride);
npyv_u64 r_2 = npyv_negative_u64(v_2);
npyv_storen_u64(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_u64 v_3 = npyv_loadn_u64(ip + 3 * vstep * istride, istride);
npyv_u64 r_3 = npyv_negative_u64(v_3);
npyv_storen_u64(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_u64 v_4 = npyv_loadn_u64(ip + 4 * vstep * istride, istride);
npyv_u64 r_4 = npyv_negative_u64(v_4);
npyv_storen_u64(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_u64 v_5 = npyv_loadn_u64(ip + 5 * vstep * istride, istride);
npyv_u64 r_5 = npyv_negative_u64(v_5);
npyv_storen_u64(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_u64 v_6 = npyv_loadn_u64(ip + 6 * vstep * istride, istride);
npyv_u64 r_6 = npyv_negative_u64(v_6);
npyv_storen_u64(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_u64 v_7 = npyv_loadn_u64(ip + 7 * vstep * istride, istride);
npyv_u64 r_7 = npyv_negative_u64(v_7);
npyv_storen_u64(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_u64 v_8 = npyv_loadn_u64(ip + 8 * vstep * istride, istride);
npyv_u64 r_8 = npyv_negative_u64(v_8);
npyv_storen_u64(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_u64 v_9 = npyv_loadn_u64(ip + 9 * vstep * istride, istride);
npyv_u64 r_9 = npyv_negative_u64(v_9);
npyv_storen_u64(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_u64 v_10 = npyv_loadn_u64(ip + 10 * vstep * istride, istride);
npyv_u64 r_10 = npyv_negative_u64(v_10);
npyv_storen_u64(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_u64 v_11 = npyv_loadn_u64(ip + 11 * vstep * istride, istride);
npyv_u64 r_11 = npyv_negative_u64(v_11);
npyv_storen_u64(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_u64 v_12 = npyv_loadn_u64(ip + 12 * vstep * istride, istride);
npyv_u64 r_12 = npyv_negative_u64(v_12);
npyv_storen_u64(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_u64 v_13 = npyv_loadn_u64(ip + 13 * vstep * istride, istride);
npyv_u64 r_13 = npyv_negative_u64(v_13);
npyv_storen_u64(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_u64 v_14 = npyv_loadn_u64(ip + 14 * vstep * istride, istride);
npyv_u64 r_14 = npyv_negative_u64(v_14);
npyv_storen_u64(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_u64 v_15 = npyv_loadn_u64(ip + 15 * vstep * istride, istride);
npyv_u64 r_15 = npyv_negative_u64(v_15);
@@ -4679,6 +4702,7 @@ simd_unary_nn_negative_u64(const npyv_lanetype_u64 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 1
#undef UNROLL
#endif // NPY_SIMD
@@ -5108,6 +5132,8 @@ simd_unary_nc_negative_f32(const npyv_lanetype_f32 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_f32(const npyv_lanetype_f32 *ip, npy_intp istride,
npyv_lanetype_f32 *op, npy_intp ostride,
@@ -5118,112 +5144,112 @@ simd_unary_nn_negative_f32(const npyv_lanetype_f32 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_f32 v_0 = npyv_loadn_f32(ip + 0 * vstep * istride, istride);
npyv_f32 r_0 = npyv_negative_f32(v_0);
npyv_storen_f32(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_f32 v_1 = npyv_loadn_f32(ip + 1 * vstep * istride, istride);
npyv_f32 r_1 = npyv_negative_f32(v_1);
npyv_storen_f32(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_f32 v_2 = npyv_loadn_f32(ip + 2 * vstep * istride, istride);
npyv_f32 r_2 = npyv_negative_f32(v_2);
npyv_storen_f32(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_f32 v_3 = npyv_loadn_f32(ip + 3 * vstep * istride, istride);
npyv_f32 r_3 = npyv_negative_f32(v_3);
npyv_storen_f32(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_f32 v_4 = npyv_loadn_f32(ip + 4 * vstep * istride, istride);
npyv_f32 r_4 = npyv_negative_f32(v_4);
npyv_storen_f32(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_f32 v_5 = npyv_loadn_f32(ip + 5 * vstep * istride, istride);
npyv_f32 r_5 = npyv_negative_f32(v_5);
npyv_storen_f32(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_f32 v_6 = npyv_loadn_f32(ip + 6 * vstep * istride, istride);
npyv_f32 r_6 = npyv_negative_f32(v_6);
npyv_storen_f32(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_f32 v_7 = npyv_loadn_f32(ip + 7 * vstep * istride, istride);
npyv_f32 r_7 = npyv_negative_f32(v_7);
npyv_storen_f32(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_f32 v_8 = npyv_loadn_f32(ip + 8 * vstep * istride, istride);
npyv_f32 r_8 = npyv_negative_f32(v_8);
npyv_storen_f32(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_f32 v_9 = npyv_loadn_f32(ip + 9 * vstep * istride, istride);
npyv_f32 r_9 = npyv_negative_f32(v_9);
npyv_storen_f32(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_f32 v_10 = npyv_loadn_f32(ip + 10 * vstep * istride, istride);
npyv_f32 r_10 = npyv_negative_f32(v_10);
npyv_storen_f32(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_f32 v_11 = npyv_loadn_f32(ip + 11 * vstep * istride, istride);
npyv_f32 r_11 = npyv_negative_f32(v_11);
npyv_storen_f32(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_f32 v_12 = npyv_loadn_f32(ip + 12 * vstep * istride, istride);
npyv_f32 r_12 = npyv_negative_f32(v_12);
npyv_storen_f32(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_f32 v_13 = npyv_loadn_f32(ip + 13 * vstep * istride, istride);
npyv_f32 r_13 = npyv_negative_f32(v_13);
npyv_storen_f32(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_f32 v_14 = npyv_loadn_f32(ip + 14 * vstep * istride, istride);
npyv_f32 r_14 = npyv_negative_f32(v_14);
npyv_storen_f32(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_f32 v_15 = npyv_loadn_f32(ip + 15 * vstep * istride, istride);
npyv_f32 r_15 = npyv_negative_f32(v_15);
@@ -5242,6 +5268,7 @@ simd_unary_nn_negative_f32(const npyv_lanetype_f32 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 1
#undef UNROLL
#endif // NPY_SIMD_F32
@@ -5671,6 +5698,8 @@ simd_unary_nc_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
npyv_lanetype_f64 *op, npy_intp ostride,
@@ -5681,112 +5710,112 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
// unrolled vector loop
for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
- #line 211
+ #line 213
#if UNROLL > 0
npyv_f64 v_0 = npyv_loadn_f64(ip + 0 * vstep * istride, istride);
npyv_f64 r_0 = npyv_negative_f64(v_0);
npyv_storen_f64(op + 0 * vstep * ostride, ostride, r_0);
#endif
-#line 211
+#line 213
#if UNROLL > 1
npyv_f64 v_1 = npyv_loadn_f64(ip + 1 * vstep * istride, istride);
npyv_f64 r_1 = npyv_negative_f64(v_1);
npyv_storen_f64(op + 1 * vstep * ostride, ostride, r_1);
#endif
-#line 211
+#line 213
#if UNROLL > 2
npyv_f64 v_2 = npyv_loadn_f64(ip + 2 * vstep * istride, istride);
npyv_f64 r_2 = npyv_negative_f64(v_2);
npyv_storen_f64(op + 2 * vstep * ostride, ostride, r_2);
#endif
-#line 211
+#line 213
#if UNROLL > 3
npyv_f64 v_3 = npyv_loadn_f64(ip + 3 * vstep * istride, istride);
npyv_f64 r_3 = npyv_negative_f64(v_3);
npyv_storen_f64(op + 3 * vstep * ostride, ostride, r_3);
#endif
-#line 211
+#line 213
#if UNROLL > 4
npyv_f64 v_4 = npyv_loadn_f64(ip + 4 * vstep * istride, istride);
npyv_f64 r_4 = npyv_negative_f64(v_4);
npyv_storen_f64(op + 4 * vstep * ostride, ostride, r_4);
#endif
-#line 211
+#line 213
#if UNROLL > 5
npyv_f64 v_5 = npyv_loadn_f64(ip + 5 * vstep * istride, istride);
npyv_f64 r_5 = npyv_negative_f64(v_5);
npyv_storen_f64(op + 5 * vstep * ostride, ostride, r_5);
#endif
-#line 211
+#line 213
#if UNROLL > 6
npyv_f64 v_6 = npyv_loadn_f64(ip + 6 * vstep * istride, istride);
npyv_f64 r_6 = npyv_negative_f64(v_6);
npyv_storen_f64(op + 6 * vstep * ostride, ostride, r_6);
#endif
-#line 211
+#line 213
#if UNROLL > 7
npyv_f64 v_7 = npyv_loadn_f64(ip + 7 * vstep * istride, istride);
npyv_f64 r_7 = npyv_negative_f64(v_7);
npyv_storen_f64(op + 7 * vstep * ostride, ostride, r_7);
#endif
-#line 211
+#line 213
#if UNROLL > 8
npyv_f64 v_8 = npyv_loadn_f64(ip + 8 * vstep * istride, istride);
npyv_f64 r_8 = npyv_negative_f64(v_8);
npyv_storen_f64(op + 8 * vstep * ostride, ostride, r_8);
#endif
-#line 211
+#line 213
#if UNROLL > 9
npyv_f64 v_9 = npyv_loadn_f64(ip + 9 * vstep * istride, istride);
npyv_f64 r_9 = npyv_negative_f64(v_9);
npyv_storen_f64(op + 9 * vstep * ostride, ostride, r_9);
#endif
-#line 211
+#line 213
#if UNROLL > 10
npyv_f64 v_10 = npyv_loadn_f64(ip + 10 * vstep * istride, istride);
npyv_f64 r_10 = npyv_negative_f64(v_10);
npyv_storen_f64(op + 10 * vstep * ostride, ostride, r_10);
#endif
-#line 211
+#line 213
#if UNROLL > 11
npyv_f64 v_11 = npyv_loadn_f64(ip + 11 * vstep * istride, istride);
npyv_f64 r_11 = npyv_negative_f64(v_11);
npyv_storen_f64(op + 11 * vstep * ostride, ostride, r_11);
#endif
-#line 211
+#line 213
#if UNROLL > 12
npyv_f64 v_12 = npyv_loadn_f64(ip + 12 * vstep * istride, istride);
npyv_f64 r_12 = npyv_negative_f64(v_12);
npyv_storen_f64(op + 12 * vstep * ostride, ostride, r_12);
#endif
-#line 211
+#line 213
#if UNROLL > 13
npyv_f64 v_13 = npyv_loadn_f64(ip + 13 * vstep * istride, istride);
npyv_f64 r_13 = npyv_negative_f64(v_13);
npyv_storen_f64(op + 13 * vstep * ostride, ostride, r_13);
#endif
-#line 211
+#line 213
#if UNROLL > 14
npyv_f64 v_14 = npyv_loadn_f64(ip + 14 * vstep * istride, istride);
npyv_f64 r_14 = npyv_negative_f64(v_14);
npyv_storen_f64(op + 14 * vstep * ostride, ostride, r_14);
#endif
-#line 211
+#line 213
#if UNROLL > 15
npyv_f64 v_15 = npyv_loadn_f64(ip + 15 * vstep * istride, istride);
npyv_f64 r_15 = npyv_negative_f64(v_15);
@@ -5805,6 +5834,7 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
*op = scalar_negative(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // 1
#undef UNROLL
#endif // NPY_SIMD_F64
@@ -5814,10 +5844,10 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
/********************************************************************************
** Defining ufunc inner functions
********************************************************************************/
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -5833,7 +5863,7 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -5849,7 +5879,7 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -5865,7 +5895,7 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -5883,7 +5913,7 @@ simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -5921,8 +5951,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -5945,97 +5975,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_ubyte in_0 = *((const npy_ubyte *)(ip + 0 * istep));
*((npy_ubyte *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_ubyte in_1 = *((const npy_ubyte *)(ip + 1 * istep));
*((npy_ubyte *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_ubyte in_2 = *((const npy_ubyte *)(ip + 2 * istep));
*((npy_ubyte *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_ubyte in_3 = *((const npy_ubyte *)(ip + 3 * istep));
*((npy_ubyte *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_ubyte in_4 = *((const npy_ubyte *)(ip + 4 * istep));
*((npy_ubyte *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_ubyte in_5 = *((const npy_ubyte *)(ip + 5 * istep));
*((npy_ubyte *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_ubyte in_6 = *((const npy_ubyte *)(ip + 6 * istep));
*((npy_ubyte *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_ubyte in_7 = *((const npy_ubyte *)(ip + 7 * istep));
*((npy_ubyte *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_ubyte in_8 = *((const npy_ubyte *)(ip + 8 * istep));
*((npy_ubyte *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_ubyte in_9 = *((const npy_ubyte *)(ip + 9 * istep));
*((npy_ubyte *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_ubyte in_10 = *((const npy_ubyte *)(ip + 10 * istep));
*((npy_ubyte *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_ubyte in_11 = *((const npy_ubyte *)(ip + 11 * istep));
*((npy_ubyte *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_ubyte in_12 = *((const npy_ubyte *)(ip + 12 * istep));
*((npy_ubyte *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_ubyte in_13 = *((const npy_ubyte *)(ip + 13 * istep));
*((npy_ubyte *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_ubyte in_14 = *((const npy_ubyte *)(ip + 14 * istep));
*((npy_ubyte *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_ubyte in_15 = *((const npy_ubyte *)(ip + 15 * istep));
*((npy_ubyte *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -6055,10 +6085,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -6074,7 +6104,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -6090,7 +6120,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -6106,7 +6136,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -6124,7 +6154,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -6162,8 +6192,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -6186,97 +6216,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_ushort in_0 = *((const npy_ushort *)(ip + 0 * istep));
*((npy_ushort *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_ushort in_1 = *((const npy_ushort *)(ip + 1 * istep));
*((npy_ushort *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_ushort in_2 = *((const npy_ushort *)(ip + 2 * istep));
*((npy_ushort *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_ushort in_3 = *((const npy_ushort *)(ip + 3 * istep));
*((npy_ushort *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_ushort in_4 = *((const npy_ushort *)(ip + 4 * istep));
*((npy_ushort *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_ushort in_5 = *((const npy_ushort *)(ip + 5 * istep));
*((npy_ushort *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_ushort in_6 = *((const npy_ushort *)(ip + 6 * istep));
*((npy_ushort *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_ushort in_7 = *((const npy_ushort *)(ip + 7 * istep));
*((npy_ushort *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_ushort in_8 = *((const npy_ushort *)(ip + 8 * istep));
*((npy_ushort *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_ushort in_9 = *((const npy_ushort *)(ip + 9 * istep));
*((npy_ushort *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_ushort in_10 = *((const npy_ushort *)(ip + 10 * istep));
*((npy_ushort *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_ushort in_11 = *((const npy_ushort *)(ip + 11 * istep));
*((npy_ushort *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_ushort in_12 = *((const npy_ushort *)(ip + 12 * istep));
*((npy_ushort *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_ushort in_13 = *((const npy_ushort *)(ip + 13 * istep));
*((npy_ushort *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_ushort in_14 = *((const npy_ushort *)(ip + 14 * istep));
*((npy_ushort *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_ushort in_15 = *((const npy_ushort *)(ip + 15 * istep));
*((npy_ushort *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -6296,10 +6326,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -6315,7 +6345,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -6331,7 +6361,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -6347,7 +6377,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -6365,7 +6395,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -6403,8 +6433,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -6427,97 +6457,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_uint in_0 = *((const npy_uint *)(ip + 0 * istep));
*((npy_uint *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_uint in_1 = *((const npy_uint *)(ip + 1 * istep));
*((npy_uint *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_uint in_2 = *((const npy_uint *)(ip + 2 * istep));
*((npy_uint *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_uint in_3 = *((const npy_uint *)(ip + 3 * istep));
*((npy_uint *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_uint in_4 = *((const npy_uint *)(ip + 4 * istep));
*((npy_uint *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_uint in_5 = *((const npy_uint *)(ip + 5 * istep));
*((npy_uint *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_uint in_6 = *((const npy_uint *)(ip + 6 * istep));
*((npy_uint *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_uint in_7 = *((const npy_uint *)(ip + 7 * istep));
*((npy_uint *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_uint in_8 = *((const npy_uint *)(ip + 8 * istep));
*((npy_uint *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_uint in_9 = *((const npy_uint *)(ip + 9 * istep));
*((npy_uint *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_uint in_10 = *((const npy_uint *)(ip + 10 * istep));
*((npy_uint *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_uint in_11 = *((const npy_uint *)(ip + 11 * istep));
*((npy_uint *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_uint in_12 = *((const npy_uint *)(ip + 12 * istep));
*((npy_uint *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_uint in_13 = *((const npy_uint *)(ip + 13 * istep));
*((npy_uint *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_uint in_14 = *((const npy_uint *)(ip + 14 * istep));
*((npy_uint *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_uint in_15 = *((const npy_uint *)(ip + 15 * istep));
*((npy_uint *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -6537,10 +6567,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -6556,7 +6586,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -6572,7 +6602,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -6588,7 +6618,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -6606,7 +6636,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -6644,8 +6674,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -6668,97 +6698,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_ulong in_0 = *((const npy_ulong *)(ip + 0 * istep));
*((npy_ulong *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_ulong in_1 = *((const npy_ulong *)(ip + 1 * istep));
*((npy_ulong *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_ulong in_2 = *((const npy_ulong *)(ip + 2 * istep));
*((npy_ulong *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_ulong in_3 = *((const npy_ulong *)(ip + 3 * istep));
*((npy_ulong *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_ulong in_4 = *((const npy_ulong *)(ip + 4 * istep));
*((npy_ulong *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_ulong in_5 = *((const npy_ulong *)(ip + 5 * istep));
*((npy_ulong *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_ulong in_6 = *((const npy_ulong *)(ip + 6 * istep));
*((npy_ulong *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_ulong in_7 = *((const npy_ulong *)(ip + 7 * istep));
*((npy_ulong *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_ulong in_8 = *((const npy_ulong *)(ip + 8 * istep));
*((npy_ulong *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_ulong in_9 = *((const npy_ulong *)(ip + 9 * istep));
*((npy_ulong *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_ulong in_10 = *((const npy_ulong *)(ip + 10 * istep));
*((npy_ulong *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_ulong in_11 = *((const npy_ulong *)(ip + 11 * istep));
*((npy_ulong *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_ulong in_12 = *((const npy_ulong *)(ip + 12 * istep));
*((npy_ulong *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_ulong in_13 = *((const npy_ulong *)(ip + 13 * istep));
*((npy_ulong *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_ulong in_14 = *((const npy_ulong *)(ip + 14 * istep));
*((npy_ulong *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_ulong in_15 = *((const npy_ulong *)(ip + 15 * istep));
*((npy_ulong *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -6778,10 +6808,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -6797,7 +6827,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -6813,7 +6843,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -6829,7 +6859,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -6847,7 +6877,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -6885,8 +6915,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -6909,97 +6939,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_ulonglong in_0 = *((const npy_ulonglong *)(ip + 0 * istep));
*((npy_ulonglong *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_ulonglong in_1 = *((const npy_ulonglong *)(ip + 1 * istep));
*((npy_ulonglong *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_ulonglong in_2 = *((const npy_ulonglong *)(ip + 2 * istep));
*((npy_ulonglong *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_ulonglong in_3 = *((const npy_ulonglong *)(ip + 3 * istep));
*((npy_ulonglong *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_ulonglong in_4 = *((const npy_ulonglong *)(ip + 4 * istep));
*((npy_ulonglong *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_ulonglong in_5 = *((const npy_ulonglong *)(ip + 5 * istep));
*((npy_ulonglong *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_ulonglong in_6 = *((const npy_ulonglong *)(ip + 6 * istep));
*((npy_ulonglong *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_ulonglong in_7 = *((const npy_ulonglong *)(ip + 7 * istep));
*((npy_ulonglong *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_ulonglong in_8 = *((const npy_ulonglong *)(ip + 8 * istep));
*((npy_ulonglong *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_ulonglong in_9 = *((const npy_ulonglong *)(ip + 9 * istep));
*((npy_ulonglong *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_ulonglong in_10 = *((const npy_ulonglong *)(ip + 10 * istep));
*((npy_ulonglong *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_ulonglong in_11 = *((const npy_ulonglong *)(ip + 11 * istep));
*((npy_ulonglong *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_ulonglong in_12 = *((const npy_ulonglong *)(ip + 12 * istep));
*((npy_ulonglong *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_ulonglong in_13 = *((const npy_ulonglong *)(ip + 13 * istep));
*((npy_ulonglong *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_ulonglong in_14 = *((const npy_ulonglong *)(ip + 14 * istep));
*((npy_ulonglong *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_ulonglong in_15 = *((const npy_ulonglong *)(ip + 15 * istep));
*((npy_ulonglong *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -7019,10 +7049,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -7038,7 +7068,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -7054,7 +7084,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -7070,7 +7100,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -7088,7 +7118,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -7126,8 +7156,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -7150,97 +7180,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_byte in_0 = *((const npy_byte *)(ip + 0 * istep));
*((npy_byte *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_byte in_1 = *((const npy_byte *)(ip + 1 * istep));
*((npy_byte *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_byte in_2 = *((const npy_byte *)(ip + 2 * istep));
*((npy_byte *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_byte in_3 = *((const npy_byte *)(ip + 3 * istep));
*((npy_byte *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_byte in_4 = *((const npy_byte *)(ip + 4 * istep));
*((npy_byte *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_byte in_5 = *((const npy_byte *)(ip + 5 * istep));
*((npy_byte *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_byte in_6 = *((const npy_byte *)(ip + 6 * istep));
*((npy_byte *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_byte in_7 = *((const npy_byte *)(ip + 7 * istep));
*((npy_byte *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_byte in_8 = *((const npy_byte *)(ip + 8 * istep));
*((npy_byte *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_byte in_9 = *((const npy_byte *)(ip + 9 * istep));
*((npy_byte *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_byte in_10 = *((const npy_byte *)(ip + 10 * istep));
*((npy_byte *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_byte in_11 = *((const npy_byte *)(ip + 11 * istep));
*((npy_byte *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_byte in_12 = *((const npy_byte *)(ip + 12 * istep));
*((npy_byte *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_byte in_13 = *((const npy_byte *)(ip + 13 * istep));
*((npy_byte *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_byte in_14 = *((const npy_byte *)(ip + 14 * istep));
*((npy_byte *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_byte in_15 = *((const npy_byte *)(ip + 15 * istep));
*((npy_byte *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -7260,10 +7290,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -7279,7 +7309,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -7295,7 +7325,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -7311,7 +7341,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -7329,7 +7359,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -7367,8 +7397,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -7391,97 +7421,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_short in_0 = *((const npy_short *)(ip + 0 * istep));
*((npy_short *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_short in_1 = *((const npy_short *)(ip + 1 * istep));
*((npy_short *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_short in_2 = *((const npy_short *)(ip + 2 * istep));
*((npy_short *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_short in_3 = *((const npy_short *)(ip + 3 * istep));
*((npy_short *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_short in_4 = *((const npy_short *)(ip + 4 * istep));
*((npy_short *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_short in_5 = *((const npy_short *)(ip + 5 * istep));
*((npy_short *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_short in_6 = *((const npy_short *)(ip + 6 * istep));
*((npy_short *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_short in_7 = *((const npy_short *)(ip + 7 * istep));
*((npy_short *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_short in_8 = *((const npy_short *)(ip + 8 * istep));
*((npy_short *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_short in_9 = *((const npy_short *)(ip + 9 * istep));
*((npy_short *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_short in_10 = *((const npy_short *)(ip + 10 * istep));
*((npy_short *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_short in_11 = *((const npy_short *)(ip + 11 * istep));
*((npy_short *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_short in_12 = *((const npy_short *)(ip + 12 * istep));
*((npy_short *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_short in_13 = *((const npy_short *)(ip + 13 * istep));
*((npy_short *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_short in_14 = *((const npy_short *)(ip + 14 * istep));
*((npy_short *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_short in_15 = *((const npy_short *)(ip + 15 * istep));
*((npy_short *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -7501,10 +7531,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -7520,7 +7550,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -7536,7 +7566,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -7552,7 +7582,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_INT == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -7570,7 +7600,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -7608,8 +7638,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -7632,97 +7662,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_int in_0 = *((const npy_int *)(ip + 0 * istep));
*((npy_int *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_int in_1 = *((const npy_int *)(ip + 1 * istep));
*((npy_int *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_int in_2 = *((const npy_int *)(ip + 2 * istep));
*((npy_int *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_int in_3 = *((const npy_int *)(ip + 3 * istep));
*((npy_int *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_int in_4 = *((const npy_int *)(ip + 4 * istep));
*((npy_int *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_int in_5 = *((const npy_int *)(ip + 5 * istep));
*((npy_int *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_int in_6 = *((const npy_int *)(ip + 6 * istep));
*((npy_int *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_int in_7 = *((const npy_int *)(ip + 7 * istep));
*((npy_int *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_int in_8 = *((const npy_int *)(ip + 8 * istep));
*((npy_int *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_int in_9 = *((const npy_int *)(ip + 9 * istep));
*((npy_int *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_int in_10 = *((const npy_int *)(ip + 10 * istep));
*((npy_int *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_int in_11 = *((const npy_int *)(ip + 11 * istep));
*((npy_int *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_int in_12 = *((const npy_int *)(ip + 12 * istep));
*((npy_int *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_int in_13 = *((const npy_int *)(ip + 13 * istep));
*((npy_int *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_int in_14 = *((const npy_int *)(ip + 14 * istep));
*((npy_int *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_int in_15 = *((const npy_int *)(ip + 15 * istep));
*((npy_int *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -7742,10 +7772,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -7761,7 +7791,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -7777,7 +7807,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -7793,7 +7823,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONG == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -7811,7 +7841,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -7849,8 +7879,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -7873,97 +7903,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_long in_0 = *((const npy_long *)(ip + 0 * istep));
*((npy_long *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_long in_1 = *((const npy_long *)(ip + 1 * istep));
*((npy_long *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_long in_2 = *((const npy_long *)(ip + 2 * istep));
*((npy_long *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_long in_3 = *((const npy_long *)(ip + 3 * istep));
*((npy_long *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_long in_4 = *((const npy_long *)(ip + 4 * istep));
*((npy_long *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_long in_5 = *((const npy_long *)(ip + 5 * istep));
*((npy_long *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_long in_6 = *((const npy_long *)(ip + 6 * istep));
*((npy_long *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_long in_7 = *((const npy_long *)(ip + 7 * istep));
*((npy_long *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_long in_8 = *((const npy_long *)(ip + 8 * istep));
*((npy_long *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_long in_9 = *((const npy_long *)(ip + 9 * istep));
*((npy_long *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_long in_10 = *((const npy_long *)(ip + 10 * istep));
*((npy_long *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_long in_11 = *((const npy_long *)(ip + 11 * istep));
*((npy_long *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_long in_12 = *((const npy_long *)(ip + 12 * istep));
*((npy_long *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_long in_13 = *((const npy_long *)(ip + 13 * istep));
*((npy_long *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_long in_14 = *((const npy_long *)(ip + 14 * istep));
*((npy_long *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_long in_15 = *((const npy_long *)(ip + 15 * istep));
*((npy_long *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -7983,10 +8013,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
#if 0
#define TO_SIMD_SFX(X) X##_f8
@@ -8002,7 +8032,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
#if 0
#define TO_SIMD_SFX(X) X##_f16
@@ -8018,7 +8048,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
#if 0
#define TO_SIMD_SFX(X) X##_f32
@@ -8034,7 +8064,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
#if 0
#define TO_SIMD_SFX(X) X##_f64
@@ -8052,7 +8082,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -8090,8 +8120,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -8114,97 +8144,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_longlong in_0 = *((const npy_longlong *)(ip + 0 * istep));
*((npy_longlong *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_longlong in_1 = *((const npy_longlong *)(ip + 1 * istep));
*((npy_longlong *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_longlong in_2 = *((const npy_longlong *)(ip + 2 * istep));
*((npy_longlong *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_longlong in_3 = *((const npy_longlong *)(ip + 3 * istep));
*((npy_longlong *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_longlong in_4 = *((const npy_longlong *)(ip + 4 * istep));
*((npy_longlong *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_longlong in_5 = *((const npy_longlong *)(ip + 5 * istep));
*((npy_longlong *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_longlong in_6 = *((const npy_longlong *)(ip + 6 * istep));
*((npy_longlong *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_longlong in_7 = *((const npy_longlong *)(ip + 7 * istep));
*((npy_longlong *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_longlong in_8 = *((const npy_longlong *)(ip + 8 * istep));
*((npy_longlong *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_longlong in_9 = *((const npy_longlong *)(ip + 9 * istep));
*((npy_longlong *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_longlong in_10 = *((const npy_longlong *)(ip + 10 * istep));
*((npy_longlong *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_longlong in_11 = *((const npy_longlong *)(ip + 11 * istep));
*((npy_longlong *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_longlong in_12 = *((const npy_longlong *)(ip + 12 * istep));
*((npy_longlong *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_longlong in_13 = *((const npy_longlong *)(ip + 13 * istep));
*((npy_longlong *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_longlong in_14 = *((const npy_longlong *)(ip + 14 * istep));
*((npy_longlong *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_longlong in_15 = *((const npy_longlong *)(ip + 15 * istep));
*((npy_longlong *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -8224,10 +8254,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 8
#if 1
#define TO_SIMD_SFX(X) X##_f8
@@ -8243,7 +8273,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 16
#if 1
#define TO_SIMD_SFX(X) X##_f16
@@ -8259,7 +8289,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 32
#if 1
#define TO_SIMD_SFX(X) X##_f32
@@ -8275,7 +8305,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_FLOAT == 64
#if 1
#define TO_SIMD_SFX(X) X##_f64
@@ -8293,7 +8323,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -8331,8 +8361,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -8355,97 +8385,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_float in_0 = *((const npy_float *)(ip + 0 * istep));
*((npy_float *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_float in_1 = *((const npy_float *)(ip + 1 * istep));
*((npy_float *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_float in_2 = *((const npy_float *)(ip + 2 * istep));
*((npy_float *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_float in_3 = *((const npy_float *)(ip + 3 * istep));
*((npy_float *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_float in_4 = *((const npy_float *)(ip + 4 * istep));
*((npy_float *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_float in_5 = *((const npy_float *)(ip + 5 * istep));
*((npy_float *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_float in_6 = *((const npy_float *)(ip + 6 * istep));
*((npy_float *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_float in_7 = *((const npy_float *)(ip + 7 * istep));
*((npy_float *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_float in_8 = *((const npy_float *)(ip + 8 * istep));
*((npy_float *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_float in_9 = *((const npy_float *)(ip + 9 * istep));
*((npy_float *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_float in_10 = *((const npy_float *)(ip + 10 * istep));
*((npy_float *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_float in_11 = *((const npy_float *)(ip + 11 * istep));
*((npy_float *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_float in_12 = *((const npy_float *)(ip + 12 * istep));
*((npy_float *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_float in_13 = *((const npy_float *)(ip + 13 * istep));
*((npy_float *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_float in_14 = *((const npy_float *)(ip + 14 * istep));
*((npy_float *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_float in_15 = *((const npy_float *)(ip + 15 * istep));
*((npy_float *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -8465,10 +8495,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 8
#if 1
#define TO_SIMD_SFX(X) X##_f8
@@ -8484,7 +8514,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 16
#if 1
#define TO_SIMD_SFX(X) X##_f16
@@ -8500,7 +8530,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 32
#if 1
#define TO_SIMD_SFX(X) X##_f32
@@ -8516,7 +8546,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 64
#if 1
#define TO_SIMD_SFX(X) X##_f64
@@ -8534,7 +8564,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -8572,8 +8602,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -8596,97 +8626,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_double in_0 = *((const npy_double *)(ip + 0 * istep));
*((npy_double *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_double in_1 = *((const npy_double *)(ip + 1 * istep));
*((npy_double *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_double in_2 = *((const npy_double *)(ip + 2 * istep));
*((npy_double *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_double in_3 = *((const npy_double *)(ip + 3 * istep));
*((npy_double *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_double in_4 = *((const npy_double *)(ip + 4 * istep));
*((npy_double *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_double in_5 = *((const npy_double *)(ip + 5 * istep));
*((npy_double *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_double in_6 = *((const npy_double *)(ip + 6 * istep));
*((npy_double *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_double in_7 = *((const npy_double *)(ip + 7 * istep));
*((npy_double *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_double in_8 = *((const npy_double *)(ip + 8 * istep));
*((npy_double *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_double in_9 = *((const npy_double *)(ip + 9 * istep));
*((npy_double *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_double in_10 = *((const npy_double *)(ip + 10 * istep));
*((npy_double *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_double in_11 = *((const npy_double *)(ip + 11 * istep));
*((npy_double *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_double in_12 = *((const npy_double *)(ip + 12 * istep));
*((npy_double *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_double in_13 = *((const npy_double *)(ip + 13 * istep));
*((npy_double *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_double in_14 = *((const npy_double *)(ip + 14 * istep));
*((npy_double *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_double in_15 = *((const npy_double *)(ip + 15 * istep));
*((npy_double *)(op + 15 * ostep)) = scalar_negative(in_15);
@@ -8706,10 +8736,10 @@ clear:
#endif
}
-#line 254
+#line 257
#undef TO_SIMD_SFX
#if 0
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 8
#if 1
#define TO_SIMD_SFX(X) X##_f8
@@ -8725,7 +8755,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s8
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 16
#if 1
#define TO_SIMD_SFX(X) X##_f16
@@ -8741,7 +8771,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s16
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 32
#if 1
#define TO_SIMD_SFX(X) X##_f32
@@ -8757,7 +8787,7 @@ clear:
#define TO_SIMD_SFX(X) X##_s32
#endif
-#line 259
+#line 262
#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 64
#if 1
#define TO_SIMD_SFX(X) X##_f64
@@ -8775,7 +8805,7 @@ clear:
#endif
-#line 280
+#line 283
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_negative)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -8813,8 +8843,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_negative)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_negative)(
@@ -8837,97 +8867,97 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_negative)
*/
#define UNROLL 8
for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
- #line 344
+ #line 347
#if UNROLL > 0
const npy_longdouble in_0 = *((const npy_longdouble *)(ip + 0 * istep));
*((npy_longdouble *)(op + 0 * ostep)) = scalar_negative(in_0);
#endif
-#line 344
+#line 347
#if UNROLL > 1
const npy_longdouble in_1 = *((const npy_longdouble *)(ip + 1 * istep));
*((npy_longdouble *)(op + 1 * ostep)) = scalar_negative(in_1);
#endif
-#line 344
+#line 347
#if UNROLL > 2
const npy_longdouble in_2 = *((const npy_longdouble *)(ip + 2 * istep));
*((npy_longdouble *)(op + 2 * ostep)) = scalar_negative(in_2);
#endif
-#line 344
+#line 347
#if UNROLL > 3
const npy_longdouble in_3 = *((const npy_longdouble *)(ip + 3 * istep));
*((npy_longdouble *)(op + 3 * ostep)) = scalar_negative(in_3);
#endif
-#line 344
+#line 347
#if UNROLL > 4
const npy_longdouble in_4 = *((const npy_longdouble *)(ip + 4 * istep));
*((npy_longdouble *)(op + 4 * ostep)) = scalar_negative(in_4);
#endif
-#line 344
+#line 347
#if UNROLL > 5
const npy_longdouble in_5 = *((const npy_longdouble *)(ip + 5 * istep));
*((npy_longdouble *)(op + 5 * ostep)) = scalar_negative(in_5);
#endif
-#line 344
+#line 347
#if UNROLL > 6
const npy_longdouble in_6 = *((const npy_longdouble *)(ip + 6 * istep));
*((npy_longdouble *)(op + 6 * ostep)) = scalar_negative(in_6);
#endif
-#line 344
+#line 347
#if UNROLL > 7
const npy_longdouble in_7 = *((const npy_longdouble *)(ip + 7 * istep));
*((npy_longdouble *)(op + 7 * ostep)) = scalar_negative(in_7);
#endif
-#line 344
+#line 347
#if UNROLL > 8
const npy_longdouble in_8 = *((const npy_longdouble *)(ip + 8 * istep));
*((npy_longdouble *)(op + 8 * ostep)) = scalar_negative(in_8);
#endif
-#line 344
+#line 347
#if UNROLL > 9
const npy_longdouble in_9 = *((const npy_longdouble *)(ip + 9 * istep));
*((npy_longdouble *)(op + 9 * ostep)) = scalar_negative(in_9);
#endif
-#line 344
+#line 347
#if UNROLL > 10
const npy_longdouble in_10 = *((const npy_longdouble *)(ip + 10 * istep));
*((npy_longdouble *)(op + 10 * ostep)) = scalar_negative(in_10);
#endif
-#line 344
+#line 347
#if UNROLL > 11
const npy_longdouble in_11 = *((const npy_longdouble *)(ip + 11 * istep));
*((npy_longdouble *)(op + 11 * ostep)) = scalar_negative(in_11);
#endif
-#line 344
+#line 347
#if UNROLL > 12
const npy_longdouble in_12 = *((const npy_longdouble *)(ip + 12 * istep));
*((npy_longdouble *)(op + 12 * ostep)) = scalar_negative(in_12);
#endif
-#line 344
+#line 347
#if UNROLL > 13
const npy_longdouble in_13 = *((const npy_longdouble *)(ip + 13 * istep));
*((npy_longdouble *)(op + 13 * ostep)) = scalar_negative(in_13);
#endif
-#line 344
+#line 347
#if UNROLL > 14
const npy_longdouble in_14 = *((const npy_longdouble *)(ip + 14 * istep));
*((npy_longdouble *)(op + 14 * ostep)) = scalar_negative(in_14);
#endif
-#line 344
+#line 347
#if UNROLL > 15
const npy_longdouble in_15 = *((const npy_longdouble *)(ip + 15 * istep));
*((npy_longdouble *)(op + 15 * ostep)) = scalar_negative(in_15);
diff --git a/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c.src b/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c.src
index 1e2a81d20b..bfe4d892d0 100644
--- a/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c.src
+++ b/contrib/python/numpy/py3/numpy/core/src/umath/loops_unary.dispatch.c.src
@@ -195,6 +195,8 @@ simd_unary_nc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
#undef UNROLL
#define UNROLL 2
#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
static NPY_INLINE void
simd_unary_nn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
npyv_lanetype_@sfx@ *op, npy_intp ostride,
@@ -226,6 +228,7 @@ simd_unary_nn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
*op = scalar_@intrin@(*ip);
}
}
+#endif // NPY_HAVE_SSE2
#endif // @supports_ncontig@
#undef UNROLL
#endif // @simd_chk@
@@ -314,8 +317,8 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
);
goto clear;
}
- // SSE2 does better with unrolled scalar for heavy non-contiguous
- #if !defined(NPY_HAVE_SSE2)
+ // X86 does better with unrolled scalar for heavy non-contiguous
+ #ifndef NPY_HAVE_SSE2
else if (istride != 1 && ostride != 1) {
// non-contiguous input and output
TO_SIMD_SFX(simd_unary_nn_@intrin@)(
diff --git a/contrib/python/numpy/py3/numpy/core/tests/test_numeric.py b/contrib/python/numpy/py3/numpy/core/tests/test_numeric.py
index 1bbdde1317..a88189e03e 100644
--- a/contrib/python/numpy/py3/numpy/core/tests/test_numeric.py
+++ b/contrib/python/numpy/py3/numpy/core/tests/test_numeric.py
@@ -477,7 +477,14 @@ class TestBoolCmp:
self.signd[self.ed] *= -1.
self.signf[1::6][self.ef[1::6]] = -np.inf
self.signd[1::6][self.ed[1::6]] = -np.inf
- self.signf[3::6][self.ef[3::6]] = -np.nan
+ # On RISC-V, many operations that produce NaNs, such as converting
+ # a -NaN from f64 to f32, return a canonical NaN. The canonical
+ # NaNs are always positive. See section 11.3 NaN Generation and
+ # Propagation of the RISC-V Unprivileged ISA for more details.
+ # We disable the float32 sign test on riscv64 for -np.nan as the sign
+ # of the NaN will be lost when it's converted to a float32.
+ if platform.processor() != 'riscv64':
+ self.signf[3::6][self.ef[3::6]] = -np.nan
self.signd[3::6][self.ed[3::6]] = -np.nan
self.signf[4::6][self.ef[4::6]] = -0.
self.signd[4::6][self.ed[4::6]] = -0.
diff --git a/contrib/python/numpy/py3/numpy/f2py/crackfortran.py b/contrib/python/numpy/py3/numpy/f2py/crackfortran.py
index 8d3fc27608..8d3fc27608 100755..100644
--- a/contrib/python/numpy/py3/numpy/f2py/crackfortran.py
+++ b/contrib/python/numpy/py3/numpy/f2py/crackfortran.py
diff --git a/contrib/python/numpy/py3/numpy/f2py/f2py2e.py b/contrib/python/numpy/py3/numpy/f2py/f2py2e.py
index ce22b2d8a9..ce22b2d8a9 100755..100644
--- a/contrib/python/numpy/py3/numpy/f2py/f2py2e.py
+++ b/contrib/python/numpy/py3/numpy/f2py/f2py2e.py
diff --git a/contrib/python/numpy/py3/numpy/f2py/rules.py b/contrib/python/numpy/py3/numpy/f2py/rules.py
index 009365e047..009365e047 100755..100644
--- a/contrib/python/numpy/py3/numpy/f2py/rules.py
+++ b/contrib/python/numpy/py3/numpy/f2py/rules.py
diff --git a/contrib/python/numpy/py3/numpy/f2py/tests/util.py b/contrib/python/numpy/py3/numpy/f2py/tests/util.py
index 75b257cdb8..6ed6c0855f 100644
--- a/contrib/python/numpy/py3/numpy/f2py/tests/util.py
+++ b/contrib/python/numpy/py3/numpy/f2py/tests/util.py
@@ -20,6 +20,7 @@ import contextlib
import numpy
from pathlib import Path
+from numpy.compat import asstr
from numpy._utils import asunicode
from numpy.testing import temppath, IS_WASM
from importlib import import_module
diff --git a/contrib/python/numpy/py3/numpy/lib/function_base.py b/contrib/python/numpy/py3/numpy/lib/function_base.py
index e75aca1e58..a3dab04d33 100644
--- a/contrib/python/numpy/py3/numpy/lib/function_base.py
+++ b/contrib/python/numpy/py3/numpy/lib/function_base.py
@@ -4655,7 +4655,8 @@ def _lerp(a, b, t, out=None):
diff_b_a = subtract(b, a)
# asanyarray is a stop-gap until gh-13105
lerp_interpolation = asanyarray(add(a, diff_b_a * t, out=out))
- subtract(b, diff_b_a * (1 - t), out=lerp_interpolation, where=t >= 0.5)
+ subtract(b, diff_b_a * (1 - t), out=lerp_interpolation, where=t >= 0.5,
+ casting='unsafe', dtype=type(lerp_interpolation.dtype))
if lerp_interpolation.ndim == 0 and out is None:
lerp_interpolation = lerp_interpolation[()] # unpack 0d arrays
return lerp_interpolation
diff --git a/contrib/python/numpy/py3/numpy/lib/tests/test_function_base.py b/contrib/python/numpy/py3/numpy/lib/tests/test_function_base.py
index 11e44630e7..2bb73b6003 100644
--- a/contrib/python/numpy/py3/numpy/lib/tests/test_function_base.py
+++ b/contrib/python/numpy/py3/numpy/lib/tests/test_function_base.py
@@ -3606,6 +3606,10 @@ class TestQuantile:
assert_equal(q, Fraction(7, 2))
assert_equal(type(q), Fraction)
+ q = np.quantile(x, .5)
+ assert_equal(q, 1.75)
+ assert_equal(type(q), np.float64)
+
q = np.quantile(x, Fraction(1, 2))
assert_equal(q, Fraction(7, 4))
assert_equal(type(q), Fraction)
diff --git a/contrib/python/numpy/py3/numpy/linalg/umath_linalg.cpp b/contrib/python/numpy/py3/numpy/linalg/umath_linalg.cpp
index 0c0b35e9c0..3b5effe14a 100644
--- a/contrib/python/numpy/py3/numpy/linalg/umath_linalg.cpp
+++ b/contrib/python/numpy/py3/numpy/linalg/umath_linalg.cpp
@@ -2259,7 +2259,7 @@ process_geev_results(GEEV_PARAMS_t<typ> *params, scalar_trait)
}
}
-
+#if 0
static inline fortran_int
call_geev(GEEV_PARAMS_t<fortran_complex>* params)
{
@@ -2275,6 +2275,8 @@ call_geev(GEEV_PARAMS_t<fortran_complex>* params)
&rv);
return rv;
}
+#endif
+
static inline fortran_int
call_geev(GEEV_PARAMS_t<fortran_doublecomplex>* params)
{
diff --git a/contrib/python/numpy/py3/numpy/testing/print_coercion_tables.py b/contrib/python/numpy/py3/numpy/testing/print_coercion_tables.py
index c1d4cdff8f..c1d4cdff8f 100755..100644
--- a/contrib/python/numpy/py3/numpy/testing/print_coercion_tables.py
+++ b/contrib/python/numpy/py3/numpy/testing/print_coercion_tables.py
diff --git a/contrib/python/numpy/py3/numpy/testing/setup.py b/contrib/python/numpy/py3/numpy/testing/setup.py
index 6f203e8727..6f203e8727 100755..100644
--- a/contrib/python/numpy/py3/numpy/testing/setup.py
+++ b/contrib/python/numpy/py3/numpy/testing/setup.py
diff --git a/contrib/python/numpy/py3/numpy/tests/test_warnings.py b/contrib/python/numpy/py3/numpy/tests/test_warnings.py
index ee5124c5d5..df90fcef8c 100644
--- a/contrib/python/numpy/py3/numpy/tests/test_warnings.py
+++ b/contrib/python/numpy/py3/numpy/tests/test_warnings.py
@@ -5,7 +5,6 @@ all of these occurrences but should catch almost all.
import pytest
from pathlib import Path
-import sys
import ast
import tokenize
import numpy
@@ -33,7 +32,7 @@ class FindFuncs(ast.NodeVisitor):
ast.NodeVisitor.generic_visit(self, node)
if p.ls[-1] == 'simplefilter' or p.ls[-1] == 'filterwarnings':
- if node.args[0].s == "ignore":
+ if node.args[0].value == "ignore":
raise AssertionError(
"warnings should have an appropriate stacklevel; found in "
"{} on line {}".format(self.__filename, node.lineno))
@@ -57,8 +56,6 @@ class FindFuncs(ast.NodeVisitor):
@pytest.mark.slow
-@pytest.mark.skipif(sys.version_info >= (3, 12),
- reason="Deprecation warning in ast")
def test_warning_calls():
# combined "ignore" and stacklevel error
base = Path(numpy.__file__).parent
diff --git a/contrib/python/numpy/py3/numpy/typing/tests/test_typing.py b/contrib/python/numpy/py3/numpy/typing/tests/test_typing.py
index 68c6f5d03f..6f778e5515 100644
--- a/contrib/python/numpy/py3/numpy/typing/tests/test_typing.py
+++ b/contrib/python/numpy/py3/numpy/typing/tests/test_typing.py
@@ -86,8 +86,6 @@ def strip_func(match: re.Match[str]) -> str:
return match.groups()[1]
-@pytest.mark.slow
-@pytest.mark.skipif(NO_MYPY, reason="Mypy is not installed")
@pytest.fixture(scope="module", autouse=True)
def run_mypy() -> None:
"""Clears the cache and run mypy before running any of the typing tests.
diff --git a/contrib/python/numpy/py3/numpy/version.py b/contrib/python/numpy/py3/numpy/version.py
index 692240a486..e96055ea6d 100644
--- a/contrib/python/numpy/py3/numpy/version.py
+++ b/contrib/python/numpy/py3/numpy/version.py
@@ -1,5 +1,5 @@
-version = "1.26.3"
+version = "1.26.4"
__version__ = version
full_version = version
diff --git a/contrib/python/numpy/py3/ya.make b/contrib/python/numpy/py3/ya.make
index 92042220c3..0eb98bef02 100644
--- a/contrib/python/numpy/py3/ya.make
+++ b/contrib/python/numpy/py3/ya.make
@@ -2,7 +2,7 @@ PY3_LIBRARY()
PROVIDES(numpy)
-VERSION(1.26.3)
+VERSION(1.26.4)
LICENSE(BSD-3-Clause)
diff --git a/yt/yt/client/api/rpc_proxy/client_base.cpp b/yt/yt/client/api/rpc_proxy/client_base.cpp
index e3c1283542..3cc5cdf8ea 100644
--- a/yt/yt/client/api/rpc_proxy/client_base.cpp
+++ b/yt/yt/client/api/rpc_proxy/client_base.cpp
@@ -80,7 +80,6 @@ TApiServiceProxy TClientBase::CreateApiServiceProxy(NRpc::IChannelPtr channel)
proxy.SetDefaultTimeout(config->RpcTimeout);
proxy.SetDefaultRequestCodec(config->RequestCodec);
proxy.SetDefaultResponseCodec(config->ResponseCodec);
- proxy.SetDefaultEnableLegacyRpcCodecs(config->EnableLegacyRpcCodecs);
NRpc::TStreamingParameters streamingParameters;
streamingParameters.ReadTimeout = config->DefaultStreamingStallTimeout;
diff --git a/yt/yt/client/api/rpc_proxy/config.cpp b/yt/yt/client/api/rpc_proxy/config.cpp
index 5a278c00bb..ce9c2f9262 100644
--- a/yt/yt/client/api/rpc_proxy/config.cpp
+++ b/yt/yt/client/api/rpc_proxy/config.cpp
@@ -100,9 +100,6 @@ void TConnectionConfig::Register(TRegistrar registrar)
.Default(NCompression::ECodec::None);
registrar.Parameter("response_codec", &TThis::ResponseCodec)
.Default(NCompression::ECodec::None);
- // COMPAT(kiselyovp): legacy RPC codecs
- registrar.Parameter("enable_legacy_rpc_codecs", &TThis::EnableLegacyRpcCodecs)
- .Default(true);
registrar.Parameter("enable_retries", &TThis::EnableRetries)
.Default(false);
diff --git a/yt/yt/client/api/rpc_proxy/config.h b/yt/yt/client/api/rpc_proxy/config.h
index 4e6a49be06..70bc9b00a3 100644
--- a/yt/yt/client/api/rpc_proxy/config.h
+++ b/yt/yt/client/api/rpc_proxy/config.h
@@ -66,8 +66,6 @@ public:
NCompression::ECodec RequestCodec;
NCompression::ECodec ResponseCodec;
- bool EnableLegacyRpcCodecs;
-
bool EnableRetries;
NRpc::TRetryingChannelConfigPtr RetryingChannel;
diff --git a/yt/yt/client/api/rpc_proxy/transaction_impl.cpp b/yt/yt/client/api/rpc_proxy/transaction_impl.cpp
index 7089f413e1..5bacf131a0 100644
--- a/yt/yt/client/api/rpc_proxy/transaction_impl.cpp
+++ b/yt/yt/client/api/rpc_proxy/transaction_impl.cpp
@@ -62,7 +62,6 @@ TTransaction::TTransaction(
Proxy_.SetDefaultTimeout(config->RpcTimeout);
Proxy_.SetDefaultRequestCodec(config->RequestCodec);
Proxy_.SetDefaultResponseCodec(config->ResponseCodec);
- Proxy_.SetDefaultEnableLegacyRpcCodecs(config->EnableLegacyRpcCodecs);
YT_LOG_DEBUG("%v (Type: %v, StartTimestamp: %v, Atomicity: %v, "
"Durability: %v, Timeout: %v, PingAncestors: %v, PingPeriod: %v, Sticky: %v, StickyProxyAddress: %v)",
diff --git a/yt/yt/client/cache/rpc.cpp b/yt/yt/client/cache/rpc.cpp
index 0512c3403b..3ae8626969 100644
--- a/yt/yt/client/cache/rpc.cpp
+++ b/yt/yt/client/cache/rpc.cpp
@@ -72,9 +72,6 @@ NApi::NRpcProxy::TConnectionConfigPtr GetConnectionConfig(const TConfig& config)
connectionConfig->ResponseCodec = GetCompressionCodecFromProto(config.GetResponseCodec());
connectionConfig->EnableRetries = config.GetEnableRetries();
- if (config.HasEnableLegacyRpcCodecs()) {
- connectionConfig->EnableLegacyRpcCodecs = config.GetEnableLegacyRpcCodecs();
- }
if (config.HasEnableSelectQueryTracingTag()) {
connectionConfig->EnableSelectQueryTracingTag = config.GetEnableSelectQueryTracingTag();
}
diff --git a/yt/yt/core/rpc/client-inl.h b/yt/yt/core/rpc/client-inl.h
index 894f21ff68..955014042d 100644
--- a/yt/yt/core/rpc/client-inl.h
+++ b/yt/yt/core/rpc/client-inl.h
@@ -85,15 +85,9 @@ TSharedRefArray TTypedClientRequest<TRequestMessage, TResponse>::SerializeHeader
{
TSharedRefArrayBuilder builder(Attachments().size() + 1);
- // COMPAT(kiselyovp): legacy RPC codecs
- builder.Add(EnableLegacyRpcCodecs_
- ? SerializeProtoToRefWithEnvelope(*this, RequestCodec_, false)
- : SerializeProtoToRefWithCompression(*this, RequestCodec_, false));
-
- auto attachmentCodecId = EnableLegacyRpcCodecs_
- ? NCompression::ECodec::None
- : RequestCodec_;
- auto compressedAttachments = CompressAttachments(Attachments(), attachmentCodecId);
+ builder.Add(SerializeProtoToRefWithCompression(*this, RequestCodec_, false));
+
+ auto compressedAttachments = CompressAttachments(Attachments(), RequestCodec_);
for (auto&& attachment : compressedAttachments) {
builder.Add(std::move(attachment));
}
@@ -132,7 +126,7 @@ bool TTypedClientResponse<TResponseMessage>::TryDeserializeBody(TRef data, std::
return codecId
? TryDeserializeProtoWithCompression(this, data, *codecId)
- // COMPAT(kiselyovp): legacy RPC codecs
+ // COMPAT(danilalexeev): legacy RPC codecs
: TryDeserializeProtoWithEnvelope(this, data);
}
@@ -149,7 +143,6 @@ TIntrusivePtr<T> TProxyBase::CreateRequest(const TMethodDescriptor& methodDescri
request->SetAcknowledgementTimeout(DefaultAcknowledgementTimeout_);
request->SetRequestCodec(DefaultRequestCodec_);
request->SetResponseCodec(DefaultResponseCodec_);
- request->SetEnableLegacyRpcCodecs(DefaultEnableLegacyRpcCodecs_);
request->SetMultiplexingBand(methodDescriptor.MultiplexingBand);
if (methodDescriptor.StreamingEnabled) {
diff --git a/yt/yt/core/rpc/client.cpp b/yt/yt/core/rpc/client.cpp
index 74d42fc7d4..d01ec795d7 100644
--- a/yt/yt/core/rpc/client.cpp
+++ b/yt/yt/core/rpc/client.cpp
@@ -443,11 +443,8 @@ void TClientRequest::PrepareHeader()
return;
}
- // COMPAT(kiselyovp): legacy RPC codecs
- if (!EnableLegacyRpcCodecs_) {
- Header_.set_request_codec(ToProto<int>(RequestCodec_));
- Header_.set_response_codec(ToProto<int>(ResponseCodec_));
- }
+ Header_.set_request_codec(ToProto<int>(RequestCodec_));
+ Header_.set_response_codec(ToProto<int>(ResponseCodec_));
if (StreamingEnabled_) {
ToProto(Header_.mutable_server_attachments_streaming_parameters(), ServerAttachmentsStreamingParameters_);
@@ -464,11 +461,6 @@ void TClientRequest::PrepareHeader()
HeaderPrepared_.store(true);
}
-bool TClientRequest::IsLegacyRpcCodecsEnabled()
-{
- return EnableLegacyRpcCodecs_;
-}
-
TSharedRefArray TClientRequest::GetHeaderlessMessage() const
{
if (SerializedHeaderlessMessageSet_.load()) {
@@ -608,7 +600,7 @@ void TClientResponse::Deserialize(TSharedRefArray responseMessage)
THROW_ERROR_EXCEPTION(NRpc::EErrorCode::ProtocolError, "Error deserializing response header");
}
- // COMPAT(kiselyovp): legacy RPC codecs
+ // COMPAT(danilalexeev): legacy RPC codecs
std::optional<NCompression::ECodec> bodyCodecId;
NCompression::ECodec attachmentCodecId;
if (Header_.has_codec()) {
diff --git a/yt/yt/core/rpc/client.h b/yt/yt/core/rpc/client.h
index 5a04b9baca..fe9433282e 100644
--- a/yt/yt/core/rpc/client.h
+++ b/yt/yt/core/rpc/client.h
@@ -75,8 +75,6 @@ struct IClientRequest
virtual TMutationId GetMutationId() const = 0;
virtual void SetMutationId(TMutationId id) = 0;
- virtual bool IsLegacyRpcCodecsEnabled() = 0;
-
virtual size_t GetHash() const = 0;
// Extension methods.
@@ -135,7 +133,6 @@ public:
DEFINE_BYVAL_RW_PROPERTY(bool, ResponseHeavy);
DEFINE_BYVAL_RW_PROPERTY(NCompression::ECodec, RequestCodec, NCompression::ECodec::None);
DEFINE_BYVAL_RW_PROPERTY(NCompression::ECodec, ResponseCodec, NCompression::ECodec::None);
- DEFINE_BYVAL_RW_PROPERTY(bool, EnableLegacyRpcCodecs, true);
DEFINE_BYVAL_RW_PROPERTY(bool, GenerateAttachmentChecksums, true);
// Field is used on client side only. So it is never serialized.
DEFINE_BYREF_RW_PROPERTY(NTracing::TTraceContext::TTagList, TracingTags);
@@ -186,8 +183,6 @@ public:
size_t GetHash() const override;
- bool IsLegacyRpcCodecsEnabled() override;
-
EMultiplexingBand GetMultiplexingBand() const;
void SetMultiplexingBand(EMultiplexingBand band);
@@ -467,7 +462,6 @@ public:
DEFINE_BYVAL_RW_PROPERTY(std::optional<TDuration>, DefaultAcknowledgementTimeout);
DEFINE_BYVAL_RW_PROPERTY(NCompression::ECodec, DefaultRequestCodec, NCompression::ECodec::None);
DEFINE_BYVAL_RW_PROPERTY(NCompression::ECodec, DefaultResponseCodec, NCompression::ECodec::None);
- DEFINE_BYVAL_RW_PROPERTY(bool, DefaultEnableLegacyRpcCodecs, true);
DEFINE_BYREF_RW_PROPERTY(TStreamingParameters, DefaultClientAttachmentsStreamingParameters);
DEFINE_BYREF_RW_PROPERTY(TStreamingParameters, DefaultServerAttachmentsStreamingParameters);
diff --git a/yt/yt/core/rpc/grpc/channel.cpp b/yt/yt/core/rpc/grpc/channel.cpp
index 9a704294f1..3d9b0b8519 100644
--- a/yt/yt/core/rpc/grpc/channel.cpp
+++ b/yt/yt/core/rpc/grpc/channel.cpp
@@ -339,13 +339,16 @@ private:
return;
}
+ if (Request_->Header().has_request_codec()) {
+ InitialMetadataBuilder_.Add(RequestCodecKey, ToString(Request_->Header().request_codec()));
+ }
+ if (Request_->Header().has_response_codec()) {
+ InitialMetadataBuilder_.Add(ResponseCodecKey, ToString(Request_->Header().response_codec()));
+ }
+
YT_VERIFY(RequestBody_.Size() >= 2);
TMessageWithAttachments messageWithAttachments;
- if (Request_->IsLegacyRpcCodecsEnabled()) {
- messageWithAttachments.Message = ExtractMessageFromEnvelopedMessage(RequestBody_[1]);
- } else {
- messageWithAttachments.Message = RequestBody_[1];
- }
+ messageWithAttachments.Message = RequestBody_[1];
for (int index = 2; index < std::ssize(RequestBody_); ++index) {
messageWithAttachments.Attachments.push_back(RequestBody_[index]);
@@ -622,6 +625,9 @@ private:
NRpc::NProto::TResponseHeader responseHeader;
ToProto(responseHeader.mutable_request_id(), Request_->GetRequestId());
+ if (Request_->Header().has_response_codec()) {
+ responseHeader.set_codec(Request_->Header().response_codec());
+ }
auto responseMessage = CreateResponseMessage(
responseHeader,
diff --git a/yt/yt/core/rpc/grpc/helpers.cpp b/yt/yt/core/rpc/grpc/helpers.cpp
index 4279c5840d..c35d1f0ed9 100644
--- a/yt/yt/core/rpc/grpc/helpers.cpp
+++ b/yt/yt/core/rpc/grpc/helpers.cpp
@@ -279,32 +279,13 @@ TMessageWithAttachments ByteBufferToMessageWithAttachments(
messageBodySize = bufferSize;
}
- NYT::NProto::TSerializedMessageEnvelope envelope;
- // Codec remains "none".
-
- TEnvelopeFixedHeader fixedHeader;
- fixedHeader.EnvelopeSize = envelope.ByteSize();
- fixedHeader.MessageSize = *messageBodySize;
-
- size_t totalMessageSize =
- sizeof (TEnvelopeFixedHeader) +
- fixedHeader.EnvelopeSize +
- fixedHeader.MessageSize;
-
auto data = TSharedMutableRef::Allocate<TMessageTag>(
- totalMessageSize,
+ *messageBodySize,
{.InitializeStorage = false});
- char* targetFixedHeader = data.Begin();
- char* targetHeader = targetFixedHeader + sizeof (TEnvelopeFixedHeader);
- char* targetMessage = targetHeader + fixedHeader.EnvelopeSize;
-
- memcpy(targetFixedHeader, &fixedHeader, sizeof (fixedHeader));
- YT_VERIFY(envelope.SerializeToArray(targetHeader, fixedHeader.EnvelopeSize));
-
TGrpcByteBufferStream stream(buffer);
- if (stream.Load(targetMessage, *messageBodySize) != *messageBodySize) {
+ if (stream.Load(data.begin(), *messageBodySize) != *messageBodySize) {
THROW_ERROR_EXCEPTION("Unexpected end of stream while reading message body");
}
@@ -389,23 +370,6 @@ TGrpcByteBufferPtr MessageWithAttachmentsToByteBuffer(const TMessageWithAttachme
return TGrpcByteBufferPtr(buffer);
}
-TSharedRef ExtractMessageFromEnvelopedMessage(const TSharedRef& data)
-{
- YT_VERIFY(data.Size() >= sizeof(TEnvelopeFixedHeader));
- const auto* fixedHeader = reinterpret_cast<const TEnvelopeFixedHeader*>(data.Begin());
- const char* sourceHeader = data.Begin() + sizeof(TEnvelopeFixedHeader);
- const char* sourceMessage = sourceHeader + fixedHeader->EnvelopeSize;
-
- NYT::NProto::TSerializedMessageEnvelope envelope;
- YT_VERIFY(envelope.ParseFromArray(sourceHeader, fixedHeader->EnvelopeSize));
-
- auto compressedMessage = data.Slice(sourceMessage, sourceMessage + fixedHeader->MessageSize);
-
- auto codecId = CheckedEnumCast<NCompression::ECodec>(envelope.codec());
- auto* codec = NCompression::GetCodec(codecId);
- return codec->Decompress(compressedMessage);
-}
-
TErrorCode StatusCodeToErrorCode(grpc_status_code statusCode)
{
switch (statusCode) {
diff --git a/yt/yt/core/rpc/grpc/helpers.h b/yt/yt/core/rpc/grpc/helpers.h
index ebcc9601e3..a85879dcb8 100644
--- a/yt/yt/core/rpc/grpc/helpers.h
+++ b/yt/yt/core/rpc/grpc/helpers.h
@@ -277,8 +277,6 @@ TMessageWithAttachments ByteBufferToMessageWithAttachments(
TGrpcByteBufferPtr MessageWithAttachmentsToByteBuffer(
const TMessageWithAttachments& messageWithAttachments);
-TSharedRef ExtractMessageFromEnvelopedMessage(const TSharedRef& data);
-
////////////////////////////////////////////////////////////////////////////////
TErrorCode StatusCodeToErrorCode(grpc_status_code statusCode);
diff --git a/yt/yt/core/rpc/grpc/public.cpp b/yt/yt/core/rpc/grpc/public.cpp
index 132c2de784..c39b99a541 100644
--- a/yt/yt/core/rpc/grpc/public.cpp
+++ b/yt/yt/core/rpc/grpc/public.cpp
@@ -24,6 +24,8 @@ const char* const AuthServiceTicketMetadataKey = "yt-auth-service-ticket";
const char* const ErrorMetadataKey = "yt-error-bin";
const char* const MessageBodySizeMetadataKey = "yt-message-body-size";
const char* const ProtocolVersionMetadataKey = "yt-protocol-version";
+const char* const RequestCodecKey = "yt-request-codec";
+const char* const ResponseCodecKey = "yt-response-codec";
const THashSet<TStringBuf>& GetNativeMetadataKeys()
{
@@ -45,6 +47,8 @@ const THashSet<TStringBuf>& GetNativeMetadataKeys()
ErrorMetadataKey,
MessageBodySizeMetadataKey,
ProtocolVersionMetadataKey,
+ RequestCodecKey,
+ ResponseCodecKey,
};
return result;
}
diff --git a/yt/yt/core/rpc/grpc/public.h b/yt/yt/core/rpc/grpc/public.h
index d3da4f43f7..58773a258b 100644
--- a/yt/yt/core/rpc/grpc/public.h
+++ b/yt/yt/core/rpc/grpc/public.h
@@ -34,6 +34,8 @@ extern const char* const AuthServiceTicketMetadataKey;
extern const char* const ErrorMetadataKey;
extern const char* const MessageBodySizeMetadataKey;
extern const char* const ProtocolVersionMetadataKey;
+extern const char* const RequestCodecKey;
+extern const char* const ResponseCodecKey;
// After adding a new metadata key, do not forget to add it in GetNativeMetadataKeys.
const THashSet<TStringBuf>& GetNativeMetadataKeys();
diff --git a/yt/yt/core/rpc/grpc/server.cpp b/yt/yt/core/rpc/grpc/server.cpp
index d9efaace43..db935778c7 100644
--- a/yt/yt/core/rpc/grpc/server.cpp
+++ b/yt/yt/core/rpc/grpc/server.cpp
@@ -419,6 +419,8 @@ private:
TString ServiceName_;
TString MethodName_;
std::optional<TDuration> Timeout_;
+ NCompression::ECodec RequestCodec_ = NCompression::ECodec::None;
+ NCompression::ECodec ResponseCodec_ = NCompression::ECodec::None;
IServicePtr Service_;
TGrpcMetadataArrayBuilder InitialMetadataBuilder_;
@@ -480,6 +482,8 @@ private:
ParseRpcCredentials();
ParseCustomMetadata();
ParseTimeout();
+ ParseRequestCodec();
+ ParseResponseCodec();
try {
SslCredentialsExt_ = WaitFor(ParseSslCredentials())
@@ -665,6 +669,54 @@ private:
UserAgent_ = TString(userAgentString);
}
+ void ParseRequestCodec()
+ {
+ auto requestCodecString = CallMetadata_.Find(RequestCodecKey);
+ if (!requestCodecString) {
+ return;
+ }
+
+ NCompression::ECodec codecId;
+ int intCodecId;
+ if (!TryFromString(requestCodecString, intCodecId)) {
+ YT_LOG_WARNING("Failed to parse request codec from request metadata (RequestId: %v)",
+ RequestId_);
+ return;
+ }
+ if (!TryEnumCast(intCodecId, &codecId)) {
+ YT_LOG_WARNING("Request codec %v is not supported (RequestId: %v)",
+ intCodecId,
+ RequestId_);
+ return;
+ }
+
+ RequestCodec_ = codecId;
+ }
+
+ void ParseResponseCodec()
+ {
+ auto responseCodecString = CallMetadata_.Find(ResponseCodecKey);
+ if (!responseCodecString) {
+ return;
+ }
+
+ NCompression::ECodec codecId;
+ int intCodecId;
+ if (!TryFromString(responseCodecString, intCodecId)) {
+ YT_LOG_WARNING("Failed to parse response codec from request metadata (RequestId: %v)",
+ RequestId_);
+ return;
+ }
+ if (!TryEnumCast(intCodecId, &codecId)) {
+ YT_LOG_WARNING("Response codec %v is not supported (RequestId: %v)",
+ intCodecId,
+ RequestId_);
+ return;
+ }
+
+ ResponseCodec_ = codecId;
+ }
+
void ParseRpcCredentials()
{
auto tokenString = CallMetadata_.Find(AuthTokenMetadataKey);
@@ -901,6 +953,9 @@ private:
header->set_method(MethodName_);
header->set_protocol_version_major(ProtocolVersion_.Major);
header->set_protocol_version_minor(ProtocolVersion_.Minor);
+ header->set_request_codec(ToProto<int>(RequestCodec_));
+ header->set_response_codec(ToProto<int>(ResponseCodec_));
+
if (Timeout_) {
header->set_timeout(ToProto<i64>(*Timeout_));
}
@@ -1032,7 +1087,7 @@ private:
YT_VERIFY(ResponseMessage_.Size() >= 2);
TMessageWithAttachments messageWithAttachments;
- messageWithAttachments.Message = ExtractMessageFromEnvelopedMessage(ResponseMessage_[1]);
+ messageWithAttachments.Message = ResponseMessage_[1];
for (int index = 2; index < std::ssize(ResponseMessage_); ++index) {
messageWithAttachments.Attachments.push_back(ResponseMessage_[index]);
}
diff --git a/yt/yt/core/rpc/message.cpp b/yt/yt/core/rpc/message.cpp
index d76dfa7c1e..6a5b6b7b2b 100644
--- a/yt/yt/core/rpc/message.cpp
+++ b/yt/yt/core/rpc/message.cpp
@@ -51,27 +51,6 @@ void SerializeAndAddProtoWithHeader(
message.SerializeWithCachedSizesToArray(reinterpret_cast<google::protobuf::uint8*>(ref.Begin() + sizeof(fixedHeader)));
}
-size_t GetAllocationSpaceForProtoWithEnvelope(const google::protobuf::MessageLite& message)
-{
- return
- sizeof (TEnvelopeFixedHeader) +
- message.ByteSizeLong();
-}
-
-void SerializeAndAddProtoWithEnvelope(
- TSharedRefArrayBuilder* builder,
- const google::protobuf::MessageLite& message)
-{
- auto ref = builder->AllocateAndAdd(
- sizeof (TEnvelopeFixedHeader) +
- message.GetCachedSize());
- auto* header = static_cast<TEnvelopeFixedHeader*>(static_cast<void*>(ref.Begin()));
- // Empty (default) TSerializedMessageEnvelope.
- header->EnvelopeSize = 0;
- header->MessageSize = message.GetCachedSize();
- message.SerializeWithCachedSizesToArray(reinterpret_cast<google::protobuf::uint8*>(ref.Begin() + sizeof(TEnvelopeFixedHeader)));
-}
-
bool DeserializeFromProtoWithHeader(
google::protobuf::MessageLite* message,
TRef data)
@@ -165,17 +144,17 @@ TSharedRefArray CreateResponseMessage(
const std::vector<TSharedRef>& attachments)
{
NProto::TResponseHeader header;
+ header.set_codec(ToProto<int>(NCompression::ECodec::None));
TSharedRefArrayBuilder builder(
2 + attachments.size(),
- GetAllocationSpaceForProtoWithHeader(header) + GetAllocationSpaceForProtoWithEnvelope(body),
+ GetAllocationSpaceForProtoWithHeader(header) + body.ByteSizeLong(),
GetRefCountedTypeCookie<TSerializedMessageTag>());
SerializeAndAddProtoWithHeader(
&builder,
TFixedMessageHeader{EMessageType::Response},
header);
- SerializeAndAddProtoWithEnvelope(
- &builder,
- body);
+ auto ref = builder.AllocateAndAdd(body.GetCachedSize());
+ body.SerializeWithCachedSizesToArray(reinterpret_cast<google::protobuf::uint8*>(ref.Begin()));
for (auto attachment : attachments) {
builder.Add(std::move(attachment));
}
diff --git a/yt/yt/core/rpc/message_format.cpp b/yt/yt/core/rpc/message_format.cpp
index 01e1dd4d27..8ef05c7b5d 100644
--- a/yt/yt/core/rpc/message_format.cpp
+++ b/yt/yt/core/rpc/message_format.cpp
@@ -71,21 +71,19 @@ public:
TSharedRef ConvertFrom(const TSharedRef& message, const NYson::TProtobufMessageType* messageType, const TYsonString& /*formatOptionsYson*/) override
{
- auto ysonBuffer = PopEnvelope(message);
TString protoBuffer;
{
google::protobuf::io::StringOutputStream output(&protoBuffer);
auto converter = CreateProtobufWriter(&output, messageType);
// NB: formatOptionsYson is ignored, since YSON parser has no user-defined options.
- ParseYsonStringBuffer(TStringBuf(ysonBuffer.Begin(), ysonBuffer.End()), EYsonType::Node, converter.get());
+ ParseYsonStringBuffer(TStringBuf(message.Begin(), message.End()), EYsonType::Node, converter.get());
}
- return PushEnvelope(TSharedRef::FromString(protoBuffer));
+ return TSharedRef::FromString(protoBuffer);
}
TSharedRef ConvertTo(const TSharedRef& message, const NYson::TProtobufMessageType* messageType, const TYsonString& /*formatOptionsYson*/) override
{
- auto protoBuffer = PopEnvelope(message);
- google::protobuf::io::ArrayInputStream stream(protoBuffer.Begin(), protoBuffer.Size());
+ google::protobuf::io::ArrayInputStream stream(message.Begin(), message.Size());
TString ysonBuffer;
{
TStringOutput output(ysonBuffer);
@@ -93,7 +91,7 @@ public:
TYsonWriter writer{&output, EYsonFormat::Text};
ParseProtobuf(&writer, &stream, messageType);
}
- return PushEnvelope(TSharedRef::FromString(ysonBuffer));
+ return TSharedRef::FromString(ysonBuffer);
}
} YsonFormat;
@@ -108,25 +106,23 @@ public:
TSharedRef ConvertFrom(const TSharedRef& message, const NYson::TProtobufMessageType* messageType, const TYsonString& formatOptionsYson) override
{
- auto jsonBuffer = PopEnvelope(message);
TString protoBuffer;
{
google::protobuf::io::StringOutputStream output(&protoBuffer);
auto converter = CreateProtobufWriter(&output, messageType);
- TMemoryInput input{jsonBuffer.Begin(), jsonBuffer.Size()};
+ TMemoryInput input{message.Begin(), message.Size()};
auto formatConfig = New<TJsonFormatConfig>();
if (formatOptionsYson) {
formatConfig->Load(NYTree::ConvertToNode(formatOptionsYson));
}
ParseJson(&input, converter.get(), formatConfig);
}
- return PushEnvelope(TSharedRef::FromString(protoBuffer));
+ return TSharedRef::FromString(protoBuffer);
}
TSharedRef ConvertTo(const TSharedRef& message, const NYson::TProtobufMessageType* messageType, const TYsonString& formatOptionsYson) override
{
- auto protoBuffer = PopEnvelope(message);
- google::protobuf::io::ArrayInputStream stream(protoBuffer.Begin(), protoBuffer.Size());
+ google::protobuf::io::ArrayInputStream stream(message.Begin(), message.Size());
TString ysonBuffer;
{
TStringOutput output(ysonBuffer);
@@ -138,7 +134,7 @@ public:
ParseProtobuf(writer.get(), &stream, messageType);
writer->Flush();
}
- return PushEnvelope(TSharedRef::FromString(ysonBuffer));
+ return TSharedRef::FromString(ysonBuffer);
}
} JsonFormat;
diff --git a/yt/yt/core/rpc/server_detail.cpp b/yt/yt/core/rpc/server_detail.cpp
index 9218725013..dcf3241ef7 100644
--- a/yt/yt/core/rpc/server_detail.cpp
+++ b/yt/yt/core/rpc/server_detail.cpp
@@ -88,15 +88,6 @@ void TServiceContextBase::Reply(const TSharedRefArray& responseMessage)
TResponseHeader header;
YT_VERIFY(TryParseResponseHeader(responseMessage, &header));
- // COMPAT(danilalexeev): legacy RPC codecs
- if (header.has_codec()) {
- YT_VERIFY(TryEnumCast(header.codec(), &ResponseCodec_));
- SetResponseBodySerializedWithCompression();
- }
- if (header.has_format()) {
- RequestHeader_->set_response_format(header.format());
- }
-
if (header.has_error()) {
Error_ = FromProto<TError>(header.error());
}
@@ -106,6 +97,11 @@ void TServiceContextBase::Reply(const TSharedRefArray& responseMessage)
ResponseAttachments_ = std::vector<TSharedRef>(
responseMessage.Begin() + 2,
responseMessage.End());
+
+ YT_VERIFY(header.has_codec() && TryEnumCast(header.codec(), &ResponseCodec_));
+ if (header.has_format()) {
+ RequestHeader_->set_response_format(header.format());
+ }
} else {
ResponseBody_.Reset();
ResponseAttachments_.clear();
@@ -190,10 +186,7 @@ TSharedRefArray TServiceContextBase::BuildResponseMessage()
header.set_format(RequestHeader_->response_format());
}
- // COMPAT(danilalexeev)
- if (IsResponseBodySerializedWithCompression()) {
- header.set_codec(static_cast<int>(ResponseCodec_));
- }
+ header.set_codec(static_cast<int>(ResponseCodec_));
auto message = Error_.IsOK()
? CreateResponseMessage(
@@ -471,16 +464,6 @@ void TServiceContextBase::SetResponseCodec(NCompression::ECodec codec)
ResponseCodec_ = codec;
}
-bool TServiceContextBase::IsResponseBodySerializedWithCompression() const
-{
- return ResponseBodySerializedWithCompression_;
-}
-
-void TServiceContextBase::SetResponseBodySerializedWithCompression()
-{
- ResponseBodySerializedWithCompression_ = true;
-}
-
////////////////////////////////////////////////////////////////////////////////
TServiceContextWrapper::TServiceContextWrapper(IServiceContextPtr underlyingContext)
@@ -745,16 +728,6 @@ void TServiceContextWrapper::SetResponseCodec(NCompression::ECodec codec)
UnderlyingContext_->SetResponseCodec(codec);
}
-bool TServiceContextWrapper::IsResponseBodySerializedWithCompression() const
-{
- return UnderlyingContext_->IsResponseBodySerializedWithCompression();
-}
-
-void TServiceContextWrapper::SetResponseBodySerializedWithCompression()
-{
- UnderlyingContext_->SetResponseBodySerializedWithCompression();
-}
-
const IServiceContextPtr& TServiceContextWrapper::GetUnderlyingContext() const
{
return UnderlyingContext_;
diff --git a/yt/yt/core/rpc/server_detail.h b/yt/yt/core/rpc/server_detail.h
index aceba1f14e..4b6ecc834f 100644
--- a/yt/yt/core/rpc/server_detail.h
+++ b/yt/yt/core/rpc/server_detail.h
@@ -102,9 +102,6 @@ public:
NCompression::ECodec GetResponseCodec() const override;
void SetResponseCodec(NCompression::ECodec codec) override;
- bool IsResponseBodySerializedWithCompression() const override;
- void SetResponseBodySerializedWithCompression() override;
-
protected:
std::unique_ptr<NProto::TRequestHeader> RequestHeader_;
TSharedRefArray RequestMessage_;
@@ -132,8 +129,6 @@ protected:
TCompactVector<TString, 4> ResponseInfos_;
NCompression::ECodec ResponseCodec_ = NCompression::ECodec::None;
- // COMPAT(danilalexeev)
- bool ResponseBodySerializedWithCompression_ = false;
TSingleShotCallbackList<void()> RepliedList_;
@@ -249,9 +244,6 @@ public:
NCompression::ECodec GetResponseCodec() const override;
void SetResponseCodec(NCompression::ECodec codec) override;
- bool IsResponseBodySerializedWithCompression() const override;
- void SetResponseBodySerializedWithCompression() override;
-
const IServiceContextPtr& GetUnderlyingContext() const;
private:
diff --git a/yt/yt/core/rpc/service.h b/yt/yt/core/rpc/service.h
index 1c2e721730..2e2a087e4f 100644
--- a/yt/yt/core/rpc/service.h
+++ b/yt/yt/core/rpc/service.h
@@ -231,11 +231,6 @@ struct IServiceContext
//! Changes the response codec.
virtual void SetResponseCodec(NCompression::ECodec codec) = 0;
- // COPMAT(danilalexeev)
- //! Returnes true if response body has been serialized with compression.
- virtual bool IsResponseBodySerializedWithCompression() const = 0;
- virtual void SetResponseBodySerializedWithCompression() = 0;
-
// Extension methods.
void SetRequestInfo();
diff --git a/yt/yt/core/rpc/service_detail.cpp b/yt/yt/core/rpc/service_detail.cpp
index 003c015241..92e81ea0ed 100644
--- a/yt/yt/core/rpc/service_detail.cpp
+++ b/yt/yt/core/rpc/service_detail.cpp
@@ -697,32 +697,13 @@ private:
MethodPerformanceCounters_->RemoteWaitTimeCounter.Record(now - retryStart);
}
- // COMPAT(kiselyovp): legacy RPC codecs
- if (RequestHeader_->has_request_codec()) {
- int intRequestCodecId = RequestHeader_->request_codec();
- if (!TryEnumCast(intRequestCodecId, &RequestCodec_)) {
- Reply(TError(
- NRpc::EErrorCode::ProtocolError,
- "Request codec %v is not supported",
- intRequestCodecId));
- return;
- }
- } else {
- RequestCodec_ = NCompression::ECodec::None;
- }
-
- if (RequestHeader_->has_response_codec()) {
- int intResponseCodecId = RequestHeader_->response_codec();
- if (!TryEnumCast(intResponseCodecId, &ResponseCodec_)) {
- Reply(TError(
- NRpc::EErrorCode::ProtocolError,
- "Response codec %v is not supported",
- intResponseCodecId));
- return;
- }
- } else {
- ResponseCodec_ = NCompression::ECodec::None;
- }
+ // COMPAT(danilalexeev): legacy RPC codecs
+ RequestCodec_ = RequestHeader_->has_request_codec()
+ ? CheckedEnumCast<NCompression::ECodec>(RequestHeader_->request_codec())
+ : NCompression::ECodec::None;
+ ResponseCodec_ = RequestHeader_->has_response_codec()
+ ? CheckedEnumCast<NCompression::ECodec>(RequestHeader_->response_codec())
+ : NCompression::ECodec::None;
Service_->IncrementActiveRequestCount();
ActiveRequestCountIncremented_ = true;
@@ -786,7 +767,7 @@ private:
GetTotalMessageAttachmentSize(RequestMessage_),
GetMessageAttachmentCount(RequestMessage_));
- // COMPAT(kiselyovp)
+ // COMPAT(danilalexeev): legacy RPC codecs
if (RequestHeader_->has_request_codec() && RequestHeader_->has_response_codec()) {
delimitedBuilder->AppendFormat("RequestCodec: %v, ResponseCodec: %v",
RequestCodec_,
@@ -1928,6 +1909,9 @@ TError TServiceBase::DoCheckRequestCompatibility(const NRpc::NProto::TRequestHea
if (auto error = DoCheckRequestFeatures(header); !error.IsOK()) {
return error;
}
+ if (auto error = DoCheckRequestCodecs(header); !error.IsOK()) {
+ return error;
+ }
return {};
}
@@ -1971,6 +1955,29 @@ TError TServiceBase::DoCheckRequestFeatures(const NRpc::NProto::TRequestHeader&
return {};
}
+TError TServiceBase::DoCheckRequestCodecs(const NRpc::NProto::TRequestHeader& header)
+{
+ if (header.has_request_codec()) {
+ NCompression::ECodec requestCodec;
+ if (!TryEnumCast(header.request_codec(), &requestCodec)) {
+ return TError(
+ NRpc::EErrorCode::ProtocolError,
+ "Request codec %v is not supported",
+ header.request_codec());
+ }
+ }
+ if (header.has_response_codec()) {
+ NCompression::ECodec responseCodec;
+ if (!TryEnumCast(header.response_codec(), &responseCodec)) {
+ return TError(
+ NRpc::EErrorCode::ProtocolError,
+ "Response codec %v is not supported",
+ header.response_codec());
+ }
+ }
+ return {};
+}
+
void TServiceBase::OnRequestTimeout(TRequestId requestId, ERequestProcessingStage stage, bool /*aborted*/)
{
auto context = FindRequest(requestId);
diff --git a/yt/yt/core/rpc/service_detail.h b/yt/yt/core/rpc/service_detail.h
index 2ce0ca71b2..822b799d50 100644
--- a/yt/yt/core/rpc/service_detail.h
+++ b/yt/yt/core/rpc/service_detail.h
@@ -214,7 +214,7 @@ public:
}
}
- // COMPAT(kiselyovp): legacy RPC codecs
+ // COMPAT(danilalexeev): legacy RPC codecs
std::optional<NCompression::ECodec> bodyCodecId;
NCompression::ECodec attachmentCodecId;
if (requestHeader.has_request_codec()) {
@@ -325,18 +325,8 @@ protected:
const auto& underlyingContext = this->GetUnderlyingContext();
const auto& requestHeader = underlyingContext->GetRequestHeader();
- // COMPAT(kiselyovp): legacy RPC codecs
- NCompression::ECodec attachmentCodecId;
- auto bodyCodecId = underlyingContext->GetResponseCodec();
- TSharedRef serializedBody;
- if (requestHeader.has_response_codec()) {
- serializedBody = SerializeProtoToRefWithCompression(*Response_, bodyCodecId, false);
- attachmentCodecId = bodyCodecId;
- underlyingContext->SetResponseBodySerializedWithCompression();
- } else {
- serializedBody = SerializeProtoToRefWithEnvelope(*Response_, bodyCodecId);
- attachmentCodecId = NCompression::ECodec::None;
- }
+ auto codecId = underlyingContext->GetResponseCodec();
+ auto serializedBody = SerializeProtoToRefWithCompression(*Response_, codecId);
if (requestHeader.has_response_format()) {
int intFormat = requestHeader.response_format();
@@ -362,7 +352,7 @@ protected:
}
}
- auto responseAttachments = CompressAttachments(Response_->Attachments(), attachmentCodecId);
+ auto responseAttachments = CompressAttachments(Response_->Attachments(), codecId);
return TSerializedResponse{
.Body = std::move(serializedBody),
@@ -948,6 +938,7 @@ private:
TError DoCheckRequestCompatibility(const NRpc::NProto::TRequestHeader& header);
TError DoCheckRequestProtocol(const NRpc::NProto::TRequestHeader& header);
TError DoCheckRequestFeatures(const NRpc::NProto::TRequestHeader& header);
+ TError DoCheckRequestCodecs(const NRpc::NProto::TRequestHeader& header);
void OnRequestTimeout(TRequestId requestId, ERequestProcessingStage stage, bool aborted);
void OnReplyBusTerminated(const NYT::NBus::IBusPtr& bus, const TError& error);
diff --git a/yt/yt/core/rpc/unittests/rpc_ut.cpp b/yt/yt/core/rpc/unittests/rpc_ut.cpp
index 3924bd6c20..226cd60f3f 100644
--- a/yt/yt/core/rpc/unittests/rpc_ut.cpp
+++ b/yt/yt/core/rpc/unittests/rpc_ut.cpp
@@ -152,7 +152,6 @@ TYPED_TEST(TNotGrpcTest, StreamingEcho)
TTestProxy proxy(this->CreateChannel());
proxy.SetDefaultRequestCodec(NCompression::ECodec::Lz4);
proxy.SetDefaultResponseCodec(NCompression::ECodec::Zstd_1);
- proxy.SetDefaultEnableLegacyRpcCodecs(false);
const int AttachmentCount = 30;
const ssize_t AttachmentSize = 2_MB;
@@ -540,7 +539,6 @@ TYPED_TEST(TNotGrpcTest, Compression)
TTestProxy proxy(this->CreateChannel());
proxy.SetDefaultRequestCodec(requestCodecId);
proxy.SetDefaultResponseCodec(responseCodecId);
- proxy.SetDefaultEnableLegacyRpcCodecs(false);
auto req = proxy.Compression();
req->set_request_codec(static_cast<int>(requestCodecId));
diff --git a/yt/yt/core/ytree/ypath_client.cpp b/yt/yt/core/ytree/ypath_client.cpp
index cdfa0f63c3..2a40321189 100644
--- a/yt/yt/core/ytree/ypath_client.cpp
+++ b/yt/yt/core/ytree/ypath_client.cpp
@@ -51,6 +51,8 @@ TYPathRequest::TYPathRequest(
{
ToProto(Header_.mutable_service(), std::move(service));
ToProto(Header_.mutable_method(), std::move(method));
+ Header_.set_request_codec(ToProto<int>(NCompression::ECodec::None));
+ Header_.set_response_codec(ToProto<int>(NCompression::ECodec::None));
auto* ypathExt = Header_.MutableExtension(NProto::TYPathHeaderExt::ypath_header_ext);
ypathExt->set_mutating(mutating);
@@ -186,11 +188,6 @@ NConcurrency::IAsyncZeroCopyInputStreamPtr TYPathRequest::GetResponseAttachments
YT_ABORT();
}
-bool TYPathRequest::IsLegacyRpcCodecsEnabled()
-{
- YT_ABORT();
-}
-
TSharedRefArray TYPathRequest::Serialize()
{
auto bodyData = SerializeBody();
diff --git a/yt/yt/core/ytree/ypath_client.h b/yt/yt/core/ytree/ypath_client.h
index 3dfdaed64e..e5d4138568 100644
--- a/yt/yt/core/ytree/ypath_client.h
+++ b/yt/yt/core/ytree/ypath_client.h
@@ -67,8 +67,6 @@ public:
NConcurrency::IAsyncZeroCopyOutputStreamPtr GetRequestAttachmentsStream() const override;
NConcurrency::IAsyncZeroCopyInputStreamPtr GetResponseAttachmentsStream() const override;
- bool IsLegacyRpcCodecsEnabled() override;
-
TSharedRefArray Serialize() override;
protected:
diff --git a/yt/yt/library/tracing/jaeger/tracer.cpp b/yt/yt/library/tracing/jaeger/tracer.cpp
index 43817c119f..50541c7a18 100644
--- a/yt/yt/library/tracing/jaeger/tracer.cpp
+++ b/yt/yt/library/tracing/jaeger/tracer.cpp
@@ -324,7 +324,6 @@ bool TJaegerChannelManager::Push(const std::vector<TSharedRef>& batches, int spa
proxy.SetDefaultTimeout(RpcTimeout_);
auto req = proxy.PostSpans();
- req->SetEnableLegacyRpcCodecs(false);
req->set_batch(MergeRefsToString(batches));
if (TvmService_) {
diff --git a/yt/yt_proto/yt/client/cache/proto/config.proto b/yt/yt_proto/yt/client/cache/proto/config.proto
index 98699e87bb..6350308d37 100644
--- a/yt/yt_proto/yt/client/cache/proto/config.proto
+++ b/yt/yt_proto/yt/client/cache/proto/config.proto
@@ -29,8 +29,6 @@ message TConfig
optional ECompressionCodec RequestCodec = 17 [default = None];
optional ECompressionCodec ResponseCodec = 12 [default = None];
- // Should set EnableLegacyRpcCodecs=False, to enable RequestCodec & ResponseCodec: https://nda.ya.ru/t/iXCfYZjS6yNEwg
- optional bool EnableLegacyRpcCodecs = 20;
optional bool EnableRetries = 13;
optional uint32 RetryBackoffTime = 14;
diff --git a/yt/yt_proto/yt/core/rpc/proto/rpc.proto b/yt/yt_proto/yt/core/rpc/proto/rpc.proto
index 043b2953e2..c9d0ea291f 100644
--- a/yt/yt_proto/yt/core/rpc/proto/rpc.proto
+++ b/yt/yt_proto/yt/core/rpc/proto/rpc.proto
@@ -144,7 +144,6 @@ message TResponseHeader
optional int32 format = 3; // EMessageFormat
- // COMPAT(kiseloyvp): this is missing when legacy mode is used
optional int32 codec = 6; // ECodec
reserved 5;