Intermediate changes

author: robot-piglet <[email protected]> 2024-06-11 21:56:53 +0300
committer: robot-piglet <[email protected]> 2024-06-12 11:36:46 +0300
commit: f34ee6ebd8f8178f084e7003b7ee7694231ffe0a (patch)
tree: 92dc4e972d754bde559cc5f2f01c6da76beab74a /contrib/python/pythran
parent: 3ef7f2079326399e4eb328be651e9fff8e4734aa (diff)
118 files changed, 5280 insertions, 3506 deletions
diff --git a/contrib/python/pythran/.dist-info/METADATA b/contrib/python/pythran/.dist-info/METADATA
index 879b7153ba1..a02accf7144 100644
--- a/contrib/python/pythran/.dist-info/METADATA
+++ b/contrib/python/pythran/.dist-info/METADATA
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pythran
-Version: 0.15.0
+Version: 0.16.1
 Summary: Ahead of Time compiler for numeric kernels
 Author-email: Serge Guelton <[email protected]>
 License: Copyright (c) 2012, HPC Project and Serge Guelton
@@ -67,6 +67,7 @@ Requires-Dist: ipython ; extra == 'test'
 Requires-Dist: nbval ; extra == 'test'
 Requires-Dist: cython ; extra == 'test'
 Requires-Dist: wheel ; extra == 'test'
+Requires-Dist: packaging ; extra == 'test'
 
 Pythran
 #######
diff --git a/contrib/python/pythran/pythran/__init__.py b/contrib/python/pythran/pythran/__init__.py
index b83c7513cba..4117131d475 100644
--- a/contrib/python/pythran/pythran/__init__.py
+++ b/contrib/python/pythran/pythran/__init__.py
@@ -9,7 +9,7 @@ This package provides several entry points
        * compile_pythranfile: python (file) to so/cpp, returns output filename
        * import_pythrancode: python (str) to module, returns loaded module
        * import_pythranfile: python (file) to module, returns loaded module
-       * test_compile: passthrough compile test, raises CompileError Exception.
+       * test_compile: passthrough compile test, raises PythranCompileError Exception.
 
 Basic scenario is to turn a Python AST into C++ code:
 >>> code = "def foo(x): return x * 2"
diff --git a/contrib/python/pythran/pythran/backend.py b/contrib/python/pythran/pythran/backend.py
index e159ff11f80..c5de8d6461a 100644
--- a/contrib/python/pythran/pythran/backend.py
+++ b/contrib/python/pythran/pythran/backend.py
@@ -14,7 +14,7 @@ from pythran.cxxgen import Statement, Block, AnnotatedStatement, Typedef, Label
 from pythran.cxxgen import Value, FunctionDeclaration, EmptyStatement, Nop
 from pythran.cxxgen import FunctionBody, Line, ReturnStatement, Struct, Assign
 from pythran.cxxgen import For, While, TryExcept, ExceptHandler, If, AutoFor
-from pythran.cxxgen import StatementWithComments
+from pythran.cxxgen import StatementWithComments, InstrumentedStatement
 from pythran.openmp import OMPDirective
 from pythran.passmanager import Backend
 from pythran.syntax import PythranSyntaxError
@@ -227,7 +227,11 @@ class CxxFunction(ast.NodeVisitor):
         if isinstance(node, ast.FunctionDef):
             head, tail = cxx_node
             return head, [StatementWithComments(t, line) for t in tail]
-        return StatementWithComments(cxx_node, line)
+        if cfg.get('backend', 'annotation_kind') == 'lineno':
+            return InstrumentedStatement(cxx_node,
+                    'pythran_trace_lineno({});'.format(node.lineno))
+        else:
+            return StatementWithComments(cxx_node, line)
 
     def skip_line_info(self, node, cxx_node):
         return cxx_node
diff --git a/contrib/python/pythran/pythran/config.py b/contrib/python/pythran/pythran/config.py
index 7c2d9a45d3c..c8d09640c7e 100644
--- a/contrib/python/pythran/pythran/config.py
+++ b/contrib/python/pythran/pythran/config.py
@@ -56,17 +56,22 @@ def get_paths_cfg(
                                             "pythran-default.cfg")
 
     user_config_path = os.environ.get('PYTHRANRC', None)
-    if not user_config_path:
+    if user_config_path is None:
         user_config_dir = os.environ.get('XDG_CONFIG_HOME', None)
         if not user_config_dir:
             user_config_dir = os.environ.get('HOME', None)
         if not user_config_dir:
             user_config_dir = '~'
-        user_config_path = os.path.expanduser(
-            os.path.join(user_config_dir, user_file))
-    return {"sys": sys_config_path,
-            "platform": platform_config_path,
-            "user": user_config_path}
+        user_config_path = os.path.expanduser(os.path.join(user_config_dir,
+                                                           user_file))
+
+    paths = {"sys": sys_config_path,
+             "platform": platform_config_path}
+
+    if user_config_path:
+        paths["user"] = user_config_path
+
+    return paths
 
 
 def init_cfg(sys_file, platform_file, user_file, config_args=None):
@@ -74,7 +79,7 @@ def init_cfg(sys_file, platform_file, user_file, config_args=None):
 
     sys_config_path = paths["sys"]
     platform_config_path = paths["platform"]
-    user_config_path = paths["user"]
+    user_config_path = paths.get("user")
 
     cfgp = ConfigParser()
     for required in (sys_config_path, platform_config_path):
@@ -82,7 +87,9 @@ def init_cfg(sys_file, platform_file, user_file, config_args=None):
     import pkgutil
     for required in (sys_config_path, platform_config_path):
         cfgp.read_string(pkgutil.get_data(__package__, os.path.basename(required)).decode("utf-8"))
-    cfgp.read([user_config_path])
+
+    if user_config_path:
+        cfgp.read([user_config_path])
 
     if config_args is not None:
         update_cfg(cfgp, config_args)
@@ -156,10 +163,8 @@ def make_extension(python, **extra):
     cfg = init_cfg('pythran.cfg',
                    'pythran-{}.cfg'.format(sys.platform),
                    '.pythranrc',
-                   extra.get('config', None))
+                   extra.pop('config', None))
 
-    if 'config' in extra:
-        extra.pop('config')
 
     def parse_define(define):
         index = define.find('=')
diff --git a/contrib/python/pythran/pythran/conversion.py b/contrib/python/pythran/pythran/conversion.py
index 836531db832..44049297a9e 100644
--- a/contrib/python/pythran/pythran/conversion.py
+++ b/contrib/python/pythran/pythran/conversion.py
@@ -137,7 +137,7 @@ def to_ast(value):
     if any(value is t for t in (bool, int, float)):
         return builtin_folding(value)
     elif isinstance(value, np.generic):
-        return to_ast(value.item())
+        raise ToNotEval()
     elif isinstance(value, (numbers.Number, str, bool, type(None))):
         iinfo = np.iinfo(int)
         if isinstance(value, int) and not (iinfo.min <= value <= iinfo.max):
diff --git a/contrib/python/pythran/pythran/cxxgen.py b/contrib/python/pythran/pythran/cxxgen.py
index 87094270257..504a9636cfe 100644
--- a/contrib/python/pythran/pythran/cxxgen.py
+++ b/contrib/python/pythran/pythran/cxxgen.py
@@ -355,6 +355,17 @@ class StatementWithComments(object):
             yield s
 
 
+class InstrumentedStatement(object):
+    def __init__(self, stmt,instrumentation):
+        self.stmt = stmt
+        self.instrumentation = instrumentation
+
+    def generate(self):
+        yield self.instrumentation
+        for s in self.stmt.generate():
+            yield s
+
+
 class ReturnStatement(Statement):
     def generate(self):
         yield "return " + self.text + ";"
@@ -476,6 +487,7 @@ class PythonModule(object):
         self.global_vars = []
         self.implems = []
         self.capsules = []
+        self.ufuncs = {}
         self.python_implems = []
         self.wrappers = []
         self.docstrings = docstrings
@@ -503,6 +515,12 @@ class PythonModule(object):
         self.capsules.append((ptrname, sig))
         self.implems.append(func)
 
+    def add_ufunc(self, func, funcname, funcobject, functypes, signature):
+        self.ufuncs.setdefault(funcname, []).append(
+                            (funcobject, functypes, signature)
+                            )
+        self.implems.append(func)
+
     def add_function(self, func, name, types, signature):
         self.add_function_to(self.implems, func, name, types, signature)
 
@@ -630,6 +648,45 @@ class PythonModule(object):
                          sig=sig)
             theextraobjects.append(capsule)
 
+        for fname, overloads in self.ufuncs.items():
+            fdoc = self.docstring(self.docstrings.get(fname, ''))
+            funcs = []
+            types = []
+            for wrapper_name, wrapper_types, overload in overloads:
+                funcs.append("pythonic::types::ufunc_wrapper<{}, {}>".format(
+                    wrapper_name, ", ".join(wrapper_types)))
+                types.extend(overload)
+
+            ufunc = '''
+            {{
+            static PyUFuncGenericFunction funcs [] = {{{funcs}}};
+            static char types[] = {{{types}}};
+            PyModule_AddObject(
+                theModule,
+                "{name}",
+                PyUFunc_FromFuncAndData(
+                    funcs,
+                    NULL,
+                    types,
+                    {noverloads}, {ninputs}, {noutputs},
+                    PyUFunc_None,
+                    "{name}",
+                    {doc},
+                    0
+                )
+            );
+            }}
+            '''.format(name=fname,
+                       funcs=", ".join(['reinterpret_cast<PyUFuncGenericFunction>(&{})'.format(f) for f in
+                                        funcs]),
+                       types=", ".join(types),
+                       noverloads=len(overloads),
+                       ninputs=len(types) // len(funcs) - 1,
+                       noutputs=1,
+                       doc=fdoc
+                       )
+            theextraobjects.append(ufunc)
+
         methods = dedent('''
             static PyMethodDef Methods[] = {{
                 {methods}
@@ -667,7 +724,8 @@ class PythonModule(object):
             ;
             PyMODINIT_FUNC
             PYTHRAN_MODULE_INIT({name})(void) {{
-                import_array()
+                import_array();
+                {import_umath}
                 #if PY_MAJOR_VERSION >= 3
                 PyObject* theModule = PyModule_Create(&moduledef);
                 #else
@@ -691,6 +749,7 @@ class PythonModule(object):
                 PYTHRAN_RETURN;
             }}
             '''.format(name=self.name,
+                       import_umath="import_umath();" if self.ufuncs else "",
                        extraobjects='\n'.join(theextraobjects),
                        **self.metadata))
 
diff --git a/contrib/python/pythran/pythran/dist.py b/contrib/python/pythran/pythran/dist.py
index fc975a9bf95..b254dee7b97 100644
--- a/contrib/python/pythran/pythran/dist.py
+++ b/contrib/python/pythran/pythran/dist.py
@@ -4,6 +4,8 @@ This modules contains a distutils extension mechanism for Pythran
 '''
 
 import pythran.config as cfg
+from pythran.tables import blas_requires
+from pythran.utils import cxxid
 
 from collections import defaultdict
 try:
@@ -12,6 +14,7 @@ except ImportError:
     from collections import Iterable
 import os.path
 import os
+import re
 
 try:
     from distutils.command.build_ext import build_ext as LegacyBuildExt
@@ -140,24 +143,51 @@ class PythranBuildExt(PythranBuildExtMixIn, LegacyBuildExt, metaclass=PythranBui
     pass
 
 
+blas_requirements = {"/".join(map(cxxid, elem)) for elem in blas_requires}
+
+includes_matcher = re.compile(r'^#include <pythonic/include/(.*)\.hpp>$',
+                              re.MULTILINE)
+
+def requires_blas(source):
+    if not os.path.exists(source) or os.path.splitext(source)[1] != ".cpp":
+        return False  # conservative
+
+    with open(source) as fd:
+        content = fd.read()
+
+    return not blas_requirements.isdisjoint(includes_matcher.findall(content))
+
+
 class PythranExtension(Extension):
     '''
     Description of a Pythran extension
 
-    Similar to distutils.core.Extension except that the sources are .py files
+    Similar to setuptools.extension.Extension except that the sources are .py files
     They must be processable by pythran, of course.
 
     The compilation process ends up in a native Python module.
     '''
 
     def __init__(self, name, sources, *args, **kwargs):
+        self._kwargs = kwargs.copy()
+
+        if all(not requires_blas(source) for source in sources):
+            # Inserting at head so that user-specified config in CLI takes
+            # precedence.
+            kwargs['config'] = ['compiler.blas=none'] + kwargs.get('config', [])
+
         cfg_ext = cfg.make_extension(python=True, **kwargs)
         self.cxx = cfg_ext.pop('cxx', None)
         self.cc = cfg_ext.pop('cc', None)
-        self._sources = sources
         Extension.__init__(self, name, sources, *args, **cfg_ext)
         self.__dict__.pop("sources", None)
 
+    def _update_blas_requirements(self, source):
+        if requires_blas(source):
+            cfg_ext = cfg.make_extension(python=True, **self._kwargs)
+            for k, v in cfg_ext.items():
+                setattr(self, k, v)
+
     @property
     def sources(self):
         import pythran.toolchain as tc
@@ -178,9 +208,13 @@ class PythranExtension(Extension):
                     module_name = self.name
                 tc.compile_pythranfile(source, output_file,
                                        module_name, cpponly=True)
+
+            self._update_blas_requirements(output_file)
             cxx_sources.append(output_file)
         return cxx_sources
 
     @sources.setter
     def sources(self, sources):
         self._sources = sources
+        for source in sources:
+            self._update_blas_requirements(source)
diff --git a/contrib/python/pythran/pythran/errors.py b/contrib/python/pythran/pythran/errors.py
index ada69f30a1a..02e978da4e7 100644
--- a/contrib/python/pythran/pythran/errors.py
+++ b/contrib/python/pythran/pythran/errors.py
@@ -4,3 +4,53 @@
 class PythranInternalError(Exception):
 
     """ Exception raise on Incorrect internal behavior in Pythran. """
+
+
+class PythranCompileError(Exception):
+
+    """ Exception raise on when Pythran fails the compile to binary step. """
+
+
+class PythranSyntaxError(SyntaxError):
+    def __init__(self, msg, node=None):
+        SyntaxError.__init__(self, msg)
+        if node:
+            self.filename = getattr(node, 'filename', None)
+            self.lineno = node.lineno
+            self.offset = node.col_offset
+
+    def __str__(self):
+        loc_info = self.lineno is not None and self.offset is not None
+
+        if self.filename and loc_info:
+            with open(self.filename) as f:
+                for i in range(self.lineno - 1):
+                    f.readline()  # and drop it
+                extra = '{}\n{}'.format(f.readline().rstrip(),
+                                        " " * (self.offset) + "^~~~ (o_0)")
+        else:
+            extra = None
+
+        if loc_info:
+            format_header = "{}:{}:{}"
+            format_args = self.lineno, self.offset, self.args[0],
+        else:
+            format_header = "{}:"
+            format_args = self.args[0],
+
+        r = (format_header + " error: {}").format(
+                self.filename or "<unknown>",
+                *format_args)
+
+        if extra is not None:
+            r += "\n----\n"
+            r += extra
+            r += "\n----\n"
+
+        return r
+
+
+class PythranTypeError(PythranSyntaxError):
+    "A new type to distinguish general syntax errors from typing issues"
+
+
diff --git a/contrib/python/pythran/pythran/intrinsic.py b/contrib/python/pythran/pythran/intrinsic.py
index 67d12377a16..c214bb419af 100644
--- a/contrib/python/pythran/pythran/intrinsic.py
+++ b/contrib/python/pythran/pythran/intrinsic.py
@@ -67,6 +67,7 @@ class Intrinsic(object):
                                        lambda call: UNKNOWN_RANGE)
         self.return_range_content = kwargs.get("return_range_content",
                                                lambda c: UNKNOWN_RANGE)
+        self.requires_blas = kwargs.get("requires_blas", False)
 
     def isliteral(self):
         return False
diff --git a/contrib/python/pythran/pythran/optimizations/constant_folding.py b/contrib/python/pythran/pythran/optimizations/constant_folding.py
index 4de944c2315..726a217fe95 100644
--- a/contrib/python/pythran/pythran/optimizations/constant_folding.py
+++ b/contrib/python/pythran/pythran/optimizations/constant_folding.py
@@ -135,7 +135,7 @@ class ConstEval(ast.NodeVisitor):
             else:
                 raise ValueError("invalid binary op")
         elif isinstance(node.target, ast.Subscript):
-            subscript = self.visit(node.target.subscript)
+            subscript = self.visit(node.target.slice)
             if ty is ast.Add:
                 self.visit(node.target.value)[subscript] += value
             elif ty is ast.Sub:
diff --git a/contrib/python/pythran/pythran/pythonic/include/types/dict.hpp b/contrib/python/pythran/pythran/pythonic/include/types/dict.hpp
index 6e30ce5547a..5f6b826bd94 100644
--- a/contrib/python/pythran/pythran/pythonic/include/types/dict.hpp
+++ b/contrib/python/pythran/pythran/pythonic/include/types/dict.hpp
@@ -5,6 +5,7 @@
 #include "pythonic/include/types/empty_iterator.hpp"
 #include "pythonic/include/types/tuple.hpp"
 
+#include "pythonic/include/utils/allocate.hpp"
 #include "pythonic/include/utils/iterator.hpp"
 #include "pythonic/include/utils/reserve.hpp"
 #include "pythonic/include/utils/shared_ref.hpp"
@@ -14,7 +15,6 @@
 #include <algorithm>
 #include <iterator>
 #include <limits>
-#include <memory>
 #include <unordered_map>
 #include <utility>
 
@@ -104,7 +104,9 @@ namespace types
         typename std::remove_cv<typename std::remove_reference<K>::type>::type;
     using _value_type =
         typename std::remove_cv<typename std::remove_reference<V>::type>::type;
-    using container_type = std::unordered_map<_key_type, _value_type>;
+    using container_type = std::unordered_map<
+        _key_type, _value_type, std::hash<_key_type>, std::equal_to<_key_type>,
+        utils::allocator<std::pair<const _key_type, _value_type>>>;
 
     utils::shared_ref<container_type> data;
     template <class Kp, class Vp>
diff --git a/contrib/python/pythran/pythran/pythonic/include/types/dynamic_tuple.hpp b/contrib/python/pythran/pythran/pythonic/include/types/dynamic_tuple.hpp
index 99e47762147..389edbfc1a7 100644
--- a/contrib/python/pythran/pythran/pythonic/include/types/dynamic_tuple.hpp
+++ b/contrib/python/pythran/pythran/pythonic/include/types/dynamic_tuple.hpp
@@ -5,6 +5,7 @@
 #include "pythonic/include/types/nditerator.hpp"
 #include "pythonic/include/types/traits.hpp"
 #include "pythonic/include/types/tuple.hpp"
+#include "pythonic/include/utils/allocate.hpp"
 #include "pythonic/include/utils/int_.hpp"
 #include "pythonic/include/utils/nested_container.hpp"
 #include "pythonic/include/utils/seq.hpp"
@@ -19,7 +20,7 @@ namespace types
 
   template <typename T>
   struct dynamic_tuple {
-    using container_type = std::vector<T>;
+    using container_type = std::vector<T, utils::allocator<T>>;
     utils::shared_ref<container_type> data;
 
     using value_type = T;
diff --git a/contrib/python/pythran/pythran/pythonic/include/types/list.hpp b/contrib/python/pythran/pythran/pythonic/include/types/list.hpp
index 7c3cd20e5c4..15a1db30866 100644
--- a/contrib/python/pythran/pythran/pythonic/include/types/list.hpp
+++ b/contrib/python/pythran/pythran/pythonic/include/types/list.hpp
@@ -4,26 +4,27 @@
 #include "pythonic/include/types/assignable.hpp"
 #include "pythonic/include/types/empty_iterator.hpp"
 #include "pythonic/include/types/nditerator.hpp"
-#include "pythonic/include/utils/shared_ref.hpp"
-#include "pythonic/include/utils/nested_container.hpp"
-#include "pythonic/include/utils/int_.hpp"
-#include "pythonic/include/utils/reserve.hpp"
-#include "pythonic/include/types/tuple.hpp"
 #include "pythonic/include/types/slice.hpp"
+#include "pythonic/include/types/tuple.hpp"
 #include "pythonic/include/types/vectorizable_type.hpp"
+#include "pythonic/include/utils/allocate.hpp"
+#include "pythonic/include/utils/int_.hpp"
+#include "pythonic/include/utils/nested_container.hpp"
+#include "pythonic/include/utils/reserve.hpp"
+#include "pythonic/include/utils/shared_ref.hpp"
 
-#include <ostream>
-#include <vector>
-#include <utility>
 #include <algorithm>
 #include <iterator>
+#include <ostream>
+#include <utility>
+#include <vector>
 
 PYTHONIC_NS_BEGIN
 
 namespace types
 {
   template <class T>
-  using container = std::vector<T>;
+  using container = std::vector<T, utils::allocator<T>>;
 
   static const size_t DEFAULT_LIST_CAPACITY = 16;
 
@@ -177,6 +178,11 @@ namespace types
     bool contains(V const &v) const;
     intptr_t id() const;
 
+    intptr_t baseid() const
+    {
+      return reinterpret_cast<intptr_t>(&(*_data));
+    }
+
     long count(T const &x) const;
     template <class Tp, class Sp>
     friend std::ostream &operator<<(std::ostream &os,
@@ -239,8 +245,7 @@ namespace types
     template <class Tp, class S>
     list(sliced_list<Tp, S> const &other);
     template <class Tp, size_t N>
-    list(static_list<Tp, N> const &other)
-        : list(other.begin(), other.end())
+    list(static_list<Tp, N> const &other) : list(other.begin(), other.end())
     {
     }
     template <class Tp, size_t N, class... S>
@@ -327,8 +332,14 @@ namespace types
       return fast(index);
     }
 
-    dtype* data() { return _data->data();}
-    const dtype* data() const { return _data->data();}
+    dtype *data()
+    {
+      return _data->data();
+    }
+    const dtype *data() const
+    {
+      return _data->data();
+    }
 
     // modifiers
     template <class Tp>
@@ -491,7 +502,7 @@ namespace types
     list<T> res(self.begin(), self.end());
     return res += other;
   }
-}
+} // namespace types
 
 namespace utils
 {
@@ -504,7 +515,7 @@ namespace utils
   template <class T, class From>
   void reserve(types::list<T> &l, From const &f,
                typename From::const_iterator *p = nullptr);
-}
+} // namespace utils
 
 template <class T>
 struct assignable<types::list<T>> {
@@ -557,7 +568,7 @@ namespace std
   struct tuple_element<I, pythonic::types::sliced_list<T, S>> {
     typedef typename pythonic::types::sliced_list<T, S>::value_type type;
   };
-}
+} // namespace std
 
 /* type inference stuff  {*/
 #include "pythonic/include/types/combined.hpp"
diff --git a/contrib/python/pythran/pythran/pythonic/include/types/ndarray.hpp b/contrib/python/pythran/pythran/pythonic/include/types/ndarray.hpp
index 217444e15bf..e28d665e1fd 100644
--- a/contrib/python/pythran/pythran/pythonic/include/types/ndarray.hpp
+++ b/contrib/python/pythran/pythran/pythonic/include/types/ndarray.hpp
@@ -825,8 +825,8 @@ namespace builtins
     template <>
     struct _build_gexpr<1> {
       template <class E, class... S>
-      types::numpy_gexpr<E, types::normalize_t<S>...>
-      operator()(E const &a, S const &...slices);
+        auto
+      operator()(E const &a, S const &...slices) -> decltype(E(a)(slices...));
     };
 
     template <class E>
diff --git a/contrib/python/pythran/pythran/pythonic/include/types/set.hpp b/contrib/python/pythran/pythran/pythonic/include/types/set.hpp
index 5a48ed3845b..a15c901f0f1 100644
--- a/contrib/python/pythran/pythran/pythonic/include/types/set.hpp
+++ b/contrib/python/pythran/pythran/pythonic/include/types/set.hpp
@@ -5,18 +5,18 @@
 #include "pythonic/include/types/empty_iterator.hpp"
 #include "pythonic/include/types/list.hpp"
 
+#include "pythonic/include/utils/allocate.hpp"
 #include "pythonic/include/utils/iterator.hpp"
 #include "pythonic/include/utils/reserve.hpp"
 #include "pythonic/include/utils/shared_ref.hpp"
 
 #include "pythonic/include/builtins/in.hpp"
 
-#include <set>
-#include <memory>
-#include <utility>
-#include <limits>
 #include <algorithm>
 #include <iterator>
+#include <limits>
+#include <set>
+#include <utility>
 
 PYTHONIC_NS_BEGIN
 namespace types
@@ -26,7 +26,7 @@ namespace types
 
   template <class T>
   class set;
-}
+} // namespace types
 PYTHONIC_NS_END
 
 /* type inference stuff  {*/
@@ -127,7 +127,8 @@ namespace types
     // data holder
     using _type =
         typename std::remove_cv<typename std::remove_reference<T>::type>::type;
-    using container_type = std::set<_type>;
+    using container_type =
+        std::set<_type, std::less<_type>, utils::allocator<_type>>;
     utils::shared_ref<container_type> data;
 
   public:
@@ -206,30 +207,30 @@ namespace types
 
     template <typename U, typename... Types>
     typename __combined<set<T>, U, Types...>::type
-    union_(U &&other, Types &&... others) const;
+    union_(U &&other, Types &&...others) const;
 
     template <typename... Types>
-    none_type update(Types &&... others);
+    none_type update(Types &&...others);
 
     set<T> intersection() const;
 
     template <typename U, typename... Types>
     typename __combined<set<T>, U, Types...>::type
-    intersection(U const &other, Types const &... others) const;
+    intersection(U const &other, Types const &...others) const;
 
     template <typename... Types>
-    void intersection_update(Types const &... others);
+    void intersection_update(Types const &...others);
 
     set<T> difference() const;
 
     template <typename U, typename... Types>
-    set<T> difference(U const &other, Types const &... others) const;
+    set<T> difference(U const &other, Types const &...others) const;
 
     template <class V>
     bool contains(V const &v) const;
 
     template <typename... Types>
-    void difference_update(Types const &... others);
+    void difference_update(Types const &...others);
 
     template <typename U>
     set<typename __combined<T, U>::type>
@@ -319,7 +320,7 @@ namespace types
       return 0;
     }
   };
-}
+} // namespace types
 
 template <class T>
 struct assignable<types::set<T>> {
diff --git a/contrib/python/pythran/pythran/pythonic/include/utils/allocate.hpp b/contrib/python/pythran/pythran/pythonic/include/utils/allocate.hpp
new file mode 100644
index 00000000000..3af878d4685
--- /dev/null
+++ b/contrib/python/pythran/pythran/pythonic/include/utils/allocate.hpp
@@ -0,0 +1,91 @@
+#ifndef PYTHONIC_INCLUDE_UTILS_ALLOCATE_HPP
+#define PYTHONIC_INCLUDE_UTILS_ALLOCATE_HPP
+
+#include <cstdlib>
+#ifdef PYTHRAN_TRACE_ALLOCATION
+#include <cstdio>
+#endif
+
+PYTHONIC_NS_BEGIN
+
+namespace utils
+{
+
+#ifdef PYTHRAN_TRACE_ALLOCATION
+  extern size_t pythran_allocation_site;
+#define pythran_trace_lineno(n) pythonic::utils::pythran_allocation_site = n;
+#define pythran_trace_allocation(n)                                            \
+  do {                                                                         \
+    fprintf(stderr, ":%d: Allocating %d bytes\n",                              \
+            pythonic::utils::pythran_allocation_site, n);                      \
+  } while (0)
+#else
+#define pythran_trace_lineno(s)
+#define pythran_trace_allocation(n)
+#endif
+
+  template <class T>
+  inline T *allocate(size_t nmemb)
+  {
+    pythran_trace_allocation(sizeof(T) * nmemb);
+    return (T *)malloc(sizeof(T) * nmemb);
+  }
+
+  template <class T>
+  inline T *callocate(size_t nmemb)
+  {
+    pythran_trace_allocation(sizeof(T) * nmemb);
+    return (T *)calloc(nmemb, sizeof(T));
+  }
+
+  template <class T>
+  inline T *reallocate(T *prev, size_t nmemb)
+  {
+    pythran_trace_allocation(sizeof(T) * nmemb);
+    return (T *)realloc(prev, sizeof(T) * nmemb);
+  }
+
+  template <class T>
+  inline void deallocate(T *mem)
+  {
+    free(mem);
+  }
+
+  template <class T>
+  struct allocator {
+    typedef T value_type;
+
+    allocator() = default;
+
+    template <class U>
+    constexpr allocator(const allocator<U> &) noexcept
+    {
+    }
+
+    [[nodiscard]] T *allocate(std::size_t n)
+    {
+      return ::pythonic::utils::allocate<T>(n);
+    }
+
+    void deallocate(T *p, std::size_t) noexcept
+    {
+      ::pythonic::utils::deallocate(p);
+    }
+
+    template <class U>
+    constexpr bool operator==(const allocator<U> &) const
+    {
+      return true;
+    }
+
+    template <class U>
+    constexpr bool operator!=(const allocator<U> &) const
+    {
+      return false;
+    }
+  }; // namespace utils
+
+} // namespace utils
+PYTHONIC_NS_END
+
+#endif
diff --git a/contrib/python/pythran/pythran/pythonic/include/utils/shared_ref.hpp b/contrib/python/pythran/pythran/pythonic/include/utils/shared_ref.hpp
index dbb745eff38..d7ce48b0c81 100644
--- a/contrib/python/pythran/pythran/pythonic/include/utils/shared_ref.hpp
+++ b/contrib/python/pythran/pythran/pythonic/include/utils/shared_ref.hpp
@@ -1,7 +1,6 @@
 #ifndef PYTHONIC_INCLUDE_UTILS_SHARED_REF_HPP
 #define PYTHONIC_INCLUDE_UTILS_SHARED_REF_HPP
 
-#include <memory>
 #include <unordered_map>
 #include <utility>
 #ifdef _OPENMP
@@ -49,7 +48,7 @@ namespace utils
       extern_type foreign;
       template <class... Types>
       memory(Types &&...args);
-    } * mem;
+    } *mem;
 
     template <class Tp>
     friend class shared_ref;
@@ -61,7 +60,7 @@ namespace utils
     // Uninitialized ctor (rvalue ref)
     shared_ref(no_memory &&) noexcept;
 
-    // Ctor allocate T && forward all arguments to T ctor
+    // Ctor allocate T and forward all arguments to T ctor
     template <class... Types>
     shared_ref(Types &&...args);
 
diff --git a/contrib/python/pythran/pythran/pythonic/numpy/median.hpp b/contrib/python/pythran/pythran/pythonic/numpy/median.hpp
index cebe3b01102..f25ac303a17 100644
--- a/contrib/python/pythran/pythran/pythonic/numpy/median.hpp
+++ b/contrib/python/pythran/pythran/pythonic/numpy/median.hpp
@@ -3,12 +3,12 @@
 
 #include "pythonic/include/numpy/median.hpp"
 
-#include "pythonic/utils/functor.hpp"
-#include "pythonic/types/ndarray.hpp"
 #include "pythonic/numpy/asarray.hpp"
 #include "pythonic/numpy/sort.hpp"
+#include "pythonic/types/ndarray.hpp"
+#include "pythonic/utils/allocate.hpp"
+#include "pythonic/utils/functor.hpp"
 #include <algorithm>
-#include <memory>
 
 PYTHONIC_NS_BEGIN
 
@@ -26,12 +26,12 @@ namespace numpy
           std::accumulate(tmp_shape.begin() + axis, tmp_shape.end(), 1L,
                           std::multiplies<long>());
       long const buffer_size = tmp_shape[axis];
-      std::unique_ptr<T[]> buffer{new T[buffer_size]};
+      T *buffer = utils::allocate<T>(buffer_size);
       const long stepper = step / tmp_shape[axis];
       const long n = tmp.flat_size() / tmp_shape[axis] * step;
       long ith = 0, nth = 0;
       for (long i = 0; i < n; i += step) {
-        T *buffer_iter = buffer.get();
+        T *buffer_iter = buffer;
         T const *iter = tmp.buffer + ith;
         T const *iend = iter + step;
         while (iter != iend) {
@@ -39,16 +39,15 @@ namespace numpy
           iter += stepper;
         }
         if (buffer_size % 2 == 1) {
-          std::nth_element(buffer.get(), buffer.get() + buffer_size / 2,
-                           buffer_iter, ndarray::comparator<T>{});
+          std::nth_element(buffer, buffer + buffer_size / 2, buffer_iter,
+                           ndarray::comparator<T>{});
           *out++ = buffer[buffer_size / 2];
         } else {
-          std::nth_element(buffer.get(), buffer.get() + buffer_size / 2,
-                           buffer_iter, ndarray::comparator<T>{});
-          auto t0 = buffer[buffer_size / 2];
-          std::nth_element(buffer.get(), buffer.get() + buffer_size / 2 - 1,
-                           buffer.get() + buffer_size / 2,
+          std::nth_element(buffer, buffer + buffer_size / 2, buffer_iter,
                            ndarray::comparator<T>{});
+          auto t0 = buffer[buffer_size / 2];
+          std::nth_element(buffer, buffer + buffer_size / 2 - 1,
+                           buffer + buffer_size / 2, ndarray::comparator<T>{});
           auto t1 = buffer[buffer_size / 2 - 1];
           *out++ = (t0 + t1) / double(2);
         }
@@ -58,25 +57,27 @@ namespace numpy
           ith = nth;
         }
       }
+      utils::deallocate(buffer);
     }
-  }
+  } // namespace
 
   template <class T, class pS>
   decltype(std::declval<T>() + 1.) median(types::ndarray<T, pS> const &arr,
                                           types::none_type)
   {
     size_t n = arr.flat_size();
-    std::unique_ptr<T[]> tmp{new T[n]};
-    std::copy(arr.buffer, arr.buffer + n, tmp.get());
-    std::nth_element(tmp.get(), tmp.get() + n / 2, tmp.get() + n,
-                     ndarray::comparator<T>{});
+    T *tmp = utils::allocate<T>(n);
+    std::copy(arr.buffer, arr.buffer + n, tmp);
+    std::nth_element(tmp, tmp + n / 2, tmp + n, ndarray::comparator<T>{});
     T t0 = tmp[n / 2];
     if (n % 2 == 1) {
+      utils::deallocate(tmp);
       return t0;
     } else {
-      std::nth_element(tmp.get(), tmp.get() + n / 2 - 1, tmp.get() + n / 2,
+      std::nth_element(tmp, tmp + n / 2 - 1, tmp + n / 2,
                        ndarray::comparator<T>{});
       T t1 = tmp[n / 2 - 1];
+      utils::deallocate(tmp);
       return (t0 + t1) / 2.;
     }
   }
@@ -115,7 +116,7 @@ namespace numpy
   }
 
   NUMPY_EXPR_TO_NDARRAY0_IMPL(median);
-}
+} // namespace numpy
 PYTHONIC_NS_END
 
 #endif
diff --git a/contrib/python/pythran/pythran/pythonic/numpy/ndarray/sort.hpp b/contrib/python/pythran/pythran/pythonic/numpy/ndarray/sort.hpp
index 3505a838d51..a5860780f70 100644
--- a/contrib/python/pythran/pythran/pythonic/numpy/ndarray/sort.hpp
+++ b/contrib/python/pythran/pythran/pythonic/numpy/ndarray/sort.hpp
@@ -4,12 +4,12 @@
 #include "pythonic/include/numpy/ndarray/sort.hpp"
 
 #include <algorithm>
-#include <memory>
 
-#include "pythonic/utils/functor.hpp"
+#include "pythonic/numpy/array.hpp"
 #include "pythonic/types/ndarray.hpp"
 #include "pythonic/types/str.hpp"
-#include "pythonic/numpy/array.hpp"
+#include "pythonic/utils/allocate.hpp"
+#include "pythonic/utils/functor.hpp"
 #include "pythonic/utils/pdqsort.hpp"
 
 PYTHONIC_NS_BEGIN
@@ -22,7 +22,7 @@ namespace numpy
     {
       struct quicksorter {
         template <class... Args>
-        void operator()(Args &&... args)
+        void operator()(Args &&...args)
         {
           pdqsort(std::forward<Args>(args)...);
         }
@@ -41,14 +41,14 @@ namespace numpy
       };
       struct heapsorter {
         template <class... Args>
-        void operator()(Args &&... args)
+        void operator()(Args &&...args)
         {
           return std::sort_heap(std::forward<Args>(args)...);
         }
       };
       struct stablesorter {
         template <class... Args>
-        void operator()(Args &&... args)
+        void operator()(Args &&...args)
         {
           return std::stable_sort(std::forward<Args>(args)...);
         }
@@ -102,11 +102,11 @@ namespace numpy
           const long stepper = step / out_shape[axis];
           const long n = flat_size / out_shape[axis];
           long ith = 0, nth = 0;
-          std::unique_ptr<T[]> buffer{new T[buffer_size]};
+          T *buffer = utils::allocate<T>(buffer_size);
           for (long i = 0; i < n; i++) {
             for (long j = 0; j < buffer_size; ++j)
               buffer[j] = out.buffer[ith + j * stepper];
-            sorter(buffer.get(), buffer.get() + buffer_size, comparator<T>{});
+            sorter(buffer, buffer + buffer_size, comparator<T>{});
             for (long j = 0; j < buffer_size; ++j)
               out.buffer[ith + j * stepper] = buffer[j];
 
@@ -115,9 +115,10 @@ namespace numpy
               ith = ++nth;
             }
           }
+          utils::deallocate(buffer);
         }
       }
-    }
+    } // namespace
 
     template <class E>
     types::none_type sort(E &&expr, long axis, types::none_type)
@@ -146,8 +147,8 @@ namespace numpy
         _sort(expr, axis, stablesorter());
       return {};
     }
-  }
-}
+  } // namespace ndarray
+} // namespace numpy
 PYTHONIC_NS_END
 
 #endif
diff --git a/contrib/python/pythran/pythran/pythonic/types/bool.hpp b/contrib/python/pythran/pythran/pythonic/types/bool.hpp
index 87755d7857c..21456be4109 100644
--- a/contrib/python/pythran/pythran/pythonic/types/bool.hpp
+++ b/contrib/python/pythran/pythran/pythonic/types/bool.hpp
@@ -5,6 +5,8 @@
 
 #ifdef ENABLE_PYTHON_MODULE
 
+#include "numpy/arrayobject.h"
+
 PYTHONIC_NS_BEGIN
 inline PyObject *to_python<bool>::convert(bool b)
 {
@@ -16,11 +18,13 @@ inline PyObject *to_python<bool>::convert(bool b)
 
 inline bool from_python<bool>::is_convertible(PyObject *obj)
 {
-  return obj == Py_True || obj == Py_False;
+  return obj == Py_True || obj == Py_False || PyObject_TypeCheck(obj, &PyBoolArrType_Type);
 }
 inline bool from_python<bool>::convert(PyObject *obj)
 {
-  return obj == Py_True;
+  if(obj == Py_True) return true;
+  else if(obj == Py_False) return false;
+  else return PyInt_AsLong(obj);
 }
 
 PYTHONIC_NS_END
diff --git a/contrib/python/pythran/pythran/pythonic/types/dict.hpp b/contrib/python/pythran/pythran/pythonic/types/dict.hpp
index d4fab4bf37a..c6065e87ac6 100644
--- a/contrib/python/pythran/pythran/pythonic/types/dict.hpp
+++ b/contrib/python/pythran/pythran/pythonic/types/dict.hpp
@@ -14,7 +14,6 @@
 #include <algorithm>
 #include <iterator>
 #include <limits>
-#include <memory>
 #include <utility>
 
 PYTHONIC_NS_BEGIN
diff --git a/contrib/python/pythran/pythran/pythonic/types/list.hpp b/contrib/python/pythran/pythran/pythonic/types/list.hpp
index 1b4b7ede98a..7f1de3c4990 100644
--- a/contrib/python/pythran/pythran/pythonic/types/list.hpp
+++ b/contrib/python/pythran/pythran/pythonic/types/list.hpp
@@ -8,6 +8,7 @@
 #include "pythonic/types/bool.hpp"
 #include "pythonic/types/slice.hpp"
 #include "pythonic/types/tuple.hpp"
+#include "pythonic/utils/allocate.hpp"
 #include "pythonic/utils/reserve.hpp"
 #include "pythonic/utils/shared_ref.hpp"
 
@@ -556,7 +557,7 @@ namespace types
   template <class T>
   void list<T>::reserve(size_t n)
   {
-    if(n > _data->capacity())
+    if (n > _data->capacity())
       _data->reserve((n / 2) * 3);
   }
   template <class T>
@@ -583,7 +584,7 @@ namespace types
   template <class T>
   void list<T>::clear()
   {
-      _data->clear();	
+    _data->clear();
   }
 
   // TODO: have to raise a valueError
diff --git a/contrib/python/pythran/pythran/pythonic/types/ndarray.hpp b/contrib/python/pythran/pythran/pythonic/types/ndarray.hpp
index 1f4c0657cfc..191933ccd61 100644
--- a/contrib/python/pythran/pythran/pythonic/types/ndarray.hpp
+++ b/contrib/python/pythran/pythran/pythonic/types/ndarray.hpp
@@ -9,6 +9,7 @@
 
 #include "pythonic/builtins/ValueError.hpp"
 
+#include "pythonic/utils/allocate.hpp"
 #include "pythonic/utils/broadcast_copy.hpp"
 #include "pythonic/utils/int_.hpp"
 #include "pythonic/utils/nested_container.hpp"
@@ -734,26 +735,27 @@ namespace types
   /* extended slice indexing */
   template <class T, class pS>
   template <class S0, class... S>
-  auto
-  ndarray<T, pS>::operator()(S0 const &s0, S const &...s) const & -> decltype(
-      extended_slice<count_new_axis<S0, S...>::value>{}((*this), s0, s...))
+  auto ndarray<T, pS>::operator()(S0 const &s0, S const &...s) const
+      & -> decltype(extended_slice<count_new_axis<S0, S...>::value>{}((*this),
+                                                                      s0, s...))
   {
     return extended_slice<count_new_axis<S0, S...>::value>{}((*this), s0, s...);
   }
 
   template <class T, class pS>
   template <class S0, class... S>
-  auto ndarray<T, pS>::operator()(S0 const &s0, S const &...s) & -> decltype(
-      extended_slice<count_new_axis<S0, S...>::value>{}((*this), s0, s...))
+  auto ndarray<T, pS>::operator()(S0 const &s0, S const &...s)
+      & -> decltype(extended_slice<count_new_axis<S0, S...>::value>{}((*this),
+                                                                      s0, s...))
   {
     return extended_slice<count_new_axis<S0, S...>::value>{}((*this), s0, s...);
   }
 
   template <class T, class pS>
   template <class S0, class... S>
-  auto ndarray<T, pS>::operator()(S0 const &s0, S const &...s) && -> decltype(
-      extended_slice<count_new_axis<S0, S...>::value>{}(std::move(*this), s0,
-                                                        s...))
+  auto ndarray<T, pS>::operator()(S0 const &s0, S const &...s)
+      && -> decltype(extended_slice<count_new_axis<S0, S...>::value>{}(
+          std::move(*this), s0, s...))
   {
     return extended_slice<count_new_axis<S0, S...>::value>{}(std::move(*this),
                                                              s0, s...);
@@ -770,12 +772,12 @@ namespace types
   ndarray<T, pS>::fast(F const &filter) const
   {
     long sz = filter.template shape<0>();
-    long *raw = (long *)malloc(sz * sizeof(long));
+    long *raw = utils::allocate<long>(sz);
     long n = 0;
     for (long i = 0; i < sz; ++i)
       if (filter.fast(i))
         raw[n++] = i;
-    // realloc(raw, n * sizeof(long));
+    // reallocate(raw, n);
     return this->fast(ndarray<long, pshape<long>>(raw, pshape<long>(n),
                                                   types::ownership::owned));
   }
@@ -1093,8 +1095,9 @@ namespace builtins
     }
 
     template <class E, class... S>
-    types::numpy_gexpr<E, types::normalize_t<S>...>
+    auto
     _build_gexpr<1>::operator()(E const &a, S const &...slices)
+    -> decltype(E(a)(slices...))
     {
       return E(a)(slices...);
     }
@@ -1115,7 +1118,7 @@ namespace builtins
       using stype = typename types::is_complex<typename E::dtype>::type;
       auto new_shape = sutils::getshape(a);
       std::get<E::value - 1>(new_shape) *= 2;
-      // this is tricky && dangerous!
+      // this is tricky and dangerous!
       auto translated_mem =
           reinterpret_cast<utils::shared_ref<types::raw_array<stype>> const &>(
               a.mem);
@@ -1124,6 +1127,7 @@ namespace builtins
       return _build_gexpr<E::value>{}(
           translated, types::slice{0, std::get<E::value - 1>(new_shape), 2});
     }
+
     template <class Op, class... Args>
     auto _make_real(types::numpy_expr<Op, Args...> const &a, utils::int_<1>)
         -> decltype(_make_real(
@@ -1138,13 +1142,34 @@ namespace builtins
     }
 
     template <class E>
+    auto _make_real(types::numpy_iexpr<E> const &a, utils::int_<1>)
+        -> decltype(_build_gexpr<types::numpy_iexpr<E>::value>{}(
+            std::declval<types::ndarray<typename types::is_complex<typename types::numpy_iexpr<E>::dtype>::type,
+                           types::array<long, types::numpy_iexpr<E>::value + 1>>>(),
+            long(), types::slice()))
+    {
+      constexpr size_t value = types::numpy_iexpr<E>::value;
+      using stype = typename types::is_complex<typename types::numpy_iexpr<E>::dtype>::type;
+      auto new_shape = sutils::getshape(a.arg);
+      std::get<value>(new_shape) *= 2;
+      // this is tricky and dangerous!
+      auto translated_mem =
+          reinterpret_cast<utils::shared_ref<types::raw_array<stype>> const &>(
+              a.arg.mem);
+      types::ndarray<stype, types::array<long, value + 1>> translated{
+          translated_mem, new_shape};
+      long offset = (a.buffer - a.arg.buffer) / a.arg.template strides<0>();
+      return _build_gexpr<value>{}(
+          translated, offset, types::slice{0, std::get<value>(new_shape), 2});
+    }
+
+    template <class E>
     types::ndarray<typename E::dtype, typename E::shape_t>
     _make_imag(E const &a, utils::int_<0>)
     {
       // cannot use numpy.zero: forward declaration issue
-      return {
-          (typename E::dtype *)calloc(a.flat_size(), sizeof(typename E::dtype)),
-          sutils::getshape(a), types::ownership::owned};
+      return {utils::callocate<typename E::dtype>(a.flat_size()),
+              sutils::getshape(a), types::ownership::owned};
     }
 
     template <class Op, class... Args>
@@ -1161,6 +1186,28 @@ namespace builtins
     }
 
     template <class E>
+    auto _make_imag(types::numpy_iexpr<E> const &a, utils::int_<1>)
+        -> decltype(_build_gexpr<types::numpy_iexpr<E>::value>{}(
+            std::declval<types::ndarray<typename types::is_complex<typename types::numpy_iexpr<E>::dtype>::type,
+                           types::array<long, types::numpy_iexpr<E>::value + 1>>>(),
+            long(), types::slice()))
+    {
+      constexpr size_t value = types::numpy_iexpr<E>::value;
+      using stype = typename types::is_complex<typename types::numpy_iexpr<E>::dtype>::type;
+      auto new_shape = sutils::getshape(a.arg);
+      std::get<types::numpy_iexpr<E>::value>(new_shape) *= 2;
+      // this is tricky and dangerous!
+      auto translated_mem =
+          reinterpret_cast<utils::shared_ref<types::raw_array<stype>> const &>(
+              a.arg.mem);
+      types::ndarray<stype, types::array<long, value + 1>> translated{
+          translated_mem, new_shape};
+      long offset = (a.buffer - a.arg.buffer) / a.arg.template strides<0>();
+      return _build_gexpr<value>{}(
+          translated, offset, types::slice{1, std::get<value>(new_shape), 2});
+    }
+
+    template <class E>
     auto _make_imag(E const &a, utils::int_<1>)
         -> decltype(_build_gexpr<E::value>{}(
             types::ndarray<typename types::is_complex<typename E::dtype>::type,
@@ -1170,7 +1217,7 @@ namespace builtins
       using stype = typename types::is_complex<typename E::dtype>::type;
       auto new_shape = sutils::getshape(a);
       std::get<E::value - 1>(new_shape) *= 2;
-      // this is tricky && dangerous!
+      // this is tricky and dangerous!
       auto translated_mem =
           reinterpret_cast<utils::shared_ref<types::raw_array<stype>> const &>(
               a.mem);
@@ -1249,6 +1296,17 @@ namespace builtins
     return details::_make_real(a, utils::int_<types::is_complex<T>::value>{});
   }
 
+  template <class E>
+  auto getattr(types::attr::REAL, types::numpy_iexpr<E> const &e)
+      -> decltype(details::_make_real(
+          e, utils::int_<types::is_complex<
+                 typename types::numpy_iexpr<E>::dtype>::value>{}))
+  {
+    return details::_make_real(
+        e, utils::int_<types::is_complex<
+               typename types::numpy_iexpr<E>::dtype>::value>{});
+  }
+
   template <class Op, class... Args>
   auto getattr(types::attr::REAL, types::numpy_expr<Op, Args...> const &a)
       -> decltype(details::_make_real(
@@ -1261,8 +1319,9 @@ namespace builtins
   }
 
   template <class E>
-  auto getattr(types::attr::REAL, types::numpy_texpr<E> const &a) -> decltype(
-      types::numpy_texpr<decltype(getattr(types::attr::REAL{}, a.arg))>{
+  auto getattr(types::attr::REAL, types::numpy_texpr<E> const &a)
+      -> decltype(types::numpy_texpr<decltype(getattr(types::attr::REAL{},
+                                                      a.arg))>{
           getattr(types::attr::REAL{}, a.arg)})
   {
     auto ta = getattr(types::attr::REAL{}, a.arg);
@@ -1289,8 +1348,9 @@ namespace builtins
   }
 
   template <class E>
-  auto getattr(types::attr::IMAG, types::numpy_texpr<E> const &a) -> decltype(
-      types::numpy_texpr<decltype(getattr(types::attr::IMAG{}, a.arg))>{
+  auto getattr(types::attr::IMAG, types::numpy_texpr<E> const &a)
+      -> decltype(types::numpy_texpr<decltype(getattr(types::attr::IMAG{},
+                                                      a.arg))>{
           getattr(types::attr::IMAG{}, a.arg)})
   {
     auto ta = getattr(types::attr::IMAG{}, a.arg);
@@ -1471,6 +1531,7 @@ PyObject *to_python<types::numpy_gexpr<Arg, S...>>::convert(
                                          : ::to_python(v.slices);
   PyObject *base = ::to_python(v.arg);
   PyObject *res = PyObject_GetItem(base, slices);
+  Py_DECREF(slices);
   Py_DECREF(base);
   if (transpose) {
     PyObject *Transposed =
diff --git a/contrib/python/pythran/pythran/pythonic/types/numpy_expr.hpp b/contrib/python/pythran/pythran/pythonic/types/numpy_expr.hpp
index 3d070ea1146..9f139ffe085 100644
--- a/contrib/python/pythran/pythran/pythonic/types/numpy_expr.hpp
+++ b/contrib/python/pythran/pythran/pythonic/types/numpy_expr.hpp
@@ -4,6 +4,7 @@
 #include "pythonic/include/types/numpy_expr.hpp"
 
 #include "pythonic/types/nditerator.hpp"
+#include "pythonic/utils/allocate.hpp"
 #include "pythonic/utils/meta.hpp"
 
 #include "pythonic/builtins/ValueError.hpp"
@@ -314,12 +315,12 @@ namespace types
   numpy_expr<Op, Args...>::fast(F const &filter) const
   {
     long sz = filter.template shape<0>();
-    long *raw = (long *)malloc(sz * sizeof(long));
+    long *raw = utils::allocate<long>(sz);
     long n = 0;
     for (long i = 0; i < sz; ++i)
       if (filter.fast(i))
         raw[n++] = i;
-    // realloc(raw, n * sizeof(long));
+    // reallocate(raw, n);
     long shp[1] = {n};
     return this->fast(
         ndarray<long, pshape<long>>(raw, shp, types::ownership::owned));
diff --git a/contrib/python/pythran/pythran/pythonic/types/numpy_gexpr.hpp b/contrib/python/pythran/pythran/pythonic/types/numpy_gexpr.hpp
index e5edeb36c33..88afc93f7f2 100644
--- a/contrib/python/pythran/pythran/pythonic/types/numpy_gexpr.hpp
+++ b/contrib/python/pythran/pythran/pythonic/types/numpy_gexpr.hpp
@@ -12,6 +12,7 @@
 #include "pythonic/operator_/isub.hpp"
 #include "pythonic/operator_/ixor.hpp"
 #include "pythonic/types/numpy_iexpr.hpp"
+#include "pythonic/utils/allocate.hpp"
 #include "pythonic/utils/meta.hpp"
 
 PYTHONIC_NS_BEGIN
@@ -799,12 +800,12 @@ namespace types
   numpy_gexpr<Arg, S...>::fast(F const &filter) const
   {
     long sz = filter.template shape<0>();
-    long *raw = (long *)malloc(sz * sizeof(long));
+    long *raw = utils::allocate<long>(sz);
     long n = 0;
     for (long i = 0; i < sz; ++i)
       if (filter.fast(i))
         raw[n++] = i;
-    // realloc(raw, n * sizeof(long));
+    // reallocate(raw, n);
     long shp[1] = {n};
     return this->fast(
         ndarray<long, pshape<long>>(raw, shp, types::ownership::owned));
diff --git a/contrib/python/pythran/pythran/pythonic/types/numpy_iexpr.hpp b/contrib/python/pythran/pythran/pythonic/types/numpy_iexpr.hpp
index 30e95ff9f7a..7489974e8de 100644
--- a/contrib/python/pythran/pythran/pythonic/types/numpy_iexpr.hpp
+++ b/contrib/python/pythran/pythran/pythonic/types/numpy_iexpr.hpp
@@ -3,20 +3,21 @@
 
 #include "pythonic/include/types/numpy_iexpr.hpp"
 
+#include "pythonic/include/types/raw_array.hpp"
+#include "pythonic/types/ndarray.hpp" // we should remove that dep during a refactoring :-)
 #include "pythonic/types/nditerator.hpp"
 #include "pythonic/types/tuple.hpp"
+#include "pythonic/utils/allocate.hpp"
 #include "pythonic/utils/array_helper.hpp"
 #include "pythonic/utils/broadcast_copy.hpp"
-#include "pythonic/include/types/raw_array.hpp"
-#include "pythonic/types/ndarray.hpp" // we should remove that dep during a refactoring :-)
 
 #include "pythonic/operator_/iadd.hpp"
 #include "pythonic/operator_/iand.hpp"
 #include "pythonic/operator_/idiv.hpp"
 #include "pythonic/operator_/imul.hpp"
 #include "pythonic/operator_/ior.hpp"
-#include "pythonic/operator_/ixor.hpp"
 #include "pythonic/operator_/isub.hpp"
+#include "pythonic/operator_/ixor.hpp"
 
 #include <numeric>
 
@@ -26,8 +27,7 @@ namespace types
 {
 
   template <class Arg>
-  numpy_iexpr<Arg>::numpy_iexpr()
-      : buffer(nullptr)
+  numpy_iexpr<Arg>::numpy_iexpr() : buffer(nullptr)
   {
   }
 
@@ -306,12 +306,12 @@ namespace types
   numpy_iexpr<Arg>::fast(F const &filter) const
   {
     long sz = filter.template shape<0>();
-    long *raw = (long *)malloc(sz * sizeof(long));
+    long *raw = utils::allocate<long>(sz);
     long n = 0;
     for (long i = 0; i < sz; ++i)
       if (filter.fast(i))
         raw[n++] = i;
-    // realloc(raw, n * sizeof(long));
+    // reallocate(raw, n);
     long shp[1] = {n};
     return this->fast(
         ndarray<long, pshape<long>>(raw, shp, types::ownership::owned));
@@ -321,7 +321,7 @@ namespace types
   template <class Arg>
   template <class vectorizer>
   typename numpy_iexpr<Arg>::simd_iterator
-      numpy_iexpr<Arg>::vbegin(vectorizer) const
+  numpy_iexpr<Arg>::vbegin(vectorizer) const
   {
     return {buffer};
   }
@@ -329,7 +329,7 @@ namespace types
   template <class Arg>
   template <class vectorizer>
   typename numpy_iexpr<Arg>::simd_iterator
-      numpy_iexpr<Arg>::vend(vectorizer) const
+  numpy_iexpr<Arg>::vend(vectorizer) const
   {
     using vector_type = typename xsimd::batch<dtype>;
     static const std::size_t vector_size = vector_type::size;
@@ -346,7 +346,7 @@ namespace types
   }
 
   template <class Arg>
-      auto numpy_iexpr<Arg>::operator[](long i) & -> decltype(this->fast(i))
+  auto numpy_iexpr<Arg>::operator[](long i) & -> decltype(this->fast(i))
   {
     if (i < 0)
       i += size();
@@ -354,8 +354,8 @@ namespace types
   }
 
   template <class Arg>
-      auto numpy_iexpr<Arg>::operator[](long i) &&
-      -> decltype(std::move(*this).fast(i))
+  auto
+  numpy_iexpr<Arg>::operator[](long i) && -> decltype(std::move(*this).fast(i))
   {
     if (i < 0)
       i += size();
@@ -366,8 +366,7 @@ namespace types
   template <class Sp>
   typename std::enable_if<is_slice<Sp>::value,
                           numpy_gexpr<numpy_iexpr<Arg>, normalize_t<Sp>>>::type
-      numpy_iexpr<Arg>::
-      operator[](Sp const &s0) const
+  numpy_iexpr<Arg>::operator[](Sp const &s0) const
   {
     return make_gexpr(*this, s0);
   }
@@ -377,8 +376,7 @@ namespace types
   typename std::enable_if<
       is_slice<Sp>::value,
       numpy_gexpr<numpy_iexpr<Arg>, normalize_t<Sp>, normalize_t<S>...>>::type
-      numpy_iexpr<Arg>::
-      operator()(Sp const &s0, S const &... s) const
+  numpy_iexpr<Arg>::operator()(Sp const &s0, S const &...s) const
   {
     return make_gexpr(*this, s0, s...);
   }
@@ -388,8 +386,7 @@ namespace types
   typename std::enable_if<
       is_numexpr_arg<F>::value && std::is_same<bool, typename F::dtype>::value,
       numpy_vexpr<numpy_iexpr<Arg>, ndarray<long, pshape<long>>>>::type
-      numpy_iexpr<Arg>::
-      operator[](F const &filter) const
+  numpy_iexpr<Arg>::operator[](F const &filter) const
   {
     return fast(filter);
   }
@@ -406,18 +403,17 @@ namespace types
                         T1 const &shape, std::integral_constant<long, I>)
   {
     return compute_offset(
-        offset +
-            (std::get<I - 1>(indices) < 0
-                 ? std::get<I - 1>(indices) + shape.template shape<I>()
-                 : std::get<I - 1>(indices)) *
-                mult,
+        offset + (std::get<I - 1>(indices) < 0
+                      ? std::get<I - 1>(indices) + shape.template shape<I>()
+                      : std::get<I - 1>(indices)) *
+                     mult,
         mult * shape.template shape<I>(), indices, shape,
         std::integral_constant<long, I - 1>());
   }
 
   template <class Arg>
-  typename numpy_iexpr<Arg>::dtype const &numpy_iexpr<Arg>::
-  operator[](array<long, value> const &indices) const
+  typename numpy_iexpr<Arg>::dtype const &
+  numpy_iexpr<Arg>::operator[](array<long, value> const &indices) const
   {
     return buffer[compute_offset(indices[value - 1] < 0
                                      ? indices[value - 1] +
@@ -428,8 +424,8 @@ namespace types
   }
 
   template <class Arg>
-  typename numpy_iexpr<Arg>::dtype &numpy_iexpr<Arg>::
-  operator[](array<long, value> const &indices)
+  typename numpy_iexpr<Arg>::dtype &
+  numpy_iexpr<Arg>::operator[](array<long, value> const &indices)
   {
     return const_cast<dtype &>(const_cast<numpy_iexpr const &>(*this)[indices]);
   }
@@ -504,7 +500,7 @@ namespace types
   {
     return e.buffer[i * e.template strides<T::value - 1>()];
   }
-}
+} // namespace types
 PYTHONIC_NS_END
 
 #endif
diff --git a/contrib/python/pythran/pythran/pythonic/types/numpy_texpr.hpp b/contrib/python/pythran/pythran/pythonic/types/numpy_texpr.hpp
index a037a010399..5de28bf7a52 100644
--- a/contrib/python/pythran/pythran/pythonic/types/numpy_texpr.hpp
+++ b/contrib/python/pythran/pythran/pythonic/types/numpy_texpr.hpp
@@ -6,6 +6,7 @@
 #include "pythonic/numpy/array.hpp"
 #include "pythonic/numpy/transpose.hpp"
 #include "pythonic/types/ndarray.hpp"
+#include "pythonic/utils/allocate.hpp"
 
 #include "pythonic/operator_/iadd.hpp"
 #include "pythonic/operator_/iand.hpp"
@@ -145,12 +146,12 @@ namespace types
   numpy_texpr_2<E>::fast(F const &filter) const
   {
     long sz = filter.template shape<0>();
-    long *raw = (long *)malloc(sz * sizeof(long));
+    long *raw = utils::allocate<long>(sz);
     long n = 0;
     for (long i = 0; i < sz; ++i)
       if (filter.fast(i))
         raw[n++] = i;
-    // realloc(raw, n * sizeof(long));
+    // reallocate(raw, n);
     return this->fast(ndarray<long, pshape<long>>(raw, pshape<long>(n),
                                                   types::ownership::owned));
   }
diff --git a/contrib/python/pythran/pythran/pythonic/types/numpy_vexpr.hpp b/contrib/python/pythran/pythran/pythonic/types/numpy_vexpr.hpp
index ed2acbcc9df..889420b93c1 100644
--- a/contrib/python/pythran/pythran/pythonic/types/numpy_vexpr.hpp
+++ b/contrib/python/pythran/pythran/pythonic/types/numpy_vexpr.hpp
@@ -1,6 +1,8 @@
 #ifndef PYTHONIC_TYPES_NUMPY_VEXPR_HPP
 #define PYTHONIC_TYPES_NUMPY_VEXPR_HPP
 
+#include "pythonic/utils/allocate.hpp"
+
 PYTHONIC_NS_BEGIN
 
 namespace types
@@ -9,8 +11,7 @@ namespace types
   template <class T, class F>
   template <class E>
   typename std::enable_if<is_iterable<E>::value, numpy_vexpr<T, F> &>::type
-      numpy_vexpr<T, F>::
-      operator=(E const &expr)
+  numpy_vexpr<T, F>::operator=(E const &expr)
   {
     // TODO: avoid the tmp copy when no aliasing
     typename assignable<E>::type tmp{expr};
@@ -21,8 +22,7 @@ namespace types
   template <class T, class F>
   template <class E>
   typename std::enable_if<!is_iterable<E>::value, numpy_vexpr<T, F> &>::type
-      numpy_vexpr<T, F>::
-      operator=(E const &expr)
+  numpy_vexpr<T, F>::operator=(E const &expr)
   {
     for (long i = 0, n = shape<0>(); i < n; ++i)
       (*this).fast(i) = expr;
@@ -60,7 +60,7 @@ namespace types
   }
   template <class T, class F>
   template <class... S>
-  auto numpy_vexpr<T, F>::operator()(S const &... slices) const
+  auto numpy_vexpr<T, F>::operator()(S const &...slices) const
       -> decltype(ndarray<dtype, array<long, value>>{*this}(slices...))
   {
     return ndarray<dtype, array<long, value>>{*this}(slices...);
@@ -69,7 +69,7 @@ namespace types
   template <class T, class F>
   template <class vectorizer>
   typename numpy_vexpr<T, F>::simd_iterator
-      numpy_vexpr<T, F>::vbegin(vectorizer) const
+  numpy_vexpr<T, F>::vbegin(vectorizer) const
   {
     return {*this, 0};
   }
@@ -77,7 +77,7 @@ namespace types
   template <class T, class F>
   template <class vectorizer>
   typename numpy_vexpr<T, F>::simd_iterator
-      numpy_vexpr<T, F>::vend(vectorizer) const
+  numpy_vexpr<T, F>::vend(vectorizer) const
   {
     return {*this, 0};
   }
@@ -94,12 +94,12 @@ namespace types
   numpy_vexpr<T, F>::fast(E const &filter) const
   {
     long sz = filter.template shape<0>();
-    long *raw = (long *)malloc(sz * sizeof(long));
+    long *raw = utils::allocate<long>(sz);
     long n = 0;
     for (long i = 0; i < sz; ++i)
       if (filter.fast(i))
         raw[n++] = i;
-    // realloc(raw, n * sizeof(long));
+    // reallocate(raw, n);
     long shp[1] = {n};
     return this->fast(
         ndarray<long, pshape<long>>(raw, shp, types::ownership::owned));
@@ -112,8 +112,7 @@ namespace types
           std::is_same<bool, typename E::dtype>::value &&
           !is_pod_array<F>::value,
       numpy_vexpr<numpy_vexpr<T, F>, ndarray<long, pshape<long>>>>::type
-      numpy_vexpr<T, F>::
-      operator[](E const &filter) const
+  numpy_vexpr<T, F>::operator[](E const &filter) const
   {
     return fast(filter);
   }
@@ -125,8 +124,7 @@ namespace types
                               !std::is_same<bool, typename E::dtype>::value &&
                               !is_pod_array<F>::value,
                           numpy_vexpr<numpy_vexpr<T, F>, E>>::type
-      numpy_vexpr<T, F>::
-      operator[](E const &filter) const
+  numpy_vexpr<T, F>::operator[](E const &filter) const
   {
     return {*this, filter};
   }
@@ -208,7 +206,7 @@ namespace types
   {
     return update_<pythonic::operator_::functor::ixor>(expr);
   }
-}
+} // namespace types
 PYTHONIC_NS_END
 
 #endif
diff --git a/contrib/python/pythran/pythran/pythonic/types/raw_array.hpp b/contrib/python/pythran/pythran/pythonic/types/raw_array.hpp
index bd352a09aa6..bf79843b3b0 100644
--- a/contrib/python/pythran/pythran/pythonic/types/raw_array.hpp
+++ b/contrib/python/pythran/pythran/pythonic/types/raw_array.hpp
@@ -1,10 +1,10 @@
 #ifndef PYTHONIC_TYPES_RAW_ARRAY_HPP
 #define PYTHONIC_TYPES_RAW_ARRAY_HPP
 
-#include "pythonic/include/types/raw_array.hpp"
 #include "pythonic/builtins/MemoryError.hpp"
+#include "pythonic/include/types/raw_array.hpp"
+#include "pythonic/utils/allocate.hpp"
 
-#include <cstdlib>
 #include <sstream>
 
 PYTHONIC_NS_BEGIN
@@ -16,14 +16,13 @@ namespace types
    * for internal use only, meant to be stored in a shared_ptr
    */
   template <class T>
-  raw_array<T>::raw_array()
-      : data(nullptr), external(false)
+  raw_array<T>::raw_array() : data(nullptr), external(false)
   {
   }
 
   template <class T>
   raw_array<T>::raw_array(size_t n)
-      : data((T *)malloc(n * sizeof(T))), external(false)
+      : data(utils::allocate<T>(n)), external(false)
   {
     if (!data) {
       std::ostringstream oss;
@@ -39,8 +38,7 @@ namespace types
   }
 
   template <class T>
-  raw_array<T>::raw_array(raw_array<T> &&d)
-      : data(d.data), external(d.external)
+  raw_array<T>::raw_array(raw_array<T> &&d) : data(d.data), external(d.external)
   {
     d.data = nullptr;
   }
@@ -49,7 +47,7 @@ namespace types
   raw_array<T>::~raw_array()
   {
     if (data && !external)
-      free(data);
+      utils::deallocate(data);
   }
 
   template <class T>
@@ -57,7 +55,7 @@ namespace types
   {
     external = true;
   }
-}
+} // namespace types
 PYTHONIC_NS_END
 
 #endif
diff --git a/contrib/python/pythran/pythran/pythonic/types/set.hpp b/contrib/python/pythran/pythran/pythonic/types/set.hpp
index 88af04eda85..7bf74bf0fc8 100644
--- a/contrib/python/pythran/pythran/pythonic/types/set.hpp
+++ b/contrib/python/pythran/pythran/pythonic/types/set.hpp
@@ -16,7 +16,6 @@
 #include <algorithm>
 #include <iterator>
 #include <limits>
-#include <memory>
 #include <set>
 #include <utility>
 
diff --git a/contrib/python/pythran/pythran/pythonic/types/vectorizable_type.hpp b/contrib/python/pythran/pythran/pythonic/types/vectorizable_type.hpp
index c20e78d55a3..81e0d90432e 100644
--- a/contrib/python/pythran/pythran/pythonic/types/vectorizable_type.hpp
+++ b/contrib/python/pythran/pythran/pythonic/types/vectorizable_type.hpp
@@ -104,7 +104,7 @@ namespace types
   struct is_vector_op {
 
     // vectorize everything but these ops. They require special handling for
-    // vectorization, && SG did not invest enough time in those
+    // vectorization, and SG did not invest enough time in those
     static const bool value =
         !std::is_same<O, operator_::functor::mod>::value &&
         (!std::is_same<O, operator_::functor::div>::value ||
diff --git a/contrib/python/pythran/pythran/pythonic/utils/allocate.hpp b/contrib/python/pythran/pythran/pythonic/utils/allocate.hpp
new file mode 100644
index 00000000000..a09cda14e27
--- /dev/null
+++ b/contrib/python/pythran/pythran/pythonic/utils/allocate.hpp
@@ -0,0 +1,16 @@
+#ifndef PYTHONIC_UTILS_ALLOCATE_HPP
+#define PYTHONIC_UTILS_ALLOCATE_HPP
+
+#include "pythonic/include/utils/allocate.hpp"
+PYTHONIC_NS_BEGIN
+
+namespace utils
+{
+#ifdef PYTHRAN_TRACE_ALLOCATION
+  size_t pythran_allocation_site;
+#endif
+}
+
+PYTHONIC_NS_END
+
+#endif
diff --git a/contrib/python/pythran/pythran/pythonic/utils/shared_ref.hpp b/contrib/python/pythran/pythran/pythonic/utils/shared_ref.hpp
index 34f4a3ea004..44ec82982c2 100644
--- a/contrib/python/pythran/pythran/pythonic/utils/shared_ref.hpp
+++ b/contrib/python/pythran/pythran/pythonic/utils/shared_ref.hpp
@@ -2,8 +2,8 @@
 #define PYTHONIC_UTILS_SHARED_REF_HPP
 
 #include "pythonic/include/utils/shared_ref.hpp"
+#include "pythonic/utils/allocate.hpp"
 
-#include <memory>
 #include <unordered_map>
 #include <utility>
 #ifdef _OPENMP
@@ -39,7 +39,8 @@ namespace utils
   template <class T>
   template <class... Types>
   shared_ref<T>::shared_ref(Types &&...args)
-      : mem(new (std::nothrow) memory(std::forward<Types>(args)...))
+      : mem(new(utils::allocate<memory>(1))
+                memory(std::forward<Types>(args)...))
   {
   }
 
@@ -143,7 +144,8 @@ namespace utils
         Py_DECREF(mem->foreign);
       }
 #endif
-      delete mem;
+      mem->~memory();
+      utils::deallocate(mem);
       mem = nullptr;
     }
   }
diff --git a/contrib/python/pythran/pythran/pythran.cfg b/contrib/python/pythran/pythran/pythran.cfg
index 6858afce68c..e9671598323 100644
--- a/contrib/python/pythran/pythran/pythran.cfg
+++ b/contrib/python/pythran/pythran/pythran.cfg
@@ -50,3 +50,6 @@ max_heterogeneous_sequence_size = 16
 # set to true if you want intermediate C++ code to be annotated with a reference
 # to the original python code
 annotate = false
+
+# set to 'lineno' if you want to generate line number instead of python extract
+annotation_kind = 'comment'
diff --git a/contrib/python/pythran/pythran/run.py b/contrib/python/pythran/pythran/run.py
index be9bd1f0f84..b5c26f0f3a5 100644
--- a/contrib/python/pythran/pythran/run.py
+++ b/contrib/python/pythran/pythran/run.py
@@ -8,10 +8,8 @@ import os
 import sys
 
 import pythran
-import pythran.types.tog
 
-import setuptools
-from distutils.errors import CompileError
+from pythran.errors import PythranSyntaxError, PythranTypeError, PythranCompileError
 
 logger = logging.getLogger("pythran")
 
@@ -128,11 +126,21 @@ def run():
                         action='store_true',
                         help='report time spent in each optimization/transformation')
 
+    parser.add_argument('--trace-allocations', dest='trace_allocations',
+                        action='store_true',
+                        help='instrument execution to trace memory allocations')
+
     parser.convert_arg_line_to_args = convert_arg_line_to_args
 
     args, extra = parser.parse_known_args(sys.argv[1:])
     args.extra_flags = extra
 
+    if args.trace_allocations:
+        args.defines.append('PYTHRAN_TRACE_ALLOCATION')
+        args.config.append("backend.annotate=1")
+        args.config.append("backend.annotation_kind=lineno")
+
+
     if args.raw_translate_only:
         args.translate_only = True
         args.undefs.append('ENABLE_PYTHON_MODULE')
@@ -194,15 +202,15 @@ def run():
         logger.critical("Chair to keyboard interface error\n"
                         "E: " + str(e))
         sys.exit(1)
-    except pythran.types.tog.PythranTypeError as e:
+    except PythranTypeError as e:
         logger.critical("You shall not pass!\n"
                         "E: " + str(e))
         sys.exit(1)
-    except pythran.syntax.PythranSyntaxError as e:
+    except PythranSyntaxError as e:
         logger.critical("I am in trouble. Your input file does not seem "
                         "to match Pythran's constraints...\n" + str(e))
         sys.exit(1)
-    except CompileError as e:
+    except PythranCompileError as e:
         logger.critical("Cover me Jack. Jack? Jaaaaack!!!!\n"
                         "E: " + str(e))
         sys.exit(1)
diff --git a/contrib/python/pythran/pythran/spec.py b/contrib/python/pythran/pythran/spec.py
index 41081a97829..694094083cb 100644
--- a/contrib/python/pythran/pythran/spec.py
+++ b/contrib/python/pythran/pythran/spec.py
@@ -5,7 +5,7 @@ This module provides a dummy parser for pythran annotations.
 from pythran.types.conversion import pytype_to_pretty_type
 
 from collections import defaultdict
-from itertools import product
+from itertools import product, chain
 import re
 import ply.lex as lex
 import ply.yacc as yacc
@@ -80,9 +80,10 @@ class Spec(object):
     ``capsule'' is a mapping from function name to signature
     '''
 
-    def __init__(self, functions, capsules=None):
+    def __init__(self, functions, capsules=None, ufuncs=None):
         self.functions = dict(functions)
         self.capsules = capsules or dict()
+        self.ufuncs = ufuncs or dict()
 
         # normalize function signatures
         for fname, signatures in functions.items():
@@ -95,10 +96,11 @@ class Spec(object):
                            "nothing will be exported")
 
     def keys(self):
-        return list(self.functions.keys()) + list(self.capsules.keys())
+        return chain(self.functions.keys(), self.capsules.keys(),
+                     self.ufuncs.keys())
 
     def __bool__(self):
-        return bool(self.functions or self.capsules)
+        return any((self.functions, self.capsules, self.ufuncs))
 
     __nonzero__ = __bool__
 
@@ -161,6 +163,7 @@ class SpecParser(object):
         'export': 'EXPORT',
         'order': 'ORDER',
         'capsule': 'CAPSULE',
+        'ufunc': 'UFUNC',
         'or': 'OR',
         'list': 'LIST',
         'set': 'SET',
@@ -212,13 +215,19 @@ class SpecParser(object):
     def p_exports(self, p):
         if len(p) > 1:
             isnative = len(p) == 6
-            target = self.exports if len(p) == 6 else self.native_exports
+            if len(p) == 6:
+                target = self.exports
+            elif p[3] == "capsule":
+                target = self.native_exports
+            else:
+                target = self.ufunc_exports
             for key, val in p[len(p)-3]:
                 target[key] += val
 
     p_exports.__doc__ = '''exports :
                    | PYTHRAN EXPORT export_list opt_craps exports
-                   | PYTHRAN EXPORT CAPSULE export_list opt_craps exports'''
+                   | PYTHRAN EXPORT CAPSULE export_list opt_craps exports
+                   | PYTHRAN EXPORT UFUNC export_list opt_craps exports'''
 
     def p_export_list(self, p):
         p[0] = (p[1],) if len(p) == 2 else (p[1] + (p[3],))
@@ -463,6 +472,7 @@ class SpecParser(object):
     def __call__(self, text, input_file=None):
         self.exports = defaultdict(tuple)
         self.native_exports = defaultdict(tuple)
+        self.ufunc_exports = defaultdict(tuple)
         self.export_info = defaultdict(tuple)
         self.input_text = text
         self.input_file = input_file
@@ -516,7 +526,7 @@ class SpecParser(object):
                         loc = self.export_info[key][i]
                         raise self.PythranSpecError(msg, loc)
 
-        return Spec(self.exports, self.native_exports)
+        return Spec(self.exports, self.native_exports, self.ufunc_exports)
 
 
 class ExtraSpecParser(SpecParser):
diff --git a/contrib/python/pythran/pythran/syntax.py b/contrib/python/pythran/pythran/syntax.py
index 988fcb59d86..2137d6ba33e 100644
--- a/contrib/python/pythran/pythran/syntax.py
+++ b/contrib/python/pythran/pythran/syntax.py
@@ -4,6 +4,7 @@ It checks the conformance of the input code to Pythran specific
 constraints.
 '''
 
+from pythran.errors import PythranSyntaxError
 from pythran.tables import MODULES
 from pythran.intrinsic import Class
 from pythran.typing import Tuple, List, Set, Dict
@@ -32,45 +33,6 @@ class ExtendedDefUseChains(beniget.DefUseChains):
         self.unbounds.setdefault(name, []).append(n)
 
 
-class PythranSyntaxError(SyntaxError):
-    def __init__(self, msg, node=None):
-        SyntaxError.__init__(self, msg)
-        if node:
-            self.filename = getattr(node, 'filename', None)
-            self.lineno = node.lineno
-            self.offset = node.col_offset
-
-    def __str__(self):
-        loc_info = self.lineno is not None and self.offset is not None
-
-        if self.filename and loc_info:
-            with open(self.filename) as f:
-                for i in range(self.lineno - 1):
-                    f.readline()  # and drop it
-                extra = '{}\n{}'.format(f.readline().rstrip(),
-                                        " " * (self.offset) + "^~~~ (o_0)")
-        else:
-            extra = None
-
-        if loc_info:
-            format_header = "{}:{}:{}"
-            format_args = self.lineno, self.offset, self.args[0],
-        else:
-            format_header = "{}:"
-            format_args = self.args[0],
-
-        r = (format_header + " error: {}").format(
-                self.filename or "<unknown>",
-                *format_args)
-
-        if extra is not None:
-            r += "\n----\n"
-            r += extra
-            r += "\n----\n"
-
-        return r
-
-
 class SyntaxChecker(ast.NodeVisitor):
 
     """
diff --git a/contrib/python/pythran/pythran/tables.py b/contrib/python/pythran/pythran/tables.py
index d1d20278b4f..60fac63ee35 100644
--- a/contrib/python/pythran/pythran/tables.py
+++ b/contrib/python/pythran/pythran/tables.py
@@ -2547,8 +2547,8 @@ _numpy_array_signature = Union[
 ]
 
 
-def expand_numpy_2_args(args, defaults=None):
-    if numpy.__version__[0] < '2':
+def expand_numpy_2_args(args, defaults=None, force=False):
+    if force or numpy.__version__[0] < '2':
         if defaults is not None:
             return {'args': args, 'defaults': defaults}
         else:
@@ -2941,7 +2941,9 @@ MODULES = {
         ),
         "alltrue": ConstFunctionIntr(
             signature=_numpy_unary_op_bool_axis_signature,
-            return_range=interval.bool_values
+            return_range=interval.bool_values,
+            args=("a", "axis"),
+            defaults=(None,)
         ),
         "amax": ConstFunctionIntr(signature=_numpy_unary_op_axis_signature),
         "amin": ConstFunctionIntr(signature=_numpy_unary_op_axis_signature),
@@ -3401,6 +3403,7 @@ MODULES = {
             REDUCED_BINARY_UFUNC,
             signature=_numpy_int_binary_op_signature
         ),
+        "bool_": ConstFunctionIntr(signature=_int_signature),
         "broadcast_to": ConstFunctionIntr(),
         "byte": ConstFunctionIntr(signature=_int_signature),
         "cbrt": ConstFunctionIntr(
@@ -3583,8 +3586,8 @@ MODULES = {
         "complex256": ConstFunctionIntr(signature=_complex_signature),
         "conj": ConstMethodIntr(signature=_numpy_unary_op_signature),
         "conjugate": ConstMethodIntr(signature=_numpy_unary_op_signature),
-        "convolve": ConstMethodIntr(),
-        "correlate": ConstMethodIntr(),
+        "convolve": ConstMethodIntr(requires_blas=True),
+        "correlate": ConstMethodIntr(requires_blas=True),
         "copy": ConstMethodIntr(signature=_numpy_array_signature),
         "copyto": FunctionIntr(
             argument_effects=[UpdateEffect(), ReadEffect(),
@@ -3686,7 +3689,9 @@ MODULES = {
             signature=_numpy_unary_op_cumsum_axis_signature
         ),
         "cumproduct": ConstFunctionIntr(
-            signature=_numpy_unary_op_cumsum_axis_signature
+            signature=_numpy_unary_op_cumsum_axis_signature,
+            args=("a", "axis", "dtype", "out"),
+            defaults=(None, None, None),
         ),
         "cumsum": ConstMethodIntr(
             signature=_numpy_unary_op_cumsum_axis_signature
@@ -3704,7 +3709,7 @@ MODULES = {
         "diff": ConstFunctionIntr(),
         "digitize": ConstFunctionIntr(),
         "divide": UFunc(BINARY_UFUNC),
-        "dot": ConstMethodIntr(),
+        "dot": ConstMethodIntr(requires_blas=True),
         "double": ConstFunctionIntr(signature=_float_signature),
         "dtype": ClassWithConstConstructor(CLASSES["dtype"]),
         "e": ConstantIntr(),
@@ -3774,7 +3779,7 @@ MODULES = {
         "indices": ConstFunctionIntr(),
         "inf": ConstantIntr(),
         "Inf": ConstantIntr(),
-        "inner": ConstFunctionIntr(),
+        "inner": ConstFunctionIntr(requires_blas=True),
         "insert": ConstFunctionIntr(),
         "interp": ConstFunctionIntr(),
         "intersect1d": ConstFunctionIntr(),
@@ -3812,7 +3817,7 @@ MODULES = {
         "lexsort": ConstFunctionIntr(),
         "linalg": {
             "norm": FunctionIntr(),
-            "matrix_power": ConstFunctionIntr(),
+            "matrix_power": ConstFunctionIntr(requires_blas=True),
         },
         "linspace": ConstFunctionIntr(),
         "log": ConstFunctionIntr(),
@@ -3882,7 +3887,10 @@ MODULES = {
             signature=_numpy_binary_op_signature
         ),
         "prod": ConstMethodIntr(),
-        "product": ConstFunctionIntr(),
+        "product": ConstFunctionIntr(
+            args=("a", "axis", "dtype", "out"),
+            defaults=(None, None, None),
+                ),
         "ptp": ConstMethodIntr(),
         "put": MethodIntr(),
         "putmask": FunctionIntr(),
@@ -3953,7 +3961,7 @@ MODULES = {
             "rand": FunctionIntr(global_effects=True,
                 **expand_numpy_2_args(args=())),
             "ranf": FunctionIntr(global_effects=True,
-                **expand_numpy_2_args(args=('size',))),
+                **expand_numpy_2_args(args=('size',), force=True)),
             "randint": FunctionIntr(global_effects=True,
                 **expand_numpy_2_args(args=("low", "high", "size"),
                                       defaults=(None, None))),
@@ -3969,7 +3977,7 @@ MODULES = {
                 **expand_numpy_2_args(args=('scale', 'size',),
                                       defaults=(1.0, None,))),
             "sample": FunctionIntr(global_effects=True,
-                **expand_numpy_2_args(args=('size',))),
+                **expand_numpy_2_args(args=('size',), force=True)),
             "seed": FunctionIntr(global_effects=True),
             "shuffle": FunctionIntr(global_effects=True),
             "standard_exponential": FunctionIntr(global_effects=True,
@@ -4016,7 +4024,10 @@ MODULES = {
         "sin": ConstFunctionIntr(signature=_numpy_unary_op_float_signature),
         "sinh": ConstFunctionIntr(signature=_numpy_unary_op_float_signature),
         "size": ConstFunctionIntr(return_range=interval.positive_values),
-        "sometrue": ConstFunctionIntr(),
+        "sometrue": ConstFunctionIntr(
+            args=("a", "axis"),
+            defaults=(None,)
+                ),
         "sort": ConstFunctionIntr(),
         "sort_complex": ConstFunctionIntr(),
         "spacing": ConstFunctionIntr(),
@@ -4059,7 +4070,8 @@ MODULES = {
         "unravel_index": ConstFunctionIntr(),
         "ushort": ConstFunctionIntr(signature=_int_signature),
         "var": ConstMethodIntr(),
-        "vdot": ConstMethodIntr(),
+        "vectorize": ConstFunctionIntr(),
+        "vdot": ConstMethodIntr(requires_blas=True),
         "vstack": ConstFunctionIntr(),
         "where": ConstFunctionIntr(),
         "zeros": ConstFunctionIntr(args=('shape', 'dtype'),
@@ -4396,8 +4408,12 @@ MODULES = {
         "__lshift__": ConstFunctionIntr(
             signature=_numpy_int_binary_op_signature
         ),
-        "matmul": ConstFunctionIntr(signature=_operator_mul_signature),
-        "__matmul__": ConstFunctionIntr(signature=_operator_mul_signature),
+        "matmul": ConstFunctionIntr(signature=_operator_mul_signature,
+                                    requires_blas=True),
+        "__matmul__": ConstFunctionIntr(signature=_operator_mul_signature,
+                                        requires_blas=True),
+        "imatmul": MethodIntr(update_effects, requires_blas=True),
+        "__imatmul__": MethodIntr(update_effects, requires_blas=True),
         "mod": ConstFunctionIntr(signature=_operator_mod_signature),
         "__mod__": ConstFunctionIntr(signature=_operator_mod_signature),
         "mul": ConstFunctionIntr(signature=_operator_mul_signature),
@@ -4567,6 +4583,8 @@ try:
 except ImportError:
     pass
 
+def looks_like_a_forward_function(spec):
+    return not spec.args and spec.varargs == 'args' and spec.varkw == 'kwargs'
 
 # populate argument description through introspection
 def save_arguments(module_name, elements):
@@ -4581,7 +4599,7 @@ def save_arguments(module_name, elements):
                 obj = getattr(themodule, elem)
                 while hasattr(obj, '__wrapped__'):
                     obj = obj.__wrapped__
-            except (AttributeError, ImportError, TypeError):
+            except (AttributeError, ImportError, TypeError, ValueError):
                 continue
 
             # first try to gather info through getfullargspec
@@ -4590,8 +4608,19 @@ def save_arguments(module_name, elements):
             except:
                 continue
 
+            # some function are actually forward function, detect those
+            # and accept to use our description instead.
+            if looks_like_a_forward_function(spec):
+                assert signature.args.args, "{} require an explicit description".format(elem)
+                continue
+
             args = [ast.Name(arg, ast.Param(), None, None)
                     for arg in spec.args]
+
+            # pop 'self' if we have a bound method
+            if inspect.ismethod(obj):
+                args = args[1:]
+
             defaults = list(spec.defaults or [])
             args += [ast.Name(arg, ast.Param(), None, None)
                      for arg in spec.kwonlyargs]
@@ -4713,6 +4742,21 @@ def save_attribute(elements, module_path):
 
 save_attribute(MODULES, ())
 
+blas_requires = set()
+
+def save_blas_requires(elements, module_path):
+    """ Recursively save attributes with module name and signature. """
+    for elem, signature in elements.items():
+        if isinstance(signature, dict):  # Submodule case
+            save_blas_requires(signature, module_path + (elem,))
+        elif signature.requires_blas:
+            blas_requires.add(module_path + (elem,))
+        elif isinstance(signature, Class):
+            save_blas_requires(signature.fields, module_path + (elem,))
+
+save_blas_requires(MODULES, ())
+
+
 # patch beniget with pythran-specific builtins
 import beniget
 beniget.beniget.Builtins['builtins'] = __import__('builtins')
diff --git a/contrib/python/pythran/pythran/toolchain.py b/contrib/python/pythran/pythran/toolchain.py
index 879b9f20ad8..fde29026011 100644
--- a/contrib/python/pythran/pythran/toolchain.py
+++ b/contrib/python/pythran/pythran/toolchain.py
@@ -9,10 +9,10 @@ from pythran.cxxgen import PythonModule, Include, Line, Statement
 from pythran.cxxgen import FunctionBody, FunctionDeclaration, Value, Block
 from pythran.cxxgen import ReturnStatement
 from pythran.dist import PythranExtension, PythranBuildExt
+from pythran.errors import PythranCompileError
 from pythran.middlend import refine, mark_unexported_functions
 from pythran.passmanager import PassManager
 from pythran.tables import pythran_ward
-from pythran.types import tog
 from pythran.types.type_dependencies import pytype_to_deps
 from pythran.types.conversion import pytype_to_ctype
 from pythran.spec import load_specfile, Spec
@@ -22,12 +22,8 @@ from pythran.version import __version__
 from pythran.utils import cxxid
 import pythran.frontend as frontend
 
-try:
-    from distutils.errors import CompileError
-    from distutils import sysconfig
-except ImportError:
-    from setuptools.errors import CompileError
-    from setuptools._distutils import sysconfig
+import sysconfig
+
 try:
     # `numpy.distutils is deprecated, may not be present, or broken
     from numpy.distutils.core import setup
@@ -166,6 +162,7 @@ def generate_cxx(module_name, code, specs=None, optimizations=None,
         mod = Generable(content)
 
         def error_checker():
+            from pythran.types import tog
             tog.typecheck(ir)
 
     else:
@@ -175,6 +172,7 @@ def generate_cxx(module_name, code, specs=None, optimizations=None,
             specs = Spec(specs, {})
 
         def error_checker():
+            from pythran.types import tog
             types = tog.typecheck(ir)
             check_specs(specs, types)
 
@@ -286,13 +284,55 @@ def generate_cxx(module_name, code, specs=None, optimizations=None,
                 docstring
             )
 
+        if specs.ufuncs:
+            mod.add_to_includes(
+                Include("pythonic/include/types/numpy_ufunc.hpp"),
+            )
+
+        for function_name, signatures in specs.ufuncs.items():
+            internal_func_name = cxxid(function_name)
+
+            for signature in signatures:
+                arguments_types = [pytype_to_ctype(t) for t in signature]
+                numpy_types = ['pythonic::c_type_to_numpy_type<{}>::value'.format(t) for t in
+                               arguments_types]
+                arguments_names = has_argument(ir, function_name)
+                arguments = [n for n, _ in
+                             zip(arguments_names, arguments_types)]
+                name_fmt = pythran_ward + "{0}::{1}::type{2}"
+                args_list = ", ".join(arguments_types)
+                specialized_fname = name_fmt.format(module_name,
+                                                    internal_func_name,
+                                                    "<{0}>".format(args_list)
+                                                    if arguments_names else "")
+                result_type = "typename %s::result_type" % specialized_fname
+                numpy_result_type = 'pythonic::c_type_to_numpy_type<{}>::value'.format(result_type)
+                numpy_types.append(numpy_result_type)
+
+                mod.add_ufunc(
+                    FunctionBody(
+                        FunctionDeclaration(
+                            Value(result_type, function_name),
+                            [Value(t, a)
+                             for t, a in zip(arguments_types, arguments)]),
+                        Block([ReturnStatement("{0}()({1})".format(
+                            warded(module_name, internal_func_name),
+                            ', '.join(arguments)))])
+                    ),
+                    function_name,
+                    pythran_ward + module_name + "::" + internal_func_name,
+                    arguments_types + [result_type],
+                    numpy_types
+                )
+
+    return mod, error_checker
     return mod, error_checker
 
 
 def compile_cxxfile(module_name, cxxfile, output_binary=None, **kwargs):
     '''c++ file -> native module
     Return the filename of the produced shared library
-    Raises CompileError on failure
+    Raises PythranCompileError on failure
 
     '''
 
@@ -317,7 +357,7 @@ def compile_cxxfile(module_name, cxxfile, output_binary=None, **kwargs):
                            '--build-temp', buildtmp]
               )
     except SystemExit as e:
-        raise CompileError(str(e))
+        raise PythranCompileError(str(e))
 
     def copy(src_file, dest_file):
         # not using shutil.copy because it fails to copy stat across devices
@@ -426,7 +466,7 @@ def compile_pythrancode(module_name, pythrancode, specs=None,
                                           str(module),
                                           output_binary=output_file,
                                           **kwargs)
-        except CompileError:
+        except PythranCompileError:
             logger.warning("Compilation error, "
                            "trying hard to find its origin...")
             error_checker()
@@ -519,7 +559,7 @@ def import_pythranfile(pythranpath, **kwargs):
 
 def test_compile():
     '''Simple passthrough compile test.
-    May raises CompileError Exception.
+    May raises PythranCompileError Exception.
 
     '''
     code = '''
diff --git a/contrib/python/pythran/pythran/types/conversion.py b/contrib/python/pythran/pythran/types/conversion.py
index bfa1ff92f29..e149b7fb47d 100644
--- a/contrib/python/pythran/pythran/types/conversion.py
+++ b/contrib/python/pythran/pythran/types/conversion.py
@@ -33,6 +33,11 @@ PYTYPE_TO_CTYPE_TABLE = {
     complex128: 'std::complex<double>',
     complex64: 'std::complex<float>',
 }
+try:
+    from numpy import bool_ as npy_bool
+    PYTYPE_TO_CTYPE_TABLE[npy_bool] = 'bool'
+except ImportError:
+    pass
 
 try:
     from numpy import float128, complex256
diff --git a/contrib/python/pythran/pythran/types/tog.py b/contrib/python/pythran/pythran/types/tog.py
index b6d7a7839e3..f3a2b0f461c 100644
--- a/contrib/python/pythran/pythran/types/tog.py
+++ b/contrib/python/pythran/pythran/types/tog.py
@@ -9,14 +9,10 @@ from numpy import floating, integer, complexfloating
 
 from pythran.tables import MODULES, attributes
 import pythran.typing as typing
-from pythran.syntax import PythranSyntaxError
+from pythran.errors import PythranSyntaxError, PythranTypeError
 from pythran.utils import isnum
 
 
-class PythranTypeError(PythranSyntaxError):
-    "A new type to distinguish general syntax errors from typing issues"
-
-
 class InferenceError(Exception):
     "Raised if the type inference algorithm cannot infer types successfully"
 
diff --git a/contrib/python/pythran/pythran/version.py b/contrib/python/pythran/pythran/version.py
index a842d05a77e..14da714e456 100644
--- a/contrib/python/pythran/pythran/version.py
+++ b/contrib/python/pythran/pythran/version.py
@@ -1 +1,2 @@
-__version__ = '0.15.0'
+__version__ = '0.16.1'
+__descr__ = 'Ahead of Time compiler for numeric kernels'
diff --git a/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_arithmetic.hpp b/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
index c72e416c6e4..e7916b0d436 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
@@ -28,7 +28,7 @@ namespace xsimd
 
         // bitwise_lshift
         template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             return detail::apply([](T x, T y) noexcept
                                  { return x << y; },
@@ -37,7 +37,7 @@ namespace xsimd
 
         // bitwise_rshift
         template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             return detail::apply([](T x, T y) noexcept
                                  { return x >> y; },
@@ -46,21 +46,21 @@ namespace xsimd
 
         // decr
         template <class A, class T>
-        inline batch<T, A> decr(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> decr(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             return self - T(1);
         }
 
         // decr_if
         template <class A, class T, class Mask>
-        inline batch<T, A> decr_if(batch<T, A> const& self, Mask const& mask, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, Mask const& mask, requires_arch<generic>) noexcept
         {
             return select(mask, decr(self), self);
         }
 
         // div
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             return detail::apply([](T x, T y) noexcept -> T
                                  { return x / y; },
@@ -69,13 +69,13 @@ namespace xsimd
 
         // fma
         template <class A, class T>
-        inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
         {
             return x * y + z;
         }
 
         template <class A, class T>
-        inline batch<std::complex<T>, A> fma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> fma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
         {
             auto res_r = fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real()));
             auto res_i = fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag()));
@@ -84,13 +84,13 @@ namespace xsimd
 
         // fms
         template <class A, class T>
-        inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
         {
             return x * y - z;
         }
 
         template <class A, class T>
-        inline batch<std::complex<T>, A> fms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> fms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
         {
             auto res_r = fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real()));
             auto res_i = fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag()));
@@ -99,13 +99,13 @@ namespace xsimd
 
         // fnma
         template <class A, class T>
-        inline batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
         {
             return -x * y + z;
         }
 
         template <class A, class T>
-        inline batch<std::complex<T>, A> fnma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> fnma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
         {
             auto res_r = -fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real()));
             auto res_i = -fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag()));
@@ -114,13 +114,13 @@ namespace xsimd
 
         // fnms
         template <class A, class T>
-        inline batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
         {
             return -x * y - z;
         }
 
         template <class A, class T>
-        inline batch<std::complex<T>, A> fnms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> fnms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
         {
             auto res_r = -fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real()));
             auto res_i = -fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag()));
@@ -129,7 +129,7 @@ namespace xsimd
 
         // hadd
         template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
-        inline T hadd(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE T hadd(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             alignas(A::alignment()) T buffer[batch<T, A>::size];
             self.store_aligned(buffer);
@@ -143,21 +143,21 @@ namespace xsimd
 
         // incr
         template <class A, class T>
-        inline batch<T, A> incr(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> incr(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             return self + T(1);
         }
 
         // incr_if
         template <class A, class T, class Mask>
-        inline batch<T, A> incr_if(batch<T, A> const& self, Mask const& mask, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, Mask const& mask, requires_arch<generic>) noexcept
         {
             return select(mask, incr(self), self);
         }
 
         // mul
         template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
-        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             return detail::apply([](T x, T y) noexcept -> T
                                  { return x * y; },
@@ -166,7 +166,7 @@ namespace xsimd
 
         // rotl
         template <class A, class T, class STy>
-        inline batch<T, A> rotl(batch<T, A> const& self, STy other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, STy other, requires_arch<generic>) noexcept
         {
             constexpr auto N = std::numeric_limits<T>::digits;
             return (self << other) | (self >> (N - other));
@@ -174,7 +174,7 @@ namespace xsimd
 
         // rotr
         template <class A, class T, class STy>
-        inline batch<T, A> rotr(batch<T, A> const& self, STy other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, STy other, requires_arch<generic>) noexcept
         {
             constexpr auto N = std::numeric_limits<T>::digits;
             return (self >> other) | (self << (N - other));
@@ -182,12 +182,12 @@ namespace xsimd
 
         // sadd
         template <class A>
-        inline batch<float, A> sadd(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<float, A> sadd(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
         {
             return add(self, other); // no saturated arithmetic on floating point numbers
         }
         template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
-        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -204,19 +204,19 @@ namespace xsimd
             }
         }
         template <class A>
-        inline batch<double, A> sadd(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<double, A> sadd(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
         {
             return add(self, other); // no saturated arithmetic on floating point numbers
         }
 
         // ssub
         template <class A>
-        inline batch<float, A> ssub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<float, A> ssub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
         {
             return sub(self, other); // no saturated arithmetic on floating point numbers
         }
         template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
-        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -229,7 +229,7 @@ namespace xsimd
             }
         }
         template <class A>
-        inline batch<double, A> ssub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<double, A> ssub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
         {
             return sub(self, other); // no saturated arithmetic on floating point numbers
         }
diff --git a/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_complex.hpp b/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_complex.hpp
index 960e4c10c6e..812c592aec0 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_complex.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_complex.hpp
@@ -26,54 +26,54 @@ namespace xsimd
 
         // real
         template <class A, class T>
-        inline batch<T, A> real(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> real(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             return self;
         }
 
         template <class A, class T>
-        inline batch<T, A> real(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> real(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
         {
             return self.real();
         }
 
         // imag
         template <class A, class T>
-        inline batch<T, A> imag(batch<T, A> const& /*self*/, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> imag(batch<T, A> const& /*self*/, requires_arch<generic>) noexcept
         {
             return batch<T, A>(T(0));
         }
 
         template <class A, class T>
-        inline batch<T, A> imag(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> imag(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
         {
             return self.imag();
         }
 
         // arg
         template <class A, class T>
-        inline real_batch_type_t<batch<T, A>> arg(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE real_batch_type_t<batch<T, A>> arg(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             return atan2(imag(self), real(self));
         }
 
         // conj
         template <class A, class T>
-        inline complex_batch_type_t<batch<T, A>> conj(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE complex_batch_type_t<batch<T, A>> conj(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             return { real(self), -imag(self) };
         }
 
         // norm
         template <class A, class T>
-        inline real_batch_type_t<batch<T, A>> norm(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE real_batch_type_t<batch<T, A>> norm(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             return { fma(real(self), real(self), imag(self) * imag(self)) };
         }
 
         // proj
         template <class A, class T>
-        inline complex_batch_type_t<batch<T, A>> proj(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE complex_batch_type_t<batch<T, A>> proj(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = complex_batch_type_t<batch<T, A>>;
             using real_batch = typename batch_type::real_batch;
@@ -86,19 +86,19 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline batch_bool<T, A> isnan(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch_bool<T, A> isnan(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
         {
             return batch_bool<T, A>(isnan(self.real()) || isnan(self.imag()));
         }
 
         template <class A, class T>
-        inline batch_bool<T, A> isinf(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch_bool<T, A> isinf(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
         {
             return batch_bool<T, A>(isinf(self.real()) || isinf(self.imag()));
         }
 
         template <class A, class T>
-        inline batch_bool<T, A> isfinite(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch_bool<T, A> isfinite(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
         {
             return batch_bool<T, A>(isfinite(self.real()) && isfinite(self.imag()));
         }
diff --git a/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_details.hpp b/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_details.hpp
index 14c62a08920..e676e0a7d23 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_details.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_details.hpp
@@ -23,81 +23,81 @@ namespace xsimd
 {
     // Forward declaration. Should we put them in a separate file?
     template <class T, class A>
-    inline batch<T, A> abs(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline batch<T, A> abs(batch<std::complex<T>, A> const& self) noexcept;
+    XSIMD_INLINE batch<T, A> abs(batch<std::complex<T>, A> const& self) noexcept;
     template <class T, class A>
-    inline bool any(batch_bool<T, A> const& self) noexcept;
+    XSIMD_INLINE bool any(batch_bool<T, A> const& self) noexcept;
     template <class T, class A>
-    inline batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+    XSIMD_INLINE batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other) noexcept;
     template <class A, class T_out, class T_in>
-    inline batch<T_out, A> batch_cast(batch<T_in, A> const&, batch<T_out, A> const& out) noexcept;
+    XSIMD_INLINE batch<T_out, A> batch_cast(batch<T_in, A> const&, batch<T_out, A> const& out) noexcept;
     template <class T, class A>
-    inline batch<T, A> bitofsign(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE batch<T, A> bitofsign(batch<T, A> const& self) noexcept;
     template <class T_out, class T_in, class A>
-    inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& self) noexcept;
+    XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& self) noexcept;
     template <class T, class A>
-    inline batch<T, A> cos(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE batch<T, A> cos(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline batch<T, A> cosh(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE batch<T, A> cosh(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline batch<T, A> exp(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE batch<T, A> exp(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
+    XSIMD_INLINE batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
     template <class T, class A>
-    inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
+    XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
     template <class T, class A>
-    inline batch<T, A> frexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
+    XSIMD_INLINE batch<T, A> frexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
     template <class T, class A, uint64_t... Coefs>
-    inline batch<T, A> horner(const batch<T, A>& self) noexcept;
+    XSIMD_INLINE batch<T, A> horner(const batch<T, A>& self) noexcept;
     template <class T, class A>
-    inline batch<T, A> hypot(const batch<T, A>& self) noexcept;
+    XSIMD_INLINE batch<T, A> hypot(const batch<T, A>& self) noexcept;
     template <class T, class A>
-    inline batch_bool<T, A> is_even(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE batch_bool<T, A> is_even(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline batch_bool<T, A> is_flint(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE batch_bool<T, A> is_flint(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline batch_bool<T, A> is_odd(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE batch_bool<T, A> is_odd(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline typename batch<T, A>::batch_bool_type isinf(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE typename batch<T, A>::batch_bool_type isinf(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline typename batch<T, A>::batch_bool_type isfinite(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE typename batch<T, A>::batch_bool_type isfinite(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline typename batch<T, A>::batch_bool_type isnan(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE typename batch<T, A>::batch_bool_type isnan(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
+    XSIMD_INLINE batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
     template <class T, class A>
-    inline batch<T, A> log(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE batch<T, A> log(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline batch<T, A> nearbyint(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE batch<T, A> nearbyint(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline batch<as_integer_t<T>, A> nearbyint_as_int(const batch<T, A>& x) noexcept;
+    XSIMD_INLINE batch<as_integer_t<T>, A> nearbyint_as_int(const batch<T, A>& x) noexcept;
     template <class T, class A>
-    inline T reduce_add(batch<T, A> const&) noexcept;
+    XSIMD_INLINE T reduce_add(batch<T, A> const&) noexcept;
     template <class T, class A>
-    inline batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
+    XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
     template <class T, class A>
-    inline batch<std::complex<T>, A> select(batch_bool<T, A> const&, batch<std::complex<T>, A> const&, batch<std::complex<T>, A> const&) noexcept;
+    XSIMD_INLINE batch<std::complex<T>, A> select(batch_bool<T, A> const&, batch<std::complex<T>, A> const&, batch<std::complex<T>, A> const&) noexcept;
     template <class T, class A>
-    inline batch<T, A> sign(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE batch<T, A> sign(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline batch<T, A> signnz(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE batch<T, A> signnz(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline batch<T, A> sin(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE batch<T, A> sin(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline batch<T, A> sinh(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE batch<T, A> sinh(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline batch<T, A> sqrt(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline batch<T, A> tan(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE batch<T, A> tan(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline batch<as_float_t<T>, A> to_float(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE batch<as_float_t<T>, A> to_float(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline batch<as_integer_t<T>, A> to_int(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE batch<as_integer_t<T>, A> to_int(batch<T, A> const& self) noexcept;
     template <class T, class A>
-    inline batch<T, A> trunc(batch<T, A> const& self) noexcept;
+    XSIMD_INLINE batch<T, A> trunc(batch<T, A> const& self) noexcept;
 
     namespace kernel
     {
@@ -105,7 +105,7 @@ namespace xsimd
         namespace detail
         {
             template <class F, class A, class T, class... Batches>
-            inline batch<T, A> apply(F&& func, batch<T, A> const& self, batch<T, A> const& other) noexcept
+            XSIMD_INLINE batch<T, A> apply(F&& func, batch<T, A> const& self, batch<T, A> const& other) noexcept
             {
                 constexpr std::size_t size = batch<T, A>::size;
                 alignas(A::alignment()) T self_buffer[size];
@@ -120,7 +120,7 @@ namespace xsimd
             }
 
             template <class U, class F, class A, class T>
-            inline batch<U, A> apply_transform(F&& func, batch<T, A> const& self) noexcept
+            XSIMD_INLINE batch<U, A> apply_transform(F&& func, batch<T, A> const& self) noexcept
             {
                 static_assert(batch<T, A>::size == batch<U, A>::size,
                               "Source and destination sizes must match");
@@ -141,42 +141,42 @@ namespace xsimd
         namespace detail
         {
             template <class A>
-            inline batch<uint8_t, A> fast_cast(batch<int8_t, A> const& self, batch<uint8_t, A> const&, requires_arch<generic>) noexcept
+            XSIMD_INLINE batch<uint8_t, A> fast_cast(batch<int8_t, A> const& self, batch<uint8_t, A> const&, requires_arch<generic>) noexcept
             {
                 return bitwise_cast<uint8_t>(self);
             }
             template <class A>
-            inline batch<uint16_t, A> fast_cast(batch<int16_t, A> const& self, batch<uint16_t, A> const&, requires_arch<generic>) noexcept
+            XSIMD_INLINE batch<uint16_t, A> fast_cast(batch<int16_t, A> const& self, batch<uint16_t, A> const&, requires_arch<generic>) noexcept
             {
                 return bitwise_cast<uint16_t>(self);
             }
             template <class A>
-            inline batch<uint32_t, A> fast_cast(batch<int32_t, A> const& self, batch<uint32_t, A> const&, requires_arch<generic>) noexcept
+            XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<int32_t, A> const& self, batch<uint32_t, A> const&, requires_arch<generic>) noexcept
             {
                 return bitwise_cast<uint32_t>(self);
             }
             template <class A>
-            inline batch<uint64_t, A> fast_cast(batch<int64_t, A> const& self, batch<uint64_t, A> const&, requires_arch<generic>) noexcept
+            XSIMD_INLINE batch<uint64_t, A> fast_cast(batch<int64_t, A> const& self, batch<uint64_t, A> const&, requires_arch<generic>) noexcept
             {
                 return bitwise_cast<uint64_t>(self);
             }
             template <class A>
-            inline batch<int8_t, A> fast_cast(batch<uint8_t, A> const& self, batch<int8_t, A> const&, requires_arch<generic>) noexcept
+            XSIMD_INLINE batch<int8_t, A> fast_cast(batch<uint8_t, A> const& self, batch<int8_t, A> const&, requires_arch<generic>) noexcept
             {
                 return bitwise_cast<int8_t>(self);
             }
             template <class A>
-            inline batch<int16_t, A> fast_cast(batch<uint16_t, A> const& self, batch<int16_t, A> const&, requires_arch<generic>) noexcept
+            XSIMD_INLINE batch<int16_t, A> fast_cast(batch<uint16_t, A> const& self, batch<int16_t, A> const&, requires_arch<generic>) noexcept
             {
                 return bitwise_cast<int16_t>(self);
             }
             template <class A>
-            inline batch<int32_t, A> fast_cast(batch<uint32_t, A> const& self, batch<int32_t, A> const&, requires_arch<generic>) noexcept
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<uint32_t, A> const& self, batch<int32_t, A> const&, requires_arch<generic>) noexcept
             {
                 return bitwise_cast<int32_t>(self);
             }
             template <class A>
-            inline batch<int64_t, A> fast_cast(batch<uint64_t, A> const& self, batch<int64_t, A> const&, requires_arch<generic>) noexcept
+            XSIMD_INLINE batch<int64_t, A> fast_cast(batch<uint64_t, A> const& self, batch<int64_t, A> const&, requires_arch<generic>) noexcept
             {
                 return bitwise_cast<int64_t>(self);
             }
@@ -184,7 +184,7 @@ namespace xsimd
             // Provide a generic uint32_t -> float cast only if we have a
             // non-generic int32_t -> float fast_cast
             template <class A, class _ = decltype(fast_cast(std::declval<batch<int32_t, A> const&>(), std::declval<batch<float, A> const&>(), A {}))>
-            inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<generic>) noexcept
+            XSIMD_INLINE batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<generic>) noexcept
             {
                 // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
                 batch<uint32_t, A> msk_lo(0xFFFF);
@@ -201,7 +201,7 @@ namespace xsimd
             // Provide a generic float -> uint32_t cast only if we have a
             // non-generic float -> int32_t fast_cast
             template <class A, class _ = decltype(fast_cast(std::declval<batch<float, A> const&>(), std::declval<batch<int32_t, A> const&>(), A {}))>
-            inline batch<uint32_t, A> fast_cast(batch<float, A> const& v, batch<uint32_t, A> const&, requires_arch<generic>) noexcept
+            XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<float, A> const& v, batch<uint32_t, A> const&, requires_arch<generic>) noexcept
             {
                 auto is_large = v >= batch<float, A>(1u << 31);
                 auto small = bitwise_cast<float>(batch_cast<int32_t>(v));
@@ -258,25 +258,25 @@ namespace xsimd
              * ====================================================
              */
             template <class B, uint64_t c>
-            inline B coef() noexcept
+            XSIMD_INLINE B coef() noexcept
             {
                 using value_type = typename B::value_type;
                 return B(bit_cast<value_type>(as_unsigned_integer_t<value_type>(c)));
             }
             template <class B>
-            inline B horner(const B&) noexcept
+            XSIMD_INLINE B horner(const B&) noexcept
             {
                 return B(typename B::value_type(0.));
             }
 
             template <class B, uint64_t c0>
-            inline B horner(const B&) noexcept
+            XSIMD_INLINE B horner(const B&) noexcept
             {
                 return coef<B, c0>();
             }
 
             template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
-            inline B horner(const B& self) noexcept
+            XSIMD_INLINE B horner(const B& self) noexcept
             {
                 return fma(self, horner<B, c1, args...>(self), coef<B, c0>());
             }
@@ -291,19 +291,19 @@ namespace xsimd
              * ====================================================
              */
             template <class B>
-            inline B horner1(const B&) noexcept
+            XSIMD_INLINE B horner1(const B&) noexcept
             {
                 return B(1.);
             }
 
             template <class B, uint64_t c0>
-            inline B horner1(const B& x) noexcept
+            XSIMD_INLINE B horner1(const B& x) noexcept
             {
                 return x + detail::coef<B, c0>();
             }
 
             template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
-            inline B horner1(const B& x) noexcept
+            XSIMD_INLINE B horner1(const B& x) noexcept
             {
                 return fma(x, horner1<B, c1, args...>(x), detail::coef<B, c0>());
             }
diff --git a/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_logical.hpp b/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_logical.hpp
index dd446e83dd7..69d9657e12e 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_logical.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_logical.hpp
@@ -24,7 +24,7 @@ namespace xsimd
 
         // from  mask
         template <class A, class T>
-        inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<generic>) noexcept
         {
             alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
             // This is inefficient but should never be called. It's just a
@@ -36,28 +36,28 @@ namespace xsimd
 
         // ge
         template <class A, class T>
-        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             return other <= self;
         }
 
         // gt
         template <class A, class T>
-        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             return other < self;
         }
 
         // is_even
         template <class A, class T>
-        inline batch_bool<T, A> is_even(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch_bool<T, A> is_even(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             return is_flint(self * T(0.5));
         }
 
         // is_flint
         template <class A, class T>
-        inline batch_bool<T, A> is_flint(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch_bool<T, A> is_flint(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             auto frac = select(isnan(self - self), constants::nan<batch<T, A>>(), self - trunc(self));
             return frac == T(0.);
@@ -65,69 +65,69 @@ namespace xsimd
 
         // is_odd
         template <class A, class T>
-        inline batch_bool<T, A> is_odd(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch_bool<T, A> is_odd(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             return is_even(self - T(1.));
         }
 
         // isinf
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> isinf(batch<T, A> const&, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch_bool<T, A> isinf(batch<T, A> const&, requires_arch<generic>) noexcept
         {
             return batch_bool<T, A>(false);
         }
         template <class A>
-        inline batch_bool<float, A> isinf(batch<float, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch_bool<float, A> isinf(batch<float, A> const& self, requires_arch<generic>) noexcept
         {
             return abs(self) == std::numeric_limits<float>::infinity();
         }
         template <class A>
-        inline batch_bool<double, A> isinf(batch<double, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch_bool<double, A> isinf(batch<double, A> const& self, requires_arch<generic>) noexcept
         {
             return abs(self) == std::numeric_limits<double>::infinity();
         }
 
         // isfinite
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> isfinite(batch<T, A> const&, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch_bool<T, A> isfinite(batch<T, A> const&, requires_arch<generic>) noexcept
         {
             return batch_bool<T, A>(true);
         }
         template <class A>
-        inline batch_bool<float, A> isfinite(batch<float, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch_bool<float, A> isfinite(batch<float, A> const& self, requires_arch<generic>) noexcept
         {
             return (self - self) == 0.f;
         }
         template <class A>
-        inline batch_bool<double, A> isfinite(batch<double, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch_bool<double, A> isfinite(batch<double, A> const& self, requires_arch<generic>) noexcept
         {
             return (self - self) == 0.;
         }
 
         // isnan
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> isnan(batch<T, A> const&, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch_bool<T, A> isnan(batch<T, A> const&, requires_arch<generic>) noexcept
         {
             return batch_bool<T, A>(false);
         }
 
         // le
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             return (self < other) || (self == other);
         }
 
         // neq
         template <class A, class T>
-        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             return !(other == self);
         }
 
         // logical_and
         template <class A, class T>
-        inline batch<T, A> logical_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> logical_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             return detail::apply([](T x, T y) noexcept
                                  { return x && y; },
@@ -136,7 +136,7 @@ namespace xsimd
 
         // logical_or
         template <class A, class T>
-        inline batch<T, A> logical_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> logical_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             return detail::apply([](T x, T y) noexcept
                                  { return x || y; },
@@ -145,7 +145,7 @@ namespace xsimd
 
         // mask
         template <class A, class T>
-        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<generic>) noexcept
         {
             alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
             self.store_aligned(buffer);
diff --git a/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_math.hpp b/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_math.hpp
index 05d27b3d470..f9e7f5782cc 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_math.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_math.hpp
@@ -26,8 +26,8 @@ namespace xsimd
 
         using namespace types;
         // abs
-        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
-        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<generic>) noexcept
+        template <class A, class T, class>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             if (std::is_unsigned<T>::value)
                 return self;
@@ -40,14 +40,71 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline batch<T, A> abs(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> abs(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
         {
             return hypot(z.real(), z.imag());
         }
 
+        // avg
+        namespace detail
+        {
+            template <class A, class T>
+            XSIMD_INLINE batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, std::true_type, std::false_type) noexcept
+            {
+                return (x & y) + ((x ^ y) >> 1);
+            }
+
+            template <class A, class T>
+            XSIMD_INLINE batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, std::true_type, std::true_type) noexcept
+            {
+                // Inspired by
+                // https://stackoverflow.com/questions/5697500/take-the-average-of-two-signed-numbers-in-c
+                auto t = (x & y) + ((x ^ y) >> 1);
+                auto t_u = bitwise_cast<typename std::make_unsigned<T>::type>(t);
+                auto avg = t + (bitwise_cast<T>(t_u >> (8 * sizeof(T) - 1)) & (x ^ y));
+                return avg;
+            }
+
+            template <class A, class T>
+            XSIMD_INLINE batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, std::false_type, std::true_type) noexcept
+            {
+                return (x + y) / 2;
+            }
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, requires_arch<generic>) noexcept
+        {
+            return detail::avg(x, y, typename std::is_integral<T>::type {}, typename std::is_signed<T>::type {});
+        }
+
+        // avgr
+        namespace detail
+        {
+            template <class A, class T>
+            XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& x, batch<T, A> const& y, std::true_type) noexcept
+            {
+                constexpr unsigned shift = 8 * sizeof(T) - 1;
+                auto adj = std::is_signed<T>::value ? ((x ^ y) & 0x1) : (((x ^ y) << shift) >> shift);
+                return ::xsimd::kernel::avg(x, y, A {}) + adj;
+            }
+
+            template <class A, class T>
+            XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& x, batch<T, A> const& y, std::false_type) noexcept
+            {
+                return ::xsimd::kernel::avg(x, y, A {});
+            }
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& x, batch<T, A> const& y, requires_arch<generic>) noexcept
+        {
+            return detail::avgr(x, y, typename std::is_integral<T>::type {});
+        }
+
         // batch_cast
         template <class A, class T>
-        inline batch<T, A> batch_cast(batch<T, A> const& self, batch<T, A> const&, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> batch_cast(batch<T, A> const& self, batch<T, A> const&, requires_arch<generic>) noexcept
         {
             return self;
         }
@@ -55,12 +112,12 @@ namespace xsimd
         namespace detail
         {
             template <class A, class T_out, class T_in>
-            inline batch<T_out, A> batch_cast(batch<T_in, A> const& self, batch<T_out, A> const& out, requires_arch<generic>, with_fast_conversion) noexcept
+            XSIMD_INLINE batch<T_out, A> batch_cast(batch<T_in, A> const& self, batch<T_out, A> const& out, requires_arch<generic>, with_fast_conversion) noexcept
             {
                 return fast_cast(self, out, A {});
             }
             template <class A, class T_out, class T_in>
-            inline batch<T_out, A> batch_cast(batch<T_in, A> const& self, batch<T_out, A> const&, requires_arch<generic>, with_slow_conversion) noexcept
+            XSIMD_INLINE batch<T_out, A> batch_cast(batch<T_in, A> const& self, batch<T_out, A> const&, requires_arch<generic>, with_slow_conversion) noexcept
             {
                 static_assert(!std::is_same<T_in, T_out>::value, "there should be no conversion for this type combination");
                 using batch_type_in = batch<T_in, A>;
@@ -76,14 +133,14 @@ namespace xsimd
         }
 
         template <class A, class T_out, class T_in>
-        inline batch<T_out, A> batch_cast(batch<T_in, A> const& self, batch<T_out, A> const& out, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T_out, A> batch_cast(batch<T_in, A> const& self, batch<T_out, A> const& out, requires_arch<generic>) noexcept
         {
             return detail::batch_cast(self, out, A {}, detail::conversion_type<A, T_in, T_out> {});
         }
 
         // bitofsign
         template <class A, class T>
-        inline batch<T, A> bitofsign(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> bitofsign(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             static_assert(std::is_integral<T>::value, "int type implementation");
             if (std::is_unsigned<T>::value)
@@ -93,19 +150,19 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<float, A> bitofsign(batch<float, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<float, A> bitofsign(batch<float, A> const& self, requires_arch<generic>) noexcept
         {
             return self & constants::signmask<batch<float, A>>();
         }
         template <class A>
-        inline batch<double, A> bitofsign(batch<double, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<double, A> bitofsign(batch<double, A> const& self, requires_arch<generic>) noexcept
         {
             return self & constants::signmask<batch<double, A>>();
         }
 
         // bitwise_cast
         template <class A, class T>
-        inline batch<T, A> bitwise_cast(batch<T, A> const& self, batch<T, A> const&, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_cast(batch<T, A> const& self, batch<T, A> const&, requires_arch<generic>) noexcept
         {
             return self;
         }
@@ -121,7 +178,7 @@ namespace xsimd
          * ====================================================
          */
         template <class A>
-        inline batch<float, A> cbrt(batch<float, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<float, A> cbrt(batch<float, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<float, A>;
             batch_type z = abs(self);
@@ -168,7 +225,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<double, A> cbrt(batch<double, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<double, A> cbrt(batch<double, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<double, A>;
             batch_type z = abs(self);
@@ -217,14 +274,14 @@ namespace xsimd
 
         // clip
         template <class A, class T>
-        inline batch<T, A> clip(batch<T, A> const& self, batch<T, A> const& lo, batch<T, A> const& hi, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> clip(batch<T, A> const& self, batch<T, A> const& lo, batch<T, A> const& hi, requires_arch<generic>) noexcept
         {
             return min(hi, max(self, lo));
         }
 
         // copysign
         template <class A, class T, class _ = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
-        inline batch<T, A> copysign(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> copysign(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             return abs(self) | bitofsign(other);
         }
@@ -251,7 +308,7 @@ namespace xsimd
                 using batch_type = batch<float, A>;
                 // computes erf(a0)/a0
                 // x is sqr(a0) and 0 <= abs(a0) <= 2/3
-                static inline batch_type erf1(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type erf1(const batch_type& x) noexcept
                 {
                     return detail::horner<batch_type,
                                           0x3f906eba, //   1.128379154774254e+00
@@ -265,7 +322,7 @@ namespace xsimd
 
                 // computes erfc(x)*exp(sqr(x))
                 // x >=  2/3
-                static inline batch_type erfc2(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type erfc2(const batch_type& x) noexcept
                 {
                     return detail::horner<batch_type,
                                           0x3f0a0e8b, //   5.392844046572836e-01
@@ -282,7 +339,7 @@ namespace xsimd
                                           >(x);
                 }
 
-                static inline batch_type erfc3(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type erfc3(const batch_type& x) noexcept
                 {
                     return (batch_type(1.) - x) * detail::horner<batch_type,
                                                                  0x3f7ffffe, //   9.9999988e-01
@@ -304,7 +361,7 @@ namespace xsimd
                 using batch_type = batch<double, A>;
                 // computes erf(a0)/a0
                 // x is sqr(a0) and 0 <= abs(a0) <= 0.65
-                static inline batch_type erf1(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type erf1(const batch_type& x) noexcept
                 {
                     return detail::horner<batch_type,
                                           0x3ff20dd750429b61ull, // 1.12837916709551
@@ -324,7 +381,7 @@ namespace xsimd
 
                 // computes erfc(x)*exp(x*x)
                 // 0.65 <= abs(x) <= 2.2
-                static inline batch_type erfc2(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type erfc2(const batch_type& x) noexcept
                 {
                     return detail::horner<batch_type,
                                           0x3feffffffbbb552bull, // 0.999999992049799
@@ -348,7 +405,7 @@ namespace xsimd
 
                 // computes erfc(x)*exp(x*x)
                 // 2.2 <= abs(x) <= 6
-                static inline batch_type erfc3(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type erfc3(const batch_type& x) noexcept
                 {
                     return detail::horner<batch_type,
                                           0x3fefff5a9e697ae2ull, // 0.99992114009714
@@ -372,7 +429,7 @@ namespace xsimd
 
                 // computes erfc(rx)*exp(rx*rx)
                 // x >=  6 rx = 1/x
-                static inline batch_type erfc4(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type erfc4(const batch_type& x) noexcept
                 {
                     return detail::horner<batch_type,
                                           0xbc7e4ad1ec7d0000ll, // -2.627435221016534e-17
@@ -404,7 +461,7 @@ namespace xsimd
          */
 
         template <class A>
-        inline batch<float, A> erf(batch<float, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<float, A> erf(batch<float, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<float, A>;
             batch_type x = abs(self);
@@ -428,7 +485,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<double, A> erf(batch<double, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<double, A> erf(batch<double, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<double, A>;
             batch_type x = abs(self);
@@ -464,7 +521,7 @@ namespace xsimd
 
         // erfc
         template <class A>
-        inline batch<float, A> erfc(batch<float, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<float, A> erfc(batch<float, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<float, A>;
             batch_type x = abs(self);
@@ -489,7 +546,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<double, A> erfc(batch<double, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<double, A> erfc(batch<double, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<double, A>;
             batch_type x = abs(self);
@@ -533,54 +590,54 @@ namespace xsimd
                 B x;
 
                 template <typename... Ts>
-                inline B operator()(const Ts&... coefs) noexcept
+                XSIMD_INLINE B operator()(const Ts&... coefs) noexcept
                 {
                     return eval(coefs...);
                 }
 
             private:
-                inline B eval(const B& c0) noexcept
+                XSIMD_INLINE B eval(const B& c0) noexcept
                 {
                     return c0;
                 }
 
-                inline B eval(const B& c0, const B& c1) noexcept
+                XSIMD_INLINE B eval(const B& c0, const B& c1) noexcept
                 {
                     return fma(x, c1, c0);
                 }
 
                 template <size_t... Is, class Tuple>
-                inline B eval(::xsimd::detail::index_sequence<Is...>, const Tuple& tuple)
+                XSIMD_INLINE B eval(::xsimd::detail::index_sequence<Is...>, const Tuple& tuple)
                 {
                     return estrin { x * x }(std::get<Is>(tuple)...);
                 }
 
                 template <class... Args>
-                inline B eval(const std::tuple<Args...>& tuple) noexcept
+                XSIMD_INLINE B eval(const std::tuple<Args...>& tuple) noexcept
                 {
                     return eval(::xsimd::detail::make_index_sequence<sizeof...(Args)>(), tuple);
                 }
 
                 template <class... Args>
-                inline B eval(const std::tuple<Args...>& tuple, const B& c0) noexcept
+                XSIMD_INLINE B eval(const std::tuple<Args...>& tuple, const B& c0) noexcept
                 {
                     return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0))));
                 }
 
                 template <class... Args>
-                inline B eval(const std::tuple<Args...>& tuple, const B& c0, const B& c1) noexcept
+                XSIMD_INLINE B eval(const std::tuple<Args...>& tuple, const B& c0, const B& c1) noexcept
                 {
                     return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1))));
                 }
 
                 template <class... Args, class... Ts>
-                inline B eval(const std::tuple<Args...>& tuple, const B& c0, const B& c1, const Ts&... coefs) noexcept
+                XSIMD_INLINE B eval(const std::tuple<Args...>& tuple, const B& c0, const B& c1, const Ts&... coefs) noexcept
                 {
                     return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1))), coefs...);
                 }
 
                 template <class... Ts>
-                inline B eval(const B& c0, const B& c1, const Ts&... coefs) noexcept
+                XSIMD_INLINE B eval(const B& c0, const B& c1, const Ts&... coefs) noexcept
                 {
                     return eval(std::make_tuple(eval(c0, c1)), coefs...);
                 }
@@ -588,7 +645,7 @@ namespace xsimd
         }
 
         template <class T, class A, uint64_t... Coefs>
-        inline batch<T, A> estrin(const batch<T, A>& self) noexcept
+        XSIMD_INLINE batch<T, A> estrin(const batch<T, A>& self) noexcept
         {
             using batch_type = batch<T, A>;
             return detail::estrin<batch_type> { self }(detail::coef<batch_type, Coefs>()...);
@@ -665,7 +722,7 @@ namespace xsimd
             struct exp_reduction<float, A, exp_tag> : exp_reduction_base<batch<float, A>, exp_tag>
             {
                 using batch_type = batch<float, A>;
-                static inline batch_type approx(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept
                 {
                     batch_type y = detail::horner<batch_type,
                                                   0x3f000000, //  5.0000000e-01
@@ -677,7 +734,7 @@ namespace xsimd
                     return ++fma(y, x * x, x);
                 }
 
-                static inline batch_type reduce(const batch_type& a, batch_type& x) noexcept
+                static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept
                 {
                     batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
                     x = fnma(k, constants::log_2hi<batch_type>(), a);
@@ -690,7 +747,7 @@ namespace xsimd
             struct exp_reduction<float, A, exp10_tag> : exp_reduction_base<batch<float, A>, exp10_tag>
             {
                 using batch_type = batch<float, A>;
-                static inline batch_type approx(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept
                 {
                     return ++(detail::horner<batch_type,
                                              0x40135d8e, //    2.3025851e+00
@@ -703,7 +760,7 @@ namespace xsimd
                               * x);
                 }
 
-                static inline batch_type reduce(const batch_type& a, batch_type& x) noexcept
+                static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept
                 {
                     batch_type k = nearbyint(constants::invlog10_2<batch_type>() * a);
                     x = fnma(k, constants::log10_2hi<batch_type>(), a);
@@ -716,7 +773,7 @@ namespace xsimd
             struct exp_reduction<float, A, exp2_tag> : exp_reduction_base<batch<float, A>, exp2_tag>
             {
                 using batch_type = batch<float, A>;
-                static inline batch_type approx(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept
                 {
                     batch_type y = detail::horner<batch_type,
                                                   0x3e75fdf1, //    2.4022652e-01
@@ -728,7 +785,7 @@ namespace xsimd
                     return ++fma(y, x * x, x * constants::log_2<batch_type>());
                 }
 
-                static inline batch_type reduce(const batch_type& a, batch_type& x) noexcept
+                static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept
                 {
                     batch_type k = nearbyint(a);
                     x = (a - k);
@@ -740,7 +797,7 @@ namespace xsimd
             struct exp_reduction<double, A, exp_tag> : exp_reduction_base<batch<double, A>, exp_tag>
             {
                 using batch_type = batch<double, A>;
-                static inline batch_type approx(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept
                 {
                     batch_type t = x * x;
                     return fnma(t,
@@ -753,7 +810,7 @@ namespace xsimd
                                 x);
                 }
 
-                static inline batch_type reduce(const batch_type& a, batch_type& hi, batch_type& lo, batch_type& x) noexcept
+                static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& hi, batch_type& lo, batch_type& x) noexcept
                 {
                     batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
                     hi = fnma(k, constants::log_2hi<batch_type>(), a);
@@ -762,7 +819,7 @@ namespace xsimd
                     return k;
                 }
 
-                static inline batch_type finalize(const batch_type& x, const batch_type& c, const batch_type& hi, const batch_type& lo) noexcept
+                static XSIMD_INLINE batch_type finalize(const batch_type& x, const batch_type& c, const batch_type& hi, const batch_type& lo) noexcept
                 {
                     return batch_type(1.) - (((lo - (x * c) / (batch_type(2.) - c)) - hi));
                 }
@@ -772,7 +829,7 @@ namespace xsimd
             struct exp_reduction<double, A, exp10_tag> : exp_reduction_base<batch<double, A>, exp10_tag>
             {
                 using batch_type = batch<double, A>;
-                static inline batch_type approx(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept
                 {
                     batch_type xx = x * x;
                     batch_type px = x * detail::horner<batch_type, 0x40a2b4798e134a01ull, 0x40796b7a050349e4ull, 0x40277d9474c55934ull, 0x3fa4fd75f3062dd4ull>(xx);
@@ -780,7 +837,7 @@ namespace xsimd
                     return ++(x2 + x2);
                 }
 
-                static inline batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept
+                static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept
                 {
                     batch_type k = nearbyint(constants::invlog10_2<batch_type>() * a);
                     x = fnma(k, constants::log10_2hi<batch_type>(), a);
@@ -788,7 +845,7 @@ namespace xsimd
                     return k;
                 }
 
-                static inline batch_type finalize(const batch_type&, const batch_type& c, const batch_type&, const batch_type&) noexcept
+                static XSIMD_INLINE batch_type finalize(const batch_type&, const batch_type& c, const batch_type&, const batch_type&) noexcept
                 {
                     return c;
                 }
@@ -798,7 +855,7 @@ namespace xsimd
             struct exp_reduction<double, A, exp2_tag> : exp_reduction_base<batch<double, A>, exp2_tag>
             {
                 using batch_type = batch<double, A>;
-                static inline batch_type approx(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept
                 {
                     batch_type t = x * x;
                     return fnma(t,
@@ -811,21 +868,21 @@ namespace xsimd
                                 x);
                 }
 
-                static inline batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept
+                static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept
                 {
                     batch_type k = nearbyint(a);
                     x = (a - k) * constants::log_2<batch_type>();
                     return k;
                 }
 
-                static inline batch_type finalize(const batch_type& x, const batch_type& c, const batch_type&, const batch_type&) noexcept
+                static XSIMD_INLINE batch_type finalize(const batch_type& x, const batch_type& c, const batch_type&, const batch_type&) noexcept
                 {
                     return batch_type(1.) + x + x * c / (batch_type(2.) - c);
                 }
             };
 
             template <exp_reduction_tag Tag, class A>
-            inline batch<float, A> exp(batch<float, A> const& self) noexcept
+            XSIMD_INLINE batch<float, A> exp(batch<float, A> const& self) noexcept
             {
                 using batch_type = batch<float, A>;
                 using reducer_t = exp_reduction<float, A, Tag>;
@@ -838,7 +895,7 @@ namespace xsimd
             }
 
             template <exp_reduction_tag Tag, class A>
-            inline batch<double, A> exp(batch<double, A> const& self) noexcept
+            XSIMD_INLINE batch<double, A> exp(batch<double, A> const& self) noexcept
             {
                 using batch_type = batch<double, A>;
                 using reducer_t = exp_reduction<double, A, Tag>;
@@ -853,13 +910,13 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline batch<T, A> exp(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> exp(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             return detail::exp<detail::exp_tag>(self);
         }
 
         template <class A, class T>
-        inline batch<std::complex<T>, A> exp(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> exp(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<std::complex<T>, A>;
             auto isincos = sincos(self.imag());
@@ -868,14 +925,14 @@ namespace xsimd
 
         // exp10
         template <class A, class T>
-        inline batch<T, A> exp10(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> exp10(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             return detail::exp<detail::exp10_tag>(self);
         }
 
         // exp2
         template <class A, class T>
-        inline batch<T, A> exp2(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> exp2(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             return detail::exp<detail::exp2_tag>(self);
         }
@@ -893,7 +950,7 @@ namespace xsimd
              * ====================================================
              */
             template <class A>
-            static inline batch<float, A> expm1(const batch<float, A>& a) noexcept
+            static XSIMD_INLINE batch<float, A> expm1(const batch<float, A>& a) noexcept
             {
                 using batch_type = batch<float, A>;
                 batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
@@ -917,7 +974,7 @@ namespace xsimd
             }
 
             template <class A>
-            static inline batch<double, A> expm1(const batch<double, A>& a) noexcept
+            static XSIMD_INLINE batch<double, A> expm1(const batch<double, A>& a) noexcept
             {
                 using batch_type = batch<double, A>;
                 batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
@@ -948,7 +1005,7 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline batch<T, A> expm1(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> expm1(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             return select(self < constants::logeps<batch_type>(),
@@ -959,7 +1016,7 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline batch<std::complex<T>, A> expm1(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> expm1(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
         {
             using batch_type = batch<std::complex<T>, A>;
             using real_batch = typename batch_type::real_batch;
@@ -972,7 +1029,7 @@ namespace xsimd
 
         // polar
         template <class A, class T>
-        inline batch<std::complex<T>, A> polar(const batch<T, A>& r, const batch<T, A>& theta, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> polar(const batch<T, A>& r, const batch<T, A>& theta, requires_arch<generic>) noexcept
         {
             auto sincosTheta = sincos(theta);
             return { r * sincosTheta.second, r * sincosTheta.first };
@@ -980,14 +1037,14 @@ namespace xsimd
 
         // fdim
         template <class A, class T>
-        inline batch<T, A> fdim(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> fdim(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             return fmax(batch<T, A>(0), self - other);
         }
 
         // fmod
         template <class A, class T>
-        inline batch<T, A> fmod(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> fmod(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             return fnma(trunc(self / other), other, self);
         }
@@ -1003,7 +1060,7 @@ namespace xsimd
          * ====================================================
          */
         template <class A, class T>
-        inline batch<T, A> frexp(const batch<T, A>& self, batch<as_integer_t<T>, A>& exp, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> frexp(const batch<T, A>& self, batch<as_integer_t<T>, A>& exp, requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             using int_type = as_integer_t<T>;
@@ -1018,28 +1075,28 @@ namespace xsimd
 
         // from bool
         template <class A, class T>
-        inline batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<generic>) noexcept
         {
             return batch<T, A>(self.data) & batch<T, A>(1);
         }
 
         // horner
         template <class T, class A, uint64_t... Coefs>
-        inline batch<T, A> horner(const batch<T, A>& self) noexcept
+        XSIMD_INLINE batch<T, A> horner(const batch<T, A>& self) noexcept
         {
             return detail::horner<batch<T, A>, Coefs...>(self);
         }
 
         // hypot
         template <class A, class T>
-        inline batch<T, A> hypot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> hypot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             return sqrt(fma(self, self, other * other));
         }
 
         // ipow
         template <class A, class T, class ITy>
-        inline batch<T, A> ipow(batch<T, A> const& self, ITy other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> ipow(batch<T, A> const& self, ITy other, requires_arch<generic>) noexcept
         {
             return ::xsimd::detail::ipow(self, other);
         }
@@ -1055,7 +1112,7 @@ namespace xsimd
          * ====================================================
          */
         template <class A, class T>
-        inline batch<T, A> ldexp(const batch<T, A>& self, const batch<as_integer_t<T>, A>& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> ldexp(const batch<T, A>& self, const batch<as_integer_t<T>, A>& other, requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             using itype = as_integer_t<batch_type>;
@@ -1066,7 +1123,7 @@ namespace xsimd
 
         // lgamma
         template <class A, class T>
-        inline batch<T, A> lgamma(batch<T, A> const& self, requires_arch<generic>) noexcept;
+        XSIMD_INLINE batch<T, A> lgamma(batch<T, A> const& self, requires_arch<generic>) noexcept;
 
         namespace detail
         {
@@ -1080,7 +1137,7 @@ namespace xsimd
              * ====================================================
              */
             template <class A>
-            static inline batch<float, A> gammalnB(const batch<float, A>& x) noexcept
+            static XSIMD_INLINE batch<float, A> gammalnB(const batch<float, A>& x) noexcept
             {
                 return horner<batch<float, A>,
                               0x3ed87730, //    4.227843421859038E-001
@@ -1095,7 +1152,7 @@ namespace xsimd
             }
 
             template <class A>
-            static inline batch<float, A> gammalnC(const batch<float, A>& x) noexcept
+            static XSIMD_INLINE batch<float, A> gammalnC(const batch<float, A>& x) noexcept
             {
                 return horner<batch<float, A>,
                               0xbf13c468, //   -5.772156501719101E-001
@@ -1110,7 +1167,7 @@ namespace xsimd
             }
 
             template <class A>
-            static inline batch<float, A> gammaln2(const batch<float, A>& x) noexcept
+            static XSIMD_INLINE batch<float, A> gammaln2(const batch<float, A>& x) noexcept
             {
                 return horner<batch<float, A>,
                               0x3daaaa94, //   8.333316229807355E-002f
@@ -1120,7 +1177,7 @@ namespace xsimd
             }
 
             template <class A>
-            static inline batch<double, A> gammaln1(const batch<double, A>& x) noexcept
+            static XSIMD_INLINE batch<double, A> gammaln1(const batch<double, A>& x) noexcept
             {
                 return horner<batch<double, A>,
                               0xc12a0c675418055eull, //  -8.53555664245765465627E5
@@ -1142,7 +1199,7 @@ namespace xsimd
             }
 
             template <class A>
-            static inline batch<double, A> gammalnA(const batch<double, A>& x) noexcept
+            static XSIMD_INLINE batch<double, A> gammalnA(const batch<double, A>& x) noexcept
             {
                 return horner<batch<double, A>,
                               0x3fb555555555554bull, //    8.33333333333331927722E-2
@@ -1169,7 +1226,7 @@ namespace xsimd
             struct lgamma_impl<batch<float, A>>
             {
                 using batch_type = batch<float, A>;
-                static inline batch_type compute(const batch_type& a) noexcept
+                static XSIMD_INLINE batch_type compute(const batch_type& a) noexcept
                 {
                     auto inf_result = (a <= batch_type(0.)) && is_flint(a);
                     batch_type x = select(inf_result, constants::nan<batch_type>(), a);
@@ -1191,7 +1248,7 @@ namespace xsimd
                 }
 
             private:
-                static inline batch_type negative(const batch_type& q, const batch_type& w) noexcept
+                static XSIMD_INLINE batch_type negative(const batch_type& q, const batch_type& w) noexcept
                 {
                     batch_type p = floor(q);
                     batch_type z = q - p;
@@ -1201,7 +1258,7 @@ namespace xsimd
                     return -log(constants::invpi<batch_type>() * abs(z)) - w;
                 }
 
-                static inline batch_type other(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type other(const batch_type& x) noexcept
                 {
                     auto xlt650 = (x < batch_type(6.5));
                     batch_type r0x = x;
@@ -1290,7 +1347,7 @@ namespace xsimd
             {
                 using batch_type = batch<double, A>;
 
-                static inline batch_type compute(const batch_type& a) noexcept
+                static XSIMD_INLINE batch_type compute(const batch_type& a) noexcept
                 {
                     auto inf_result = (a <= batch_type(0.)) && is_flint(a);
                     batch_type x = select(inf_result, constants::nan<batch_type>(), a);
@@ -1312,6 +1369,8 @@ namespace xsimd
                 }
 
             private:
+                // FIXME: cannot mark this one as XSIMD_INLINE because there's a
+                // recursive loop on `lgamma'.
                 static inline batch_type large_negative(const batch_type& q) noexcept
                 {
                     batch_type w = lgamma(q);
@@ -1324,7 +1383,7 @@ namespace xsimd
                     return constants::logpi<batch_type>() - log(z) - w;
                 }
 
-                static inline batch_type other(const batch_type& xx) noexcept
+                static XSIMD_INLINE batch_type other(const batch_type& xx) noexcept
                 {
                     batch_type x = xx;
                     auto test = (x < batch_type(13.));
@@ -1367,7 +1426,7 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline batch<T, A> lgamma(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> lgamma(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             return detail::lgamma_impl<batch<T, A>>::compute(self);
         }
@@ -1383,7 +1442,7 @@ namespace xsimd
          * ====================================================
          */
         template <class A>
-        inline batch<float, A> log(batch<float, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<float, A> log(batch<float, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<float, A>;
             using int_type = as_integer_t<float>;
@@ -1423,7 +1482,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<double, A> log(batch<double, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<double, A> log(batch<double, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<double, A>;
             using int_type = as_integer_t<double>;
@@ -1466,14 +1525,14 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline batch<std::complex<T>, A> log(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> log(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
         {
             return batch<std::complex<T>, A>(log(abs(z)), atan2(z.imag(), z.real()));
         }
 
         // log2
         template <class A>
-        inline batch<float, A> log2(batch<float, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<float, A> log2(batch<float, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<float, A>;
             using int_type = as_integer_t<float>;
@@ -1513,7 +1572,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<double, A> log2(batch<double, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<double, A> log2(batch<double, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<double, A>;
             using int_type = as_integer_t<double>;
@@ -1563,7 +1622,7 @@ namespace xsimd
         namespace detail
         {
             template <class T, class A>
-            inline batch<T, A> logN_complex_impl(const batch<T, A>& z, typename batch<T, A>::value_type base) noexcept
+            XSIMD_INLINE batch<T, A> logN_complex_impl(const batch<T, A>& z, typename batch<T, A>::value_type base) noexcept
             {
                 using batch_type = batch<T, A>;
                 using rv_type = typename batch_type::value_type;
@@ -1572,7 +1631,7 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline batch<std::complex<T>, A> log2(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> log2(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
         {
             return detail::logN_complex_impl(self, std::log(2));
         }
@@ -1590,7 +1649,7 @@ namespace xsimd
          * ====================================================
          */
         template <class A>
-        inline batch<float, A> log10(batch<float, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<float, A> log10(batch<float, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<float, A>;
             const batch_type
@@ -1641,7 +1700,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<double, A> log10(batch<double, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<double, A> log10(batch<double, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<double, A>;
             const batch_type
@@ -1695,7 +1754,7 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline batch<std::complex<T>, A> log10(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> log10(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
         {
             return detail::logN_complex_impl(z, std::log(10));
         }
@@ -1711,7 +1770,7 @@ namespace xsimd
          * ====================================================
          */
         template <class A>
-        inline batch<float, A> log1p(batch<float, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<float, A> log1p(batch<float, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<float, A>;
             using int_type = as_integer_t<float>;
@@ -1743,7 +1802,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<double, A> log1p(batch<double, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<double, A> log1p(batch<double, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<double, A>;
             using int_type = as_integer_t<double>;
@@ -1776,7 +1835,7 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline batch<std::complex<T>, A> log1p(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> log1p(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<std::complex<T>, A>;
             using real_batch = typename batch_type::real_batch;
@@ -1791,7 +1850,7 @@ namespace xsimd
 
         // mod
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> mod(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> mod(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             return detail::apply([](T x, T y) noexcept -> T
                                  { return x % y; },
@@ -1800,14 +1859,14 @@ namespace xsimd
 
         // nearbyint
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> nearbyint(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> nearbyint(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             return self;
         }
         namespace detail
         {
             template <class A, class T>
-            inline batch<T, A> nearbyintf(batch<T, A> const& self) noexcept
+            XSIMD_INLINE batch<T, A> nearbyintf(batch<T, A> const& self) noexcept
             {
                 using batch_type = batch<T, A>;
                 batch_type s = bitofsign(self);
@@ -1827,26 +1886,26 @@ namespace xsimd
             }
         }
         template <class A>
-        inline batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<generic>) noexcept
         {
             return detail::nearbyintf(self);
         }
         template <class A>
-        inline batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<generic>) noexcept
         {
             return detail::nearbyintf(self);
         }
 
         // nearbyint_as_int
         template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> nearbyint_as_int(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> nearbyint_as_int(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             return self;
         }
 
         // nearbyint_as_int
         template <class A>
-        inline batch<as_integer_t<float>, A>
+        XSIMD_INLINE batch<as_integer_t<float>, A>
         nearbyint_as_int(batch<float, A> const& self, requires_arch<generic>) noexcept
         {
             using U = as_integer_t<float>;
@@ -1856,7 +1915,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<as_integer_t<double>, A>
+        XSIMD_INLINE batch<as_integer_t<double>, A>
         nearbyint_as_int(batch<double, A> const& self, requires_arch<generic>) noexcept
         {
             using U = as_integer_t<double>;
@@ -1873,12 +1932,12 @@ namespace xsimd
             {
                 using batch_type = batch<T, A>;
 
-                static inline batch_type next(batch_type const& b) noexcept
+                static XSIMD_INLINE batch_type next(batch_type const& b) noexcept
                 {
                     return b;
                 }
 
-                static inline batch_type prev(batch_type const& b) noexcept
+                static XSIMD_INLINE batch_type prev(batch_type const& b) noexcept
                 {
                     return b;
                 }
@@ -1906,13 +1965,13 @@ namespace xsimd
                 using int_batch = typename bitwise_cast_batch<T, A>::type;
                 using int_type = typename int_batch::value_type;
 
-                static inline batch_type next(const batch_type& b) noexcept
+                static XSIMD_INLINE batch_type next(const batch_type& b) noexcept
                 {
                     batch_type n = ::xsimd::bitwise_cast<T>(::xsimd::bitwise_cast<int_type>(b) + int_type(1));
                     return select(b == constants::infinity<batch_type>(), b, n);
                 }
 
-                static inline batch_type prev(const batch_type& b) noexcept
+                static XSIMD_INLINE batch_type prev(const batch_type& b) noexcept
                 {
                     batch_type p = ::xsimd::bitwise_cast<T>(::xsimd::bitwise_cast<int_type>(b) - int_type(1));
                     return select(b == constants::minusinfinity<batch_type>(), b, p);
@@ -1920,7 +1979,7 @@ namespace xsimd
             };
         }
         template <class A, class T>
-        inline batch<T, A> nextafter(batch<T, A> const& from, batch<T, A> const& to, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> nextafter(batch<T, A> const& from, batch<T, A> const& to, requires_arch<generic>) noexcept
         {
             using kernel = detail::nextafter_kernel<T, A>;
             return select(from == to, from,
@@ -1938,23 +1997,22 @@ namespace xsimd
          * ====================================================
          */
         template <class A, class T>
-        inline batch<T, A> pow(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> pow(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             const auto zero = batch_type(0.);
-            auto negx = self < zero;
-            auto iszero = self == zero;
-            constexpr T e = static_cast<T>(2.718281828459045);
-            auto adj_self = select(iszero, batch_type(e), abs(self));
+            auto negself = self < zero;
+            auto iszeropowpos = self == zero && other >= zero;
+            auto adj_self = select(iszeropowpos, batch_type(1), abs(self));
             batch_type z = exp(other * log(adj_self));
-            z = select(iszero, zero, z);
-            z = select(is_odd(other) && negx, -z, z);
-            auto invalid = negx && !(is_flint(other) || isinf(other));
+            z = select(iszeropowpos, zero, z);
+            z = select(is_odd(other) && negself, -z, z);
+            auto invalid = negself && !(is_flint(other) || isinf(other));
             return select(invalid, constants::nan<batch_type>(), z);
         }
 
         template <class A, class T>
-        inline batch<std::complex<T>, A> pow(const batch<std::complex<T>, A>& a, const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> pow(const batch<std::complex<T>, A>& a, const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
         {
             using cplx_batch = batch<std::complex<T>, A>;
             using real_batch = typename cplx_batch::real_batch;
@@ -1973,8 +2031,8 @@ namespace xsimd
 
         // reciprocal
         template <class T, class A, class = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
-        inline batch<T, A> reciprocal(batch<T, A> const& self,
-                                      requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> reciprocal(batch<T, A> const& self,
+                                            requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             return div(batch_type(1), self);
@@ -1982,7 +2040,7 @@ namespace xsimd
 
         // reduce_add
         template <class A, class T>
-        inline std::complex<T> reduce_add(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE std::complex<T> reduce_add(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
         {
             return { reduce_add(self.real()), reduce_add(self.imag()) };
         }
@@ -1999,23 +2057,23 @@ namespace xsimd
             };
 
             template <class Op, class A, class T>
-            inline T reduce(Op, batch<T, A> const& self, std::integral_constant<unsigned, 1>) noexcept
+            XSIMD_INLINE T reduce(Op, batch<T, A> const& self, std::integral_constant<unsigned, 1>) noexcept
             {
                 return self.get(0);
             }
 
             template <class Op, class A, class T, unsigned Lvl>
-            inline T reduce(Op op, batch<T, A> const& self, std::integral_constant<unsigned, Lvl>) noexcept
+            XSIMD_INLINE T reduce(Op op, batch<T, A> const& self, std::integral_constant<unsigned, Lvl>) noexcept
             {
                 using index_type = as_unsigned_integer_t<T>;
-                batch<T, A> split = swizzle(self, make_batch_constant<batch<index_type, A>, split_high<index_type, Lvl / 2>>());
+                batch<T, A> split = swizzle(self, make_batch_constant<index_type, A, split_high<index_type, Lvl / 2>>());
                 return reduce(op, op(split, self), std::integral_constant<unsigned, Lvl / 2>());
             }
         }
 
         // reduce_max
         template <class A, class T>
-        inline T reduce_max(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             return detail::reduce([](batch<T, A> const& x, batch<T, A> const& y)
                                   { return max(x, y); },
@@ -2024,7 +2082,7 @@ namespace xsimd
 
         // reduce_min
         template <class A, class T>
-        inline T reduce_min(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             return detail::reduce([](batch<T, A> const& x, batch<T, A> const& y)
                                   { return min(x, y); },
@@ -2033,17 +2091,17 @@ namespace xsimd
 
         // remainder
         template <class A>
-        inline batch<float, A> remainder(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<float, A> remainder(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
         {
             return fnma(nearbyint(self / other), other, self);
         }
         template <class A>
-        inline batch<double, A> remainder(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<double, A> remainder(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
         {
             return fnma(nearbyint(self / other), other, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> remainder(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> remainder(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             auto mod = self % other;
             return select(mod <= other / 2, mod, mod - other);
@@ -2051,14 +2109,14 @@ namespace xsimd
 
         // select
         template <class A, class T>
-        inline batch<std::complex<T>, A> select(batch_bool<T, A> const& cond, batch<std::complex<T>, A> const& true_br, batch<std::complex<T>, A> const& false_br, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> select(batch_bool<T, A> const& cond, batch<std::complex<T>, A> const& true_br, batch<std::complex<T>, A> const& false_br, requires_arch<generic>) noexcept
         {
             return { select(cond, true_br.real(), false_br.real()), select(cond, true_br.imag(), false_br.imag()) };
         }
 
         // sign
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> sign(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> sign(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             batch_type res = select(self > batch_type(0), batch_type(1), batch_type(0)) - select(self < batch_type(0), batch_type(1), batch_type(0));
@@ -2068,7 +2126,7 @@ namespace xsimd
         namespace detail
         {
             template <class T, class A>
-            inline batch<T, A> signf(batch<T, A> const& self) noexcept
+            XSIMD_INLINE batch<T, A> signf(batch<T, A> const& self) noexcept
             {
                 using batch_type = batch<T, A>;
                 batch_type res = select(self > batch_type(0.f), batch_type(1.f), batch_type(0.f)) - select(self < batch_type(0.f), batch_type(1.f), batch_type(0.f));
@@ -2081,17 +2139,17 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<float, A> sign(batch<float, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<float, A> sign(batch<float, A> const& self, requires_arch<generic>) noexcept
         {
             return detail::signf(self);
         }
         template <class A>
-        inline batch<double, A> sign(batch<double, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<double, A> sign(batch<double, A> const& self, requires_arch<generic>) noexcept
         {
             return detail::signf(self);
         }
         template <class A, class T>
-        inline batch<std::complex<T>, A> sign(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> sign(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
         {
             using batch_type = batch<std::complex<T>, A>;
             using real_batch = typename batch_type::real_batch;
@@ -2104,7 +2162,7 @@ namespace xsimd
 
         // signnz
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> signnz(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> signnz(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             return (self >> (sizeof(T) * 8 - 1)) | batch_type(1.);
@@ -2113,7 +2171,7 @@ namespace xsimd
         namespace detail
         {
             template <class T, class A>
-            inline batch<T, A> signnzf(batch<T, A> const& self) noexcept
+            XSIMD_INLINE batch<T, A> signnzf(batch<T, A> const& self) noexcept
             {
                 using batch_type = batch<T, A>;
 #ifndef XSIMD_NO_NANS
@@ -2125,19 +2183,19 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<float, A> signnz(batch<float, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<float, A> signnz(batch<float, A> const& self, requires_arch<generic>) noexcept
         {
             return detail::signnzf(self);
         }
         template <class A>
-        inline batch<double, A> signnz(batch<double, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<double, A> signnz(batch<double, A> const& self, requires_arch<generic>) noexcept
         {
             return detail::signnzf(self);
         }
 
         // sqrt
         template <class A, class T>
-        inline batch<std::complex<T>, A> sqrt(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> sqrt(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
         {
 
             constexpr T csqrt_scale_factor = std::is_same<T, float>::value ? 6.7108864e7f : 1.8014398509481984e16;
@@ -2192,7 +2250,7 @@ namespace xsimd
             struct stirling_kernel<batch<float, A>>
             {
                 using batch_type = batch<float, A>;
-                static inline batch_type compute(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type compute(const batch_type& x) noexcept
                 {
                     return horner<batch_type,
                                   0x3daaaaab,
@@ -2201,12 +2259,12 @@ namespace xsimd
                                   0xb970b359>(x);
                 }
 
-                static inline batch_type split_limit() noexcept
+                static XSIMD_INLINE batch_type split_limit() noexcept
                 {
                     return batch_type(bit_cast<float>(uint32_t(0x41d628f6)));
                 }
 
-                static inline batch_type large_limit() noexcept
+                static XSIMD_INLINE batch_type large_limit() noexcept
                 {
                     return batch_type(bit_cast<float>(uint32_t(0x420c28f3)));
                 }
@@ -2216,7 +2274,7 @@ namespace xsimd
             struct stirling_kernel<batch<double, A>>
             {
                 using batch_type = batch<double, A>;
-                static inline batch_type compute(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type compute(const batch_type& x) noexcept
                 {
                     return horner<batch_type,
                                   0x3fb5555555555986ull, //   8.33333333333482257126E-2
@@ -2227,12 +2285,12 @@ namespace xsimd
                                   >(x);
                 }
 
-                static inline batch_type split_limit() noexcept
+                static XSIMD_INLINE batch_type split_limit() noexcept
                 {
                     return batch_type(bit_cast<double>(uint64_t(0x4061e083ba3443d4)));
                 }
 
-                static inline batch_type large_limit() noexcept
+                static XSIMD_INLINE batch_type large_limit() noexcept
                 {
                     return batch_type(bit_cast<double>(uint64_t(0x4065800000000000)));
                 }
@@ -2248,7 +2306,7 @@ namespace xsimd
              * ====================================================
              */
             template <class T, class A>
-            inline batch<T, A> stirling(const batch<T, A>& a) noexcept
+            XSIMD_INLINE batch<T, A> stirling(const batch<T, A>& a) noexcept
             {
                 using batch_type = batch<T, A>;
                 const batch_type stirlingsplitlim = stirling_kernel<batch_type>::split_limit();
@@ -2286,7 +2344,7 @@ namespace xsimd
             struct tgamma_kernel<batch<float, A>>
             {
                 using batch_type = batch<float, A>;
-                static inline batch_type compute(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type compute(const batch_type& x) noexcept
                 {
                     return horner<batch_type,
                                   0x3f800000UL, //  9.999999757445841E-01
@@ -2305,7 +2363,7 @@ namespace xsimd
             struct tgamma_kernel<batch<double, A>>
             {
                 using batch_type = batch<double, A>;
-                static inline batch_type compute(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type compute(const batch_type& x) noexcept
                 {
                     return horner<batch_type,
                                   0x3ff0000000000000ULL, // 9.99999999999999996796E-1
@@ -2339,7 +2397,7 @@ namespace xsimd
              * ====================================================
              */
             template <class B>
-            inline B tgamma_large_negative(const B& a) noexcept
+            XSIMD_INLINE B tgamma_large_negative(const B& a) noexcept
             {
                 B st = stirling(a);
                 B p = floor(a);
@@ -2353,7 +2411,7 @@ namespace xsimd
             }
 
             template <class B, class BB>
-            inline B tgamma_other(const B& a, const BB& test) noexcept
+            XSIMD_INLINE B tgamma_other(const B& a, const BB& test) noexcept
             {
                 B x = select(test, B(2.), a);
 #ifndef XSIMD_NO_INFINITIES
@@ -2392,7 +2450,7 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline batch<T, A> tgamma(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> tgamma(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             auto nan_result = (self < batch_type(0.) && is_flint(self));
diff --git a/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_memory.hpp b/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_memory.hpp
index e9e9065832a..4651ecdbb73 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_memory.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_memory.hpp
@@ -21,10 +21,10 @@
 
 namespace xsimd
 {
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
     struct batch_constant;
 
-    template <class batch_type, bool... Values>
+    template <typename T, class A, bool... Values>
     struct batch_bool_constant;
 
     namespace kernel
@@ -36,7 +36,7 @@ namespace xsimd
         namespace detail
         {
             template <class IT, class A, class I, size_t... Is>
-            inline batch<IT, A> create_compress_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence<Is...>)
+            XSIMD_INLINE batch<IT, A> create_compress_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence<Is...>)
             {
                 batch<IT, A> swizzle_mask(IT(0));
                 alignas(A::alignment()) IT mask_buffer[batch<IT, A>::size] = { Is... };
@@ -49,7 +49,7 @@ namespace xsimd
         }
 
         template <typename A, typename T>
-        inline batch<T, A>
+        XSIMD_INLINE batch<T, A>
         compress(batch<T, A> const& x, batch_bool<T, A> const& mask,
                  kernel::requires_arch<generic>) noexcept
         {
@@ -65,7 +65,7 @@ namespace xsimd
         namespace detail
         {
             template <class IT, class A, class I, size_t... Is>
-            inline batch<IT, A> create_expand_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence<Is...>)
+            XSIMD_INLINE batch<IT, A> create_expand_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence<Is...>)
             {
                 batch<IT, A> swizzle_mask(IT(0));
                 IT j = 0;
@@ -75,7 +75,7 @@ namespace xsimd
         }
 
         template <typename A, typename T>
-        inline batch<T, A>
+        XSIMD_INLINE batch<T, A>
         expand(batch<T, A> const& x, batch_bool<T, A> const& mask,
                kernel::requires_arch<generic>) noexcept
         {
@@ -88,7 +88,7 @@ namespace xsimd
 
         // extract_pair
         template <class A, class T>
-        inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<generic>) noexcept
         {
             constexpr std::size_t size = batch<T, A>::size;
             assert(i < size && "index in bounds");
@@ -115,6 +115,7 @@ namespace xsimd
         // gather
         namespace detail
         {
+            // Not using XSIMD_INLINE here as it makes msvc hand got ever on avx512
             template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
             inline batch<T, A> gather(U const* src, batch<V, A> const& index,
                                       ::xsimd::index<N> I) noexcept
@@ -134,7 +135,7 @@ namespace xsimd
         } // namespace detail
 
         template <typename T, typename A, typename V>
-        inline batch<T, A>
+        XSIMD_INLINE batch<T, A>
         gather(batch<T, A> const&, T const* src, batch<V, A> const& index,
                kernel::requires_arch<generic>) noexcept
         {
@@ -146,7 +147,7 @@ namespace xsimd
 
         // Gather with runtime indexes and mismatched strides.
         template <typename T, typename A, typename U, typename V>
-        inline detail::sizes_mismatch_t<T, U, batch<T, A>>
+        XSIMD_INLINE detail::sizes_mismatch_t<T, U, batch<T, A>>
         gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
                kernel::requires_arch<generic>) noexcept
         {
@@ -158,7 +159,7 @@ namespace xsimd
 
         // Gather with runtime indexes and matching strides.
         template <typename T, typename A, typename U, typename V>
-        inline detail::stride_match_t<T, U, batch<T, A>>
+        XSIMD_INLINE detail::stride_match_t<T, U, batch<T, A>>
         gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
                kernel::requires_arch<generic>) noexcept
         {
@@ -170,7 +171,7 @@ namespace xsimd
 
         // insert
         template <class A, class T, size_t I>
-        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept
         {
             struct index_mask
             {
@@ -180,12 +181,12 @@ namespace xsimd
                 }
             };
             batch<T, A> tmp(val);
-            return select(make_batch_bool_constant<batch<T, A>, index_mask>(), self, tmp);
+            return select(make_batch_bool_constant<T, A, index_mask>(), self, tmp);
         }
 
         // get
         template <class A, size_t I, class T>
-        inline T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
         {
             alignas(A::alignment()) T buffer[batch<T, A>::size];
             self.store_aligned(&buffer[0]);
@@ -193,7 +194,7 @@ namespace xsimd
         }
 
         template <class A, size_t I, class T>
-        inline T get(batch_bool<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
+        XSIMD_INLINE T get(batch_bool<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
         {
             alignas(A::alignment()) T buffer[batch_bool<T, A>::size];
             self.store_aligned(&buffer[0]);
@@ -201,7 +202,7 @@ namespace xsimd
         }
 
         template <class A, size_t I, class T>
-        inline auto get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
+        XSIMD_INLINE auto get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
         {
             alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];
             self.store_aligned(&buffer[0]);
@@ -209,7 +210,7 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline T get(batch<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
+        XSIMD_INLINE T get(batch<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
         {
             alignas(A::alignment()) T buffer[batch<T, A>::size];
             self.store_aligned(&buffer[0]);
@@ -217,7 +218,7 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline T get(batch_bool<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
+        XSIMD_INLINE T get(batch_bool<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
         {
             alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
             self.store_aligned(&buffer[0]);
@@ -225,7 +226,7 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline auto get(batch<std::complex<T>, A> const& self, std::size_t i, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
+        XSIMD_INLINE auto get(batch<std::complex<T>, A> const& self, std::size_t i, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
         {
             using T2 = typename batch<std::complex<T>, A>::value_type;
             alignas(A::alignment()) T2 buffer[batch<std::complex<T>, A>::size];
@@ -237,14 +238,14 @@ namespace xsimd
         namespace detail
         {
             template <class A, class T_in, class T_out>
-            inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
+            XSIMD_INLINE batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
             {
                 using batch_type_in = batch<T_in, A>;
                 using batch_type_out = batch<T_out, A>;
                 return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A {});
             }
             template <class A, class T_in, class T_out>
-            inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_slow_conversion) noexcept
+            XSIMD_INLINE batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_slow_conversion) noexcept
             {
                 static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
                 using batch_type_out = batch<T_out, A>;
@@ -254,7 +255,7 @@ namespace xsimd
             }
         }
         template <class A, class T_in, class T_out>
-        inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T_out, A> load_aligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
         {
             return detail::load_aligned<A>(mem, cvt, A {}, detail::conversion_type<A, T_in, T_out> {});
         }
@@ -263,7 +264,7 @@ namespace xsimd
         namespace detail
         {
             template <class A, class T_in, class T_out>
-            inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
+            XSIMD_INLINE batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
             {
                 using batch_type_in = batch<T_in, A>;
                 using batch_type_out = batch<T_out, A>;
@@ -271,21 +272,21 @@ namespace xsimd
             }
 
             template <class A, class T_in, class T_out>
-            inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>, with_slow_conversion) noexcept
+            XSIMD_INLINE batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>, with_slow_conversion) noexcept
             {
                 static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
                 return load_aligned<A>(mem, cvt, generic {}, with_slow_conversion {});
             }
         }
         template <class A, class T_in, class T_out>
-        inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
         {
             return detail::load_unaligned<A>(mem, cvt, generic {}, detail::conversion_type<A, T_in, T_out> {});
         }
 
         // rotate_left
         template <size_t N, class A, class T>
-        inline batch<T, A> rotate_left(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> rotate_left(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             struct rotate_generator
             {
@@ -295,18 +296,18 @@ namespace xsimd
                 }
             };
 
-            return swizzle(self, make_batch_constant<batch<as_unsigned_integer_t<T>, A>, rotate_generator>(), A {});
+            return swizzle(self, make_batch_constant<as_unsigned_integer_t<T>, A, rotate_generator>(), A {});
         }
 
         template <size_t N, class A, class T>
-        inline batch<std::complex<T>, A> rotate_left(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> rotate_left(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
         {
             return { rotate_left<N>(self.real()), rotate_left<N>(self.imag()) };
         }
 
         // rotate_right
         template <size_t N, class A, class T>
-        inline batch<T, A> rotate_right(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> rotate_right(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             struct rotate_generator
             {
@@ -316,11 +317,11 @@ namespace xsimd
                 }
             };
 
-            return swizzle(self, make_batch_constant<batch<as_unsigned_integer_t<T>, A>, rotate_generator>(), A {});
+            return swizzle(self, make_batch_constant<as_unsigned_integer_t<T>, A, rotate_generator>(), A {});
         }
 
         template <size_t N, class A, class T>
-        inline batch<std::complex<T>, A> rotate_right(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> rotate_right(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
         {
             return { rotate_right<N>(self.real()), rotate_right<N>(self.imag()) };
         }
@@ -329,15 +330,15 @@ namespace xsimd
         namespace detail
         {
             template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
-            inline void scatter(batch<T, A> const& src, U* dst,
-                                batch<V, A> const& index,
-                                ::xsimd::index<N> I) noexcept
+            XSIMD_INLINE void scatter(batch<T, A> const& src, U* dst,
+                                      batch<V, A> const& index,
+                                      ::xsimd::index<N> I) noexcept
             {
                 dst[index.get(I)] = static_cast<U>(src.get(I));
             }
 
             template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
-            inline void
+            XSIMD_INLINE void
             scatter(batch<T, A> const& src, U* dst, batch<V, A> const& index,
                     ::xsimd::index<N> I) noexcept
             {
@@ -350,7 +351,7 @@ namespace xsimd
         } // namespace detail
 
         template <typename A, typename T, typename V>
-        inline void
+        XSIMD_INLINE void
         scatter(batch<T, A> const& src, T* dst,
                 batch<V, A> const& index,
                 kernel::requires_arch<generic>) noexcept
@@ -362,7 +363,7 @@ namespace xsimd
         }
 
         template <typename A, typename T, typename U, typename V>
-        inline detail::sizes_mismatch_t<T, U, void>
+        XSIMD_INLINE detail::sizes_mismatch_t<T, U, void>
         scatter(batch<T, A> const& src, U* dst,
                 batch<V, A> const& index,
                 kernel::requires_arch<generic>) noexcept
@@ -374,7 +375,7 @@ namespace xsimd
         }
 
         template <typename A, typename T, typename U, typename V>
-        inline detail::stride_match_t<T, U, void>
+        XSIMD_INLINE detail::stride_match_t<T, U, void>
         scatter(batch<T, A> const& src, U* dst,
                 batch<V, A> const& index,
                 kernel::requires_arch<generic>) noexcept
@@ -412,6 +413,12 @@ namespace xsimd
                 return true;
             }
 
+            template <typename ITy>
+            constexpr bool is_zip_lo(size_t, ITy)
+            {
+                return false;
+            }
+
             template <typename ITy0, typename ITy1, typename... ITys>
             constexpr bool is_zip_lo(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices)
             {
@@ -423,6 +430,12 @@ namespace xsimd
                 return true;
             }
 
+            template <typename ITy>
+            constexpr bool is_zip_hi(size_t, ITy)
+            {
+                return false;
+            }
+
             template <typename ITy0, typename ITy1, typename... ITys>
             constexpr bool is_zip_hi(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices)
             {
@@ -443,19 +456,19 @@ namespace xsimd
         }
 
         template <class A, typename T, typename ITy, ITy... Indices>
-        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept
         {
             constexpr size_t bsize = sizeof...(Indices);
 
             // Detect common patterns
             XSIMD_IF_CONSTEXPR(detail::is_swizzle_fst(bsize, Indices...))
             {
-                return swizzle(x, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? 0 /* never happens */ : Indices)...>());
+                return swizzle(x, batch_constant<ITy, A, ((Indices >= bsize) ? 0 /* never happens */ : Indices)...>());
             }
 
             XSIMD_IF_CONSTEXPR(detail::is_swizzle_snd(bsize, Indices...))
             {
-                return swizzle(y, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : 0 /* never happens */)...>());
+                return swizzle(y, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : 0 /* never happens */)...>());
             }
 
             XSIMD_IF_CONSTEXPR(detail::is_zip_lo(bsize, Indices...))
@@ -470,7 +483,7 @@ namespace xsimd
 
             XSIMD_IF_CONSTEXPR(detail::is_select(bsize, Indices...))
             {
-                return select(batch_bool_constant<batch<T, A>, (Indices < bsize)...>(), x, y);
+                return select(batch_bool_constant<T, A, (Indices < bsize)...>(), x, y);
             }
 
 #if defined(__has_builtin)
@@ -491,16 +504,16 @@ namespace xsimd
 #else
             // Use a generic_pattern. It is suboptimal but clang optimizes this
             // pretty well.
-            batch<T, A> x_lane = swizzle(x, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
-            batch<T, A> y_lane = swizzle(y, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
-            batch_bool_constant<batch<T, A>, (Indices < bsize)...> select_x_lane;
+            batch<T, A> x_lane = swizzle(x, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
+            batch<T, A> y_lane = swizzle(y, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
+            batch_bool_constant<T, A, (Indices < bsize)...> select_x_lane;
             return select(select_x_lane, x_lane, y_lane);
 #endif
         }
 
         // store
         template <class T, class A>
-        inline void store(batch_bool<T, A> const& self, bool* mem, requires_arch<generic>) noexcept
+        XSIMD_INLINE void store(batch_bool<T, A> const& self, bool* mem, requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             constexpr auto size = batch_bool<T, A>::size;
@@ -512,7 +525,7 @@ namespace xsimd
 
         // store_aligned
         template <class A, class T_in, class T_out>
-        inline void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
         {
             static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
             alignas(A::alignment()) T_in buffer[batch<T_in, A>::size];
@@ -522,7 +535,7 @@ namespace xsimd
 
         // store_unaligned
         template <class A, class T_in, class T_out>
-        inline void store_unaligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE void store_unaligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
         {
             static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
             return store_aligned<A>(mem, self, generic {});
@@ -530,13 +543,13 @@ namespace xsimd
 
         // swizzle
         template <class A, class T, class ITy, ITy... Vs>
-        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<batch<ITy, A>, Vs...> mask, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<ITy, A, Vs...> mask, requires_arch<generic>) noexcept
         {
             return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
         }
 
         template <class A, class T, class ITy>
-        inline batch<T, A> swizzle(batch<T, A> const& self, batch<ITy, A> mask, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<ITy, A> mask, requires_arch<generic>) noexcept
         {
             constexpr size_t size = batch<T, A>::size;
             alignas(A::alignment()) T self_buffer[size];
@@ -552,7 +565,7 @@ namespace xsimd
         }
 
         template <class A, class T, class ITy>
-        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch<ITy, A> mask, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch<ITy, A> mask, requires_arch<generic>) noexcept
         {
             return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
         }
@@ -561,26 +574,26 @@ namespace xsimd
         namespace detail
         {
             template <class A, class T>
-            inline batch<std::complex<T>, A> load_complex(batch<T, A> const& /*hi*/, batch<T, A> const& /*lo*/, requires_arch<generic>) noexcept
+            XSIMD_INLINE batch<std::complex<T>, A> load_complex(batch<T, A> const& /*hi*/, batch<T, A> const& /*lo*/, requires_arch<generic>) noexcept
             {
                 static_assert(std::is_same<T, void>::value, "load_complex not implemented for the required architecture");
             }
 
             template <class A, class T>
-            inline batch<T, A> complex_high(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
+            XSIMD_INLINE batch<T, A> complex_high(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
             {
                 static_assert(std::is_same<T, void>::value, "complex_high not implemented for the required architecture");
             }
 
             template <class A, class T>
-            inline batch<T, A> complex_low(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
+            XSIMD_INLINE batch<T, A> complex_low(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
             {
                 static_assert(std::is_same<T, void>::value, "complex_low not implemented for the required architecture");
             }
         }
 
         template <class A, class T_out, class T_in>
-        inline batch<std::complex<T_out>, A> load_complex_aligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T_out>, A> load_complex_aligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
         {
             using real_batch = batch<T_out, A>;
             T_in const* buffer = reinterpret_cast<T_in const*>(mem);
@@ -591,7 +604,7 @@ namespace xsimd
 
         // load_complex_unaligned
         template <class A, class T_out, class T_in>
-        inline batch<std::complex<T_out>, A> load_complex_unaligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T_out>, A> load_complex_unaligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
         {
             using real_batch = batch<T_out, A>;
             T_in const* buffer = reinterpret_cast<T_in const*>(mem);
@@ -602,7 +615,7 @@ namespace xsimd
 
         // store_complex_aligned
         template <class A, class T_out, class T_in>
-        inline void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
+        XSIMD_INLINE void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
         {
             using real_batch = batch<T_in, A>;
             real_batch hi = detail::complex_high(src, A {});
@@ -614,7 +627,7 @@ namespace xsimd
 
         // store_compelx_unaligned
         template <class A, class T_out, class T_in>
-        inline void store_complex_unaligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
+        XSIMD_INLINE void store_complex_unaligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
         {
             using real_batch = batch<T_in, A>;
             real_batch hi = detail::complex_high(src, A {});
diff --git a/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_rounding.hpp b/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_rounding.hpp
index b6a79a4515b..daf7b58ea71 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_rounding.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_rounding.hpp
@@ -24,7 +24,7 @@ namespace xsimd
 
         // ceil
         template <class A, class T>
-        inline batch<T, A> ceil(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> ceil(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             batch<T, A> truncated_self = trunc(self);
             return select(truncated_self < self, truncated_self + 1, truncated_self);
@@ -32,7 +32,7 @@ namespace xsimd
 
         // floor
         template <class A, class T>
-        inline batch<T, A> floor(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> floor(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             batch<T, A> truncated_self = trunc(self);
             return select(truncated_self > self, truncated_self - 1, truncated_self);
@@ -40,7 +40,7 @@ namespace xsimd
 
         // round
         template <class A, class T>
-        inline batch<T, A> round(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> round(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             auto v = abs(self);
             auto c = ceil(v);
@@ -50,17 +50,17 @@ namespace xsimd
 
         // trunc
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> trunc(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> trunc(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             return self;
         }
         template <class A>
-        inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<generic>) noexcept
         {
             return select(abs(self) < constants::maxflint<batch<float, A>>(), to_float(to_int(self)), self);
         }
         template <class A>
-        inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<double, A> trunc(batch<double, A> const& self, requires_arch<generic>) noexcept
         {
             return select(abs(self) < constants::maxflint<batch<double, A>>(), to_float(to_int(self)), self);
         }
diff --git a/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_trigo.hpp b/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_trigo.hpp
index 2568a7253fe..b1bb68f25e9 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_trigo.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/generic/xsimd_generic_trigo.hpp
@@ -35,7 +35,7 @@ namespace xsimd
 
         // acos
         template <class A, class T>
-        inline batch<T, A> acos(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> acos(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             batch_type x = abs(self);
@@ -47,7 +47,7 @@ namespace xsimd
             return select(x_larger_05, x, constants::pio2<batch_type>() - x);
         }
         template <class A, class T>
-        inline batch<std::complex<T>, A> acos(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> acos(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
         {
             using batch_type = batch<std::complex<T>, A>;
             using real_batch = typename batch_type::real_batch;
@@ -66,7 +66,7 @@ namespace xsimd
          * ====================================================
          */
         template <class A, class T>
-        inline batch<T, A> acosh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> acosh(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             batch_type x = self - batch_type(1.);
@@ -76,7 +76,7 @@ namespace xsimd
             return select(test, l1pz + constants::log_2<batch_type>(), l1pz);
         }
         template <class A, class T>
-        inline batch<std::complex<T>, A> acosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> acosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
         {
             using batch_type = batch<std::complex<T>, A>;
             batch_type w = acos(z);
@@ -86,7 +86,7 @@ namespace xsimd
 
         // asin
         template <class A>
-        inline batch<float, A> asin(batch<float, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<float, A> asin(batch<float, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<float, A>;
             batch_type x = abs(self);
@@ -105,7 +105,7 @@ namespace xsimd
             return z ^ sign;
         }
         template <class A>
-        inline batch<double, A> asin(batch<double, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<double, A> asin(batch<double, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<double, A>;
             batch_type x = abs(self);
@@ -127,7 +127,7 @@ namespace xsimd
                               ^ bitofsign(self));
         }
         template <class A, class T>
-        inline batch<std::complex<T>, A> asin(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> asin(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
         {
             using batch_type = batch<std::complex<T>, A>;
             using real_batch = typename batch_type::real_batch;
@@ -159,32 +159,32 @@ namespace xsimd
         namespace detail
         {
             template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-            inline batch<T, A>
+            XSIMD_INLINE batch<T, A>
             average(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
             {
                 return (x1 & x2) + ((x1 ^ x2) >> 1);
             }
 
             template <class A, class T>
-            inline batch<T, A>
+            XSIMD_INLINE batch<T, A>
             averagef(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
             {
                 using batch_type = batch<T, A>;
                 return fma(x1, batch_type(0.5), x2 * batch_type(0.5));
             }
             template <class A>
-            inline batch<float, A> average(batch<float, A> const& x1, batch<float, A> const& x2) noexcept
+            XSIMD_INLINE batch<float, A> average(batch<float, A> const& x1, batch<float, A> const& x2) noexcept
             {
                 return averagef(x1, x2);
             }
             template <class A>
-            inline batch<double, A> average(batch<double, A> const& x1, batch<double, A> const& x2) noexcept
+            XSIMD_INLINE batch<double, A> average(batch<double, A> const& x1, batch<double, A> const& x2) noexcept
             {
                 return averagef(x1, x2);
             }
         }
         template <class A>
-        inline batch<float, A> asinh(batch<float, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<float, A> asinh(batch<float, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<float, A>;
             batch_type x = abs(self);
@@ -212,7 +212,7 @@ namespace xsimd
 #endif
         }
         template <class A>
-        inline batch<double, A> asinh(batch<double, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<double, A> asinh(batch<double, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<double, A>;
             batch_type x = abs(self);
@@ -226,7 +226,7 @@ namespace xsimd
             return bitofsign(self) ^ z;
         }
         template <class A, class T>
-        inline batch<std::complex<T>, A> asinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> asinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
         {
             using batch_type = batch<std::complex<T>, A>;
             batch_type w = asin(batch_type(-z.imag(), z.real()));
@@ -238,7 +238,7 @@ namespace xsimd
         namespace detail
         {
             template <class A>
-            static inline batch<float, A> kernel_atan(const batch<float, A>& x, const batch<float, A>& recx) noexcept
+            static XSIMD_INLINE batch<float, A> kernel_atan(const batch<float, A>& x, const batch<float, A>& recx) noexcept
             {
                 using batch_type = batch<float, A>;
                 const auto flag1 = x < constants::tan3pio8<batch_type>();
@@ -259,7 +259,7 @@ namespace xsimd
                 return yy + z1;
             }
             template <class A>
-            static inline batch<double, A> kernel_atan(const batch<double, A>& x, const batch<double, A>& recx) noexcept
+            static XSIMD_INLINE batch<double, A> kernel_atan(const batch<double, A>& x, const batch<double, A>& recx) noexcept
             {
                 using batch_type = batch<double, A>;
                 const auto flag1 = x < constants::tan3pio8<batch_type>();
@@ -288,7 +288,7 @@ namespace xsimd
             }
         }
         template <class A, class T>
-        inline batch<T, A> atan(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> atan(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             const batch_type absa = abs(self);
@@ -296,7 +296,7 @@ namespace xsimd
             return x ^ bitofsign(self);
         }
         template <class A, class T>
-        inline batch<std::complex<T>, A> atan(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> atan(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
         {
             using batch_type = batch<std::complex<T>, A>;
             using real_batch = typename batch_type::real_batch;
@@ -327,7 +327,7 @@ namespace xsimd
          * ====================================================
          */
         template <class A, class T>
-        inline batch<T, A> atanh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> atanh(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             batch_type x = abs(self);
@@ -338,7 +338,7 @@ namespace xsimd
             return bitofsign(self) ^ (batch_type(0.5) * log1p(select(test, fma(t, tmp, t), tmp)));
         }
         template <class A, class T>
-        inline batch<std::complex<T>, A> atanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> atanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
         {
             using batch_type = batch<std::complex<T>, A>;
             batch_type w = atan(batch_type(-z.imag(), z.real()));
@@ -348,7 +348,7 @@ namespace xsimd
 
         // atan2
         template <class A, class T>
-        inline batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             const batch_type q = abs(self / other);
@@ -360,19 +360,19 @@ namespace xsimd
         namespace detail
         {
             template <class T, class A>
-            inline batch<T, A> quadrant(const batch<T, A>& x) noexcept
+            XSIMD_INLINE batch<T, A> quadrant(const batch<T, A>& x) noexcept
             {
                 return x & batch<T, A>(3);
             }
 
             template <class A>
-            inline batch<float, A> quadrant(const batch<float, A>& x) noexcept
+            XSIMD_INLINE batch<float, A> quadrant(const batch<float, A>& x) noexcept
             {
                 return to_float(quadrant(to_int(x)));
             }
 
             template <class A>
-            inline batch<double, A> quadrant(const batch<double, A>& x) noexcept
+            XSIMD_INLINE batch<double, A> quadrant(const batch<double, A>& x) noexcept
             {
                 using batch_type = batch<double, A>;
                 batch_type a = x * batch_type(0.25);
@@ -389,7 +389,7 @@ namespace xsimd
              */
 
             template <class A>
-            inline batch<float, A> cos_eval(const batch<float, A>& z) noexcept
+            XSIMD_INLINE batch<float, A> cos_eval(const batch<float, A>& z) noexcept
             {
                 using batch_type = batch<float, A>;
                 batch_type y = detail::horner<batch_type,
@@ -400,7 +400,7 @@ namespace xsimd
             }
 
             template <class A>
-            inline batch<float, A> sin_eval(const batch<float, A>& z, const batch<float, A>& x) noexcept
+            XSIMD_INLINE batch<float, A> sin_eval(const batch<float, A>& z, const batch<float, A>& x) noexcept
             {
                 using batch_type = batch<float, A>;
                 batch_type y = detail::horner<batch_type,
@@ -411,7 +411,7 @@ namespace xsimd
             }
 
             template <class A>
-            static inline batch<float, A> base_tancot_eval(const batch<float, A>& z) noexcept
+            static XSIMD_INLINE batch<float, A> base_tancot_eval(const batch<float, A>& z) noexcept
             {
                 using batch_type = batch<float, A>;
                 batch_type zz = z * z;
@@ -426,7 +426,7 @@ namespace xsimd
             }
 
             template <class A, class BB>
-            static inline batch<float, A> tan_eval(const batch<float, A>& z, const BB& test) noexcept
+            static XSIMD_INLINE batch<float, A> tan_eval(const batch<float, A>& z, const BB& test) noexcept
             {
                 using batch_type = batch<float, A>;
                 batch_type y = base_tancot_eval(z);
@@ -434,7 +434,7 @@ namespace xsimd
             }
 
             template <class A, class BB>
-            static inline batch<float, A> cot_eval(const batch<float, A>& z, const BB& test) noexcept
+            static XSIMD_INLINE batch<float, A> cot_eval(const batch<float, A>& z, const BB& test) noexcept
             {
                 using batch_type = batch<float, A>;
                 batch_type y = base_tancot_eval(z);
@@ -451,7 +451,7 @@ namespace xsimd
              * ====================================================
              */
             template <class A>
-            static inline batch<double, A> cos_eval(const batch<double, A>& z) noexcept
+            static XSIMD_INLINE batch<double, A> cos_eval(const batch<double, A>& z) noexcept
             {
                 using batch_type = batch<double, A>;
                 batch_type y = detail::horner<batch_type,
@@ -466,7 +466,7 @@ namespace xsimd
             }
 
             template <class A>
-            static inline batch<double, A> sin_eval(const batch<double, A>& z, const batch<double, A>& x) noexcept
+            static XSIMD_INLINE batch<double, A> sin_eval(const batch<double, A>& z, const batch<double, A>& x) noexcept
             {
                 using batch_type = batch<double, A>;
                 batch_type y = detail::horner<batch_type,
@@ -480,7 +480,7 @@ namespace xsimd
             }
 
             template <class A>
-            static inline batch<double, A> base_tancot_eval(const batch<double, A>& z) noexcept
+            static XSIMD_INLINE batch<double, A> base_tancot_eval(const batch<double, A>& z) noexcept
             {
                 using batch_type = batch<double, A>;
                 batch_type zz = z * z;
@@ -497,7 +497,7 @@ namespace xsimd
             }
 
             template <class A, class BB>
-            static inline batch<double, A> tan_eval(const batch<double, A>& z, const BB& test) noexcept
+            static XSIMD_INLINE batch<double, A> tan_eval(const batch<double, A>& z, const BB& test) noexcept
             {
                 using batch_type = batch<double, A>;
                 batch_type y = base_tancot_eval(z);
@@ -505,7 +505,7 @@ namespace xsimd
             }
 
             template <class A, class BB>
-            static inline batch<double, A> cot_eval(const batch<double, A>& z, const BB& test) noexcept
+            static XSIMD_INLINE batch<double, A> cot_eval(const batch<double, A>& z, const BB& test) noexcept
             {
                 using batch_type = batch<double, A>;
                 batch_type y = base_tancot_eval(z);
@@ -531,7 +531,7 @@ namespace xsimd
             template <class B, class Tag = trigo_radian_tag>
             struct trigo_reducer
             {
-                static inline B reduce(const B& x, B& xr) noexcept
+                static XSIMD_INLINE B reduce(const B& x, B& xr) noexcept
                 {
                     if (all(x <= constants::pio4<B>()))
                     {
@@ -606,7 +606,7 @@ namespace xsimd
             template <class B>
             struct trigo_reducer<B, trigo_pi_tag>
             {
-                static inline B reduce(const B& x, B& xr) noexcept
+                static XSIMD_INLINE B reduce(const B& x, B& xr) noexcept
                 {
                     B xi = nearbyint(x * B(2.));
                     B x2 = x - xi * B(0.5);
@@ -617,7 +617,7 @@ namespace xsimd
 
         }
         template <class A, class T>
-        inline batch<T, A> cos(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> cos(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             const batch_type x = abs(self);
@@ -634,7 +634,7 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline batch<std::complex<T>, A> cos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> cos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
         {
             return { cos(z.real()) * cosh(z.imag()), -sin(z.real()) * sinh(z.imag()) };
         }
@@ -652,7 +652,7 @@ namespace xsimd
          */
 
         template <class A, class T>
-        inline batch<T, A> cosh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> cosh(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             batch_type x = abs(self);
@@ -663,7 +663,7 @@ namespace xsimd
             return select(test1, tmp1 * tmp, detail::average(tmp, batch_type(1.) / tmp));
         }
         template <class A, class T>
-        inline batch<std::complex<T>, A> cosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> cosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
         {
             auto x = z.real();
             auto y = z.imag();
@@ -674,7 +674,7 @@ namespace xsimd
         namespace detail
         {
             template <class A, class T, class Tag = trigo_radian_tag>
-            inline batch<T, A> sin(batch<T, A> const& self, Tag = Tag()) noexcept
+            XSIMD_INLINE batch<T, A> sin(batch<T, A> const& self, Tag = Tag()) noexcept
             {
                 using batch_type = batch<T, A>;
                 const batch_type x = abs(self);
@@ -692,20 +692,20 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline batch<T, A> sin(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> sin(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             return detail::sin(self);
         }
 
         template <class A, class T>
-        inline batch<std::complex<T>, A> sin(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> sin(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
         {
             return { sin(z.real()) * cosh(z.imag()), cos(z.real()) * sinh(z.imag()) };
         }
 
         // sincos
         template <class A, class T>
-        inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             const batch_type x = abs(self);
@@ -724,7 +724,7 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline std::pair<batch<std::complex<T>, A>, batch<std::complex<T>, A>>
+        XSIMD_INLINE std::pair<batch<std::complex<T>, A>, batch<std::complex<T>, A>>
         sincos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
         {
             using batch_type = batch<std::complex<T>, A>;
@@ -749,7 +749,7 @@ namespace xsimd
              * ====================================================
              */
             template <class A>
-            inline batch<float, A> sinh_kernel(batch<float, A> const& self) noexcept
+            XSIMD_INLINE batch<float, A> sinh_kernel(batch<float, A> const& self) noexcept
             {
                 using batch_type = batch<float, A>;
                 batch_type sqr_self = self * self;
@@ -763,7 +763,7 @@ namespace xsimd
             }
 
             template <class A>
-            inline batch<double, A> sinh_kernel(batch<double, A> const& self) noexcept
+            XSIMD_INLINE batch<double, A> sinh_kernel(batch<double, A> const& self) noexcept
             {
                 using batch_type = batch<double, A>;
                 batch_type sqrself = self * self;
@@ -792,7 +792,7 @@ namespace xsimd
          * ====================================================
          */
         template <class A, class T>
-        inline batch<T, A> sinh(batch<T, A> const& a, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> sinh(batch<T, A> const& a, requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             batch_type half(0.5);
@@ -814,7 +814,7 @@ namespace xsimd
             return select(lt1, z, r) ^ bts;
         }
         template <class A, class T>
-        inline batch<std::complex<T>, A> sinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> sinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
         {
             auto x = z.real();
             auto y = z.imag();
@@ -823,7 +823,7 @@ namespace xsimd
 
         // tan
         template <class A, class T>
-        inline batch<T, A> tan(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> tan(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             const batch_type x = abs(self);
@@ -836,7 +836,7 @@ namespace xsimd
             return y ^ bitofsign(self);
         }
         template <class A, class T>
-        inline batch<std::complex<T>, A> tan(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> tan(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
         {
             using batch_type = batch<std::complex<T>, A>;
             using real_batch = typename batch_type::real_batch;
@@ -867,7 +867,7 @@ namespace xsimd
             struct tanh_kernel<batch<float, A>>
             {
                 using batch_type = batch<float, A>;
-                static inline batch_type tanh(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type tanh(const batch_type& x) noexcept
                 {
                     batch_type sqrx = x * x;
                     return fma(detail::horner<batch_type,
@@ -881,7 +881,7 @@ namespace xsimd
                                x, x);
                 }
 
-                static inline batch_type cotanh(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type cotanh(const batch_type& x) noexcept
                 {
                     return batch_type(1.) / tanh(x);
                 }
@@ -891,20 +891,20 @@ namespace xsimd
             struct tanh_kernel<batch<double, A>>
             {
                 using batch_type = batch<double, A>;
-                static inline batch_type tanh(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type tanh(const batch_type& x) noexcept
                 {
                     batch_type sqrx = x * x;
                     return fma(sqrx * p(sqrx) / q(sqrx), x, x);
                 }
 
-                static inline batch_type cotanh(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type cotanh(const batch_type& x) noexcept
                 {
                     batch_type sqrx = x * x;
                     batch_type qval = q(sqrx);
                     return qval / (x * fma(p(sqrx), sqrx, qval));
                 }
 
-                static inline batch_type p(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type p(const batch_type& x) noexcept
                 {
                     return detail::horner<batch_type,
                                           0xc0993ac030580563, // -1.61468768441708447952E3
@@ -913,7 +913,7 @@ namespace xsimd
                                           >(x);
                 }
 
-                static inline batch_type q(const batch_type& x) noexcept
+                static XSIMD_INLINE batch_type q(const batch_type& x) noexcept
                 {
                     return detail::horner1<batch_type,
                                            0x40b2ec102442040c, //  4.84406305325125486048E3
@@ -934,7 +934,7 @@ namespace xsimd
          * ====================================================
          */
         template <class A, class T>
-        inline batch<T, A> tanh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<T, A> tanh(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             using batch_type = batch<T, A>;
             batch_type one(1.);
@@ -952,7 +952,7 @@ namespace xsimd
             return select(test, z, r) ^ bts;
         }
         template <class A, class T>
-        inline batch<std::complex<T>, A> tanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> tanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
         {
             using real_batch = typename batch<std::complex<T>, A>::real_batch;
             auto x = z.real();
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_avx.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_avx.hpp
index 5ec1e02d484..f41702babac 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_avx.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_avx.hpp
@@ -27,39 +27,39 @@ namespace xsimd
 
         // fwd
         template <class A, class T, size_t I>
-        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
 
         namespace detail
         {
-            inline void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept
+            XSIMD_INLINE void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept
             {
                 low = _mm256_castsi256_si128(val);
                 high = _mm256_extractf128_si256(val, 1);
             }
-            inline void split_avx(__m256 val, __m128& low, __m128& high) noexcept
+            XSIMD_INLINE void split_avx(__m256 val, __m128& low, __m128& high) noexcept
             {
                 low = _mm256_castps256_ps128(val);
                 high = _mm256_extractf128_ps(val, 1);
             }
-            inline void split_avx(__m256d val, __m128d& low, __m128d& high) noexcept
+            XSIMD_INLINE void split_avx(__m256d val, __m128d& low, __m128d& high) noexcept
             {
                 low = _mm256_castpd256_pd128(val);
                 high = _mm256_extractf128_pd(val, 1);
             }
-            inline __m256i merge_sse(__m128i low, __m128i high) noexcept
+            XSIMD_INLINE __m256i merge_sse(__m128i low, __m128i high) noexcept
             {
                 return _mm256_insertf128_si256(_mm256_castsi128_si256(low), high, 1);
             }
-            inline __m256 merge_sse(__m128 low, __m128 high) noexcept
+            XSIMD_INLINE __m256 merge_sse(__m128 low, __m128 high) noexcept
             {
                 return _mm256_insertf128_ps(_mm256_castps128_ps256(low), high, 1);
             }
-            inline __m256d merge_sse(__m128d low, __m128d high) noexcept
+            XSIMD_INLINE __m256d merge_sse(__m128d low, __m128d high) noexcept
             {
                 return _mm256_insertf128_pd(_mm256_castpd128_pd256(low), high, 1);
             }
             template <class F>
-            inline __m256i fwd_to_sse(F f, __m256i self) noexcept
+            XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self) noexcept
             {
                 __m128i self_low, self_high;
                 split_avx(self, self_low, self_high);
@@ -68,7 +68,7 @@ namespace xsimd
                 return merge_sse(res_low, res_high);
             }
             template <class F>
-            inline __m256i fwd_to_sse(F f, __m256i self, __m256i other) noexcept
+            XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, __m256i other) noexcept
             {
                 __m128i self_low, self_high, other_low, other_high;
                 split_avx(self, self_low, self_high);
@@ -78,7 +78,7 @@ namespace xsimd
                 return merge_sse(res_low, res_high);
             }
             template <class F>
-            inline __m256i fwd_to_sse(F f, __m256i self, int32_t other) noexcept
+            XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, int32_t other) noexcept
             {
                 __m128i self_low, self_high;
                 split_avx(self, self_low, self_high);
@@ -90,13 +90,13 @@ namespace xsimd
 
         // abs
         template <class A>
-        inline batch<float, A> abs(batch<float, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<avx>) noexcept
         {
             __m256 sign_mask = _mm256_set1_ps(-0.f); // -0.f = 1 << 31
             return _mm256_andnot_ps(sign_mask, self);
         }
         template <class A>
-        inline batch<double, A> abs(batch<double, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<avx>) noexcept
         {
             __m256d sign_mask = _mm256_set1_pd(-0.f); // -0.f = 1 << 31
             return _mm256_andnot_pd(sign_mask, self);
@@ -104,96 +104,96 @@ namespace xsimd
 
         // add
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
         {
             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
                                       { return add(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
                                       self, other);
         }
         template <class A>
-        inline batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_add_ps(self, other);
         }
         template <class A>
-        inline batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_add_pd(self, other);
         }
 
         // all
         template <class A>
-        inline bool all(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE bool all(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_testc_ps(self, batch_bool<float, A>(true)) != 0;
         }
         template <class A>
-        inline bool all(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE bool all(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_testc_pd(self, batch_bool<double, A>(true)) != 0;
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline bool all(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_testc_si256(self, batch_bool<T, A>(true)) != 0;
         }
 
         // any
         template <class A>
-        inline bool any(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE bool any(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
         {
             return !_mm256_testz_ps(self, self);
         }
         template <class A>
-        inline bool any(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE bool any(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
         {
             return !_mm256_testz_pd(self, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline bool any(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
         {
             return !_mm256_testz_si256(self, self);
         }
 
         // batch_bool_cast
         template <class A, class T_out, class T_in>
-        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx>) noexcept
         {
             return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
         }
 
         // bitwise_and
         template <class A>
-        inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_and_ps(self, other);
         }
         template <class A>
-        inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_and_pd(self, other);
         }
 
         template <class A>
-        inline batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_and_ps(self, other);
         }
         template <class A>
-        inline batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_and_pd(self, other);
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
         {
             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
                                       { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
                                       self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
         {
             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
                                       { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
@@ -202,36 +202,36 @@ namespace xsimd
 
         // bitwise_andnot
         template <class A>
-        inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_andnot_ps(other, self);
         }
         template <class A>
-        inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_andnot_pd(other, self);
         }
 
         template <class A>
-        inline batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_andnot_ps(other, self);
         }
         template <class A>
-        inline batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_andnot_pd(other, self);
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
         {
             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
                                       { return bitwise_andnot(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
                                       self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
         {
             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
                                       { return bitwise_andnot(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
@@ -240,7 +240,7 @@ namespace xsimd
 
         // bitwise_lshift
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
         {
             return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept
                                       { return bitwise_lshift(batch<T, sse4_2>(s), o, sse4_2 {}); },
@@ -249,14 +249,14 @@ namespace xsimd
 
         // bitwise_not
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx>) noexcept
         {
             return detail::fwd_to_sse([](__m128i s) noexcept
                                       { return bitwise_not(batch<T, sse4_2>(s), sse4_2 {}); },
                                       self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
         {
             return detail::fwd_to_sse([](__m128i s) noexcept
                                       { return bitwise_not(batch_bool<T, sse4_2>(s), sse4_2 {}); },
@@ -265,34 +265,34 @@ namespace xsimd
 
         // bitwise_or
         template <class A>
-        inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_or_ps(self, other);
         }
         template <class A>
-        inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_or_pd(self, other);
         }
         template <class A>
-        inline batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_or_ps(self, other);
         }
         template <class A>
-        inline batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_or_pd(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
         {
             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
                                       { return bitwise_or(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
                                       self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
         {
             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
                                       { return bitwise_or(batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o)); },
@@ -301,7 +301,7 @@ namespace xsimd
 
         // bitwise_rshift
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
         {
             return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept
                                       { return bitwise_rshift(batch<T, sse4_2>(s), o, sse4_2 {}); },
@@ -310,34 +310,34 @@ namespace xsimd
 
         // bitwise_xor
         template <class A>
-        inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_xor_ps(self, other);
         }
         template <class A>
-        inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_xor_pd(self, other);
         }
         template <class A>
-        inline batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_xor_ps(self, other);
         }
         template <class A>
-        inline batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_xor_pd(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
         {
             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
                                       { return bitwise_xor(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
                                       self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
         {
             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
                                       { return bitwise_xor(batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o), sse4_2 {}); },
@@ -346,66 +346,66 @@ namespace xsimd
 
         // bitwise_cast
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
         {
             return _mm256_castsi256_ps(self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
         {
             return _mm256_castsi256_pd(self);
         }
         template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
-        inline batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<avx>) noexcept
         {
             return batch<Tp, A>(self.data);
         }
         template <class A>
-        inline batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
         {
             return _mm256_castps_pd(self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
         {
             return _mm256_castps_si256(self);
         }
         template <class A>
-        inline batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
         {
             return _mm256_castpd_ps(self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
         {
             return _mm256_castpd_si256(self);
         }
 
         // bitwise_not
         template <class A>
-        inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
         }
         template <class A>
-        inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
         }
         template <class A>
-        inline batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
         }
         template <class A>
-        inline batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
         }
 
         // broadcast
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> broadcast(T val, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<avx>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -430,24 +430,24 @@ namespace xsimd
             }
         }
         template <class A>
-        inline batch<float, A> broadcast(float val, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> broadcast(float val, requires_arch<avx>) noexcept
         {
             return _mm256_set1_ps(val);
         }
         template <class A>
-        inline batch<double, A> broadcast(double val, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<avx>) noexcept
         {
             return _mm256_set1_pd(val);
         }
 
         // ceil
         template <class A>
-        inline batch<float, A> ceil(batch<float, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> ceil(batch<float, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_ceil_ps(self);
         }
         template <class A>
-        inline batch<double, A> ceil(batch<double, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> ceil(batch<double, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_ceil_pd(self);
         }
@@ -457,7 +457,7 @@ namespace xsimd
             // On clang, _mm256_extractf128_ps is built upon build_shufflevector
             // which require index parameter to be a constant
             template <int index, class B>
-            inline B get_half_complex_f(const B& real, const B& imag) noexcept
+            XSIMD_INLINE B get_half_complex_f(const B& real, const B& imag) noexcept
             {
                 __m128 tmp0 = _mm256_extractf128_ps(real, index);
                 __m128 tmp1 = _mm256_extractf128_ps(imag, index);
@@ -469,7 +469,7 @@ namespace xsimd
                 return res;
             }
             template <int index, class B>
-            inline B get_half_complex_d(const B& real, const B& imag) noexcept
+            XSIMD_INLINE B get_half_complex_d(const B& real, const B& imag) noexcept
             {
                 __m128d tmp0 = _mm256_extractf128_pd(real, index);
                 __m128d tmp1 = _mm256_extractf128_pd(imag, index);
@@ -483,24 +483,24 @@ namespace xsimd
 
             // complex_low
             template <class A>
-            inline batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
+            XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
             {
                 return get_half_complex_f<0>(self.real(), self.imag());
             }
             template <class A>
-            inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
+            XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
             {
                 return get_half_complex_d<0>(self.real(), self.imag());
             }
 
             // complex_high
             template <class A>
-            inline batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
+            XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
             {
                 return get_half_complex_f<1>(self.real(), self.imag());
             }
             template <class A>
-            inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
+            XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
             {
                 return get_half_complex_d<1>(self.real(), self.imag());
             }
@@ -510,13 +510,13 @@ namespace xsimd
         namespace detail
         {
             template <class A>
-            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
+            XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
             {
                 return _mm256_cvtepi32_ps(self);
             }
 
             template <class A>
-            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx>) noexcept
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx>) noexcept
             {
                 return _mm256_cvttps_epi32(self);
             }
@@ -524,46 +524,46 @@ namespace xsimd
 
         // decr_if
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx>) noexcept
         {
             return self + batch<T, A>(mask.data);
         }
 
         // div
         template <class A>
-        inline batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_div_ps(self, other);
         }
         template <class A>
-        inline batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_div_pd(self, other);
         }
 
         // eq
         template <class A>
-        inline batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_cmp_ps(self, other, _CMP_EQ_OQ);
         }
         template <class A>
-        inline batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_cmp_pd(self, other, _CMP_EQ_OQ);
         }
         template <class A>
-        inline batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
         {
             return ~(self != other);
         }
         template <class A>
-        inline batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
         {
             return ~(self != other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
         {
             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
                                       { return eq(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
@@ -571,26 +571,26 @@ namespace xsimd
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
         {
             return ~(self != other);
         }
 
         // floor
         template <class A>
-        inline batch<float, A> floor(batch<float, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> floor(batch<float, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_floor_ps(self);
         }
         template <class A>
-        inline batch<double, A> floor(batch<double, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> floor(batch<double, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_floor_pd(self);
         }
 
         // from_mask
         template <class A>
-        inline batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<avx>) noexcept
         {
             alignas(A::alignment()) static const uint64_t lut32[] = {
                 0x0000000000000000ul,
@@ -602,7 +602,7 @@ namespace xsimd
             return _mm256_castsi256_ps(_mm256_setr_epi64x(lut32[mask & 0x3], lut32[(mask >> 2) & 0x3], lut32[(mask >> 4) & 0x3], lut32[mask >> 6]));
         }
         template <class A>
-        inline batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<avx>) noexcept
         {
             alignas(A::alignment()) static const uint64_t lut64[][4] = {
                 { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul },
@@ -626,7 +626,7 @@ namespace xsimd
             return _mm256_castsi256_pd(_mm256_load_si256((const __m256i*)lut64[mask]));
         }
         template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx>) noexcept
         {
             alignas(A::alignment()) static const uint32_t lut32[] = {
                 0x00000000,
@@ -689,7 +689,7 @@ namespace xsimd
 
         // haddp
         template <class A>
-        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx>) noexcept
         {
             // row = (a,b,c,d,e,f,g,h)
             // tmp0 = (a0+a1, a2+a3, b0+b1, b2+b3, a4+a5, a6+a7, b4+b5, b6+b7)
@@ -715,7 +715,7 @@ namespace xsimd
             return _mm256_add_ps(tmp0, tmp1);
         }
         template <class A>
-        inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<avx>) noexcept
         {
             // row = (a,b,c,d)
             // tmp0 = (a0+a1, b0+b1, a2+a3, b2+b3)
@@ -731,14 +731,14 @@ namespace xsimd
 
         // incr_if
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx>) noexcept
         {
             return self - batch<T, A>(mask.data);
         }
 
         // insert
         template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx>) noexcept
         {
 #if !defined(_MSC_VER) || _MSC_VER > 1900
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
@@ -763,41 +763,41 @@ namespace xsimd
 
         // isnan
         template <class A>
-        inline batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_cmp_ps(self, self, _CMP_UNORD_Q);
         }
         template <class A>
-        inline batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_cmp_pd(self, self, _CMP_UNORD_Q);
         }
 
         // le
         template <class A>
-        inline batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_cmp_ps(self, other, _CMP_LE_OQ);
         }
         template <class A>
-        inline batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_cmp_pd(self, other, _CMP_LE_OQ);
         }
 
         // load_aligned
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
         {
             return _mm256_load_si256((__m256i const*)mem);
         }
         template <class A>
-        inline batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
         {
             return _mm256_load_ps(mem);
         }
         template <class A>
-        inline batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
         {
             return _mm256_load_pd(mem);
         }
@@ -806,7 +806,7 @@ namespace xsimd
         {
             // load_complex
             template <class A>
-            inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx>) noexcept
+            XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx>) noexcept
             {
                 using batch_type = batch<float, A>;
                 __m128 tmp0 = _mm256_extractf128_ps(hi, 0);
@@ -825,7 +825,7 @@ namespace xsimd
                 return { real, imag };
             }
             template <class A>
-            inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx>) noexcept
+            XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx>) noexcept
             {
                 using batch_type = batch<double, A>;
                 __m128d tmp0 = _mm256_extractf128_pd(hi, 0);
@@ -845,35 +845,35 @@ namespace xsimd
 
         // load_unaligned
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
         {
             return _mm256_loadu_si256((__m256i const*)mem);
         }
         template <class A>
-        inline batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
         {
             return _mm256_loadu_ps(mem);
         }
         template <class A>
-        inline batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
         {
             return _mm256_loadu_pd(mem);
         }
 
         // lt
         template <class A>
-        inline batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_cmp_ps(self, other, _CMP_LT_OQ);
         }
         template <class A>
-        inline batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_cmp_pd(self, other, _CMP_LT_OQ);
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
         {
             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
                                       { return lt(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
@@ -882,7 +882,7 @@ namespace xsimd
 
         // mask
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
             {
@@ -905,86 +905,86 @@ namespace xsimd
             }
         }
         template <class A>
-        inline uint64_t mask(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE uint64_t mask(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_movemask_ps(self);
         }
 
         template <class A>
-        inline uint64_t mask(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE uint64_t mask(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_movemask_pd(self);
         }
 
         // max
         template <class A>
-        inline batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_max_ps(self, other);
         }
         template <class A>
-        inline batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_max_pd(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
         {
             return select(self > other, self, other);
         }
 
         // min
         template <class A>
-        inline batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_min_ps(self, other);
         }
         template <class A>
-        inline batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_min_pd(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
         {
             return select(self <= other, self, other);
         }
 
         // mul
         template <class A>
-        inline batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_mul_ps(self, other);
         }
         template <class A>
-        inline batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_mul_pd(self, other);
         }
 
         // nearbyint
         template <class A>
-        inline batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
         }
         template <class A>
-        inline batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
         }
 
         // nearbyint_as_int
         template <class A>
-        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
-                                                  requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                        requires_arch<avx>) noexcept
         {
             return _mm256_cvtps_epi32(self);
         }
 
         // neg
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> neg(batch<T, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<avx>) noexcept
         {
             return 0 - self;
         }
@@ -994,55 +994,55 @@ namespace xsimd
             return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)));
         }
         template <class A>
-        inline batch<double, A> neg(batch<double, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> neg(batch<double, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000)));
         }
 
         // neq
         template <class A>
-        inline batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_cmp_ps(self, other, _CMP_NEQ_UQ);
         }
         template <class A>
-        inline batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_cmp_pd(self, other, _CMP_NEQ_UQ);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
         {
             return ~(self == other);
         }
 
         template <class A>
-        inline batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_xor_ps(self, other);
         }
         template <class A>
-        inline batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_xor_pd(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(self.data), _mm256_castsi256_ps(other.data)));
         }
 
         // reciprocal
         template <class A>
-        inline batch<float, A> reciprocal(batch<float, A> const& self,
-                                          kernel::requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> reciprocal(batch<float, A> const& self,
+                                                kernel::requires_arch<avx>) noexcept
         {
             return _mm256_rcp_ps(self);
         }
 
         // reduce_add
         template <class A>
-        inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx>) noexcept
+        XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<avx>) noexcept
         {
             // Warning about _mm256_hadd_ps:
             // _mm256_hadd_ps(a,b) gives
@@ -1060,7 +1060,7 @@ namespace xsimd
             return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0));
         }
         template <class A>
-        inline double reduce_add(batch<double, A> const& rhs, requires_arch<avx>) noexcept
+        XSIMD_INLINE double reduce_add(batch<double, A> const& rhs, requires_arch<avx>) noexcept
         {
             // rhs = (x0, x1, x2, x3)
             // tmp = (x2, x3, x0, x1)
@@ -1072,7 +1072,7 @@ namespace xsimd
             return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0));
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
         {
             __m128i low, high;
             detail::split_avx(self, low, high);
@@ -1082,7 +1082,7 @@ namespace xsimd
 
         // reduce_max
         template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
-        inline T reduce_max(batch<T, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<avx>) noexcept
         {
             constexpr auto mask = detail::shuffle(1, 0);
             batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
@@ -1093,7 +1093,7 @@ namespace xsimd
 
         // reduce_min
         template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
-        inline T reduce_min(batch<T, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<avx>) noexcept
         {
             constexpr auto mask = detail::shuffle(1, 0);
             batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
@@ -1104,19 +1104,19 @@ namespace xsimd
 
         // rsqrt
         template <class A>
-        inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
         {
             return _mm256_rsqrt_ps(val);
         }
         template <class A>
-        inline batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
         {
             return _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(val)));
         }
 
         // sadd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -1135,17 +1135,17 @@ namespace xsimd
 
         // select
         template <class A>
-        inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
         {
             return _mm256_blendv_ps(false_br, true_br, cond);
         }
         template <class A>
-        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
         {
             return _mm256_blendv_pd(false_br, true_br, cond);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
         {
             __m128i cond_low, cond_hi;
             detail::split_avx(cond, cond_low, cond_hi);
@@ -1161,76 +1161,76 @@ namespace xsimd
             return detail::merge_sse(res_low, res_hi);
         }
         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
         {
             return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
         }
 
         template <class A, bool... Values>
-        inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> select(batch_bool_constant<float, A, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
         {
-            constexpr auto mask = batch_bool_constant<batch<float, A>, Values...>::mask();
+            constexpr auto mask = batch_bool_constant<float, A, Values...>::mask();
             return _mm256_blend_ps(false_br, true_br, mask);
         }
 
         template <class A, bool... Values>
-        inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> select(batch_bool_constant<double, A, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
         {
-            constexpr auto mask = batch_bool_constant<batch<double, A>, Values...>::mask();
+            constexpr auto mask = batch_bool_constant<double, A, Values...>::mask();
             return _mm256_blend_pd(false_br, true_br, mask);
         }
 
         // set
         template <class A, class... Values>
-        inline batch<float, A> set(batch<float, A> const&, requires_arch<avx>, Values... values) noexcept
+        XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<avx>, Values... values) noexcept
         {
             static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
             return _mm256_setr_ps(values...);
         }
 
         template <class A, class... Values>
-        inline batch<double, A> set(batch<double, A> const&, requires_arch<avx>, Values... values) noexcept
+        XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<avx>, Values... values) noexcept
         {
             static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
             return _mm256_setr_pd(values...);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3) noexcept
         {
             return _mm256_set_epi64x(v3, v2, v1, v0);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
         {
             return _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
         {
             return _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
-                               T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                                     T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
         {
             return _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
         }
 
         template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx>, Values... values) noexcept
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx>, Values... values) noexcept
         {
             return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
         }
 
         template <class A, class... Values>
-        inline batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<avx>, Values... values) noexcept
+        XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<avx>, Values... values) noexcept
         {
             static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
             return _mm256_castsi256_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
         }
 
         template <class A, class... Values>
-        inline batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<avx>, Values... values) noexcept
+        XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<avx>, Values... values) noexcept
         {
             static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
             return _mm256_castsi256_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
@@ -1238,7 +1238,7 @@ namespace xsimd
 
         // shuffle
         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
-        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx>) noexcept
         {
             constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
             // shuffle within lane
@@ -1253,7 +1253,7 @@ namespace xsimd
         }
 
         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
-        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3> mask, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<avx>) noexcept
         {
             constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3);
             // shuffle within lane
@@ -1269,7 +1269,7 @@ namespace xsimd
 
         // slide_left
         template <size_t N, class A, class T>
-        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx>) noexcept
         {
             constexpr unsigned BitCount = N * 8;
             if (BitCount == 0)
@@ -1310,7 +1310,7 @@ namespace xsimd
 
         // slide_right
         template <size_t N, class A, class T>
-        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx>) noexcept
         {
             constexpr unsigned BitCount = N * 8;
             if (BitCount == 0)
@@ -1350,19 +1350,19 @@ namespace xsimd
 
         // sqrt
         template <class A>
-        inline batch<float, A> sqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
         {
             return _mm256_sqrt_ps(val);
         }
         template <class A>
-        inline batch<double, A> sqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
         {
             return _mm256_sqrt_pd(val);
         }
 
         // ssub
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -1377,70 +1377,70 @@ namespace xsimd
 
         // store_aligned
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline void store_aligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_store_si256((__m256i*)mem, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_store_si256((__m256i*)mem, self);
         }
         template <class A>
-        inline void store_aligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_store_ps(mem, self);
         }
         template <class A>
-        inline void store_aligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_store_pd(mem, self);
         }
 
         // store_unaligned
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_storeu_si256((__m256i*)mem, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_storeu_si256((__m256i*)mem, self);
         }
         template <class A>
-        inline void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_storeu_ps(mem, self);
         }
         template <class A>
-        inline void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_storeu_pd(mem, self);
         }
 
         // sub
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
         {
             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
                                       { return sub(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
                                       self, other);
         }
         template <class A>
-        inline batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_sub_ps(self, other);
         }
         template <class A>
-        inline batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
         {
             return _mm256_sub_pd(self, other);
         }
 
         // swizzle (dynamic mask)
         template <class A>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx>) noexcept
         {
             // duplicate low and high part of input
             __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
@@ -1464,7 +1464,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx>) noexcept
         {
             // duplicate low and high part of input
             __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
@@ -1488,14 +1488,14 @@ namespace xsimd
         }
 
         template <class A, typename T, detail::enable_sized_integral_t<T, 4> = 0>
-        inline batch<T, A> swizzle(batch<T, A> const& self, batch<uint32_t, A> const& mask, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<uint32_t, A> const& mask, requires_arch<avx>) noexcept
         {
             return bitwise_cast<T>(
                 swizzle(bitwise_cast<float>(self), mask));
         }
 
         template <class A, typename T, detail::enable_sized_integral_t<T, 8> = 0>
-        inline batch<T, A>
+        XSIMD_INLINE batch<T, A>
         swizzle(batch<T, A> const& self, batch<uint64_t, A> const& mask, requires_arch<avx>) noexcept
         {
             return bitwise_cast<T>(
@@ -1504,7 +1504,7 @@ namespace xsimd
 
         // swizzle (constant mask)
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
         {
             // duplicate low and high part of input
             __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
@@ -1514,14 +1514,14 @@ namespace xsimd
             __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
 
             // normalize mask
-            batch_constant<batch<uint32_t, A>, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
+            batch_constant<uint32_t, A, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
 
             // permute within each lane
-            __m256 r0 = _mm256_permutevar_ps(low_low, (batch<uint32_t, A>)half_mask);
-            __m256 r1 = _mm256_permutevar_ps(hi_hi, (batch<uint32_t, A>)half_mask);
+            __m256 r0 = _mm256_permutevar_ps(low_low, half_mask.as_batch());
+            __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask.as_batch());
 
             // mask to choose the right lane
-            batch_bool_constant<batch<uint32_t, A>, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
+            batch_bool_constant<uint32_t, A, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
 
             // blend the two permutes
             constexpr auto mask = blend_mask.mask();
@@ -1529,7 +1529,7 @@ namespace xsimd
         }
 
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx>) noexcept
         {
             // duplicate low and high part of input
             __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
@@ -1539,14 +1539,14 @@ namespace xsimd
             __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
 
             // normalize mask
-            batch_constant<batch<uint64_t, A>, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
+            batch_constant<uint64_t, A, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
 
             // permute within each lane
-            __m256d r0 = _mm256_permutevar_pd(low_low, (batch<uint64_t, A>)half_mask);
-            __m256d r1 = _mm256_permutevar_pd(hi_hi, (batch<uint64_t, A>)half_mask);
+            __m256d r0 = _mm256_permutevar_pd(low_low, half_mask.as_batch());
+            __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask.as_batch());
 
             // mask to choose the right lane
-            batch_bool_constant<batch<uint64_t, A>, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
+            batch_bool_constant<uint64_t, A, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
 
             // blend the two permutes
             constexpr auto mask = blend_mask.mask();
@@ -1563,17 +1563,17 @@ namespace xsimd
                   uint32_t V6,
                   uint32_t V7,
                   detail::enable_sized_integral_t<T, 4> = 0>
-        inline batch<T, A> swizzle(batch<T, A> const& self,
-                                   batch_constant<batch<uint32_t, A>,
-                                                  V0,
-                                                  V1,
-                                                  V2,
-                                                  V3,
-                                                  V4,
-                                                  V5,
-                                                  V6,
-                                                  V7> const& mask,
-                                   requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self,
+                                         batch_constant<uint32_t, A,
+                                                        V0,
+                                                        V1,
+                                                        V2,
+                                                        V3,
+                                                        V4,
+                                                        V5,
+                                                        V6,
+                                                        V7> const& mask,
+                                         requires_arch<avx>) noexcept
         {
             return bitwise_cast<T>(
                 swizzle(bitwise_cast<float>(self), mask));
@@ -1586,9 +1586,9 @@ namespace xsimd
                   uint64_t V2,
                   uint64_t V3,
                   detail::enable_sized_integral_t<T, 8> = 0>
-        inline batch<T, A>
+        XSIMD_INLINE batch<T, A>
         swizzle(batch<T, A> const& self,
-                batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> const& mask,
+                batch_constant<uint64_t, A, V0, V1, V2, V3> const& mask,
                 requires_arch<avx>) noexcept
         {
             return bitwise_cast<T>(
@@ -1597,19 +1597,19 @@ namespace xsimd
 
         // trunc
         template <class A>
-        inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_round_ps(self, _MM_FROUND_TO_ZERO);
         }
         template <class A>
-        inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> trunc(batch<double, A> const& self, requires_arch<avx>) noexcept
         {
             return _mm256_round_pd(self, _MM_FROUND_TO_ZERO);
         }
 
         // zip_hi
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
             {
@@ -1656,14 +1656,14 @@ namespace xsimd
             }
         }
         template <class A>
-        inline batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
         {
             auto lo = _mm256_unpacklo_ps(self, other);
             auto hi = _mm256_unpackhi_ps(self, other);
             return _mm256_permute2f128_ps(lo, hi, 0x31);
         }
         template <class A>
-        inline batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
         {
             auto lo = _mm256_unpacklo_pd(self, other);
             auto hi = _mm256_unpackhi_pd(self, other);
@@ -1672,7 +1672,7 @@ namespace xsimd
 
         // zip_lo
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
             {
@@ -1720,14 +1720,14 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
         {
             auto lo = _mm256_unpacklo_ps(self, other);
             auto hi = _mm256_unpackhi_ps(self, other);
             return _mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1);
         }
         template <class A>
-        inline batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
         {
             auto lo = _mm256_unpacklo_pd(self, other);
             auto hi = _mm256_unpackhi_pd(self, other);
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_avx2.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_avx2.hpp
index a5b07ec9da2..a7b421d8ecd 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_avx2.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_avx2.hpp
@@ -26,7 +26,7 @@ namespace xsimd
 
         // abs
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<avx2>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -52,7 +52,7 @@ namespace xsimd
 
         // add
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -76,45 +76,83 @@ namespace xsimd
             }
         }
 
+        // avgr
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_avg_epu8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_avg_epu16(self, other);
+            }
+            else
+            {
+                return avgr(self, other, generic {});
+            }
+        }
+
+        // avg
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto adj = ((self ^ other) << 7) >> 7;
+                return avgr(self, other, A {}) - adj;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto adj = ((self ^ other) << 15) >> 15;
+                return avgr(self, other, A {}) - adj;
+            }
+            else
+            {
+                return avg(self, other, generic {});
+            }
+        }
+
         // bitwise_and
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
             return _mm256_and_si256(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
         {
             return _mm256_and_si256(self, other);
         }
 
         // bitwise_andnot
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
             return _mm256_andnot_si256(other, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
         {
             return _mm256_andnot_si256(other, self);
         }
 
         // bitwise_not
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx2>) noexcept
         {
             return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
         {
             return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
         }
 
         // bitwise_lshift
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
             {
@@ -135,7 +173,7 @@ namespace xsimd
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
             {
@@ -153,19 +191,19 @@ namespace xsimd
 
         // bitwise_or
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
             return _mm256_or_si256(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
         {
             return _mm256_or_si256(self, other);
         }
 
         // bitwise_rshift
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -215,7 +253,7 @@ namespace xsimd
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -247,19 +285,19 @@ namespace xsimd
 
         // bitwise_xor
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
             return _mm256_xor_si256(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
         {
             return _mm256_xor_si256(self, other);
         }
 
         // complex_low
         template <class A>
-        inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
         {
             __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 1, 1, 0));
             __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(1, 2, 0, 0));
@@ -268,7 +306,7 @@ namespace xsimd
 
         // complex_high
         template <class A>
-        inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
         {
             __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 3, 1, 2));
             __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0));
@@ -280,7 +318,7 @@ namespace xsimd
         {
 
             template <class A>
-            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
+            XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
             {
                 // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
                 // adapted to avx
@@ -294,7 +332,7 @@ namespace xsimd
             }
 
             template <class A>
-            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
+            XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
             {
                 // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
                 // adapted to avx
@@ -311,7 +349,7 @@ namespace xsimd
 
         // eq
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -337,16 +375,16 @@ namespace xsimd
 
         // gather
         template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
-        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
-                                  kernel::requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                        kernel::requires_arch<avx2>) noexcept
         {
             // scatter for this one is AVX512F+AVX512VL
             return _mm256_i32gather_epi32(reinterpret_cast<const int*>(src), index, sizeof(T));
         }
 
         template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
-        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
-                                  kernel::requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                        kernel::requires_arch<avx2>) noexcept
         {
             // scatter for this one is AVX512F+AVX512VL
             return _mm256_i64gather_epi64(reinterpret_cast<const long long int*>(src), index, sizeof(T));
@@ -354,18 +392,18 @@ namespace xsimd
 
         template <class A, class U,
                   detail::enable_sized_integral_t<U, 4> = 0>
-        inline batch<float, A> gather(batch<float, A> const&, float const* src,
-                                      batch<U, A> const& index,
-                                      kernel::requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, float const* src,
+                                            batch<U, A> const& index,
+                                            kernel::requires_arch<avx2>) noexcept
         {
             // scatter for this one is AVX512F+AVX512VL
             return _mm256_i32gather_ps(src, index, sizeof(float));
         }
 
         template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
-        inline batch<double, A> gather(batch<double, A> const&, double const* src,
-                                       batch<U, A> const& index,
-                                       requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<double, A> gather(batch<double, A> const&, double const* src,
+                                             batch<U, A> const& index,
+                                             requires_arch<avx2>) noexcept
         {
             // scatter for this one is AVX512F+AVX512VL
             return _mm256_i64gather_pd(src, index, sizeof(double));
@@ -373,9 +411,9 @@ namespace xsimd
 
         // gather: handmade conversions
         template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
-        inline batch<float, A> gather(batch<float, A> const&, double const* src,
-                                      batch<V, A> const& index,
-                                      requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, double const* src,
+                                            batch<V, A> const& index,
+                                            requires_arch<avx2>) noexcept
         {
             const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
             const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
@@ -383,9 +421,9 @@ namespace xsimd
         }
 
         template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
-        inline batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
-                                        batch<V, A> const& index,
-                                        requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
+                                              batch<V, A> const& index,
+                                              requires_arch<avx2>) noexcept
         {
             const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
             const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
@@ -394,7 +432,7 @@ namespace xsimd
 
         // lt
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -427,7 +465,7 @@ namespace xsimd
 
         // load_complex
         template <class A>
-        inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx2>) noexcept
         {
             using batch_type = batch<float, A>;
             batch_type real = _mm256_castpd_ps(
@@ -441,7 +479,7 @@ namespace xsimd
             return { real, imag };
         }
         template <class A>
-        inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx2>) noexcept
         {
             using batch_type = batch<double, A>;
             batch_type real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
@@ -450,7 +488,7 @@ namespace xsimd
         }
         // mask
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -469,7 +507,7 @@ namespace xsimd
 
         // max
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -513,7 +551,7 @@ namespace xsimd
 
         // min
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -557,7 +595,7 @@ namespace xsimd
 
         // mul
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -585,7 +623,7 @@ namespace xsimd
 
         // reduce_add
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
             {
@@ -619,19 +657,19 @@ namespace xsimd
 
         // rotate_right
         template <size_t N, class A>
-        inline batch<uint16_t, A> rotate_right(batch<uint16_t, A> const& self, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<uint16_t, A> rotate_right(batch<uint16_t, A> const& self, requires_arch<avx2>) noexcept
         {
             return _mm256_alignr_epi8(self, self, N);
         }
         template <size_t N, class A>
-        inline batch<int16_t, A> rotate_right(batch<int16_t, A> const& self, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<int16_t, A> rotate_right(batch<int16_t, A> const& self, requires_arch<avx2>) noexcept
         {
             return bitwise_cast<int16_t>(rotate_right<N, A>(bitwise_cast<uint16_t>(self), avx2 {}));
         }
 
         // sadd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -667,7 +705,7 @@ namespace xsimd
 
         // select
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -691,9 +729,9 @@ namespace xsimd
             }
         }
         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
         {
-            constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
+            constexpr int mask = batch_bool_constant<T, A, Values...>::mask();
             // FIXME: for some reason mask here is not considered as an immediate,
             // but it's okay for _mm256_blend_epi32
             // case 2: return _mm256_blend_epi16(false_br, true_br, mask);
@@ -714,7 +752,7 @@ namespace xsimd
 
         // slide_left
         template <size_t N, class A, class T>
-        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx2>) noexcept
         {
             constexpr unsigned BitCount = N * 8;
             if (BitCount == 0)
@@ -745,7 +783,7 @@ namespace xsimd
 
         // slide_right
         template <size_t N, class A, class T>
-        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx2>) noexcept
         {
             constexpr unsigned BitCount = N * 8;
             if (BitCount == 0)
@@ -776,7 +814,7 @@ namespace xsimd
 
         // ssub
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -812,7 +850,7 @@ namespace xsimd
 
         // sub
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -838,13 +876,13 @@ namespace xsimd
 
         // swizzle (dynamic mask)
         template <class A>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
         {
             return _mm256_permutevar8x32_ps(self, mask);
         }
 
         template <class A>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
         {
             batch<uint32_t, A> broadcaster = { 0, 1, 0, 1, 0, 1, 0, 1 };
             constexpr uint64_t comb = 0x0000000100000001ul * 2;
@@ -852,65 +890,65 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
         {
             return bitwise_cast<uint64_t>(swizzle(bitwise_cast<double>(self), mask, avx2 {}));
         }
         template <class A>
-        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
         {
             return bitwise_cast<int64_t>(swizzle(bitwise_cast<double>(self), mask, avx2 {}));
         }
         template <class A>
-        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
         {
             return _mm256_permutevar8x32_epi32(self, mask);
         }
         template <class A>
-        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
         {
             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
         }
 
         // swizzle (constant mask)
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
         {
-            return _mm256_permutevar8x32_ps(self, (batch<uint32_t, A>)mask);
+            return _mm256_permutevar8x32_ps(self, mask.as_batch());
         }
 
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
         {
             constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
             return _mm256_permute4x64_pd(self, mask);
         }
 
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
         {
             constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
             return _mm256_permute4x64_epi64(self, mask);
         }
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
         {
             return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx2 {}));
         }
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
         {
-            return _mm256_permutevar8x32_epi32(self, (batch<uint32_t, A>)mask);
+            return _mm256_permutevar8x32_epi32(self, mask.as_batch());
         }
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
         {
             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
         }
 
         // zip_hi
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -945,7 +983,7 @@ namespace xsimd
 
         // zip_lo
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_avx512bw.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_avx512bw.hpp
index 94a194dab7a..8b381986c3a 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_avx512bw.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_avx512bw.hpp
@@ -27,7 +27,7 @@ namespace xsimd
         namespace detail
         {
             template <class A, class T, int Cmp>
-            inline batch_bool<T, A> compare_int_avx512bw(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            XSIMD_INLINE batch_bool<T, A> compare_int_avx512bw(batch<T, A> const& self, batch<T, A> const& other) noexcept
             {
                 using register_type = typename batch_bool<T, A>::register_type;
                 if (std::is_signed<T>::value)
@@ -73,7 +73,7 @@ namespace xsimd
 
         // abs
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512bw>) noexcept
         {
             if (std::is_unsigned<T>::value)
             {
@@ -96,7 +96,7 @@ namespace xsimd
 
         // add
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -112,9 +112,47 @@ namespace xsimd
             }
         }
 
+        // avgr
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_avg_epu8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_avg_epu16(self, other);
+            }
+            else
+            {
+                return avgr(self, other, generic {});
+            }
+        }
+
+        // avg
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto adj = ((self ^ other) << 7) >> 7;
+                return avgr(self, other, A {}) - adj;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto adj = ((self ^ other) << 15) >> 15;
+                return avgr(self, other, A {}) - adj;
+            }
+            else
+            {
+                return avg(self, other, generic {});
+            }
+        }
+
         // bitwise_lshift
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
         {
 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
             XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
@@ -134,7 +172,7 @@ namespace xsimd
 
         // bitwise_rshift
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -188,42 +226,42 @@ namespace xsimd
 
         // eq
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
         {
             return detail::compare_int_avx512bw<A, T, _MM_CMPINT_EQ>(self, other);
         }
 
         // ge
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
         {
             return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GE>(self, other);
         }
 
         // gt
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
         {
             return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GT>(self, other);
         }
 
         // le
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
         {
             return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LE>(self, other);
         }
 
         // lt
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
         {
             return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LT>(self, other);
         }
 
         // max
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -259,7 +297,7 @@ namespace xsimd
 
         // min
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -295,7 +333,7 @@ namespace xsimd
 
         // mul
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -315,26 +353,26 @@ namespace xsimd
 
         // neq
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
         {
             return detail::compare_int_avx512bw<A, T, _MM_CMPINT_NE>(self, other);
         }
 
         // rotate_right
         template <size_t N, class A>
-        inline batch<uint16_t, A> rotate_right(batch<uint16_t, A> const& self, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<uint16_t, A> rotate_right(batch<uint16_t, A> const& self, requires_arch<avx512bw>) noexcept
         {
             return _mm512_alignr_epi8(self, self, N);
         }
         template <size_t N, class A>
-        inline batch<int16_t, A> rotate_right(batch<int16_t, A> const& self, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<int16_t, A> rotate_right(batch<int16_t, A> const& self, requires_arch<avx512bw>) noexcept
         {
             return bitwise_cast<int16_t>(rotate_right<N, A>(bitwise_cast<uint16_t>(self), avx2 {}));
         }
 
         // sadd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -370,7 +408,7 @@ namespace xsimd
 
         // select
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512bw>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -408,7 +446,7 @@ namespace xsimd
         }
 
         template <size_t N, class A, class T>
-        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
         {
             constexpr unsigned BitCount = N * 8;
             if (BitCount == 0)
@@ -467,7 +505,7 @@ namespace xsimd
             }
         }
         template <size_t N, class A, class T>
-        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
         {
             constexpr unsigned BitCount = N * 8;
             if (BitCount == 0)
@@ -500,7 +538,7 @@ namespace xsimd
 
         // ssub
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -536,7 +574,7 @@ namespace xsimd
 
         // sub
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -554,57 +592,57 @@ namespace xsimd
 
         // swizzle (dynamic version)
         template <class A>
-        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch<uint16_t, A> mask, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch<uint16_t, A> mask, requires_arch<avx512bw>) noexcept
         {
             return _mm512_permutexvar_epi16(mask, self);
         }
 
         template <class A>
-        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch<uint16_t, A> mask, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch<uint16_t, A> mask, requires_arch<avx512bw>) noexcept
         {
             return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512bw {}));
         }
 
         template <class A>
-        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<avx512bw>) noexcept
         {
             return _mm512_shuffle_epi8(self, mask);
         }
 
         template <class A>
-        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<avx512bw>) noexcept
         {
             return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, avx512bw {}));
         }
 
         // swizzle (static version)
         template <class A, uint16_t... Vs>
-        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
         {
-            return swizzle(self, (batch<uint16_t, A>)mask, avx512bw {});
+            return swizzle(self, mask.as_batch(), avx512bw {});
         }
 
         template <class A, uint16_t... Vs>
-        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
         {
-            return swizzle(self, (batch<uint16_t, A>)mask, avx512bw {});
+            return swizzle(self, mask.as_batch(), avx512bw {});
         }
 
         template <class A, uint8_t... Vs>
-        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
         {
-            return swizzle(self, (batch<uint8_t, A>)mask, avx512bw {});
+            return swizzle(self, mask.as_batch(), avx512bw {});
         }
 
         template <class A, uint8_t... Vs>
-        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
         {
-            return swizzle(self, (batch<uint8_t, A>)mask, avx512bw {});
+            return swizzle(self, mask.as_batch(), avx512bw {});
         }
 
         // zip_hi
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
         {
             __m512i lo, hi;
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
@@ -632,7 +670,7 @@ namespace xsimd
 
         // zip_lo
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
         {
             __m512i lo, hi;
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_avx512f.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_avx512f.hpp
index 7ee46101356..c2b485a30e3 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_avx512f.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_avx512f.hpp
@@ -27,30 +27,30 @@ namespace xsimd
 
         namespace detail
         {
-            inline void split_avx512(__m512 val, __m256& low, __m256& high) noexcept
+            XSIMD_INLINE void split_avx512(__m512 val, __m256& low, __m256& high) noexcept
             {
                 low = _mm512_castps512_ps256(val);
                 high = _mm512_extractf32x8_ps(val, 1);
             }
-            inline void split_avx512(__m512d val, __m256d& low, __m256d& high) noexcept
+            XSIMD_INLINE void split_avx512(__m512d val, __m256d& low, __m256d& high) noexcept
             {
                 low = _mm512_castpd512_pd256(val);
                 high = _mm512_extractf64x4_pd(val, 1);
             }
-            inline void split_avx512(__m512i val, __m256i& low, __m256i& high) noexcept
+            XSIMD_INLINE void split_avx512(__m512i val, __m256i& low, __m256i& high) noexcept
             {
                 low = _mm512_castsi512_si256(val);
                 high = _mm512_extracti64x4_epi64(val, 1);
             }
-            inline __m512i merge_avx(__m256i low, __m256i high) noexcept
+            XSIMD_INLINE __m512i merge_avx(__m256i low, __m256i high) noexcept
             {
                 return _mm512_inserti64x4(_mm512_castsi256_si512(low), high, 1);
             }
-            inline __m512 merge_avx(__m256 low, __m256 high) noexcept
+            XSIMD_INLINE __m512 merge_avx(__m256 low, __m256 high) noexcept
             {
                 return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castpd256_pd512(_mm256_castps_pd(low)), _mm256_castps_pd(high), 1));
             }
-            inline __m512d merge_avx(__m256d low, __m256d high) noexcept
+            XSIMD_INLINE __m512d merge_avx(__m256d low, __m256d high) noexcept
             {
                 return _mm512_insertf64x4(_mm512_castpd256_pd512(low), high, 1);
             }
@@ -86,7 +86,7 @@ namespace xsimd
         namespace detail
         {
 
-            inline uint32_t morton(uint16_t x, uint16_t y) noexcept
+            XSIMD_INLINE uint32_t morton(uint16_t x, uint16_t y) noexcept
             {
 
                 static const unsigned short MortonTable256[256] = {
@@ -129,7 +129,7 @@ namespace xsimd
             }
 
             template <class A, class T, int Cmp>
-            inline batch_bool<T, A> compare_int_avx512f(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            XSIMD_INLINE batch_bool<T, A> compare_int_avx512f(batch<T, A> const& self, batch<T, A> const& other) noexcept
             {
                 using register_type = typename batch_bool<T, A>::register_type;
                 if (std::is_signed<T>::value)
@@ -217,7 +217,7 @@ namespace xsimd
 
         // abs
         template <class A>
-        inline batch<float, A> abs(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<avx512f>) noexcept
         {
             __m512 self_asf = (__m512)self;
             __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asf);
@@ -225,7 +225,7 @@ namespace xsimd
             return *reinterpret_cast<__m512*>(&res_asi);
         }
         template <class A>
-        inline batch<double, A> abs(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<avx512f>) noexcept
         {
             __m512d self_asd = (__m512d)self;
             __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asd);
@@ -234,7 +234,7 @@ namespace xsimd
             return *reinterpret_cast<__m512d*>(&res_asi);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512f>) noexcept
         {
             if (std::is_unsigned<T>::value)
             {
@@ -270,7 +270,7 @@ namespace xsimd
 
         // add
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -299,19 +299,19 @@ namespace xsimd
             }
         }
         template <class A>
-        inline batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_add_ps(self, other);
         }
         template <class A>
-        inline batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_add_pd(self, other);
         }
 
         // all
         template <class A, class T>
-        inline bool all(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
             return self.data == register_type(-1);
@@ -319,7 +319,7 @@ namespace xsimd
 
         // any
         template <class A, class T>
-        inline bool any(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
             return self.data != register_type(0);
@@ -327,14 +327,14 @@ namespace xsimd
 
         // batch_bool_cast
         template <class A, class T_out, class T_in>
-        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx512f>) noexcept
         {
             return self.data;
         }
 
         // bitwise_and
         template <class A>
-        inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
 #if defined(_MSC_VER)
             return _mm512_and_ps(self, other);
@@ -343,19 +343,19 @@ namespace xsimd
 #endif
         }
         template <class A>
-        inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_and_si512(self, other);
         }
 
         template <class A, class T>
-        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
             return register_type(self.data & other.data);
@@ -363,24 +363,24 @@ namespace xsimd
 
         // bitwise_andnot
         template <class A>
-        inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(other), _mm512_castps_si512(self)));
         }
         template <class A>
-        inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_castsi512_pd(_mm512_andnot_si512(_mm512_castpd_si512(other), _mm512_castpd_si512(self)));
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_andnot_si512(other, self);
         }
 
         template <class A, class T>
-        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
             return register_type(self.data & ~other.data);
@@ -388,7 +388,7 @@ namespace xsimd
 
         // bitwise_lshift
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -433,56 +433,56 @@ namespace xsimd
 
         // bitwise_not
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_xor_si512(self, _mm512_set1_epi32(-1));
         }
         template <class A, class T>
-        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
             return register_type(~self.data);
         }
 
         template <class A>
-        inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_set1_epi32(-1)));
         }
         template <class A>
-        inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_set1_epi32(-1)));
         }
 
         // bitwise_or
         template <class A>
-        inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
         }
         template <class A>
-        inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_castsi512_pd(_mm512_or_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
         }
 
         template <class A, class T>
-        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
             return register_type(self.data | other.data);
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_or_si512(self, other);
         }
 
         // bitwise_rshift
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -552,69 +552,69 @@ namespace xsimd
 
         // bitwise_xor
         template <class A>
-        inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
         }
         template <class A>
-        inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
         }
 
         template <class A, class T>
-        inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
             return register_type(self.data | other.data);
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_xor_si512(self, other);
         }
 
         // bitwise_cast
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
         {
             return _mm512_castsi512_ps(self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<avx512f>) noexcept
         {
             return _mm512_castsi512_pd(self);
         }
         template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
-        inline batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<avx512f>) noexcept
         {
             return batch<Tp, A>(self.data);
         }
         template <class A>
-        inline batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<avx512f>) noexcept
         {
             return _mm512_castps_pd(self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<avx512f>) noexcept
         {
             return _mm512_castps_si512(self);
         }
         template <class A>
-        inline batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
         {
             return _mm512_castpd_ps(self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<avx512f>) noexcept
         {
             return _mm512_castpd_si512(self);
         }
 
         // broadcast
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> broadcast(T val, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<avx512f>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -639,56 +639,56 @@ namespace xsimd
             }
         }
         template <class A>
-        inline batch<float, A> broadcast(float val, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> broadcast(float val, requires_arch<avx512f>) noexcept
         {
             return _mm512_set1_ps(val);
         }
         template <class A>
-        batch<double, A> inline broadcast(double val, requires_arch<avx512f>) noexcept
+        batch<double, A> XSIMD_INLINE broadcast(double val, requires_arch<avx512f>) noexcept
         {
             return _mm512_set1_pd(val);
         }
 
         // ceil
         template <class A>
-        inline batch<float, A> ceil(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> ceil(batch<float, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_roundscale_ps(self, _MM_FROUND_TO_POS_INF);
         }
         template <class A>
-        inline batch<double, A> ceil(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> ceil(batch<double, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_roundscale_pd(self, _MM_FROUND_TO_POS_INF);
         }
 
         // compress
         template <class A>
-        inline batch<float, A> compress(batch<float, A> const& self, batch_bool<float, A> const& mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> compress(batch<float, A> const& self, batch_bool<float, A> const& mask, requires_arch<avx512f>) noexcept
         {
             return _mm512_maskz_compress_ps(mask.mask(), self);
         }
         template <class A>
-        inline batch<double, A> compress(batch<double, A> const& self, batch_bool<double, A> const& mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> compress(batch<double, A> const& self, batch_bool<double, A> const& mask, requires_arch<avx512f>) noexcept
         {
             return _mm512_maskz_compress_pd(mask.mask(), self);
         }
         template <class A>
-        inline batch<int32_t, A> compress(batch<int32_t, A> const& self, batch_bool<int32_t, A> const& mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<int32_t, A> compress(batch<int32_t, A> const& self, batch_bool<int32_t, A> const& mask, requires_arch<avx512f>) noexcept
         {
             return _mm512_maskz_compress_epi32(mask.mask(), self);
         }
         template <class A>
-        inline batch<uint32_t, A> compress(batch<uint32_t, A> const& self, batch_bool<uint32_t, A> const& mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<uint32_t, A> compress(batch<uint32_t, A> const& self, batch_bool<uint32_t, A> const& mask, requires_arch<avx512f>) noexcept
         {
             return _mm512_maskz_compress_epi32(mask.mask(), self);
         }
         template <class A>
-        inline batch<int64_t, A> compress(batch<int64_t, A> const& self, batch_bool<int64_t, A> const& mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<int64_t, A> compress(batch<int64_t, A> const& self, batch_bool<int64_t, A> const& mask, requires_arch<avx512f>) noexcept
         {
             return _mm512_maskz_compress_epi64(mask.mask(), self);
         }
         template <class A>
-        inline batch<uint64_t, A> compress(batch<uint64_t, A> const& self, batch_bool<uint64_t, A> const& mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<uint64_t, A> compress(batch<uint64_t, A> const& self, batch_bool<uint64_t, A> const& mask, requires_arch<avx512f>) noexcept
         {
             return _mm512_maskz_compress_epi64(mask.mask(), self);
         }
@@ -697,19 +697,19 @@ namespace xsimd
         namespace detail
         {
             template <class A>
-            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
+            XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
             {
                 return _mm512_cvtepi32_ps(self);
             }
 
             template <class A>
-            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx512f>) noexcept
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx512f>) noexcept
             {
                 return _mm512_cvttps_epi32(self);
             }
 
             template <class A>
-            inline batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
+            XSIMD_INLINE batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
             {
                 return _mm512_cvtepu32_ps(self);
             }
@@ -725,13 +725,13 @@ namespace xsimd
         {
             // complex_low
             template <class A>
-            inline batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<avx512f>) noexcept
+            XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<avx512f>) noexcept
             {
                 __m512i idx = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
                 return _mm512_permutex2var_ps(self.real(), idx, self.imag());
             }
             template <class A>
-            inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx512f>) noexcept
+            XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx512f>) noexcept
             {
                 __m512i idx = _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11);
                 return _mm512_permutex2var_pd(self.real(), idx, self.imag());
@@ -739,13 +739,13 @@ namespace xsimd
 
             // complex_high
             template <class A>
-            inline batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<avx512f>) noexcept
+            XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<avx512f>) noexcept
             {
                 __m512i idx = _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
                 return _mm512_permutex2var_ps(self.real(), idx, self.imag());
             }
             template <class A>
-            inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx512f>) noexcept
+            XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx512f>) noexcept
             {
                 __m512i idx = _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15);
                 return _mm512_permutex2var_pd(self.real(), idx, self.imag());
@@ -754,35 +754,35 @@ namespace xsimd
 
         // div
         template <class A>
-        inline batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_div_ps(self, other);
         }
         template <class A>
-        inline batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_div_pd(self, other);
         }
 
         // eq
         template <class A>
-        inline batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_cmp_ps_mask(self, other, _CMP_EQ_OQ);
         }
         template <class A>
-        inline batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_cmp_pd_mask(self, other, _CMP_EQ_OQ);
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             return detail::compare_int_avx512f<A, T, _MM_CMPINT_EQ>(self, other);
         }
         template <class A, class T>
-        inline batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
             return register_type(~self.data ^ other.data);
@@ -790,126 +790,126 @@ namespace xsimd
 
         // expand
         template <class A>
-        inline batch<float, A> expand(batch<float, A> const& self, batch_bool<float, A> const& mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> expand(batch<float, A> const& self, batch_bool<float, A> const& mask, requires_arch<avx512f>) noexcept
         {
             return _mm512_maskz_expand_ps(mask.mask(), self);
         }
         template <class A>
-        inline batch<double, A> expand(batch<double, A> const& self, batch_bool<double, A> const& mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> expand(batch<double, A> const& self, batch_bool<double, A> const& mask, requires_arch<avx512f>) noexcept
         {
             return _mm512_maskz_expand_pd(mask.mask(), self);
         }
         template <class A>
-        inline batch<int32_t, A> expand(batch<int32_t, A> const& self, batch_bool<int32_t, A> const& mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<int32_t, A> expand(batch<int32_t, A> const& self, batch_bool<int32_t, A> const& mask, requires_arch<avx512f>) noexcept
         {
             return _mm512_maskz_expand_epi32(mask.mask(), self);
         }
         template <class A>
-        inline batch<uint32_t, A> expand(batch<uint32_t, A> const& self, batch_bool<uint32_t, A> const& mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<uint32_t, A> expand(batch<uint32_t, A> const& self, batch_bool<uint32_t, A> const& mask, requires_arch<avx512f>) noexcept
         {
             return _mm512_maskz_expand_epi32(mask.mask(), self);
         }
         template <class A>
-        inline batch<int64_t, A> expand(batch<int64_t, A> const& self, batch_bool<int64_t, A> const& mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<int64_t, A> expand(batch<int64_t, A> const& self, batch_bool<int64_t, A> const& mask, requires_arch<avx512f>) noexcept
         {
             return _mm512_maskz_expand_epi64(mask.mask(), self);
         }
         template <class A>
-        inline batch<uint64_t, A> expand(batch<uint64_t, A> const& self, batch_bool<uint64_t, A> const& mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<uint64_t, A> expand(batch<uint64_t, A> const& self, batch_bool<uint64_t, A> const& mask, requires_arch<avx512f>) noexcept
         {
             return _mm512_maskz_expand_epi64(mask.mask(), self);
         }
 
         // floor
         template <class A>
-        inline batch<float, A> floor(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> floor(batch<float, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_roundscale_ps(self, _MM_FROUND_TO_NEG_INF);
         }
         template <class A>
-        inline batch<double, A> floor(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> floor(batch<double, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_roundscale_pd(self, _MM_FROUND_TO_NEG_INF);
         }
 
         // fnma
         template <class A>
-        inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
         {
             return _mm512_fnmadd_ps(x, y, z);
         }
 
         template <class A>
-        inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
         {
             return _mm512_fnmadd_pd(x, y, z);
         }
 
         // fma
         template <class A>
-        inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
         {
             return _mm512_fmadd_ps(x, y, z);
         }
 
         template <class A>
-        inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
         {
             return _mm512_fmadd_pd(x, y, z);
         }
 
         // fms
         template <class A>
-        inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
         {
             return _mm512_fmsub_ps(x, y, z);
         }
 
         template <class A>
-        inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
         {
             return _mm512_fmsub_pd(x, y, z);
         }
 
         // from bool
         template <class A, class T>
-        inline batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
         {
             return select(self, batch<T, A>(1), batch<T, A>(0));
         }
 
         // from_mask
         template <class T, class A>
-        inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx512f>) noexcept
         {
             return static_cast<typename batch_bool<T, A>::register_type>(mask);
         }
 
         // gather
         template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
-        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
-                                  kernel::requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                        kernel::requires_arch<avx512f>) noexcept
         {
             return _mm512_i32gather_epi32(index, static_cast<const void*>(src), sizeof(T));
         }
 
         template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
-        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
-                                  kernel::requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                        kernel::requires_arch<avx512f>) noexcept
         {
             return _mm512_i64gather_epi64(index, static_cast<const void*>(src), sizeof(T));
         }
 
         template <class A, class U, detail::enable_sized_integral_t<U, 4> = 0>
-        inline batch<float, A> gather(batch<float, A> const&, float const* src,
-                                      batch<U, A> const& index,
-                                      kernel::requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, float const* src,
+                                            batch<U, A> const& index,
+                                            kernel::requires_arch<avx512f>) noexcept
         {
             return _mm512_i32gather_ps(index, src, sizeof(float));
         }
 
         template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
-        inline batch<double, A>
+        XSIMD_INLINE batch<double, A>
         gather(batch<double, A> const&, double const* src, batch<U, A> const& index,
                kernel::requires_arch<avx512f>) noexcept
         {
@@ -918,9 +918,9 @@ namespace xsimd
 
         // gather: handmade conversions
         template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
-        inline batch<float, A> gather(batch<float, A> const&, double const* src,
-                                      batch<V, A> const& index,
-                                      requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, double const* src,
+                                            batch<V, A> const& index,
+                                            requires_arch<avx512f>) noexcept
         {
             const batch<double, A> low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double)));
             const batch<double, A> high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double)));
@@ -928,9 +928,9 @@ namespace xsimd
         }
 
         template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
-        inline batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
-                                        batch<V, A> const& index,
-                                        requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
+                                              batch<V, A> const& index,
+                                              requires_arch<avx512f>) noexcept
         {
             const batch<double, A> low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double)));
             const batch<double, A> high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double)));
@@ -939,41 +939,41 @@ namespace xsimd
 
         // ge
         template <class A>
-        inline batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_cmp_ps_mask(self, other, _CMP_GE_OQ);
         }
         template <class A>
-        inline batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_cmp_pd_mask(self, other, _CMP_GE_OQ);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             return detail::compare_int_avx512f<A, T, _MM_CMPINT_GE>(self, other);
         }
 
         // gt
         template <class A>
-        inline batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_cmp_ps_mask(self, other, _CMP_GT_OQ);
         }
         template <class A>
-        inline batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_cmp_pd_mask(self, other, _CMP_GT_OQ);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             return detail::compare_int_avx512f<A, T, _MM_CMPINT_GT>(self, other);
         }
 
         // haddp
         template <class A>
-        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512f>) noexcept
         {
             // The following folds over the vector once:
             // tmp1 = [a0..8, b0..8]
@@ -1034,7 +1034,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<avx512f>) noexcept
         {
 #define step1(I, a, b)                                                   \
     batch<double, avx512f> res##I;                                       \
@@ -1069,25 +1069,25 @@ namespace xsimd
 
         // isnan
         template <class A>
-        inline batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_cmp_ps_mask(self, self, _CMP_UNORD_Q);
         }
         template <class A>
-        inline batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_cmp_pd_mask(self, self, _CMP_UNORD_Q);
         }
 
         // ldexp
         template <class A>
-        inline batch<float, A> ldexp(const batch<float, A>& self, const batch<as_integer_t<float>, A>& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> ldexp(const batch<float, A>& self, const batch<as_integer_t<float>, A>& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_scalef_ps(self, _mm512_cvtepi32_ps(other));
         }
 
         template <class A>
-        inline batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512f>) noexcept
         {
             // FIXME: potential data loss here when converting other elements to
             // int32 before converting them back to double.
@@ -1097,34 +1097,34 @@ namespace xsimd
 
         // le
         template <class A>
-        inline batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_cmp_ps_mask(self, other, _CMP_LE_OQ);
         }
         template <class A>
-        inline batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_cmp_pd_mask(self, other, _CMP_LE_OQ);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             return detail::compare_int_avx512f<A, T, _MM_CMPINT_LE>(self, other);
         }
 
         // load_aligned
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
         {
             return _mm512_load_si512((__m512i const*)mem);
         }
         template <class A>
-        inline batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
         {
             return _mm512_load_ps(mem);
         }
         template <class A>
-        inline batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
         {
             return _mm512_load_pd(mem);
         }
@@ -1133,7 +1133,7 @@ namespace xsimd
         namespace detail
         {
             template <class A>
-            inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx512f>) noexcept
+            XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx512f>) noexcept
             {
                 __m512i real_idx = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
                 __m512i imag_idx = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
@@ -1142,7 +1142,7 @@ namespace xsimd
                 return { real, imag };
             }
             template <class A>
-            inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx512f>) noexcept
+            XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx512f>) noexcept
             {
                 __m512i real_idx = _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14);
                 __m512i imag_idx = _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15);
@@ -1154,59 +1154,59 @@ namespace xsimd
 
         // load_unaligned
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
         {
             return _mm512_loadu_si512((__m512i const*)mem);
         }
         template <class A>
-        inline batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
         {
             return _mm512_loadu_ps(mem);
         }
         template <class A>
-        inline batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
         {
             return _mm512_loadu_pd(mem);
         }
 
         // lt
         template <class A>
-        inline batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_cmp_ps_mask(self, other, _CMP_LT_OQ);
         }
         template <class A>
-        inline batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_cmp_pd_mask(self, other, _CMP_LT_OQ);
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             return detail::compare_int_avx512f<A, T, _MM_CMPINT_LT>(self, other);
         }
 
         // mask
         template <class A, class T>
-        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
         {
             return self.data;
         }
 
         // max
         template <class A>
-        inline batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_max_ps(self, other);
         }
         template <class A>
-        inline batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_max_pd(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -1246,17 +1246,17 @@ namespace xsimd
 
         // min
         template <class A>
-        inline batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_min_ps(self, other);
         }
         template <class A>
-        inline batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_min_pd(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -1296,17 +1296,17 @@ namespace xsimd
 
         // mul
         template <class A>
-        inline batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_mul_ps(self, other);
         }
         template <class A>
-        inline batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_mul_pd(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
             {
@@ -1322,50 +1322,50 @@ namespace xsimd
 
         // nearbyint
         template <class A>
-        inline batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION);
         }
         template <class A>
-        inline batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION);
         }
 
         // nearbyint_as_int
         template <class A>
-        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
-                                                  requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                        requires_arch<avx512f>) noexcept
         {
             return _mm512_cvtps_epi32(self);
         }
 
         // neg
         template <class A, class T>
-        inline batch<T, A> neg(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<avx512f>) noexcept
         {
             return 0 - self;
         }
 
         // neq
         template <class A>
-        inline batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_cmp_ps_mask(self, other, _CMP_NEQ_UQ);
         }
         template <class A>
-        inline batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_cmp_pd_mask(self, other, _CMP_NEQ_UQ);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             return ~(self == other);
         }
 
         template <class A, class T>
-        inline batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
             return register_type(self.data ^ other.data);
@@ -1373,7 +1373,7 @@ namespace xsimd
 
         // reciprocal
         template <class A>
-        inline batch<float, A>
+        XSIMD_INLINE batch<float, A>
         reciprocal(batch<float, A> const& self,
                    kernel::requires_arch<avx512f>) noexcept
         {
@@ -1381,7 +1381,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<double, A>
+        XSIMD_INLINE batch<double, A>
         reciprocal(batch<double, A> const& self,
                    kernel::requires_arch<avx512f>) noexcept
         {
@@ -1390,7 +1390,7 @@ namespace xsimd
 
         // reduce_add
         template <class A>
-        inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
         {
             __m128 tmp1 = _mm512_extractf32x4_ps(rhs, 0);
             __m128 tmp2 = _mm512_extractf32x4_ps(rhs, 1);
@@ -1402,7 +1402,7 @@ namespace xsimd
             return reduce_add(batch<float, sse4_2>(res3), sse4_2 {});
         }
         template <class A>
-        inline double reduce_add(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE double reduce_add(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
         {
             __m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1);
             __m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0);
@@ -1410,7 +1410,7 @@ namespace xsimd
             return reduce_add(batch<double, avx2>(res1), avx2 {});
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline T reduce_add(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx512f>) noexcept
         {
             __m256i low, high;
             detail::split_avx512(self, low, high);
@@ -1420,10 +1420,10 @@ namespace xsimd
 
         // reduce_max
         template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
-        inline T reduce_max(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<avx512f>) noexcept
         {
-            constexpr batch_constant<batch<uint64_t, A>, 5, 6, 7, 8, 0, 0, 0, 0> mask;
-            batch<T, A> step = _mm512_permutexvar_epi64((batch<uint64_t, A>)mask, self);
+            constexpr batch_constant<uint64_t, A, 5, 6, 7, 8, 0, 0, 0, 0> mask;
+            batch<T, A> step = _mm512_permutexvar_epi64(mask.as_batch(), self);
             batch<T, A> acc = max(self, step);
             __m256i low = _mm512_castsi512_si256(acc);
             return reduce_max(batch<T, avx2>(low));
@@ -1431,10 +1431,10 @@ namespace xsimd
 
         // reduce_min
         template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
-        inline T reduce_min(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<avx512f>) noexcept
         {
-            constexpr batch_constant<batch<uint64_t, A>, 5, 6, 7, 8, 0, 0, 0, 0> mask;
-            batch<T, A> step = _mm512_permutexvar_epi64((batch<uint64_t, A>)mask, self);
+            constexpr batch_constant<uint64_t, A, 5, 6, 7, 8, 0, 0, 0, 0> mask;
+            batch<T, A> step = _mm512_permutexvar_epi64(mask.as_batch(), self);
             batch<T, A> acc = min(self, step);
             __m256i low = _mm512_castsi512_si256(acc);
             return reduce_min(batch<T, avx2>(low));
@@ -1442,19 +1442,19 @@ namespace xsimd
 
         // rsqrt
         template <class A>
-        inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
         {
             return _mm512_rsqrt14_ps(val);
         }
         template <class A>
-        inline batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<avx512f>) noexcept
         {
             return _mm512_rsqrt14_pd(val);
         }
 
         // sadd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -1474,52 +1474,52 @@ namespace xsimd
         // scatter
         template <class A, class T,
                   class = typename std::enable_if<std::is_same<uint32_t, T>::value || std::is_same<int32_t, T>::value, void>::type>
-        inline void scatter(batch<T, A> const& src, T* dst,
-                            batch<int32_t, A> const& index,
-                            kernel::requires_arch<avx512f>) noexcept
+        XSIMD_INLINE void scatter(batch<T, A> const& src, T* dst,
+                                  batch<int32_t, A> const& index,
+                                  kernel::requires_arch<avx512f>) noexcept
         {
             _mm512_i32scatter_epi32(dst, index, src, sizeof(T));
         }
 
         template <class A, class T,
                   class = typename std::enable_if<std::is_same<uint64_t, T>::value || std::is_same<int64_t, T>::value, void>::type>
-        inline void scatter(batch<T, A> const& src, T* dst,
-                            batch<int64_t, A> const& index,
-                            kernel::requires_arch<avx512f>) noexcept
+        XSIMD_INLINE void scatter(batch<T, A> const& src, T* dst,
+                                  batch<int64_t, A> const& index,
+                                  kernel::requires_arch<avx512f>) noexcept
         {
             _mm512_i64scatter_epi64(dst, index, src, sizeof(T));
         }
 
         template <class A>
-        inline void scatter(batch<float, A> const& src, float* dst,
-                            batch<int32_t, A> const& index,
-                            kernel::requires_arch<avx512f>) noexcept
+        XSIMD_INLINE void scatter(batch<float, A> const& src, float* dst,
+                                  batch<int32_t, A> const& index,
+                                  kernel::requires_arch<avx512f>) noexcept
         {
             _mm512_i32scatter_ps(dst, index, src, sizeof(float));
         }
 
         template <class A>
-        inline void scatter(batch<double, A> const& src, double* dst,
-                            batch<int64_t, A> const& index,
-                            kernel::requires_arch<avx512f>) noexcept
+        XSIMD_INLINE void scatter(batch<double, A> const& src, double* dst,
+                                  batch<int64_t, A> const& index,
+                                  kernel::requires_arch<avx512f>) noexcept
         {
             _mm512_i64scatter_pd(dst, index, src, sizeof(double));
         }
 
         // select
         template <class A>
-        inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx512f>) noexcept
         {
             return _mm512_mask_blend_ps(cond, false_br, true_br);
         }
         template <class A>
-        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx512f>) noexcept
         {
             return _mm512_mask_blend_pd(cond, false_br, true_br);
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -1571,7 +1571,7 @@ namespace xsimd
         }
 
         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
         {
             return select(batch_bool<T, A> { Values... }, true_br, false_br, avx512f {});
         }
@@ -1589,32 +1589,32 @@ namespace xsimd
 
         // set
         template <class A>
-        inline batch<float, A> set(batch<float, A> const&, requires_arch<avx512f>, float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15) noexcept
+        XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<avx512f>, float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15) noexcept
         {
             return _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
         }
 
         template <class A>
-        inline batch<double, A> set(batch<double, A> const&, requires_arch<avx512f>, double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7) noexcept
+        XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<avx512f>, double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7) noexcept
         {
             return _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
         {
             return _mm512_set_epi64(v7, v6, v5, v4, v3, v2, v1, v0);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
-                               T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                                     T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
         {
             return _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
         }
         template <class A, class T, detail::enable_signed_integer_t<T> = 0>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
-                               T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
-                               T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
-                               T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                                     T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                                     T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
+                                     T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
         {
 #if defined(__clang__) || __GNUC__
             return __extension__(__m512i)(__v32hi) {
@@ -1628,10 +1628,10 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_unsigned_integer_t<T> = 0>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
-                               T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
-                               T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
-                               T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                                     T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                                     T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
+                                     T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
         {
 #if defined(__clang__) || __GNUC__
             return __extension__(__m512i)(__v32hu) {
@@ -1645,14 +1645,14 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_signed_integer_t<T> = 0>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
-                               T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
-                               T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
-                               T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31,
-                               T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39,
-                               T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47,
-                               T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55,
-                               T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                                     T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                                     T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
+                                     T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31,
+                                     T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39,
+                                     T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47,
+                                     T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55,
+                                     T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept
         {
 
 #if defined(__clang__) || __GNUC__
@@ -1670,14 +1670,14 @@ namespace xsimd
 #endif
         }
         template <class A, class T, detail::enable_unsigned_integer_t<T> = 0>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
-                               T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
-                               T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
-                               T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31,
-                               T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39,
-                               T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47,
-                               T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55,
-                               T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                                     T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                                     T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
+                                     T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31,
+                                     T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39,
+                                     T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47,
+                                     T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55,
+                                     T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept
         {
 
 #if defined(__clang__) || __GNUC__
@@ -1696,7 +1696,7 @@ namespace xsimd
         }
 
         template <class A, class T, class... Values>
-        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx512f>, Values... values) noexcept
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx512f>, Values... values) noexcept
         {
             static_assert(sizeof...(Values) == batch_bool<T, A>::size, "consistent init");
             using register_type = typename batch_bool<T, A>::register_type;
@@ -1708,9 +1708,9 @@ namespace xsimd
 
         // shuffle
         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7, ITy I8, ITy I9, ITy I10, ITy I11, ITy I12, ITy I13, ITy I14, ITy I15>
-        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y,
-                                       batch_constant<batch<ITy, A>, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15> mask,
-                                       requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y,
+                                             batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15> mask,
+                                             requires_arch<avx512f>) noexcept
         {
             constexpr uint32_t smask = (I0 & 0x3) | ((I1 & 0x3) << 2) | ((I2 & 0x3) << 4) | ((I3 & 0x3) << 6);
 
@@ -1726,7 +1726,7 @@ namespace xsimd
         }
 
         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
-        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx512f>) noexcept
         {
             constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3) | ((I4 & 0x1) << 4) | ((I5 & 0x1) << 5) | ((I6 & 0x1) << 6) | ((I7 & 0x1) << 7);
             // shuffle within lane
@@ -1742,7 +1742,7 @@ namespace xsimd
 
         // slide_left
         template <size_t N, class A, class T>
-        inline batch<T, A> slide_left(batch<T, A> const&, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const&, requires_arch<avx512f>) noexcept
         {
             static_assert(N == 0xDEAD, "not implemented yet");
             return {};
@@ -1750,7 +1750,7 @@ namespace xsimd
 
         // slide_right
         template <size_t N, class A, class T>
-        inline batch<T, A> slide_right(batch<T, A> const&, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const&, requires_arch<avx512f>) noexcept
         {
             static_assert(N == 0xDEAD, "not implemented yet");
             return {};
@@ -1758,19 +1758,19 @@ namespace xsimd
 
         // sqrt
         template <class A>
-        inline batch<float, A> sqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
         {
             return _mm512_sqrt_ps(val);
         }
         template <class A>
-        inline batch<double, A> sqrt(batch<double, A> const& val, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<avx512f>) noexcept
         {
             return _mm512_sqrt_pd(val);
         }
 
         // ssub
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -1785,7 +1785,7 @@ namespace xsimd
 
         // store
         template <class T, class A>
-        inline void store(batch_bool<T, A> const& self, bool* mem, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE void store(batch_bool<T, A> const& self, bool* mem, requires_arch<avx512f>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
             constexpr auto size = batch_bool<T, A>::size;
@@ -1795,51 +1795,51 @@ namespace xsimd
 
         // store_aligned
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline void store_aligned(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_store_si512((__m512i*)mem, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_store_si512((__m512i*)mem, self);
         }
         template <class A>
-        inline void store_aligned(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_store_ps(mem, self);
         }
         template <class A>
-        inline void store_aligned(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_store_pd(mem, self);
         }
 
         // store_unaligned
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_storeu_si512((__m512i*)mem, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_storeu_si512((__m512i*)mem, self);
         }
         template <class A>
-        inline void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_storeu_ps(mem, self);
         }
         template <class A>
-        inline void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_storeu_pd(mem, self);
         }
 
         // sub
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -1868,88 +1868,88 @@ namespace xsimd
             }
         }
         template <class A>
-        inline batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_sub_ps(self, other);
         }
         template <class A>
-        inline batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
             return _mm512_sub_pd(self, other);
         }
 
         // swizzle (dynamic version)
         template <class A>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512f>) noexcept
         {
             return _mm512_permutexvar_ps(mask, self);
         }
 
         template <class A>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512f>) noexcept
         {
             return _mm512_permutexvar_pd(mask, self);
         }
 
         template <class A>
-        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512f>) noexcept
         {
             return _mm512_permutexvar_epi64(mask, self);
         }
 
         template <class A>
-        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512f>) noexcept
         {
             return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx512f {}));
         }
 
         template <class A>
-        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512f>) noexcept
         {
             return _mm512_permutexvar_epi32(mask, self);
         }
 
         template <class A>
-        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512f>) noexcept
         {
             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx512f {}));
         }
 
         // swizzle (constant version)
         template <class A, uint32_t... Vs>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
         {
-            return swizzle(self, (batch<uint32_t, A>)mask, avx512f {});
+            return swizzle(self, mask.as_batch(), avx512f {});
         }
 
         template <class A, uint64_t... Vs>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
         {
-            return swizzle(self, (batch<uint64_t, A>)mask, avx512f {});
+            return swizzle(self, mask.as_batch(), avx512f {});
         }
 
         template <class A, uint64_t... Vs>
-        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
         {
-            return swizzle(self, (batch<uint64_t, A>)mask, avx512f {});
+            return swizzle(self, mask.as_batch(), avx512f {});
         }
 
         template <class A, uint64_t... Vs>
-        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
         {
-            return swizzle(self, (batch<uint64_t, A>)mask, avx512f {});
+            return swizzle(self, mask.as_batch(), avx512f {});
         }
 
         template <class A, uint32_t... Vs>
-        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
         {
-            return swizzle(self, (batch<uint32_t, A>)mask, avx512f {});
+            return swizzle(self, mask.as_batch(), avx512f {});
         }
 
         template <class A, uint32_t... Vs>
-        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
         {
-            return swizzle(self, (batch<uint32_t, A>)mask, avx512f {});
+            return swizzle(self, mask.as_batch(), avx512f {});
         }
 
         namespace detail
@@ -1973,28 +1973,28 @@ namespace xsimd
                       uint16_t I24, uint16_t I25, uint16_t I26, uint16_t I27, uint16_t I28, uint16_t I29, uint16_t I30, uint16_t I31>
             struct fold_batch_constant
             {
-                using type = batch_constant<batch<uint32_t, A>, I0 / 2, I2 / 2, I4 / 2, I6 / 2, I8 / 2, I10 / 2, I12 / 2, I14 / 2,
+                using type = batch_constant<uint32_t, A, I0 / 2, I2 / 2, I4 / 2, I6 / 2, I8 / 2, I10 / 2, I12 / 2, I14 / 2,
                                             I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>;
             };
 
         }
 
         template <class A, uint16_t... Idx, class _ = typename std::enable_if<detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value, void>::type>
-        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, Idx...>, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...>, requires_arch<avx512f>) noexcept
         {
             constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
             return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
         }
 
         template <class A>
-        inline batch<uint16_t, A>
-        swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<uint16_t, A>
+        swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
         {
             // FIXME: this sequence is very inefficient, but it's here to catch
             // a pattern generated by detail::reduce from xsimd_generic_math.hpp.
             // The whole pattern is actually decently folded by GCC and Clang,
             // so bare with it.
-            constexpr batch_constant<batch<uint32_t, A>, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
+            constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
             auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
 
             alignas(A::alignment()) uint16_t buffer[32];
@@ -2004,21 +2004,21 @@ namespace xsimd
         }
 
         template <class A, uint16_t... Vs>
-        inline batch<int16_t, A>
-        swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<int16_t, A>
+        swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
         {
             return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512f {}));
         }
 
         // trunc
         template <class A>
-        inline batch<float, A>
+        XSIMD_INLINE batch<float, A>
         trunc(batch<float, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION);
         }
         template <class A>
-        inline batch<double, A>
+        XSIMD_INLINE batch<double, A>
         trunc(batch<double, A> const& self, requires_arch<avx512f>) noexcept
         {
             return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION);
@@ -2026,7 +2026,7 @@ namespace xsimd
 
         // zip_hi
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A>
+        XSIMD_INLINE batch<T, A>
         zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             __m512i lo, hi;
@@ -2064,7 +2064,7 @@ namespace xsimd
                 1);
         }
         template <class A>
-        inline batch<float, A>
+        XSIMD_INLINE batch<float, A>
         zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
             auto lo = _mm512_unpacklo_ps(self, other);
@@ -2078,7 +2078,7 @@ namespace xsimd
                 1);
         }
         template <class A>
-        inline batch<double, A>
+        XSIMD_INLINE batch<double, A>
         zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
             auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other));
@@ -2094,7 +2094,7 @@ namespace xsimd
 
         // zip_lo
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A>
+        XSIMD_INLINE batch<T, A>
         zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             __m512i lo, hi;
@@ -2132,7 +2132,7 @@ namespace xsimd
                 2);
         }
         template <class A>
-        inline batch<float, A>
+        XSIMD_INLINE batch<float, A>
         zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
             auto lo = _mm512_unpacklo_ps(self, other);
@@ -2146,7 +2146,7 @@ namespace xsimd
                 2);
         }
         template <class A>
-        inline batch<double, A>
+        XSIMD_INLINE batch<double, A>
         zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
             auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other));
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_constants.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_constants.hpp
index 22dd5d3e303..51411d28774 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_constants.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_constants.hpp
@@ -24,34 +24,34 @@ namespace xsimd
 
 #define XSIMD_DEFINE_CONSTANT(NAME, SINGLE, DOUBLE) \
     template <class T>                              \
-    inline T NAME() noexcept                        \
+    XSIMD_INLINE T NAME() noexcept                  \
     {                                               \
         return T(NAME<typename T::value_type>());   \
     }                                               \
     template <>                                     \
-    inline float NAME<float>() noexcept             \
+    XSIMD_INLINE float NAME<float>() noexcept       \
     {                                               \
         return SINGLE;                              \
     }                                               \
     template <>                                     \
-    inline double NAME<double>() noexcept           \
+    XSIMD_INLINE double NAME<double>() noexcept     \
     {                                               \
         return DOUBLE;                              \
     }
 
 #define XSIMD_DEFINE_CONSTANT_HEX(NAME, SINGLE, DOUBLE) \
     template <class T>                                  \
-    inline T NAME() noexcept                            \
+    XSIMD_INLINE T NAME() noexcept                      \
     {                                                   \
         return T(NAME<typename T::value_type>());       \
     }                                                   \
     template <>                                         \
-    inline float NAME<float>() noexcept                 \
+    XSIMD_INLINE float NAME<float>() noexcept           \
     {                                                   \
         return bit_cast<float>((uint32_t)SINGLE);       \
     }                                                   \
     template <>                                         \
-    inline double NAME<double>() noexcept               \
+    XSIMD_INLINE double NAME<double>() noexcept         \
     {                                                   \
         return bit_cast<double>((uint64_t)DOUBLE);      \
     }
@@ -168,7 +168,7 @@ namespace xsimd
         }
 
         template <class T>
-        inline constexpr T allbits() noexcept
+        XSIMD_INLINE constexpr T allbits() noexcept
         {
             return T(detail::allbits_impl<typename T::value_type>::get_value());
         }
@@ -178,19 +178,19 @@ namespace xsimd
          *****************************/
 
         template <class T>
-        inline constexpr as_integer_t<T> mask1frexp() noexcept
+        XSIMD_INLINE constexpr as_integer_t<T> mask1frexp() noexcept
         {
             return as_integer_t<T>(mask1frexp<typename T::value_type>());
         }
 
         template <>
-        inline constexpr int32_t mask1frexp<float>() noexcept
+        XSIMD_INLINE constexpr int32_t mask1frexp<float>() noexcept
         {
             return 0x7f800000;
         }
 
         template <>
-        inline constexpr int64_t mask1frexp<double>() noexcept
+        XSIMD_INLINE constexpr int64_t mask1frexp<double>() noexcept
         {
             return 0x7ff0000000000000;
         }
@@ -200,19 +200,19 @@ namespace xsimd
          *****************************/
 
         template <class T>
-        inline constexpr as_integer_t<T> mask2frexp() noexcept
+        XSIMD_INLINE constexpr as_integer_t<T> mask2frexp() noexcept
         {
             return as_integer_t<T>(mask2frexp<typename T::value_type>());
         }
 
         template <>
-        inline constexpr int32_t mask2frexp<float>() noexcept
+        XSIMD_INLINE constexpr int32_t mask2frexp<float>() noexcept
         {
             return 0x3f000000;
         }
 
         template <>
-        inline constexpr int64_t mask2frexp<double>() noexcept
+        XSIMD_INLINE constexpr int64_t mask2frexp<double>() noexcept
         {
             return 0x3fe0000000000000;
         }
@@ -222,19 +222,19 @@ namespace xsimd
          ******************************/
 
         template <class T>
-        inline constexpr as_integer_t<T> maxexponent() noexcept
+        XSIMD_INLINE constexpr as_integer_t<T> maxexponent() noexcept
         {
             return as_integer_t<T>(maxexponent<typename T::value_type>());
         }
 
         template <>
-        inline constexpr int32_t maxexponent<float>() noexcept
+        XSIMD_INLINE constexpr int32_t maxexponent<float>() noexcept
         {
             return 127;
         }
 
         template <>
-        inline constexpr int64_t maxexponent<double>() noexcept
+        XSIMD_INLINE constexpr int64_t maxexponent<double>() noexcept
         {
             return 1023;
         }
@@ -244,19 +244,19 @@ namespace xsimd
          ******************************/
 
         template <class T>
-        inline constexpr as_integer_t<T> maxexponentm1() noexcept
+        XSIMD_INLINE constexpr as_integer_t<T> maxexponentm1() noexcept
         {
             return as_integer_t<T>(maxexponentm1<typename T::value_type>());
         }
 
         template <>
-        inline constexpr int32_t maxexponentm1<float>() noexcept
+        XSIMD_INLINE constexpr int32_t maxexponentm1<float>() noexcept
         {
             return 126;
         }
 
         template <>
-        inline constexpr int64_t maxexponentm1<double>() noexcept
+        XSIMD_INLINE constexpr int64_t maxexponentm1<double>() noexcept
         {
             return 1022;
         }
@@ -266,19 +266,19 @@ namespace xsimd
          **********************/
 
         template <class T>
-        inline constexpr int32_t nmb() noexcept
+        XSIMD_INLINE constexpr int32_t nmb() noexcept
         {
             return nmb<typename T::value_type>();
         }
 
         template <>
-        inline constexpr int32_t nmb<float>() noexcept
+        XSIMD_INLINE constexpr int32_t nmb<float>() noexcept
         {
             return 23;
         }
 
         template <>
-        inline constexpr int32_t nmb<double>() noexcept
+        XSIMD_INLINE constexpr int32_t nmb<double>() noexcept
         {
             return 52;
         }
@@ -288,7 +288,7 @@ namespace xsimd
          ***********************/
 
         template <class T>
-        inline constexpr T zero() noexcept
+        XSIMD_INLINE constexpr T zero() noexcept
         {
             return T(typename T::value_type(0));
         }
@@ -353,7 +353,7 @@ namespace xsimd
             template <>
             struct minvalue_impl<float>
             {
-                inline static float get_value() noexcept
+                XSIMD_INLINE static float get_value() noexcept
                 {
                     return bit_cast<float>((uint32_t)0xff7fffff);
                 }
@@ -362,7 +362,7 @@ namespace xsimd
             template <>
             struct minvalue_impl<double>
             {
-                inline static double get_value() noexcept
+                XSIMD_INLINE static double get_value() noexcept
                 {
                     return bit_cast<double>((uint64_t)0xffefffffffffffff);
                 }
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_emulated.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_emulated.hpp
new file mode 100644
index 00000000000..ef7fd0191a7
--- /dev/null
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_emulated.hpp
@@ -0,0 +1,757 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_EMULATED_HPP
+#define XSIMD_EMULATED_HPP
+
+#include <complex>
+#include <limits>
+#include <numeric>
+#include <type_traits>
+
+#include "../arch/xsimd_scalar.hpp"
+
+#include "../types/xsimd_emulated_register.hpp"
+#include "../types/xsimd_utils.hpp"
+
+namespace xsimd
+{
+    template <typename T, class A, bool... Values>
+    struct batch_bool_constant;
+
+    template <class T_out, class T_in, class A>
+    XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
+
+    template <typename T, class A, T... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // fwd
+        template <class A, class T, size_t I>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
+        template <class A, typename T, typename ITy, ITy... Indices>
+        XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
+
+        namespace detail
+        {
+            template <size_t I, class F, class... Bs>
+            auto emulated_apply(F func, Bs const&... bs) -> decltype(func(bs.data[I]...))
+            {
+                return func(bs.data[I]...);
+            }
+
+            template <class F, class B, class... Bs, size_t... Is>
+            auto emulated_apply(F func, ::xsimd::detail::index_sequence<Is...>, B const& b, Bs const&... bs) -> std::array<decltype(func(b.data[0], bs.data[0]...)), B::size>
+            {
+                return { emulated_apply<Is>(func, b, bs...)... };
+            }
+
+            template <class B, class F, class... Bs>
+            auto emulated_apply(F func, B const& b, Bs const&... bs) -> std::array<decltype(func(b.data[0], bs.data[0]...)), B::size>
+            {
+                return emulated_apply(func, ::xsimd::detail::make_index_sequence<B::size>(), b, bs...);
+            }
+        }
+
+        // abs
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::abs(v); },
+                                          self);
+        }
+
+        // add
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::add(v0, v1); },
+                                          self, other);
+        }
+
+        // all
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return std::all_of(self.data.begin(), self.data.end(), [](T v)
+                               { return bool(v); });
+        }
+
+        // any
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return std::any_of(self.data.begin(), self.data.end(), [](T v)
+                               { return bool(v); });
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in, size_t N = 8 * sizeof(T_in) * batch<T_in, A>::size>
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<emulated<N>>) noexcept
+        {
+            return { self.data };
+        }
+
+        // bitwise_and
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::bitwise_and(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::bitwise_and(v0, v1); },
+                                          self, other);
+        }
+
+        // bitwise_andnot
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::bitwise_andnot(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::bitwise_andnot(v0, v1); },
+                                          self, other);
+        }
+
+        // bitwise_lshift
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([other](T v)
+                                          { return xsimd::bitwise_lshift(v, other); },
+                                          self);
+        }
+
+        // bitwise_not
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::bitwise_not(v); },
+                                          self);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v)
+                                          { return xsimd::bitwise_not(v); },
+                                          self);
+        }
+
+        // bitwise_or
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::bitwise_or(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::bitwise_or(v0, v1); },
+                                          self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([other](T v)
+                                          { return xsimd::bitwise_rshift(v, other); },
+                                          self);
+        }
+
+        // bitwise_xor
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::bitwise_xor(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::bitwise_xor(v0, v1); },
+                                          self, other);
+        }
+
+        // bitwise_cast
+        template <class A, class T_in, class T_out, size_t N = 8 * sizeof(T_in) * batch<T_in, A>::size>
+        XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& self, batch<T_out, A> const&, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T_out, A>::size;
+            std::array<T_out, size> result;
+            char* raw_data = reinterpret_cast<char*>(result.data());
+            const char* raw_input = reinterpret_cast<const char*>(self.data.data());
+            memcpy(raw_data, raw_input, size * sizeof(T_out));
+            return result;
+        }
+
+        // broadcast
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        batch<T, A> XSIMD_INLINE broadcast(T val, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> r;
+            std::fill(r.begin(), r.end(), val);
+            return r;
+        }
+
+        // store_complex
+        namespace detail
+        {
+            // complex_low
+            template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+            XSIMD_INLINE batch<T, A> complex_low(batch<std::complex<T>, A> const& self, requires_arch<emulated<N>>) noexcept
+            {
+                constexpr size_t size = batch<T, A>::size;
+                std::array<T, size> result;
+                for (size_t i = 0; i < size / 2; ++i)
+                {
+                    result[2 * i] = self.real().data[i];
+                    result[1 + 2 * i] = self.imag().data[i];
+                }
+                return result;
+            }
+            // complex_high
+            template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+            XSIMD_INLINE batch<T, A> complex_high(batch<std::complex<T>, A> const& self, requires_arch<emulated<N>>) noexcept
+            {
+                constexpr size_t size = batch<T, A>::size;
+                std::array<T, size> result;
+                for (size_t i = 0; i < size / 2; ++i)
+                {
+                    result[2 * i] = self.real().data[i + size / 2];
+                    result[1 + 2 * i] = self.imag().data[i + size / 2];
+                }
+                return result;
+            }
+        }
+
+        // decr_if
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<emulated<N>>) noexcept
+        {
+            return self - batch<T, A>(mask.data);
+        }
+
+        // div
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::div(v0, v1); },
+                                          self, other);
+        }
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A, size_t N = 8 * sizeof(float) * batch<float, A>::size>
+            XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](int32_t v)
+                                              { return float(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(float) * batch<float, A>::size>
+            XSIMD_INLINE batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](uint32_t v)
+                                              { return float(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(double) * batch<double, A>::size>
+            XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& self, batch<double, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](int64_t v)
+                                              { return double(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(double) * batch<double, A>::size>
+            XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& self, batch<double, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](uint64_t v)
+                                              { return double(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(int32_t) * batch<int32_t, A>::size>
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](float v)
+                                              { return int32_t(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(double) * batch<double, A>::size>
+            XSIMD_INLINE batch<int64_t, A> fast_cast(batch<double, A> const& self, batch<int64_t, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](double v)
+                                              { return int64_t(v); },
+                                              self);
+            }
+        }
+
+        // eq
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, emulated<N>> eq(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::eq(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch_bool<T, A>::size>
+        XSIMD_INLINE batch_bool<T, emulated<N>> eq(batch_bool<T, emulated<N>> const& self, batch_bool<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::eq(v0, v1); },
+                                          self, other);
+        }
+
+        // from_bool
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v)
+                                          { return T(v); },
+                                          self);
+        }
+
+        // from_mask
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<bool, size> vmask;
+            for (size_t i = 0; i < size; ++i)
+                vmask[i] = (mask >> i) & 1u;
+            return vmask;
+        }
+
+        // ge
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, emulated<N>> ge(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::ge(v0, v1); },
+                                          self, other);
+        }
+
+        // gt
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, emulated<N>> gt(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::gt(v0, v1); },
+                                          self, other);
+        }
+
+        // haddp
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> haddp(batch<T, A> const* row, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> r;
+            for (size_t i = 0; i < size; ++i)
+                r[i] = std::accumulate(row[i].data.begin() + 1, row[i].data.end(), row[i].data.front());
+            return r;
+        }
+
+        // incr_if
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<emulated<N>>) noexcept
+        {
+            return self + batch<T, A>(mask.data);
+        }
+
+        // insert
+        template <class A, class T, size_t I, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<emulated<N>>) noexcept
+        {
+            batch<T, A> other = self;
+            other.data[I] = val;
+            return other;
+        }
+
+        // isnan
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size, class = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> isnan(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::isnan(v); },
+                                          self);
+        }
+
+        // load_aligned
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> res;
+            std::copy(mem, mem + size, res.begin());
+            return res;
+        }
+
+        // load_unaligned
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> res;
+            std::copy(mem, mem + size, res.begin());
+            return res;
+        }
+
+        // load_complex
+        namespace detail
+        {
+            template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+            XSIMD_INLINE batch<std::complex<T>, A> load_complex(batch<T, A> const& hi, batch<T, A> const& lo, requires_arch<emulated<N>>) noexcept
+            {
+                constexpr size_t size = batch<T, A>::size;
+                std::array<T, size> real, imag;
+                for (size_t i = 0; i < size / 2; ++i)
+                {
+                    real[i] = hi.data[2 * i];
+                    imag[i] = hi.data[1 + 2 * i];
+                }
+                for (size_t i = 0; i < size / 2; ++i)
+                {
+                    real[size / 2 + i] = lo.data[2 * i];
+                    imag[size / 2 + i] = lo.data[1 + 2 * i];
+                }
+                return { real, imag };
+            }
+        }
+
+        // le
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, emulated<N>> le(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::le(v0, v1); },
+                                          self, other);
+        }
+
+        // lt
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, emulated<N>> lt(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::lt(v0, v1); },
+                                          self, other);
+        }
+
+        // mask
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            uint64_t res = 0;
+            for (size_t i = 0; i < size; ++i)
+                res |= (self.data[i] ? 1u : 0u) << i;
+            return res;
+        }
+
+        // max
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::max(v0, v1); },
+                                          self, other);
+        }
+
+        // min
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::min(v0, v1); },
+                                          self, other);
+        }
+
+        // mul
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::mul(v0, v1); },
+                                          self, other);
+        }
+
+        // nearbyint_as_int
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<as_integer_t<T>, A> nearbyint_as_int(batch<T, A> const& self,
+                                                                requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::nearbyint_as_int(v); },
+                                          self);
+        }
+
+        // neg
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::neg(v); },
+                                          self);
+        }
+
+        // neq
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::neq(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::neq(v0, v1); },
+                                          self, other);
+        }
+
+        // reduce_add
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> buffer;
+            self.store_unaligned(buffer.data());
+            return std::accumulate(buffer.begin() + 1, buffer.end(), *buffer.begin());
+        }
+
+        // reduce_max
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return std::accumulate(self.data.begin() + 1, self.data.end(), *self.data.begin(), [](T const& x, T const& y)
+                                   { return xsimd::max(x, y); });
+        }
+
+        // reduce_min
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return std::accumulate(self.data.begin() + 1, self.data.end(), *self.data.begin(), [](T const& x, T const& y)
+                                   { return xsimd::min(x, y); });
+        }
+
+        // rsqrt
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::rsqrt(v); },
+                                          self);
+        }
+
+        // select
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool c, T t, T f)
+                                          { return xsimd::select(c, t, f); },
+                                          cond, true_br, false_br);
+        }
+
+        template <class A, class T, bool... Values>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<emulated<8 * sizeof(T) * batch<T, A>::size>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            static_assert(sizeof...(Values) == size, "consistent init");
+            return select((batch_bool<T, A>)cond, true_br, false_br, emulated<8 * sizeof(T) * size> {});
+        }
+
+        // shuffle
+        template <class A, typename T, class ITy, ITy... Is>
+        XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, Is...> mask, requires_arch<emulated<batch<T, A>::size>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            batch<ITy, A> bmask = mask;
+            std::array<T, size> res;
+            for (size_t i = 0; i < size; ++i)
+                res[i] = bmask.data[i] < size ? x.data[bmask.data[i]] : y.data[bmask.data[i] - size];
+            return res;
+        }
+
+        // sqrt
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::sqrt(v); },
+                                          self);
+        }
+
+        // slide_left
+        template <size_t M, class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> result;
+            char* raw_data = reinterpret_cast<char*>(result.data());
+            memset(raw_data, 0, M);
+            memcpy(raw_data + M, reinterpret_cast<const char*>(x.data.data()), sizeof(T) * result.size() - M);
+            return result;
+        }
+
+        // slide_right
+        template <size_t M, class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> result;
+            char* raw_data = reinterpret_cast<char*>(result.data());
+            memcpy(raw_data, reinterpret_cast<const char*>(x.data.data()) + M, sizeof(T) * result.size() - M);
+            memset(raw_data + sizeof(T) * result.size() - M, 0, M);
+            return result;
+        }
+
+        // sadd
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::sadd(v0, v1); },
+                                          self, other);
+        }
+
+        // set
+        template <class A, class T, size_t N, class... Values>
+        XSIMD_INLINE batch<T, emulated<N>> set(batch<T, emulated<N>> const&, requires_arch<emulated<N>>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<T, emulated<N>>::size, "consistent init");
+            return { typename batch<T, emulated<N>>::register_type { static_cast<T>(values)... } };
+        }
+
+        template <class A, class T, size_t N, class... Values>
+        XSIMD_INLINE batch_bool<T, emulated<N>> set(batch_bool<T, emulated<N>> const&, requires_arch<emulated<N>>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<T, emulated<N>>::size, "consistent init");
+            return { std::array<bool, sizeof...(Values)> { static_cast<bool>(values)... } };
+        }
+
+        // ssub
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::ssub(v0, v1); },
+                                          self, other);
+        }
+
+        // store_aligned
+        template <class A, class T, size_t N>
+        XSIMD_INLINE void store_aligned(T* mem, batch<T, emulated<N>> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            std::copy(self.data.begin(), self.data.end(), mem);
+        }
+
+        // store_unaligned
+        template <class A, class T, size_t N>
+        XSIMD_INLINE void store_unaligned(T* mem, batch<T, emulated<N>> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            std::copy(self.data.begin(), self.data.end(), mem);
+        }
+
+        // sub
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::sub(v0, v1); },
+                                          self, other);
+        }
+
+        // swizzle
+
+        template <class A, typename T, class ITy, ITy... Is>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch_constant<ITy, A, Is...> mask, requires_arch<emulated<8 * sizeof(T) * batch<T, A>::size>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            batch<ITy, A> bmask = mask;
+            std::array<T, size> res;
+            for (size_t i = 0; i < size; ++i)
+                res[i] = self.data[bmask.data[i]];
+            return res;
+        }
+
+        // zip_hi
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            // Note: irregular behavior for odd numbers.
+            std::array<T, size> res;
+            if (size % 2)
+            {
+                for (size_t i = 0; i < size; ++i)
+                    res[i] = (i % 2 ? self : other).data[size / 2 + i / 2];
+            }
+            else
+            {
+                for (size_t i = 0; i < size; ++i)
+                    res[i] = (i % 2 ? other : self).data[size / 2 + i / 2];
+            }
+            return res;
+        }
+
+        // zip_lo
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            // Note: irregular behavior for odd numbers.
+            std::array<T, size> res;
+            for (size_t i = 0; i < size; ++i)
+                res[i] = (i % 2 ? other : self).data[i / 2];
+            return res;
+        }
+    }
+}
+
+#endif
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_fma3_avx.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_fma3_avx.hpp
index 64e9ed65d1f..99262531476 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_fma3_avx.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_fma3_avx.hpp
@@ -23,52 +23,52 @@ namespace xsimd
 
         // fnma
         template <class A>
-        inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        XSIMD_INLINE batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
         {
             return _mm256_fnmadd_ps(x, y, z);
         }
 
         template <class A>
-        inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        XSIMD_INLINE batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
         {
             return _mm256_fnmadd_pd(x, y, z);
         }
 
         // fnms
         template <class A>
-        inline batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        XSIMD_INLINE batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
         {
             return _mm256_fnmsub_ps(x, y, z);
         }
 
         template <class A>
-        inline batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        XSIMD_INLINE batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
         {
             return _mm256_fnmsub_pd(x, y, z);
         }
 
         // fma
         template <class A>
-        inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        XSIMD_INLINE batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
         {
             return _mm256_fmadd_ps(x, y, z);
         }
 
         template <class A>
-        inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        XSIMD_INLINE batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
         {
             return _mm256_fmadd_pd(x, y, z);
         }
 
         // fms
         template <class A>
-        inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        XSIMD_INLINE batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
         {
             return _mm256_fmsub_ps(x, y, z);
         }
 
         template <class A>
-        inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        XSIMD_INLINE batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
         {
             return _mm256_fmsub_pd(x, y, z);
         }
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_fma3_sse.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_fma3_sse.hpp
index 55c38f13a4d..9b126166ac0 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_fma3_sse.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_fma3_sse.hpp
@@ -22,52 +22,52 @@ namespace xsimd
         using namespace types;
         // fnma
         template <class A>
-        inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        XSIMD_INLINE batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
         {
             return _mm_fnmadd_ps(x, y, z);
         }
 
         template <class A>
-        inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        XSIMD_INLINE batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
         {
             return _mm_fnmadd_pd(x, y, z);
         }
 
         // fnms
         template <class A>
-        inline batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        XSIMD_INLINE batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
         {
             return _mm_fnmsub_ps(x, y, z);
         }
 
         template <class A>
-        inline batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        XSIMD_INLINE batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
         {
             return _mm_fnmsub_pd(x, y, z);
         }
 
         // fma
         template <class A>
-        inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        XSIMD_INLINE batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
         {
             return _mm_fmadd_ps(x, y, z);
         }
 
         template <class A>
-        inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        XSIMD_INLINE batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
         {
             return _mm_fmadd_pd(x, y, z);
         }
 
         // fms
         template <class A>
-        inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        XSIMD_INLINE batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
         {
             return _mm_fmsub_ps(x, y, z);
         }
 
         template <class A>
-        inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        XSIMD_INLINE batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
         {
             return _mm_fmsub_pd(x, y, z);
         }
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_fma4.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_fma4.hpp
index 6a97d711e91..e51c7c52a82 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_fma4.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_fma4.hpp
@@ -23,52 +23,52 @@ namespace xsimd
 
         // fnma
         template <class A>
-        inline batch<float, A> fnma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        XSIMD_INLINE batch<float, A> fnma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
         {
             return _mm_nmacc_ps(x, y, z);
         }
 
         template <class A>
-        inline batch<double, A> fnma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        XSIMD_INLINE batch<double, A> fnma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
         {
             return _mm_nmacc_pd(x, y, z);
         }
 
         // fnms
         template <class A>
-        inline batch<float, A> fnms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        XSIMD_INLINE batch<float, A> fnms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
         {
             return _mm_nmsub_ps(x, y, z);
         }
 
         template <class A>
-        inline batch<double, A> fnms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        XSIMD_INLINE batch<double, A> fnms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
         {
             return _mm_nmsub_pd(x, y, z);
         }
 
         // fma
         template <class A>
-        inline batch<float, A> fma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        XSIMD_INLINE batch<float, A> fma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
         {
             return _mm_macc_ps(x, y, z);
         }
 
         template <class A>
-        inline batch<double, A> fma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        XSIMD_INLINE batch<double, A> fma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
         {
             return _mm_macc_pd(x, y, z);
         }
 
         // fms
         template <class A>
-        inline batch<float, A> fms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        XSIMD_INLINE batch<float, A> fms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
         {
             return _mm_msub_ps(x, y, z);
         }
 
         template <class A>
-        inline batch<double, A> fms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        XSIMD_INLINE batch<double, A> fms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
         {
             return _mm_msub_pd(x, y, z);
         }
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_generic_fwd.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_generic_fwd.hpp
index 87dcaa886f3..02708d60f70 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_generic_fwd.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_generic_fwd.hpp
@@ -22,21 +22,21 @@ namespace xsimd
     {
         // forward declaration
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<generic>) noexcept;
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<generic>) noexcept;
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
         template <class A, class T>
-        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline T hadd(batch<T, A> const& self, requires_arch<generic>) noexcept;
+        XSIMD_INLINE T hadd(batch<T, A> const& self, requires_arch<generic>) noexcept;
 
     }
 }
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_i8mm_neon64.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_i8mm_neon64.hpp
new file mode 100644
index 00000000000..55339230203
--- /dev/null
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_i8mm_neon64.hpp
@@ -0,0 +1,17 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_I8MM_NEON64_HPP
+#define XSIMD_I8MM_NEON64_HPP
+
+#include "../types/xsimd_i8mm_neon64_register.hpp"
+
+#endif
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_isa.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_isa.hpp
index 0edd7767417..5b714b29918 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_isa.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_isa.hpp
@@ -16,6 +16,10 @@
 
 #include "./xsimd_generic_fwd.hpp"
 
+#if XSIMD_WITH_EMULATED
+#include "./xsimd_emulated.hpp"
+#endif
+
 #if XSIMD_WITH_SSE2
 #include "./xsimd_sse2.hpp"
 #endif
@@ -104,6 +108,10 @@
 #include "./xsimd_neon64.hpp"
 #endif
 
+#if XSIMD_WITH_I8MM_NEON64
+#include "./xsimd_i8mm_neon64.hpp"
+#endif
+
 #if XSIMD_WITH_SVE
 #include "./xsimd_sve.hpp"
 #endif
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_neon.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_neon.hpp
index 54f09fb6636..cd161305ffd 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_neon.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_neon.hpp
@@ -23,124 +23,130 @@
 // Wrap intrinsics so we can pass them as function pointers
 // - OP: intrinsics name prefix, e.g., vorrq
 // - RT: type traits to deduce intrinsics return types
-#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                \
-    namespace wrap                                                          \
-    {                                                                       \
-        inline RT<uint8x16_t> OP##_u8(uint8x16_t a, uint8x16_t b) noexcept  \
-        {                                                                   \
-            return ::OP##_u8(a, b);                                         \
-        }                                                                   \
-        inline RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept     \
-        {                                                                   \
-            return ::OP##_s8(a, b);                                         \
-        }                                                                   \
-        inline RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \
-        {                                                                   \
-            return ::OP##_u16(a, b);                                        \
-        }                                                                   \
-        inline RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept    \
-        {                                                                   \
-            return ::OP##_s16(a, b);                                        \
-        }                                                                   \
-        inline RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \
-        {                                                                   \
-            return ::OP##_u32(a, b);                                        \
-        }                                                                   \
-        inline RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept    \
-        {                                                                   \
-            return ::OP##_s32(a, b);                                        \
-        }                                                                   \
+#define WRAP_BINARY_UINT_EXCLUDING_64(OP, RT)                                     \
+    namespace wrap                                                                \
+    {                                                                             \
+        XSIMD_INLINE RT<uint8x16_t> OP##_u8(uint8x16_t a, uint8x16_t b) noexcept  \
+        {                                                                         \
+            return ::OP##_u8(a, b);                                               \
+        }                                                                         \
+        XSIMD_INLINE RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \
+        {                                                                         \
+            return ::OP##_u16(a, b);                                              \
+        }                                                                         \
+        XSIMD_INLINE RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \
+        {                                                                         \
+            return ::OP##_u32(a, b);                                              \
+        }                                                                         \
     }
 
-#define WRAP_BINARY_INT(OP, RT)                                             \
-    WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                    \
-    namespace wrap                                                          \
-    {                                                                       \
-        inline RT<uint64x2_t> OP##_u64(uint64x2_t a, uint64x2_t b) noexcept \
-        {                                                                   \
-            return ::OP##_u64(a, b);                                        \
-        }                                                                   \
-        inline RT<int64x2_t> OP##_s64(int64x2_t a, int64x2_t b) noexcept    \
-        {                                                                   \
-            return ::OP##_s64(a, b);                                        \
-        }                                                                   \
-    }
-
-#define WRAP_BINARY_FLOAT(OP, RT)                                              \
+#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                   \
+    WRAP_BINARY_UINT_EXCLUDING_64(OP, RT)                                      \
     namespace wrap                                                             \
     {                                                                          \
-        inline RT<float32x4_t> OP##_f32(float32x4_t a, float32x4_t b) noexcept \
+        XSIMD_INLINE RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept  \
+        {                                                                      \
+            return ::OP##_s8(a, b);                                            \
+        }                                                                      \
+        XSIMD_INLINE RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \
         {                                                                      \
-            return ::OP##_f32(a, b);                                           \
+            return ::OP##_s16(a, b);                                           \
+        }                                                                      \
+        XSIMD_INLINE RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \
+        {                                                                      \
+            return ::OP##_s32(a, b);                                           \
         }                                                                      \
     }
 
-#define WRAP_UNARY_INT_EXCLUDING_64(OP)                   \
-    namespace wrap                                        \
-    {                                                     \
-        inline uint8x16_t OP##_u8(uint8x16_t a) noexcept  \
-        {                                                 \
-            return ::OP##_u8(a);                          \
-        }                                                 \
-        inline int8x16_t OP##_s8(int8x16_t a) noexcept    \
-        {                                                 \
-            return ::OP##_s8(a);                          \
-        }                                                 \
-        inline uint16x8_t OP##_u16(uint16x8_t a) noexcept \
-        {                                                 \
-            return ::OP##_u16(a);                         \
-        }                                                 \
-        inline int16x8_t OP##_s16(int16x8_t a) noexcept   \
-        {                                                 \
-            return ::OP##_s16(a);                         \
-        }                                                 \
-        inline uint32x4_t OP##_u32(uint32x4_t a) noexcept \
-        {                                                 \
-            return ::OP##_u32(a);                         \
-        }                                                 \
-        inline int32x4_t OP##_s32(int32x4_t a) noexcept   \
-        {                                                 \
-            return ::OP##_s32(a);                         \
-        }                                                 \
+#define WRAP_BINARY_INT(OP, RT)                                                   \
+    WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                          \
+    namespace wrap                                                                \
+    {                                                                             \
+        XSIMD_INLINE RT<uint64x2_t> OP##_u64(uint64x2_t a, uint64x2_t b) noexcept \
+        {                                                                         \
+            return ::OP##_u64(a, b);                                              \
+        }                                                                         \
+        XSIMD_INLINE RT<int64x2_t> OP##_s64(int64x2_t a, int64x2_t b) noexcept    \
+        {                                                                         \
+            return ::OP##_s64(a, b);                                              \
+        }                                                                         \
     }
 
-#define WRAP_UNARY_INT(OP)                                \
-    WRAP_UNARY_INT_EXCLUDING_64(OP)                       \
-    namespace wrap                                        \
-    {                                                     \
-        inline uint64x2_t OP##_u64(uint64x2_t a) noexcept \
-        {                                                 \
-            return ::OP##_u64(a);                         \
-        }                                                 \
-        inline int64x2_t OP##_s64(int64x2_t a) noexcept   \
-        {                                                 \
-            return ::OP##_s64(a);                         \
-        }                                                 \
+#define WRAP_BINARY_FLOAT(OP, RT)                                                    \
+    namespace wrap                                                                   \
+    {                                                                                \
+        XSIMD_INLINE RT<float32x4_t> OP##_f32(float32x4_t a, float32x4_t b) noexcept \
+        {                                                                            \
+            return ::OP##_f32(a, b);                                                 \
+        }                                                                            \
     }
 
-#define WRAP_UNARY_FLOAT(OP)                                \
-    namespace wrap                                          \
-    {                                                       \
-        inline float32x4_t OP##_f32(float32x4_t a) noexcept \
-        {                                                   \
-            return ::OP##_f32(a);                           \
-        }                                                   \
+#define WRAP_UNARY_INT_EXCLUDING_64(OP)                         \
+    namespace wrap                                              \
+    {                                                           \
+        XSIMD_INLINE uint8x16_t OP##_u8(uint8x16_t a) noexcept  \
+        {                                                       \
+            return ::OP##_u8(a);                                \
+        }                                                       \
+        XSIMD_INLINE int8x16_t OP##_s8(int8x16_t a) noexcept    \
+        {                                                       \
+            return ::OP##_s8(a);                                \
+        }                                                       \
+        XSIMD_INLINE uint16x8_t OP##_u16(uint16x8_t a) noexcept \
+        {                                                       \
+            return ::OP##_u16(a);                               \
+        }                                                       \
+        XSIMD_INLINE int16x8_t OP##_s16(int16x8_t a) noexcept   \
+        {                                                       \
+            return ::OP##_s16(a);                               \
+        }                                                       \
+        XSIMD_INLINE uint32x4_t OP##_u32(uint32x4_t a) noexcept \
+        {                                                       \
+            return ::OP##_u32(a);                               \
+        }                                                       \
+        XSIMD_INLINE int32x4_t OP##_s32(int32x4_t a) noexcept   \
+        {                                                       \
+            return ::OP##_s32(a);                               \
+        }                                                       \
+    }
+
+#define WRAP_UNARY_INT(OP)                                      \
+    WRAP_UNARY_INT_EXCLUDING_64(OP)                             \
+    namespace wrap                                              \
+    {                                                           \
+        XSIMD_INLINE uint64x2_t OP##_u64(uint64x2_t a) noexcept \
+        {                                                       \
+            return ::OP##_u64(a);                               \
+        }                                                       \
+        XSIMD_INLINE int64x2_t OP##_s64(int64x2_t a) noexcept   \
+        {                                                       \
+            return ::OP##_s64(a);                               \
+        }                                                       \
+    }
+
+#define WRAP_UNARY_FLOAT(OP)                                      \
+    namespace wrap                                                \
+    {                                                             \
+        XSIMD_INLINE float32x4_t OP##_f32(float32x4_t a) noexcept \
+        {                                                         \
+            return ::OP##_f32(a);                                 \
+        }                                                         \
     }
 
 // Dummy identity caster to ease coding
-inline uint8x16_t vreinterpretq_u8_u8(uint8x16_t arg) noexcept { return arg; }
-inline int8x16_t vreinterpretq_s8_s8(int8x16_t arg) noexcept { return arg; }
-inline uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) noexcept { return arg; }
-inline int16x8_t vreinterpretq_s16_s16(int16x8_t arg) noexcept { return arg; }
-inline uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) noexcept { return arg; }
-inline int32x4_t vreinterpretq_s32_s32(int32x4_t arg) noexcept { return arg; }
-inline uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) noexcept { return arg; }
-inline int64x2_t vreinterpretq_s64_s64(int64x2_t arg) noexcept { return arg; }
-inline float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg; }
+XSIMD_INLINE uint8x16_t vreinterpretq_u8_u8(uint8x16_t arg) noexcept { return arg; }
+XSIMD_INLINE int8x16_t vreinterpretq_s8_s8(int8x16_t arg) noexcept { return arg; }
+XSIMD_INLINE uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) noexcept { return arg; }
+XSIMD_INLINE int16x8_t vreinterpretq_s16_s16(int16x8_t arg) noexcept { return arg; }
+XSIMD_INLINE uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) noexcept { return arg; }
+XSIMD_INLINE int32x4_t vreinterpretq_s32_s32(int32x4_t arg) noexcept { return arg; }
+XSIMD_INLINE uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) noexcept { return arg; }
+XSIMD_INLINE int64x2_t vreinterpretq_s64_s64(int64x2_t arg) noexcept { return arg; }
+XSIMD_INLINE float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg; }
 
 namespace xsimd
 {
-    template <class batch_type, bool... Values>
+    template <typename T, class A, bool... Values>
     struct batch_bool_constant;
 
     namespace kernel
@@ -204,6 +210,10 @@ namespace xsimd
                                                                     uint32x4_t, int32x4_t,
                                                                     float32x4_t>;
 
+            using excluding_int64f32_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
+                                                                       uint16x8_t, int16x8_t,
+                                                                       uint32x4_t, int32x4_t>;
+
             /**************************
              * comparison dispatchers *
              **************************/
@@ -296,55 +306,55 @@ namespace xsimd
          *************/
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
-        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
         {
             return vdupq_n_u8(uint8_t(val));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
-        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
         {
             return vdupq_n_s8(int8_t(val));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
-        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
         {
             return vdupq_n_u16(uint16_t(val));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
-        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
         {
             return vdupq_n_s16(int16_t(val));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
-        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
         {
             return vdupq_n_u32(uint32_t(val));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
-        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
         {
             return vdupq_n_s32(int32_t(val));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
         {
             return vdupq_n_u64(uint64_t(val));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
         {
             return vdupq_n_s64(int64_t(val));
         }
 
         template <class A>
-        inline batch<float, A> broadcast(float val, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<float, A> broadcast(float val, requires_arch<neon>) noexcept
         {
             return vdupq_n_f32(val);
         }
@@ -354,13 +364,13 @@ namespace xsimd
          *******/
 
         template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<neon>, Args... args) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<neon>, Args... args) noexcept
         {
             return xsimd::types::detail::neon_vector_type<T> { args... };
         }
 
         template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
-        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<neon>, Args... args) noexcept
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<neon>, Args... args) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
             using unsigned_type = as_unsigned_integer_t<T>;
@@ -368,22 +378,22 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<float, A> set(batch<float, A> const&, requires_arch<neon>, float f0, float f1, float f2, float f3) noexcept
+        XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<neon>, float f0, float f1, float f2, float f3) noexcept
         {
             return float32x4_t { f0, f1, f2, f3 };
         }
 
         template <class A>
-        inline batch<std::complex<float>, A> set(batch<std::complex<float>, A> const&, requires_arch<neon>,
-                                                 std::complex<float> c0, std::complex<float> c1,
-                                                 std::complex<float> c2, std::complex<float> c3) noexcept
+        XSIMD_INLINE batch<std::complex<float>, A> set(batch<std::complex<float>, A> const&, requires_arch<neon>,
+                                                       std::complex<float> c0, std::complex<float> c1,
+                                                       std::complex<float> c2, std::complex<float> c3) noexcept
         {
             return batch<std::complex<float>, A>(float32x4_t { c0.real(), c1.real(), c2.real(), c3.real() },
                                                  float32x4_t { c0.imag(), c1.imag(), c2.imag(), c3.imag() });
         }
 
         template <class A, class... Args>
-        inline batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<neon>, Args... args) noexcept
+        XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<neon>, Args... args) noexcept
         {
             using register_type = typename batch_bool<float, A>::register_type;
             using unsigned_type = as_unsigned_integer_t<float>;
@@ -395,55 +405,55 @@ namespace xsimd
          *************/
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
-        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
             return vandq_u8(arg, vdupq_n_u8(1));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
-        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
             return vandq_s8(reinterpret_cast<int8x16_t>(arg.data), vdupq_n_s8(1));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
-        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
             return vandq_u16(arg, vdupq_n_u16(1));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
-        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
             return vandq_s16(reinterpret_cast<int16x8_t>(arg.data), vdupq_n_s16(1));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
-        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
             return vandq_u32(arg, vdupq_n_u32(1));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
-        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
             return vandq_s32(reinterpret_cast<int32x4_t>(arg.data), vdupq_n_s32(1));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
             return vandq_u64(arg, vdupq_n_u64(1));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
             return vandq_s64(reinterpret_cast<int64x2_t>(arg.data), vdupq_n_s64(1));
         }
 
         template <class A>
-        inline batch<float, A> from_bool(batch_bool<float, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<float, A> from_bool(batch_bool<float, A> const& arg, requires_arch<neon>) noexcept
         {
             return vreinterpretq_f32_u32(vandq_u32(arg, vreinterpretq_u32_f32(vdupq_n_f32(1.f))));
         }
@@ -463,50 +473,50 @@ namespace xsimd
 #endif
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
-        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
             return xsimd_aligned_load(vld1q_u8, uint8_t*, src);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
-        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
             return xsimd_aligned_load(vld1q_s8, int8_t*, src);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
-        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
             return xsimd_aligned_load(vld1q_u16, uint16_t*, src);
         }
         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
-        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
             return xsimd_aligned_load(vld1q_s16, int16_t*, src);
         }
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
-        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
             return xsimd_aligned_load(vld1q_u32, uint32_t*, src);
         }
         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
-        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
             return xsimd_aligned_load(vld1q_s32, int32_t*, src);
         }
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
             return xsimd_aligned_load(vld1q_u64, uint64_t*, src);
         }
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
             return xsimd_aligned_load(vld1q_s64, int64_t*, src);
         }
 
         template <class A>
-        inline batch<float, A> load_aligned(float const* src, convert<float>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<float, A> load_aligned(float const* src, convert<float>, requires_arch<neon>) noexcept
         {
             return xsimd_aligned_load(vld1q_f32, float*, src);
         }
@@ -514,50 +524,50 @@ namespace xsimd
 #undef xsimd_aligned_load
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
-        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
             return vld1q_u8((uint8_t*)src);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
-        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
             return vld1q_s8((int8_t*)src);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
-        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
             return vld1q_u16((uint16_t*)src);
         }
         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
-        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
             return vld1q_s16((int16_t*)src);
         }
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
-        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
             return vld1q_u32((uint32_t*)src);
         }
         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
-        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
             return vld1q_s32((int32_t*)src);
         }
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
             return vld1q_u64((uint64_t*)src);
         }
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
         {
             return vld1q_s64((int64_t*)src);
         }
 
         template <class A>
-        inline batch<float, A> load_unaligned(float const* src, convert<float>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<float, A> load_unaligned(float const* src, convert<float>, requires_arch<neon>) noexcept
         {
             return vld1q_f32(src);
         }
@@ -567,61 +577,61 @@ namespace xsimd
          *********/
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
-        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
         {
             vst1q_u8((uint8_t*)dst, src);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
-        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
         {
             vst1q_s8((int8_t*)dst, src);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
-        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
         {
             vst1q_u16((uint16_t*)dst, src);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
-        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
         {
             vst1q_s16((int16_t*)dst, src);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
-        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
         {
             vst1q_u32((uint32_t*)dst, src);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
-        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
         {
             vst1q_s32((int32_t*)dst, src);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
         {
             vst1q_u64((uint64_t*)dst, src);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
         {
             vst1q_s64((int64_t*)dst, src);
         }
 
         template <class A>
-        inline void store_aligned(float* dst, batch<float, A> const& src, requires_arch<neon>) noexcept
+        XSIMD_INLINE void store_aligned(float* dst, batch<float, A> const& src, requires_arch<neon>) noexcept
         {
             vst1q_f32(dst, src);
         }
 
         template <class A, class T>
-        inline void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        XSIMD_INLINE void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
         {
             store_aligned<A>(dst, src, A {});
         }
@@ -631,7 +641,7 @@ namespace xsimd
          ****************/
 
         template <class A>
-        inline batch<std::complex<float>, A> load_complex_aligned(std::complex<float> const* mem, convert<std::complex<float>>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<std::complex<float>, A> load_complex_aligned(std::complex<float> const* mem, convert<std::complex<float>>, requires_arch<neon>) noexcept
         {
             using real_batch = batch<float, A>;
             const float* buf = reinterpret_cast<const float*>(mem);
@@ -642,7 +652,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<std::complex<float>, A> load_complex_unaligned(std::complex<float> const* mem, convert<std::complex<float>> cvt, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<std::complex<float>, A> load_complex_unaligned(std::complex<float> const* mem, convert<std::complex<float>> cvt, requires_arch<neon>) noexcept
         {
             return load_complex_aligned<A>(mem, cvt, A {});
         }
@@ -652,7 +662,7 @@ namespace xsimd
          *****************/
 
         template <class A>
-        inline void store_complex_aligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
+        XSIMD_INLINE void store_complex_aligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
         {
             float32x4x2_t tmp;
             tmp.val[0] = src.real();
@@ -662,7 +672,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline void store_complex_unaligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
+        XSIMD_INLINE void store_complex_unaligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
         {
             store_complex_aligned(dst, src, A {});
         }
@@ -672,55 +682,55 @@ namespace xsimd
          *******/
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
-        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(rhs)));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
-        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vnegq_s8(rhs);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
-        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vreinterpretq_u16_s16(vnegq_s16(vreinterpretq_s16_u16(rhs)));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
-        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vnegq_s16(rhs);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
-        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vreinterpretq_u32_s32(vnegq_s32(vreinterpretq_s32_u32(rhs)));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
-        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vnegq_s32(rhs);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return batch<T, A> { -rhs.get(0), -rhs.get(1) };
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return batch<T, A> { -rhs.get(0), -rhs.get(1) };
         }
 
         template <class A>
-        inline batch<float, A> neg(batch<float, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<float, A> neg(batch<float, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vnegq_f32(rhs);
         }
@@ -733,7 +743,7 @@ namespace xsimd
         WRAP_BINARY_FLOAT(vaddq, detail::identity_return_type)
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
-        inline batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             const detail::neon_dispatcher::binary dispatcher = {
@@ -744,6 +754,38 @@ namespace xsimd
             return dispatcher.apply(register_type(lhs), register_type(rhs));
         }
 
+        /*******
+         * avg *
+         *******/
+
+        WRAP_BINARY_UINT_EXCLUDING_64(vhaddq, detail::identity_return_type)
+
+        template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = {
+                std::make_tuple(wrap::vhaddq_u8, wrap::vhaddq_u16, wrap::vhaddq_u32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        /********
+         * avgr *
+         ********/
+
+        WRAP_BINARY_UINT_EXCLUDING_64(vrhaddq, detail::identity_return_type)
+
+        template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = {
+                std::make_tuple(wrap::vrhaddq_u8, wrap::vrhaddq_u16, wrap::vrhaddq_u32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
         /********
          * sadd *
          ********/
@@ -751,7 +793,7 @@ namespace xsimd
         WRAP_BINARY_INT(vqaddq, detail::identity_return_type)
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
-        inline batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             const detail::neon_dispatcher::binary dispatcher = {
@@ -770,7 +812,7 @@ namespace xsimd
         WRAP_BINARY_FLOAT(vsubq, detail::identity_return_type)
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
-        inline batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             const detail::neon_dispatcher::binary dispatcher = {
@@ -788,7 +830,7 @@ namespace xsimd
         WRAP_BINARY_INT(vqsubq, detail::identity_return_type)
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
-        inline batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             const detail::neon_dispatcher::binary dispatcher = {
@@ -807,7 +849,7 @@ namespace xsimd
         WRAP_BINARY_FLOAT(vmulq, detail::identity_return_type)
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
-        inline batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             const detail::excluding_int64_dispatcher::binary dispatcher = {
@@ -823,20 +865,20 @@ namespace xsimd
 
 #if defined(XSIMD_FAST_INTEGER_DIVISION)
         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
-        inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vcvtq_s32_f32(vcvtq_f32_s32(lhs) / vcvtq_f32_s32(rhs));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
-        inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vcvtq_u32_f32(vcvtq_f32_u32(lhs) / vcvtq_f32_u32(rhs));
         }
 #endif
 
         template <class A>
-        inline batch<float, A> div(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<float, A> div(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
         {
             // from stackoverflow & https://projectne10.github.io/Ne10/doc/NE10__divc_8neon_8c_source.html
             // get an initial estimate of 1/b.
@@ -860,7 +902,7 @@ namespace xsimd
         WRAP_BINARY_FLOAT(vceqq, detail::comp_return_type)
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
-        inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
@@ -871,7 +913,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
-        inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
             using dispatcher_type = detail::neon_comp_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary;
@@ -882,13 +924,13 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
-        inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
-        inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
         }
@@ -900,25 +942,25 @@ namespace xsimd
         namespace detail
         {
             template <class A>
-            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
+            XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
             {
                 return vcvtq_f32_s32(self);
             }
 
             template <class A>
-            inline batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
+            XSIMD_INLINE batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
             {
                 return vcvtq_f32_u32(self);
             }
 
             template <class A>
-            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<neon>) noexcept
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<neon>) noexcept
             {
                 return vcvtq_s32_f32(self);
             }
 
             template <class A>
-            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<neon>) noexcept
+            XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<neon>) noexcept
             {
                 return vcvtq_u32_f32(self);
             }
@@ -933,7 +975,7 @@ namespace xsimd
         WRAP_BINARY_FLOAT(vcltq, detail::comp_return_type)
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
-        inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
@@ -944,7 +986,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
-        inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return batch_bool<T, A>({ lhs.get(0) < rhs.get(0), lhs.get(1) < rhs.get(1) });
         }
@@ -957,7 +999,7 @@ namespace xsimd
         WRAP_BINARY_FLOAT(vcleq, detail::comp_return_type)
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
-        inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
@@ -968,7 +1010,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
-        inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return batch_bool<T, A>({ lhs.get(0) <= rhs.get(0), lhs.get(1) <= rhs.get(1) });
         }
@@ -981,7 +1023,7 @@ namespace xsimd
         WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type)
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
-        inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
@@ -992,7 +1034,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
-        inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return batch_bool<T, A>({ lhs.get(0) > rhs.get(0), lhs.get(1) > rhs.get(1) });
         }
@@ -1005,7 +1047,7 @@ namespace xsimd
         WRAP_BINARY_FLOAT(vcgeq, detail::comp_return_type)
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
-        inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
@@ -1016,7 +1058,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
-        inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return batch_bool<T, A>({ lhs.get(0) >= rhs.get(0), lhs.get(1) >= rhs.get(1) });
         }
@@ -1026,7 +1068,7 @@ namespace xsimd
          *******************/
 
         template <class A, class T_out, class T_in>
-        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon>) noexcept
         {
             using register_type = typename batch_bool<T_out, A>::register_type;
             return register_type(self);
@@ -1040,7 +1082,7 @@ namespace xsimd
 
         namespace detail
         {
-            inline float32x4_t bitwise_and_f32(float32x4_t lhs, float32x4_t rhs) noexcept
+            XSIMD_INLINE float32x4_t bitwise_and_f32(float32x4_t lhs, float32x4_t rhs) noexcept
             {
                 return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(lhs),
                                                        vreinterpretq_u32_f32(rhs)));
@@ -1059,14 +1101,14 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
-        inline batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
-        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
             return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
@@ -1080,14 +1122,14 @@ namespace xsimd
 
         namespace detail
         {
-            inline float32x4_t bitwise_or_f32(float32x4_t lhs, float32x4_t rhs) noexcept
+            XSIMD_INLINE float32x4_t bitwise_or_f32(float32x4_t lhs, float32x4_t rhs) noexcept
             {
                 return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(lhs),
                                                        vreinterpretq_u32_f32(rhs)));
             }
 
             template <class V>
-            inline V bitwise_or_neon(V const& lhs, V const& rhs) noexcept
+            XSIMD_INLINE V bitwise_or_neon(V const& lhs, V const& rhs) noexcept
             {
                 const neon_dispatcher::binary dispatcher = {
                     std::make_tuple(wrap::vorrq_u8, wrap::vorrq_s8, wrap::vorrq_u16, wrap::vorrq_s16,
@@ -1099,14 +1141,14 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
-        inline batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
-        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
             return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
@@ -1120,14 +1162,14 @@ namespace xsimd
 
         namespace detail
         {
-            inline float32x4_t bitwise_xor_f32(float32x4_t lhs, float32x4_t rhs) noexcept
+            XSIMD_INLINE float32x4_t bitwise_xor_f32(float32x4_t lhs, float32x4_t rhs) noexcept
             {
                 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(lhs),
                                                        vreinterpretq_u32_f32(rhs)));
             }
 
             template <class V>
-            inline V bitwise_xor_neon(V const& lhs, V const& rhs) noexcept
+            XSIMD_INLINE V bitwise_xor_neon(V const& lhs, V const& rhs) noexcept
             {
                 const neon_dispatcher::binary dispatcher = {
                     std::make_tuple(wrap::veorq_u8, wrap::veorq_s8, wrap::veorq_u16, wrap::veorq_s16,
@@ -1139,14 +1181,14 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
-        inline batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
-        inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
             return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
@@ -1157,7 +1199,7 @@ namespace xsimd
          *******/
 
         template <class A, class T>
-        inline batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return bitwise_xor(lhs, rhs, A {});
         }
@@ -1170,23 +1212,23 @@ namespace xsimd
 
         namespace detail
         {
-            inline int64x2_t bitwise_not_s64(int64x2_t arg) noexcept
+            XSIMD_INLINE int64x2_t bitwise_not_s64(int64x2_t arg) noexcept
             {
                 return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(arg)));
             }
 
-            inline uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept
+            XSIMD_INLINE uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept
             {
                 return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(arg)));
             }
 
-            inline float32x4_t bitwise_not_f32(float32x4_t arg) noexcept
+            XSIMD_INLINE float32x4_t bitwise_not_f32(float32x4_t arg) noexcept
             {
                 return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg)));
             }
 
             template <class V>
-            inline V bitwise_not_neon(V const& arg) noexcept
+            XSIMD_INLINE V bitwise_not_neon(V const& arg) noexcept
             {
                 const neon_dispatcher::unary dispatcher = {
                     std::make_tuple(wrap::vmvnq_u8, wrap::vmvnq_s8, wrap::vmvnq_u16, wrap::vmvnq_s16,
@@ -1199,14 +1241,14 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
-        inline batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             return detail::bitwise_not_neon(register_type(arg));
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
-        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
             return detail::bitwise_not_neon(register_type(arg));
@@ -1220,13 +1262,13 @@ namespace xsimd
 
         namespace detail
         {
-            inline float32x4_t bitwise_andnot_f32(float32x4_t lhs, float32x4_t rhs) noexcept
+            XSIMD_INLINE float32x4_t bitwise_andnot_f32(float32x4_t lhs, float32x4_t rhs) noexcept
             {
                 return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs)));
             }
 
             template <class V>
-            inline V bitwise_andnot_neon(V const& lhs, V const& rhs) noexcept
+            XSIMD_INLINE V bitwise_andnot_neon(V const& lhs, V const& rhs) noexcept
             {
                 const detail::neon_dispatcher::binary dispatcher = {
                     std::make_tuple(wrap::vbicq_u8, wrap::vbicq_s8, wrap::vbicq_u16, wrap::vbicq_s16,
@@ -1238,14 +1280,14 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
-        inline batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
-        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
             return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
@@ -1259,7 +1301,7 @@ namespace xsimd
         WRAP_BINARY_FLOAT(vminq, detail::identity_return_type)
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
-        inline batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             const detail::excluding_int64_dispatcher::binary dispatcher = {
@@ -1270,7 +1312,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
-        inline batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return { std::min(lhs.get(0), rhs.get(0)), std::min(lhs.get(1), rhs.get(1)) };
         }
@@ -1283,7 +1325,7 @@ namespace xsimd
         WRAP_BINARY_FLOAT(vmaxq, detail::identity_return_type)
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
-        inline batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             const detail::excluding_int64_dispatcher::binary dispatcher = {
@@ -1294,7 +1336,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
-        inline batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return { std::max(lhs.get(0), rhs.get(0)), std::max(lhs.get(1), rhs.get(1)) };
         }
@@ -1305,32 +1347,32 @@ namespace xsimd
 
         namespace wrap
         {
-            inline int8x16_t vabsq_s8(int8x16_t a) noexcept { return ::vabsq_s8(a); }
-            inline int16x8_t vabsq_s16(int16x8_t a) noexcept { return ::vabsq_s16(a); }
-            inline int32x4_t vabsq_s32(int32x4_t a) noexcept { return ::vabsq_s32(a); }
+            XSIMD_INLINE int8x16_t vabsq_s8(int8x16_t a) noexcept { return ::vabsq_s8(a); }
+            XSIMD_INLINE int16x8_t vabsq_s16(int16x8_t a) noexcept { return ::vabsq_s16(a); }
+            XSIMD_INLINE int32x4_t vabsq_s32(int32x4_t a) noexcept { return ::vabsq_s32(a); }
         }
         WRAP_UNARY_FLOAT(vabsq)
 
         namespace detail
         {
-            inline uint8x16_t abs_u8(uint8x16_t arg) noexcept
+            XSIMD_INLINE uint8x16_t abs_u8(uint8x16_t arg) noexcept
             {
                 return arg;
             }
 
-            inline uint16x8_t abs_u16(uint16x8_t arg) noexcept
+            XSIMD_INLINE uint16x8_t abs_u16(uint16x8_t arg) noexcept
             {
                 return arg;
             }
 
-            inline uint32x4_t abs_u32(uint32x4_t arg) noexcept
+            XSIMD_INLINE uint32x4_t abs_u32(uint32x4_t arg) noexcept
             {
                 return arg;
             }
         }
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
-        inline batch<T, A> abs(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             const detail::excluding_int64_dispatcher::unary dispatcher = {
@@ -1345,7 +1387,7 @@ namespace xsimd
          ********/
 
         template <class A>
-        inline batch<float, A> rsqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
         {
             return vrsqrteq_f32(arg);
         }
@@ -1355,7 +1397,7 @@ namespace xsimd
          ********/
 
         template <class A>
-        inline batch<float, A> sqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
         {
             batch<float, A> sqrt_reciprocal = vrsqrteq_f32(arg);
             // one iter
@@ -1371,13 +1413,13 @@ namespace xsimd
 
 #ifdef __ARM_FEATURE_FMA
         template <class A>
-        inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
         {
             return vfmaq_f32(z, x, y);
         }
 
         template <class A>
-        inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
         {
             return vfmaq_f32(-z, x, y);
         }
@@ -1388,7 +1430,7 @@ namespace xsimd
          *********/
 
         template <class A>
-        inline batch<float, A> haddp(const batch<float, A>* row, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<float, A> haddp(const batch<float, A>* row, requires_arch<neon>) noexcept
         {
             // row = (a,b,c,d)
             float32x2_t tmp1, tmp2, tmp3;
@@ -1413,7 +1455,7 @@ namespace xsimd
          **************/
 
         template <class A>
-        inline batch<float, A>
+        XSIMD_INLINE batch<float, A>
         reciprocal(const batch<float, A>& x,
                    kernel::requires_arch<neon>) noexcept
         {
@@ -1425,55 +1467,55 @@ namespace xsimd
          **********/
 
         template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 1> = 0>
-        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
         {
             return vsetq_lane_u8(val, self, I);
         }
 
         template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 1> = 0>
-        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
         {
             return vsetq_lane_s8(val, self, I);
         }
 
         template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 2> = 0>
-        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
         {
             return vsetq_lane_u16(val, self, I);
         }
 
         template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 2> = 0>
-        inline batch<int16_t, A> insert(batch<int16_t, A> const& self, int16_t val, index<I>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<int16_t, A> insert(batch<int16_t, A> const& self, int16_t val, index<I>, requires_arch<neon>) noexcept
         {
             return vsetq_lane_s16(val, self, I);
         }
 
         template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 4> = 0>
-        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
         {
             return vsetq_lane_u32(val, self, I);
         }
 
         template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 4> = 0>
-        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
         {
             return vsetq_lane_s32(val, self, I);
         }
 
         template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
         {
             return vsetq_lane_u64(val, self, I);
         }
 
         template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
         {
             return vsetq_lane_s64(val, self, I);
         }
 
         template <class A, size_t I>
-        inline batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<neon>) noexcept
         {
             return vsetq_lane_f32(val, self, I);
         }
@@ -1483,8 +1525,8 @@ namespace xsimd
          *******************/
 
         template <class A>
-        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
-                                                  requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                        requires_arch<neon>) noexcept
         {
             /* origin: https://github.com/DLTcollab/sse2neon/blob/cad518a93b326f0f644b7972d488d04eaa2b0475/sse2neon.h#L4028-L4047 */
             //  Contributors to this work are:
@@ -1553,7 +1595,7 @@ namespace xsimd
         namespace detail
         {
             template <class T, class A, class V>
-            inline T sum_batch(V const& arg) noexcept
+            XSIMD_INLINE T sum_batch(V const& arg) noexcept
             {
                 T res = T(0);
                 for (std::size_t i = 0; i < batch<T, A>::size; ++i)
@@ -1565,7 +1607,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
-        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
         {
             uint8x8_t tmp = vpadd_u8(vget_low_u8(arg), vget_high_u8(arg));
             tmp = vpadd_u8(tmp, tmp);
@@ -1575,7 +1617,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
-        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
         {
             int8x8_t tmp = vpadd_s8(vget_low_s8(arg), vget_high_s8(arg));
             tmp = vpadd_s8(tmp, tmp);
@@ -1585,7 +1627,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
-        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
         {
             uint16x4_t tmp = vpadd_u16(vget_low_u16(arg), vget_high_u16(arg));
             tmp = vpadd_u16(tmp, tmp);
@@ -1594,7 +1636,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
-        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
         {
             int16x4_t tmp = vpadd_s16(vget_low_s16(arg), vget_high_s16(arg));
             tmp = vpadd_s16(tmp, tmp);
@@ -1603,7 +1645,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
-        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
         {
             uint32x2_t tmp = vpadd_u32(vget_low_u32(arg), vget_high_u32(arg));
             tmp = vpadd_u32(tmp, tmp);
@@ -1611,7 +1653,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
-        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
         {
             int32x2_t tmp = vpadd_s32(vget_low_s32(arg), vget_high_s32(arg));
             tmp = vpadd_s32(tmp, tmp);
@@ -1619,13 +1661,13 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
-        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
         {
             return arg.get(0) + arg.get(1);
         }
 
         template <class A>
-        inline float reduce_add(batch<float, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE float reduce_add(batch<float, A> const& arg, requires_arch<neon>) noexcept
         {
             float32x2_t tmp = vpadd_f32(vget_low_f32(arg), vget_high_f32(arg));
             tmp = vpadd_f32(tmp, tmp);
@@ -1652,15 +1694,15 @@ namespace xsimd
 
         namespace wrap
         {
-            inline uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) noexcept { return ::vbslq_u8(a, b, c); }
-            inline int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) noexcept { return ::vbslq_s8(a, b, c); }
-            inline uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) noexcept { return ::vbslq_u16(a, b, c); }
-            inline int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) noexcept { return ::vbslq_s16(a, b, c); }
-            inline uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) noexcept { return ::vbslq_u32(a, b, c); }
-            inline int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) noexcept { return ::vbslq_s32(a, b, c); }
-            inline uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) noexcept { return ::vbslq_u64(a, b, c); }
-            inline int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) noexcept { return ::vbslq_s64(a, b, c); }
-            inline float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) noexcept { return ::vbslq_f32(a, b, c); }
+            XSIMD_INLINE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) noexcept { return ::vbslq_u8(a, b, c); }
+            XSIMD_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) noexcept { return ::vbslq_s8(a, b, c); }
+            XSIMD_INLINE uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) noexcept { return ::vbslq_u16(a, b, c); }
+            XSIMD_INLINE int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) noexcept { return ::vbslq_s16(a, b, c); }
+            XSIMD_INLINE uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) noexcept { return ::vbslq_u32(a, b, c); }
+            XSIMD_INLINE int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) noexcept { return ::vbslq_s32(a, b, c); }
+            XSIMD_INLINE uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) noexcept { return ::vbslq_u64(a, b, c); }
+            XSIMD_INLINE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) noexcept { return ::vbslq_s64(a, b, c); }
+            XSIMD_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) noexcept { return ::vbslq_f32(a, b, c); }
         }
 
         namespace detail
@@ -1688,7 +1730,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
-        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<neon>) noexcept
         {
             using bool_register_type = typename batch_bool<T, A>::register_type;
             using register_type = typename batch<T, A>::register_type;
@@ -1701,7 +1743,7 @@ namespace xsimd
         }
 
         template <class A, class T, bool... b, detail::enable_neon_type_t<T> = 0>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<neon>) noexcept
         {
             return select(batch_bool<T, A> { b... }, true_br, false_br, neon {});
         }
@@ -1711,61 +1753,61 @@ namespace xsimd
          **********/
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
-        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             uint8x8x2_t tmp = vzip_u8(vget_low_u8(lhs), vget_low_u8(rhs));
             return vcombine_u8(tmp.val[0], tmp.val[1]);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
-        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             int8x8x2_t tmp = vzip_s8(vget_low_s8(lhs), vget_low_s8(rhs));
             return vcombine_s8(tmp.val[0], tmp.val[1]);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
-        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             uint16x4x2_t tmp = vzip_u16(vget_low_u16(lhs), vget_low_u16(rhs));
             return vcombine_u16(tmp.val[0], tmp.val[1]);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
-        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             int16x4x2_t tmp = vzip_s16(vget_low_s16(lhs), vget_low_s16(rhs));
             return vcombine_s16(tmp.val[0], tmp.val[1]);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
-        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             uint32x2x2_t tmp = vzip_u32(vget_low_u32(lhs), vget_low_u32(rhs));
             return vcombine_u32(tmp.val[0], tmp.val[1]);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
-        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             int32x2x2_t tmp = vzip_s32(vget_low_s32(lhs), vget_low_s32(rhs));
             return vcombine_s32(tmp.val[0], tmp.val[1]);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vcombine_u64(vget_low_u64(lhs), vget_low_u64(rhs));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vcombine_s64(vget_low_s64(lhs), vget_low_s64(rhs));
         }
 
         template <class A>
-        inline batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
         {
             float32x2x2_t tmp = vzip_f32(vget_low_f32(lhs), vget_low_f32(rhs));
             return vcombine_f32(tmp.val[0], tmp.val[1]);
@@ -1776,61 +1818,61 @@ namespace xsimd
          **********/
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
-        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             uint8x8x2_t tmp = vzip_u8(vget_high_u8(lhs), vget_high_u8(rhs));
             return vcombine_u8(tmp.val[0], tmp.val[1]);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
-        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             int8x8x2_t tmp = vzip_s8(vget_high_s8(lhs), vget_high_s8(rhs));
             return vcombine_s8(tmp.val[0], tmp.val[1]);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
-        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             uint16x4x2_t tmp = vzip_u16(vget_high_u16(lhs), vget_high_u16(rhs));
             return vcombine_u16(tmp.val[0], tmp.val[1]);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
-        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             int16x4x2_t tmp = vzip_s16(vget_high_s16(lhs), vget_high_s16(rhs));
             return vcombine_s16(tmp.val[0], tmp.val[1]);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
-        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             uint32x2x2_t tmp = vzip_u32(vget_high_u32(lhs), vget_high_u32(rhs));
             return vcombine_u32(tmp.val[0], tmp.val[1]);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
-        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             int32x2x2_t tmp = vzip_s32(vget_high_s32(lhs), vget_high_s32(rhs));
             return vcombine_s32(tmp.val[0], tmp.val[1]);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vcombine_u64(vget_high_u64(lhs), vget_high_u64(rhs));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vcombine_s64(vget_high_s64(lhs), vget_high_s64(rhs));
         }
 
         template <class A>
-        inline batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
         {
             float32x2x2_t tmp = vzip_f32(vget_high_f32(lhs), vget_high_f32(rhs));
             return vcombine_f32(tmp.val[0], tmp.val[1]);
@@ -1843,14 +1885,14 @@ namespace xsimd
         namespace detail
         {
             template <class A, class T>
-            inline batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
             {
                 assert(false && "extract_pair out of bounds");
                 return batch<T, A> {};
             }
 
             template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
-            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -1863,7 +1905,7 @@ namespace xsimd
             }
 
             template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 1> = 0>
-            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -1876,7 +1918,7 @@ namespace xsimd
             }
 
             template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
-            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -1889,7 +1931,7 @@ namespace xsimd
             }
 
             template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 2> = 0>
-            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -1902,7 +1944,7 @@ namespace xsimd
             }
 
             template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
-            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -1915,7 +1957,7 @@ namespace xsimd
             }
 
             template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 4> = 0>
-            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -1928,7 +1970,7 @@ namespace xsimd
             }
 
             template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
-            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -1941,7 +1983,7 @@ namespace xsimd
             }
 
             template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 8> = 0>
-            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -1954,7 +1996,7 @@ namespace xsimd
             }
 
             template <class A, size_t I, size_t... Is>
-            inline batch<float, A> extract_pair(batch<float, A> const& lhs, batch<float, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<float, A> extract_pair(batch<float, A> const& lhs, batch<float, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -1967,7 +2009,7 @@ namespace xsimd
             }
 
             template <class A, class T, size_t... Is>
-            inline batch<T, A> extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<0, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<0, Is...>) noexcept
             {
                 if (n == 0)
                 {
@@ -1981,7 +2023,7 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<neon>) noexcept
         {
             constexpr std::size_t size = batch<T, A>::size;
             assert(n < size && "index in bounds");
@@ -1995,14 +2037,14 @@ namespace xsimd
         namespace detail
         {
             template <class A, class T>
-            inline batch<T, A> bitwise_lshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept
             {
                 assert(false && "bitwise_lshift out of bounds");
                 return batch<T, A> {};
             }
 
             template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
-            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -2015,7 +2057,7 @@ namespace xsimd
             }
 
             template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0>
-            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -2028,7 +2070,7 @@ namespace xsimd
             }
 
             template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
-            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -2041,7 +2083,7 @@ namespace xsimd
             }
 
             template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0>
-            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -2054,7 +2096,7 @@ namespace xsimd
             }
 
             template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
-            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -2067,7 +2109,7 @@ namespace xsimd
             }
 
             template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0>
-            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -2080,7 +2122,7 @@ namespace xsimd
             }
 
             template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
-            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -2093,7 +2135,7 @@ namespace xsimd
             }
 
             template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0>
-            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -2106,7 +2148,7 @@ namespace xsimd
             }
 
             template <class A, class T, int... Is>
-            inline batch<T, A> bitwise_lshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_lshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept
             {
                 if (n == 0)
                 {
@@ -2120,7 +2162,7 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
         {
             constexpr int size = sizeof(typename batch<T, A>::value_type) * 8;
             assert(0 <= n && n < size && "index in bounds");
@@ -2128,49 +2170,49 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vshlq_u8(lhs, rhs);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vshlq_s8(lhs, rhs);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vshlq_u16(lhs, rhs);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vshlq_s16(lhs, rhs);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vshlq_u32(lhs, rhs);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vshlq_s32(lhs, rhs);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vshlq_u64(lhs, rhs);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vshlq_s64(lhs, rhs);
         }
@@ -2182,14 +2224,14 @@ namespace xsimd
         namespace detail
         {
             template <class A, class T>
-            inline batch<T, A> bitwise_rshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept
             {
                 assert(false && "bitwise_rshift out of bounds");
                 return batch<T, A> {};
             }
 
             template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
-            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -2202,7 +2244,7 @@ namespace xsimd
             }
 
             template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0>
-            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -2215,7 +2257,7 @@ namespace xsimd
             }
 
             template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
-            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -2228,7 +2270,7 @@ namespace xsimd
             }
 
             template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0>
-            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -2241,7 +2283,7 @@ namespace xsimd
             }
 
             template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
-            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -2254,7 +2296,7 @@ namespace xsimd
             }
 
             template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0>
-            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -2267,7 +2309,7 @@ namespace xsimd
             }
 
             template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
-            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -2280,7 +2322,7 @@ namespace xsimd
             }
 
             template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0>
-            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -2293,7 +2335,7 @@ namespace xsimd
             }
 
             template <class A, class T, int... Is>
-            inline batch<T, A> bitwise_rshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> bitwise_rshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept
             {
                 if (n == 0)
                 {
@@ -2307,7 +2349,7 @@ namespace xsimd
         }
 
         template <class A, class T>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
         {
             constexpr int size = sizeof(typename batch<T, A>::value_type) * 8;
             assert(0 <= n && n < size && "index in bounds");
@@ -2315,37 +2357,37 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vshlq_u8(lhs, vnegq_s8(rhs));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vshlq_s8(lhs, vnegq_s8(rhs));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vshlq_u16(lhs, vnegq_s16(rhs));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vshlq_s16(lhs, vnegq_s16(rhs));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vshlq_u32(lhs, vnegq_s32(rhs));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             return vshlq_s32(lhs, vnegq_s32(rhs));
         }
@@ -2357,26 +2399,26 @@ namespace xsimd
          *******/
 
         template <class A, class T, detail::enable_sized_t<T, 8> = 0>
-        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
             uint64x1_t tmp = vand_u64(vget_low_u64(arg), vget_high_u64(arg));
             return vget_lane_u64(tmp, 0) == ~0ULL;
         }
 
         template <class A, class T, detail::enable_sized_t<T, 1> = 0>
-        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
             return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {});
         }
 
         template <class A, class T, detail::enable_sized_t<T, 2> = 0>
-        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
             return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {});
         }
 
         template <class A, class T, detail::enable_sized_t<T, 4> = 0>
-        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
             return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {});
         }
@@ -2386,26 +2428,26 @@ namespace xsimd
          *******/
 
         template <class A, class T, detail::enable_sized_t<T, 8> = 0>
-        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
             uint32x2_t tmp = vqmovn_u64(arg);
             return vget_lane_u64(vreinterpret_u64_u32(tmp), 0) != 0;
         }
 
         template <class A, class T, detail::enable_sized_t<T, 1> = 0>
-        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
             return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {});
         }
 
         template <class A, class T, detail::enable_sized_t<T, 2> = 0>
-        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
             return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {});
         }
 
         template <class A, class T, detail::enable_sized_t<T, 4> = 0>
-        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
             return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {});
         }
@@ -2414,45 +2456,45 @@ namespace xsimd
          * bitwise_cast *
          ****************/
 
-#define WRAP_CAST(SUFFIX, TYPE)                                          \
-    namespace wrap                                                       \
-    {                                                                    \
-        inline TYPE vreinterpretq_##SUFFIX##_u8(uint8x16_t a) noexcept   \
-        {                                                                \
-            return ::vreinterpretq_##SUFFIX##_u8(a);                     \
-        }                                                                \
-        inline TYPE vreinterpretq_##SUFFIX##_s8(int8x16_t a) noexcept    \
-        {                                                                \
-            return ::vreinterpretq_##SUFFIX##_s8(a);                     \
-        }                                                                \
-        inline TYPE vreinterpretq_##SUFFIX##_u16(uint16x8_t a) noexcept  \
-        {                                                                \
-            return ::vreinterpretq_##SUFFIX##_u16(a);                    \
-        }                                                                \
-        inline TYPE vreinterpretq_##SUFFIX##_s16(int16x8_t a) noexcept   \
-        {                                                                \
-            return ::vreinterpretq_##SUFFIX##_s16(a);                    \
-        }                                                                \
-        inline TYPE vreinterpretq_##SUFFIX##_u32(uint32x4_t a) noexcept  \
-        {                                                                \
-            return ::vreinterpretq_##SUFFIX##_u32(a);                    \
-        }                                                                \
-        inline TYPE vreinterpretq_##SUFFIX##_s32(int32x4_t a) noexcept   \
-        {                                                                \
-            return ::vreinterpretq_##SUFFIX##_s32(a);                    \
-        }                                                                \
-        inline TYPE vreinterpretq_##SUFFIX##_u64(uint64x2_t a) noexcept  \
-        {                                                                \
-            return ::vreinterpretq_##SUFFIX##_u64(a);                    \
-        }                                                                \
-        inline TYPE vreinterpretq_##SUFFIX##_s64(int64x2_t a) noexcept   \
-        {                                                                \
-            return ::vreinterpretq_##SUFFIX##_s64(a);                    \
-        }                                                                \
-        inline TYPE vreinterpretq_##SUFFIX##_f32(float32x4_t a) noexcept \
-        {                                                                \
-            return ::vreinterpretq_##SUFFIX##_f32(a);                    \
-        }                                                                \
+#define WRAP_CAST(SUFFIX, TYPE)                                                \
+    namespace wrap                                                             \
+    {                                                                          \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u8(uint8x16_t a) noexcept   \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_u8(a);                           \
+        }                                                                      \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s8(int8x16_t a) noexcept    \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_s8(a);                           \
+        }                                                                      \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u16(uint16x8_t a) noexcept  \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_u16(a);                          \
+        }                                                                      \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s16(int16x8_t a) noexcept   \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_s16(a);                          \
+        }                                                                      \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u32(uint32x4_t a) noexcept  \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_u32(a);                          \
+        }                                                                      \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s32(int32x4_t a) noexcept   \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_s32(a);                          \
+        }                                                                      \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u64(uint64x2_t a) noexcept  \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_u64(a);                          \
+        }                                                                      \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s64(int64x2_t a) noexcept   \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_s64(a);                          \
+        }                                                                      \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_f32(float32x4_t a) noexcept \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_f32(a);                          \
+        }                                                                      \
     }
 
         WRAP_CAST(u8, uint8x16_t)
@@ -2485,7 +2527,7 @@ namespace xsimd
             };
 
             template <class R, class... T>
-            inline const bitwise_caster_impl<R, T...> make_bitwise_caster_impl(R (*... arg)(T)) noexcept
+            XSIMD_INLINE const bitwise_caster_impl<R, T...> make_bitwise_caster_impl(R (*... arg)(T)) noexcept
             {
                 return { std::make_tuple(arg...) };
             }
@@ -2524,7 +2566,7 @@ namespace xsimd
         }
 
         template <class A, class T, class R>
-        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<neon>) noexcept
         {
             const detail::neon_bitwise_caster caster = {
                 std::make_tuple(
@@ -2566,7 +2608,7 @@ namespace xsimd
          *********/
 
         template <class A>
-        inline batch_bool<float, A> isnan(batch<float, A> const& arg, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& arg, requires_arch<neon>) noexcept
         {
             return !(arg == arg);
         }
@@ -2578,7 +2620,7 @@ namespace xsimd
             struct slider_left
             {
                 template <class A, class T>
-                inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
                 {
                     const auto left = vdupq_n_u8(0);
                     const auto right = bitwise_cast<uint8_t>(x).data;
@@ -2591,7 +2633,7 @@ namespace xsimd
             struct slider_left<0>
             {
                 template <class A, class T>
-                inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
                 {
                     return x;
                 }
@@ -2599,7 +2641,7 @@ namespace xsimd
         } // namespace detail
 
         template <size_t N, class A, class T>
-        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<neon>) noexcept
         {
             return detail::slider_left<N> {}(x, A {});
         }
@@ -2611,7 +2653,7 @@ namespace xsimd
             struct slider_right
             {
                 template <class A, class T>
-                inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
                 {
                     const auto left = bitwise_cast<uint8_t>(x).data;
                     const auto right = vdupq_n_u8(0);
@@ -2624,7 +2666,7 @@ namespace xsimd
             struct slider_right<16>
             {
                 template <class A, class T>
-                inline batch<T, A> operator()(batch<T, A> const&, requires_arch<neon>) noexcept
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const&, requires_arch<neon>) noexcept
                 {
                     return batch<T, A> {};
                 }
@@ -2632,7 +2674,7 @@ namespace xsimd
         } // namespace detail
 
         template <size_t N, class A, class T>
-        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<neon>) noexcept
         {
             return detail::slider_right<N> {}(x, A {});
         }
@@ -2643,27 +2685,27 @@ namespace xsimd
         namespace wrap
         {
             template <size_t N>
-            inline uint8x16_t rotate_right_u8(uint8x16_t a, uint8x16_t b) noexcept { return vextq_u8(a, b, N); }
+            XSIMD_INLINE uint8x16_t rotate_right_u8(uint8x16_t a, uint8x16_t b) noexcept { return vextq_u8(a, b, N); }
             template <size_t N>
-            inline int8x16_t rotate_right_s8(int8x16_t a, int8x16_t b) noexcept { return vextq_s8(a, b, N); }
+            XSIMD_INLINE int8x16_t rotate_right_s8(int8x16_t a, int8x16_t b) noexcept { return vextq_s8(a, b, N); }
             template <size_t N>
-            inline uint16x8_t rotate_right_u16(uint16x8_t a, uint16x8_t b) noexcept { return vextq_u16(a, b, N); }
+            XSIMD_INLINE uint16x8_t rotate_right_u16(uint16x8_t a, uint16x8_t b) noexcept { return vextq_u16(a, b, N); }
             template <size_t N>
-            inline int16x8_t rotate_right_s16(int16x8_t a, int16x8_t b) noexcept { return vextq_s16(a, b, N); }
+            XSIMD_INLINE int16x8_t rotate_right_s16(int16x8_t a, int16x8_t b) noexcept { return vextq_s16(a, b, N); }
             template <size_t N>
-            inline uint32x4_t rotate_right_u32(uint32x4_t a, uint32x4_t b) noexcept { return vextq_u32(a, b, N); }
+            XSIMD_INLINE uint32x4_t rotate_right_u32(uint32x4_t a, uint32x4_t b) noexcept { return vextq_u32(a, b, N); }
             template <size_t N>
-            inline int32x4_t rotate_right_s32(int32x4_t a, int32x4_t b) noexcept { return vextq_s32(a, b, N); }
+            XSIMD_INLINE int32x4_t rotate_right_s32(int32x4_t a, int32x4_t b) noexcept { return vextq_s32(a, b, N); }
             template <size_t N>
-            inline uint64x2_t rotate_right_u64(uint64x2_t a, uint64x2_t b) noexcept { return vextq_u64(a, b, N); }
+            XSIMD_INLINE uint64x2_t rotate_right_u64(uint64x2_t a, uint64x2_t b) noexcept { return vextq_u64(a, b, N); }
             template <size_t N>
-            inline int64x2_t rotate_right_s64(int64x2_t a, int64x2_t b) noexcept { return vextq_s64(a, b, N); }
+            XSIMD_INLINE int64x2_t rotate_right_s64(int64x2_t a, int64x2_t b) noexcept { return vextq_s64(a, b, N); }
             template <size_t N>
-            inline float32x4_t rotate_right_f32(float32x4_t a, float32x4_t b) noexcept { return vextq_f32(a, b, N); }
+            XSIMD_INLINE float32x4_t rotate_right_f32(float32x4_t a, float32x4_t b) noexcept { return vextq_f32(a, b, N); }
         }
 
         template <size_t N, class A, class T, detail::enable_neon_type_t<T> = 0>
-        inline batch<T, A> rotate_right(batch<T, A> const& a, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> rotate_right(batch<T, A> const& a, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             const detail::neon_dispatcher::binary dispatcher = {
@@ -2675,7 +2717,7 @@ namespace xsimd
         }
     }
 
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
     struct batch_constant;
 
     namespace kernel
@@ -2685,9 +2727,9 @@ namespace xsimd
          ***********/
 
         template <class A, class T, class I, I... idx>
-        inline batch<T, A> swizzle(batch<T, A> const& self,
-                                   batch_constant<batch<I, A>, idx...>,
-                                   requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self,
+                                         batch_constant<I, A, idx...>,
+                                         requires_arch<neon>) noexcept
         {
             static_assert(batch<T, A>::size == sizeof...(idx), "valid swizzle indices");
             std::array<T, batch<T, A>::size> data;
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_neon64.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_neon64.hpp
index bc982c7ce63..d0999703349 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_neon64.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_neon64.hpp
@@ -21,7 +21,7 @@
 
 namespace xsimd
 {
-    template <class batch_type, bool... Values>
+    template <typename T, class A, bool... Values>
     struct batch_bool_constant;
 
     namespace kernel
@@ -33,25 +33,25 @@ namespace xsimd
          *******/
 
         template <class A, class T, detail::enable_sized_t<T, 4> = 0>
-        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
         {
             return vminvq_u32(arg) == ~0U;
         }
 
         template <class A, class T, detail::enable_sized_t<T, 1> = 0>
-        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
         {
             return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {});
         }
 
         template <class A, class T, detail::enable_sized_t<T, 2> = 0>
-        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
         {
             return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {});
         }
 
         template <class A, class T, detail::enable_sized_t<T, 8> = 0>
-        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
         {
             return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {});
         }
@@ -61,25 +61,25 @@ namespace xsimd
          *******/
 
         template <class A, class T, detail::enable_sized_t<T, 4> = 0>
-        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
         {
             return vmaxvq_u32(arg) != 0;
         }
 
         template <class A, class T, detail::enable_sized_t<T, 1> = 0>
-        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
         {
             return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {});
         }
 
         template <class A, class T, detail::enable_sized_t<T, 2> = 0>
-        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
         {
             return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {});
         }
 
         template <class A, class T, detail::enable_sized_t<T, 8> = 0>
-        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
         {
             return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {});
         }
@@ -90,13 +90,13 @@ namespace xsimd
 
         // Required to avoid ambiguous call
         template <class A, class T>
-        inline batch<T, A> broadcast(T val, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon64>) noexcept
         {
-            return broadcast<neon64>(val, neon {});
+            return broadcast<A>(val, neon {});
         }
 
         template <class A>
-        inline batch<double, A> broadcast(double val, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<neon64>) noexcept
         {
             return vdupq_n_f64(val);
         }
@@ -106,13 +106,13 @@ namespace xsimd
          *******/
 
         template <class A>
-        inline batch<double, A> set(batch<double, A> const&, requires_arch<neon64>, double d0, double d1) noexcept
+        XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<neon64>, double d0, double d1) noexcept
         {
             return float64x2_t { d0, d1 };
         }
 
         template <class A>
-        inline batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<neon64>, bool b0, bool b1) noexcept
+        XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<neon64>, bool b0, bool b1) noexcept
         {
             using register_type = typename batch_bool<double, A>::register_type;
             using unsigned_type = as_unsigned_integer_t<double>;
@@ -125,7 +125,7 @@ namespace xsimd
          *************/
 
         template <class A>
-        inline batch<double, A> from_bool(batch_bool<double, A> const& arg, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> from_bool(batch_bool<double, A> const& arg, requires_arch<neon64>) noexcept
         {
             return vreinterpretq_f64_u64(vandq_u64(arg, vreinterpretq_u64_f64(vdupq_n_f64(1.))));
         }
@@ -142,13 +142,13 @@ namespace xsimd
 #endif
 
         template <class A>
-        inline batch<double, A> load_aligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> load_aligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
         {
             return xsimd_aligned_load(vld1q_f64, double*, src);
         }
 
         template <class A>
-        inline batch<double, A> load_unaligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> load_unaligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
         {
             return vld1q_f64(src);
         }
@@ -159,13 +159,13 @@ namespace xsimd
          *********/
 
         template <class A>
-        inline void store_aligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
+        XSIMD_INLINE void store_aligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
         {
             vst1q_f64(dst, src);
         }
 
         template <class A>
-        inline void store_unaligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
+        XSIMD_INLINE void store_unaligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
         {
             return store_aligned<A>(dst, src, A {});
         }
@@ -175,7 +175,7 @@ namespace xsimd
          ****************/
 
         template <class A>
-        inline batch<std::complex<double>, A> load_complex_aligned(std::complex<double> const* mem, convert<std::complex<double>>, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<std::complex<double>, A> load_complex_aligned(std::complex<double> const* mem, convert<std::complex<double>>, requires_arch<neon64>) noexcept
         {
             using real_batch = batch<double, A>;
             const double* buf = reinterpret_cast<const double*>(mem);
@@ -186,7 +186,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<std::complex<double>, A> load_complex_unaligned(std::complex<double> const* mem, convert<std::complex<double>> cvt, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<std::complex<double>, A> load_complex_unaligned(std::complex<double> const* mem, convert<std::complex<double>> cvt, requires_arch<neon64>) noexcept
         {
             return load_complex_aligned<A>(mem, cvt, A {});
         }
@@ -196,7 +196,7 @@ namespace xsimd
          *****************/
 
         template <class A>
-        inline void store_complex_aligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
+        XSIMD_INLINE void store_complex_aligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
         {
             float64x2x2_t tmp;
             tmp.val[0] = src.real();
@@ -206,7 +206,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline void store_complex_unaligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
+        XSIMD_INLINE void store_complex_unaligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
         {
             store_complex_aligned(dst, src, A {});
         }
@@ -216,19 +216,19 @@ namespace xsimd
          *******/
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vreinterpretq_u64_s64(vnegq_s64(vreinterpretq_s64_u64(rhs)));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vnegq_s64(rhs);
         }
 
         template <class A>
-        inline batch<double, A> neg(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> neg(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vnegq_f64(rhs);
         }
@@ -238,7 +238,7 @@ namespace xsimd
          *******/
 
         template <class A>
-        inline batch<double, A> add(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> add(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vaddq_f64(lhs, rhs);
         }
@@ -248,7 +248,7 @@ namespace xsimd
          ********/
 
         template <class A>
-        inline batch<double, A> sadd(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> sadd(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return add(lhs, rhs, neon64 {});
         }
@@ -258,7 +258,7 @@ namespace xsimd
          *******/
 
         template <class A>
-        inline batch<double, A> sub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> sub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vsubq_f64(lhs, rhs);
         }
@@ -268,7 +268,7 @@ namespace xsimd
          ********/
 
         template <class A>
-        inline batch<double, A> ssub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> ssub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return sub(lhs, rhs, neon64 {});
         }
@@ -278,7 +278,7 @@ namespace xsimd
          *******/
 
         template <class A>
-        inline batch<double, A> mul(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> mul(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vmulq_f64(lhs, rhs);
         }
@@ -289,19 +289,19 @@ namespace xsimd
 
 #if defined(XSIMD_FAST_INTEGER_DIVISION)
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vcvtq_u64_f64(vcvtq_f64_u64(lhs) / vcvtq_f64_u64(rhs));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vcvtq_s64_f64(vcvtq_f64_s64(lhs) / vcvtq_f64_s64(rhs));
         }
 #endif
         template <class A>
-        inline batch<double, A> div(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> div(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vdivq_f64(lhs, rhs);
         }
@@ -311,37 +311,37 @@ namespace xsimd
          ******/
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vceqq_u64(lhs, rhs);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vceqq_s64(lhs, rhs);
         }
 
         template <class A>
-        inline batch_bool<double, A> eq(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vceqq_f64(lhs, rhs);
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vceqq_u64(lhs, rhs);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vceqq_u64(lhs, rhs);
         }
 
         template <class A>
-        inline batch_bool<double, A> eq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vceqq_u64(lhs, rhs);
         }
@@ -352,25 +352,25 @@ namespace xsimd
         namespace detail
         {
             template <class A>
-            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
+            XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
             {
                 return vcvtq_f64_s64(x);
             }
 
             template <class A>
-            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
+            XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
             {
                 return vcvtq_f64_u64(x);
             }
 
             template <class A>
-            inline batch<int64_t, A> fast_cast(batch<double, A> const& x, batch<int64_t, A> const&, requires_arch<neon64>) noexcept
+            XSIMD_INLINE batch<int64_t, A> fast_cast(batch<double, A> const& x, batch<int64_t, A> const&, requires_arch<neon64>) noexcept
             {
                 return vcvtq_s64_f64(x);
             }
 
             template <class A>
-            inline batch<uint64_t, A> fast_cast(batch<double, A> const& x, batch<uint64_t, A> const&, requires_arch<neon64>) noexcept
+            XSIMD_INLINE batch<uint64_t, A> fast_cast(batch<double, A> const& x, batch<uint64_t, A> const&, requires_arch<neon64>) noexcept
             {
                 return vcvtq_u64_f64(x);
             }
@@ -382,19 +382,19 @@ namespace xsimd
          ******/
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vcltq_u64(lhs, rhs);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vcltq_s64(lhs, rhs);
         }
 
         template <class A>
-        inline batch_bool<double, A> lt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vcltq_f64(lhs, rhs);
         }
@@ -404,19 +404,19 @@ namespace xsimd
          ******/
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vcleq_u64(lhs, rhs);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vcleq_s64(lhs, rhs);
         }
 
         template <class A>
-        inline batch_bool<double, A> le(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vcleq_f64(lhs, rhs);
         }
@@ -426,19 +426,19 @@ namespace xsimd
          ******/
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vcgtq_u64(lhs, rhs);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vcgtq_s64(lhs, rhs);
         }
 
         template <class A>
-        inline batch_bool<double, A> gt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vcgtq_f64(lhs, rhs);
         }
@@ -448,19 +448,19 @@ namespace xsimd
          ******/
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vcgeq_u64(lhs, rhs);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vcgeq_s64(lhs, rhs);
         }
 
         template <class A>
-        inline batch_bool<double, A> ge(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vcgeq_f64(lhs, rhs);
         }
@@ -470,7 +470,7 @@ namespace xsimd
          *******************/
 
         template <class A, class T_out, class T_in>
-        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon64>) noexcept
         {
             using register_type = typename batch_bool<T_out, A>::register_type;
             return register_type(self);
@@ -481,14 +481,14 @@ namespace xsimd
          ***************/
 
         template <class A>
-        inline batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(lhs),
                                                    vreinterpretq_u64_f64(rhs)));
         }
 
         template <class A>
-        inline batch_bool<double, A> bitwise_and(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<double, A> bitwise_and(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vandq_u64(lhs, rhs);
         }
@@ -498,14 +498,14 @@ namespace xsimd
          **************/
 
         template <class A>
-        inline batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(lhs),
                                                    vreinterpretq_u64_f64(rhs)));
         }
 
         template <class A>
-        inline batch_bool<double, A> bitwise_or(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<double, A> bitwise_or(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vorrq_u64(lhs, rhs);
         }
@@ -515,14 +515,14 @@ namespace xsimd
          ***************/
 
         template <class A>
-        inline batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(lhs),
                                                    vreinterpretq_u64_f64(rhs)));
         }
 
         template <class A>
-        inline batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return veorq_u64(lhs, rhs);
         }
@@ -532,7 +532,7 @@ namespace xsimd
          *******/
 
         template <class A>
-        inline batch_bool<double, A> neq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return bitwise_xor(lhs, rhs, A {});
         }
@@ -542,13 +542,13 @@ namespace xsimd
          ***************/
 
         template <class A>
-        inline batch<double, A> bitwise_not(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_f64(rhs)));
         }
 
         template <class A>
-        inline batch_bool<double, A> bitwise_not(batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<double, A> bitwise_not(batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return detail::bitwise_not_u64(rhs);
         }
@@ -558,14 +558,14 @@ namespace xsimd
          ******************/
 
         template <class A>
-        inline batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(lhs),
                                                    vreinterpretq_u64_f64(rhs)));
         }
 
         template <class A>
-        inline batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vbicq_u64(lhs, rhs);
         }
@@ -575,7 +575,7 @@ namespace xsimd
          *******/
 
         template <class A>
-        inline batch<double, A> min(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> min(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vminq_f64(lhs, rhs);
         }
@@ -585,7 +585,7 @@ namespace xsimd
          *******/
 
         template <class A>
-        inline batch<double, A> max(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> max(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vmaxq_f64(lhs, rhs);
         }
@@ -595,34 +595,34 @@ namespace xsimd
          *******/
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return rhs;
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vabsq_s64(rhs);
         }
 
         template <class A>
-        inline batch<double, A> abs(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> abs(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vabsq_f64(rhs);
         }
 
         template <class A>
-        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
-                                                  requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                        requires_arch<neon64>) noexcept
         {
             return vcvtnq_s32_f32(self);
         }
 
 #if !defined(__GNUC__)
         template <class A>
-        inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
-                                                  requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
+                                                        requires_arch<neon64>) noexcept
         {
             return vcvtnq_s64_f64(self);
         }
@@ -633,7 +633,7 @@ namespace xsimd
          **************/
 
         template <class A>
-        inline batch<double, A>
+        XSIMD_INLINE batch<double, A>
         reciprocal(const batch<double, A>& x,
                    kernel::requires_arch<neon64>) noexcept
         {
@@ -645,7 +645,7 @@ namespace xsimd
          ********/
 
         template <class A>
-        inline batch<double, A> rsqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vrsqrteq_f64(rhs);
         }
@@ -655,7 +655,7 @@ namespace xsimd
          ********/
 
         template <class A>
-        inline batch<double, A> sqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vsqrtq_f64(rhs);
         }
@@ -666,13 +666,13 @@ namespace xsimd
 
 #ifdef __ARM_FEATURE_FMA
         template <class A>
-        inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
         {
             return vfmaq_f64(z, x, y);
         }
 
         template <class A>
-        inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
         {
             return vfmaq_f64(-z, x, y);
         }
@@ -683,7 +683,7 @@ namespace xsimd
          *********/
 
         template <class A>
-        inline batch<double, A> haddp(const batch<double, A>* row, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> haddp(const batch<double, A>* row, requires_arch<neon64>) noexcept
         {
             return vpaddq_f64(row[0], row[1]);
         }
@@ -693,7 +693,7 @@ namespace xsimd
          **********/
 
         template <class A, size_t I>
-        inline batch<double, A> insert(batch<double, A> const& self, double val, index<I>, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> insert(batch<double, A> const& self, double val, index<I>, requires_arch<neon64>) noexcept
         {
             return vsetq_lane_f64(val, self, I);
         }
@@ -705,60 +705,60 @@ namespace xsimd
         // Wrap reducer intrinsics so we can pass them as function pointers
         // - OP: intrinsics name prefix, e.g., vorrq
 
-#define WRAP_REDUCER_INT_EXCLUDING_64(OP)               \
-    namespace wrap                                      \
-    {                                                   \
-        inline uint8_t OP##_u8(uint8x16_t a) noexcept   \
-        {                                               \
-            return ::OP##_u8(a);                        \
-        }                                               \
-        inline int8_t OP##_s8(int8x16_t a) noexcept     \
-        {                                               \
-            return ::OP##_s8(a);                        \
-        }                                               \
-        inline uint16_t OP##_u16(uint16x8_t a) noexcept \
-        {                                               \
-            return ::OP##_u16(a);                       \
-        }                                               \
-        inline int16_t OP##_s16(int16x8_t a) noexcept   \
-        {                                               \
-            return ::OP##_s16(a);                       \
-        }                                               \
-        inline uint32_t OP##_u32(uint32x4_t a) noexcept \
-        {                                               \
-            return ::OP##_u32(a);                       \
-        }                                               \
-        inline int32_t OP##_s32(int32x4_t a) noexcept   \
-        {                                               \
-            return ::OP##_s32(a);                       \
-        }                                               \
+#define WRAP_REDUCER_INT_EXCLUDING_64(OP)                     \
+    namespace wrap                                            \
+    {                                                         \
+        XSIMD_INLINE uint8_t OP##_u8(uint8x16_t a) noexcept   \
+        {                                                     \
+            return ::OP##_u8(a);                              \
+        }                                                     \
+        XSIMD_INLINE int8_t OP##_s8(int8x16_t a) noexcept     \
+        {                                                     \
+            return ::OP##_s8(a);                              \
+        }                                                     \
+        XSIMD_INLINE uint16_t OP##_u16(uint16x8_t a) noexcept \
+        {                                                     \
+            return ::OP##_u16(a);                             \
+        }                                                     \
+        XSIMD_INLINE int16_t OP##_s16(int16x8_t a) noexcept   \
+        {                                                     \
+            return ::OP##_s16(a);                             \
+        }                                                     \
+        XSIMD_INLINE uint32_t OP##_u32(uint32x4_t a) noexcept \
+        {                                                     \
+            return ::OP##_u32(a);                             \
+        }                                                     \
+        XSIMD_INLINE int32_t OP##_s32(int32x4_t a) noexcept   \
+        {                                                     \
+            return ::OP##_s32(a);                             \
+        }                                                     \
     }
 
-#define WRAP_REDUCER_INT(OP)                            \
-    WRAP_REDUCER_INT_EXCLUDING_64(OP)                   \
-    namespace wrap                                      \
-    {                                                   \
-        inline uint64_t OP##_u64(uint64x2_t a) noexcept \
-        {                                               \
-            return ::OP##_u64(a);                       \
-        }                                               \
-        inline int64_t OP##_s64(int64x2_t a) noexcept   \
-        {                                               \
-            return ::OP##_s64(a);                       \
-        }                                               \
+#define WRAP_REDUCER_INT(OP)                                  \
+    WRAP_REDUCER_INT_EXCLUDING_64(OP)                         \
+    namespace wrap                                            \
+    {                                                         \
+        XSIMD_INLINE uint64_t OP##_u64(uint64x2_t a) noexcept \
+        {                                                     \
+            return ::OP##_u64(a);                             \
+        }                                                     \
+        XSIMD_INLINE int64_t OP##_s64(int64x2_t a) noexcept   \
+        {                                                     \
+            return ::OP##_s64(a);                             \
+        }                                                     \
     }
 
-#define WRAP_REDUCER_FLOAT(OP)                         \
-    namespace wrap                                     \
-    {                                                  \
-        inline float OP##_f32(float32x4_t a) noexcept  \
-        {                                              \
-            return ::OP##_f32(a);                      \
-        }                                              \
-        inline double OP##_f64(float64x2_t a) noexcept \
-        {                                              \
-            return ::OP##_f64(a);                      \
-        }                                              \
+#define WRAP_REDUCER_FLOAT(OP)                               \
+    namespace wrap                                           \
+    {                                                        \
+        XSIMD_INLINE float OP##_f32(float32x4_t a) noexcept  \
+        {                                                    \
+            return ::OP##_f32(a);                            \
+        }                                                    \
+        XSIMD_INLINE double OP##_f64(float64x2_t a) noexcept \
+        {                                                    \
+            return ::OP##_f64(a);                            \
+        }                                                    \
     }
 
         namespace detail
@@ -852,7 +852,7 @@ namespace xsimd
         WRAP_REDUCER_FLOAT(vaddvq)
 
         template <class A, class T, detail::enable_neon64_type_t<T> = 0>
-        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon64>) noexcept
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon64>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             const detail::neon_reducer_dispatcher::unary dispatcher = {
@@ -872,19 +872,19 @@ namespace xsimd
 
         namespace wrap
         {
-            inline uint64_t vmaxvq_u64(uint64x2_t a) noexcept
+            XSIMD_INLINE uint64_t vmaxvq_u64(uint64x2_t a) noexcept
             {
                 return std::max(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1));
             }
 
-            inline int64_t vmaxvq_s64(int64x2_t a) noexcept
+            XSIMD_INLINE int64_t vmaxvq_s64(int64x2_t a) noexcept
             {
                 return std::max(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1));
             }
         }
 
         template <class A, class T, detail::enable_neon64_type_t<T> = 0>
-        inline typename batch<T, A>::value_type reduce_max(batch<T, A> const& arg, requires_arch<neon64>) noexcept
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_max(batch<T, A> const& arg, requires_arch<neon64>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             const detail::neon_reducer_dispatcher::unary dispatcher = {
@@ -904,19 +904,19 @@ namespace xsimd
 
         namespace wrap
         {
-            inline uint64_t vminvq_u64(uint64x2_t a) noexcept
+            XSIMD_INLINE uint64_t vminvq_u64(uint64x2_t a) noexcept
             {
                 return std::min(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1));
             }
 
-            inline int64_t vminvq_s64(int64x2_t a) noexcept
+            XSIMD_INLINE int64_t vminvq_s64(int64x2_t a) noexcept
             {
                 return std::min(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1));
             }
         }
 
         template <class A, class T, detail::enable_neon64_type_t<T> = 0>
-        inline typename batch<T, A>::value_type reduce_min(batch<T, A> const& arg, requires_arch<neon64>) noexcept
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_min(batch<T, A> const& arg, requires_arch<neon64>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
             const detail::neon_reducer_dispatcher::unary dispatcher = {
@@ -936,37 +936,78 @@ namespace xsimd
          **********/
 
         template <class A>
-        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& a, batch<double, A> const& b, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& a, batch<double, A> const& b, requires_arch<neon64>) noexcept
         {
             return vbslq_f64(cond, a, b);
         }
 
         template <class A, bool... b>
-        inline batch<double, A> select(batch_bool_constant<batch<double, A>, b...> const&,
-                                       batch<double, A> const& true_br,
-                                       batch<double, A> const& false_br,
-                                       requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> select(batch_bool_constant<double, A, b...> const&,
+                                             batch<double, A> const& true_br,
+                                             batch<double, A> const& false_br,
+                                             requires_arch<neon64>) noexcept
         {
             return select(batch_bool<double, A> { b... }, true_br, false_br, neon64 {});
         }
         /**********
          * zip_lo *
          **********/
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_u8(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_s8(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_u16(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_s16(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_u32(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_s32(lhs, rhs);
+        }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vzip1q_u64(lhs, rhs);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vzip1q_s64(lhs, rhs);
         }
 
         template <class A>
-        inline batch<double, A> zip_lo(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_f32(lhs, rhs);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vzip1q_f64(lhs, rhs);
         }
@@ -975,20 +1016,62 @@ namespace xsimd
          * zip_hi *
          **********/
 
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_u8(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_s8(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_u16(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_s16(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_u32(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_s32(lhs, rhs);
+        }
+
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vzip2q_u64(lhs, rhs);
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vzip2q_s64(lhs, rhs);
         }
 
         template <class A>
-        inline batch<double, A> zip_hi(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_f32(lhs, rhs);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vzip2q_f64(lhs, rhs);
         }
@@ -1000,8 +1083,8 @@ namespace xsimd
         namespace detail
         {
             template <class A, size_t I, size_t... Is>
-            inline batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n,
-                                                 ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n,
+                                                       ::xsimd::detail::index_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -1015,7 +1098,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n, requires_arch<neon64>) noexcept
         {
             constexpr std::size_t size = batch<double, A>::size;
             assert(n < size && "index in bounds");
@@ -1027,25 +1110,25 @@ namespace xsimd
          ******************/
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
         {
             return bitwise_rshift<A>(lhs, n, neon {});
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vshlq_u64(lhs, vnegq_s64(rhs));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
         {
             return bitwise_rshift<A>(lhs, n, neon {});
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
             return vshlq_s64(lhs, vnegq_s64(rhs));
         }
@@ -1054,17 +1137,17 @@ namespace xsimd
          * bitwise_cast *
          ****************/
 
-#define WRAP_CAST(SUFFIX, TYPE)                                          \
-    namespace wrap                                                       \
-    {                                                                    \
-        inline float64x2_t vreinterpretq_f64_##SUFFIX(TYPE a) noexcept   \
-        {                                                                \
-            return ::vreinterpretq_f64_##SUFFIX(a);                      \
-        }                                                                \
-        inline TYPE vreinterpretq_##SUFFIX##_f64(float64x2_t a) noexcept \
-        {                                                                \
-            return ::vreinterpretq_##SUFFIX##_f64(a);                    \
-        }                                                                \
+#define WRAP_CAST(SUFFIX, TYPE)                                                \
+    namespace wrap                                                             \
+    {                                                                          \
+        XSIMD_INLINE float64x2_t vreinterpretq_f64_##SUFFIX(TYPE a) noexcept   \
+        {                                                                      \
+            return ::vreinterpretq_f64_##SUFFIX(a);                            \
+        }                                                                      \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_f64(float64x2_t a) noexcept \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_f64(a);                          \
+        }                                                                      \
     }
 
         WRAP_CAST(u8, uint8x16_t)
@@ -1080,7 +1163,7 @@ namespace xsimd
 #undef WRAP_CAST
 
         template <class A, class T>
-        inline batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
         {
             using caster_type = detail::bitwise_caster_impl<float64x2_t,
                                                             uint8x16_t, int8x16_t,
@@ -1116,7 +1199,7 @@ namespace xsimd
         }
 
         template <class A, class R>
-        inline batch<R, A> bitwise_cast(batch<double, A> const& arg, batch<R, A> const&, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<double, A> const& arg, batch<R, A> const&, requires_arch<neon64>) noexcept
         {
             using caster_type = detail::bitwise_caster_neon64<float64x2_t,
                                                               uint8x16_t, int8x16_t,
@@ -1135,7 +1218,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<double, A> bitwise_cast(batch<double, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<double, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
         {
             return arg;
         }
@@ -1145,7 +1228,7 @@ namespace xsimd
          *********/
 
         template <class A>
-        inline batch_bool<double, A> isnan(batch<double, A> const& arg, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& arg, requires_arch<neon64>) noexcept
         {
             return !(arg == arg);
         }
@@ -1154,13 +1237,13 @@ namespace xsimd
          * rotate_right *
          ****************/
         template <size_t N, class A>
-        inline batch<double, A> rotate_right(batch<double, A> const& a, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> rotate_right(batch<double, A> const& a, requires_arch<neon64>) noexcept
         {
             return vextq_f64(a, a, N);
         }
     }
 
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
     struct batch_constant;
 
     namespace kernel
@@ -1169,23 +1252,23 @@ namespace xsimd
          * swizzle (dynamic) *
          *********************/
         template <class A>
-        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> idx,
-                                         requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> idx,
+                                               requires_arch<neon64>) noexcept
         {
             return vqtbl1q_u8(self, idx);
         }
 
         template <class A>
-        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch<uint8_t, A> idx,
-                                        requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch<uint8_t, A> idx,
+                                              requires_arch<neon64>) noexcept
         {
             return vqtbl1q_s8(self, idx);
         }
 
         template <class A>
-        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self,
-                                          batch<uint16_t, A> idx,
-                                          requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self,
+                                                batch<uint16_t, A> idx,
+                                                requires_arch<neon64>) noexcept
         {
             using batch_type = batch<uint8_t, A>;
             using index_type = batch<uint8_t, A>;
@@ -1195,17 +1278,17 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self,
-                                         batch<uint16_t, A> idx,
-                                         requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self,
+                                               batch<uint16_t, A> idx,
+                                               requires_arch<neon64>) noexcept
         {
             return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), idx, neon64 {}));
         }
 
         template <class A>
-        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
-                                          batch<uint32_t, A> idx,
-                                          requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
+                                                batch<uint32_t, A> idx,
+                                                requires_arch<neon64>) noexcept
         {
             using batch_type = batch<uint8_t, A>;
             using index_type = batch<uint8_t, A>;
@@ -1215,17 +1298,17 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
-                                         batch<uint32_t, A> idx,
-                                         requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
+                                               batch<uint32_t, A> idx,
+                                               requires_arch<neon64>) noexcept
         {
             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), idx, neon64 {}));
         }
 
         template <class A>
-        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
-                                          batch<uint64_t, A> idx,
-                                          requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
+                                                batch<uint64_t, A> idx,
+                                                requires_arch<neon64>) noexcept
         {
             using batch_type = batch<uint8_t, A>;
             using index_type = batch<uint8_t, A>;
@@ -1235,25 +1318,25 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
-                                         batch<uint64_t, A> idx,
-                                         requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
+                                               batch<uint64_t, A> idx,
+                                               requires_arch<neon64>) noexcept
         {
             return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), idx, neon64 {}));
         }
 
         template <class A>
-        inline batch<float, A> swizzle(batch<float, A> const& self,
-                                       batch<uint32_t, A> idx,
-                                       requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self,
+                                             batch<uint32_t, A> idx,
+                                             requires_arch<neon64>) noexcept
         {
             return bitwise_cast<float>(swizzle(bitwise_cast<uint32_t>(self), idx, neon64 {}));
         }
 
         template <class A>
-        inline batch<double, A> swizzle(batch<double, A> const& self,
-                                        batch<uint64_t, A> idx,
-                                        requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self,
+                                              batch<uint64_t, A> idx,
+                                              requires_arch<neon64>) noexcept
         {
             return bitwise_cast<double>(swizzle(bitwise_cast<uint64_t>(self), idx, neon64 {}));
         }
@@ -1271,43 +1354,41 @@ namespace xsimd
             template <class CB1, class CB2, class IS>
             struct index_burst_impl;
 
-            template <class B1, class B2, typename B2::value_type... V,
-                      typename B2::value_type... incr>
-            struct index_burst_impl<batch_constant<B1>, batch_constant<B2, V...>,
-                                    integer_sequence<typename B2::value_type, incr...>>
+            template <typename T1, class A, typename T2, T2... V,
+                      T2... incr>
+            struct index_burst_impl<batch_constant<T1, A>, batch_constant<T2, A, V...>,
+                                    integer_sequence<T2, incr...>>
             {
-                using type = batch_constant<B2, V...>;
+                using type = batch_constant<T2, A, V...>;
             };
 
-            template <class B1, typename B1::value_type V0, typename B1::value_type... V1,
-                      class B2, typename B2::value_type... V2,
-                      typename B2::value_type... incr>
-            struct index_burst_impl<batch_constant<B1, V0, V1...>, batch_constant<B2, V2...>,
-                                    integer_sequence<typename B2::value_type, incr...>>
+            template <typename T1, class A, T1 V0, T1... V1,
+                      typename T2, T2... V2, T2... incr>
+            struct index_burst_impl<batch_constant<T1, A, V0, V1...>, batch_constant<T2, A, V2...>,
+                                    integer_sequence<T2, incr...>>
             {
-                using value_type = typename B2::value_type;
-                using next_input = batch_constant<B1, V1...>;
-                using next_output = batch_constant<B2, V2..., (V0 + incr)...>;
-                using type = typename index_burst_impl<next_input, next_output, integer_sequence<value_type, incr...>>::type;
+                using next_input = batch_constant<T1, A, V1...>;
+                using next_output = batch_constant<T2, A, V2..., (V0 + incr)...>;
+                using type = typename index_burst_impl<next_input, next_output, integer_sequence<T2, incr...>>::type;
             };
 
             template <class B, class T>
             struct index_burst;
 
-            template <class B, typename B::value_type... V, class T>
-            struct index_burst<batch_constant<B, V...>, T>
+            template <typename Tp, class A, Tp... V, typename T>
+            struct index_burst<batch_constant<Tp, A, V...>, T>
             {
-                static constexpr size_t mul = sizeof(typename B::value_type) / sizeof(T);
-                using input = batch_constant<B, (mul * V)...>;
-                using output = batch_constant<batch<T, typename B::arch_type>>;
+                static constexpr size_t mul = sizeof(Tp) / sizeof(T);
+                using input = batch_constant<Tp, A, (mul * V)...>;
+                using output = batch_constant<T, A>;
                 using type = typename index_burst_impl<input, output, make_integer_sequence<T, mul>>::type;
             };
 
-            template <class B, class T>
+            template <class B, typename T>
             using index_burst_t = typename index_burst<B, T>::type;
 
-            template <class T, class B>
-            inline index_burst_t<B, T> burst_index(B)
+            template <typename T, class B>
+            XSIMD_INLINE index_burst_t<B, T> burst_index(B)
             {
                 return index_burst_t<B, T>();
             }
@@ -1315,106 +1396,106 @@ namespace xsimd
 
         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
                   uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
-        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self,
-                                         batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
-                                         requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self,
+                                               batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
+                                               requires_arch<neon64>) noexcept
         {
             return vqtbl1q_u8(self, batch<uint8_t, A>(idx));
         }
 
         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
                   uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
-        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self,
-                                        batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
-                                        requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self,
+                                              batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
+                                              requires_arch<neon64>) noexcept
         {
             return vqtbl1q_s8(self, batch<uint8_t, A>(idx));
         }
 
         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
-        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self,
-                                          batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx,
-                                          requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self,
+                                                batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> idx,
+                                                requires_arch<neon64>) noexcept
         {
             using batch_type = batch<uint8_t, A>;
             return vreinterpretq_u16_u8(swizzle<A>(batch_type(vreinterpretq_u8_u16(self)), detail::burst_index<uint8_t>(idx), A()));
         }
 
         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
-        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self,
-                                         batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx,
-                                         requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self,
+                                               batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> idx,
+                                               requires_arch<neon64>) noexcept
         {
             using batch_type = batch<int8_t, A>;
             return vreinterpretq_s16_s8(swizzle<A>(batch_type(vreinterpretq_s8_s16(self)), detail::burst_index<uint8_t>(idx), A()));
         }
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
-                                          batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
-                                          requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
+                                                batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
+                                                requires_arch<neon64>) noexcept
         {
             using batch_type = batch<uint8_t, A>;
             return vreinterpretq_u32_u8(swizzle<A>(batch_type(vreinterpretq_u8_u32(self)), detail::burst_index<uint8_t>(idx), A()));
         }
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
-                                         batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
-                                         requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
+                                               batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
+                                               requires_arch<neon64>) noexcept
         {
             using batch_type = batch<int8_t, A>;
             return vreinterpretq_s32_s8(swizzle<A>(batch_type(vreinterpretq_s8_s32(self)), detail::burst_index<uint8_t>(idx), A()));
         }
 
         template <class A, uint64_t V0, uint64_t V1>
-        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
-                                          batch_constant<batch<uint64_t, A>, V0, V1> idx,
-                                          requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
+                                                batch_constant<uint64_t, A, V0, V1> idx,
+                                                requires_arch<neon64>) noexcept
         {
             using batch_type = batch<uint8_t, A>;
             return vreinterpretq_u64_u8(swizzle<A>(batch_type(vreinterpretq_u8_u64(self)), detail::burst_index<uint8_t>(idx), A()));
         }
 
         template <class A, uint64_t V0, uint64_t V1>
-        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
-                                         batch_constant<batch<uint64_t, A>, V0, V1> idx,
-                                         requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
+                                               batch_constant<uint64_t, A, V0, V1> idx,
+                                               requires_arch<neon64>) noexcept
         {
             using batch_type = batch<int8_t, A>;
             return vreinterpretq_s64_s8(swizzle<A>(batch_type(vreinterpretq_s8_s64(self)), detail::burst_index<uint8_t>(idx), A()));
         }
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<float, A> swizzle(batch<float, A> const& self,
-                                       batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
-                                       requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self,
+                                             batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
+                                             requires_arch<neon64>) noexcept
         {
             using batch_type = batch<uint8_t, A>;
             return vreinterpretq_f32_u8(swizzle<A>(batch_type(vreinterpretq_u8_f32(self)), detail::burst_index<uint8_t>(idx), A()));
         }
 
         template <class A, uint64_t V0, uint64_t V1>
-        inline batch<double, A> swizzle(batch<double, A> const& self,
-                                        batch_constant<batch<uint64_t, A>, V0, V1> idx,
-                                        requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self,
+                                              batch_constant<uint64_t, A, V0, V1> idx,
+                                              requires_arch<neon64>) noexcept
         {
             using batch_type = batch<uint8_t, A>;
             return vreinterpretq_f64_u8(swizzle<A>(batch_type(vreinterpretq_u8_f64(self)), detail::burst_index<uint8_t>(idx), A()));
         }
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<std::complex<float>, A> swizzle(batch<std::complex<float>, A> const& self,
-                                                     batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
-                                                     requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<std::complex<float>, A> swizzle(batch<std::complex<float>, A> const& self,
+                                                           batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
+                                                           requires_arch<neon64>) noexcept
         {
             return batch<std::complex<float>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
         }
 
         template <class A, uint64_t V0, uint64_t V1>
-        inline batch<std::complex<double>, A> swizzle(batch<std::complex<double>, A> const& self,
-                                                      batch_constant<batch<uint64_t, A>, V0, V1> idx,
-                                                      requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<std::complex<double>, A> swizzle(batch<std::complex<double>, A> const& self,
+                                                            batch_constant<uint64_t, A, V0, V1> idx,
+                                                            requires_arch<neon64>) noexcept
         {
             return batch<std::complex<double>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
         }
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_rvv.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_rvv.hpp
index 98d1de9ce34..75f1145cdb8 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_rvv.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_rvv.hpp
@@ -284,7 +284,7 @@
 
 namespace xsimd
 {
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
     struct batch_constant;
 
     namespace kernel
@@ -384,7 +384,7 @@ namespace xsimd
             }
 
             template <class A, class T, class U = as_unsigned_integer_t<T>>
-            inline batch<U, A> rvv_to_unsigned_batch(batch<T, A> const& arg) noexcept
+            XSIMD_INLINE batch<U, A> rvv_to_unsigned_batch(batch<T, A> const& arg) noexcept
             {
                 return rvvreinterpret<U>(arg.data);
             }
@@ -413,18 +413,18 @@ namespace xsimd
                                , size_t(bvec));
 
             template <class T, size_t Width>
-            inline rvv_bool_t<T, Width> pmask8(uint8_t mask) noexcept
+            XSIMD_INLINE rvv_bool_t<T, Width> pmask8(uint8_t mask) noexcept
             {
                 return rvv_bool_t<T, Width>(mask);
             }
             template <class T, size_t Width>
-            inline rvv_bool_t<T, Width> pmask(uint64_t mask) noexcept
+            XSIMD_INLINE rvv_bool_t<T, Width> pmask(uint64_t mask) noexcept
             {
                 return rvv_bool_t<T, Width>(mask);
             }
 
             template <class A, class T, size_t offset = 0, int shift = 0>
-            inline rvv_reg_t<T, A::width> vindex() noexcept
+            XSIMD_INLINE rvv_reg_t<T, A::width> vindex() noexcept
             {
                 auto index = rvvid(T {});
                 if (shift < 0)
@@ -462,7 +462,7 @@ namespace xsimd
         namespace detail
         {
             template <class T, size_t Width>
-            inline detail::rvv_reg_t<T, Width> broadcast(T arg) noexcept
+            XSIMD_INLINE detail::rvv_reg_t<T, Width> broadcast(T arg) noexcept
             {
                 // A bit of a dance, here, because rvvmv_splat has no other
                 // argument from which to deduce type, and T=char is not
@@ -475,7 +475,7 @@ namespace xsimd
 
         // broadcast
         template <class A, class T>
-        inline batch<T, A> broadcast(T arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<rvv>) noexcept
         {
             return detail::broadcast<T, A::width>(arg);
         }
@@ -491,13 +491,13 @@ namespace xsimd
         }
 
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<rvv>) noexcept
         {
             return detail::rvvle(reinterpret_cast<detail::rvv_fix_char_t<T> const*>(src));
         }
 
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<rvv>) noexcept
         {
             return load_aligned<A>(src, convert<T>(), rvv {});
         }
@@ -506,14 +506,14 @@ namespace xsimd
         namespace detail
         {
             template <class T, size_t W, typename std::enable_if<W >= types::detail::rvv_width_m1, int>::type = 0>
-            inline rvv_reg_t<T, W * 2> rvvabut(rvv_reg_t<T, W> const& lo, rvv_reg_t<T, W> const& hi) noexcept
+            XSIMD_INLINE rvv_reg_t<T, W * 2> rvvabut(rvv_reg_t<T, W> const& lo, rvv_reg_t<T, W> const& hi) noexcept
             {
                 typename rvv_reg_t<T, W * 2>::register_type tmp;
                 tmp = __riscv_vset(tmp, 0, lo);
                 return __riscv_vset(tmp, 1, hi);
             }
 
-            template <class T, size_t W, typename std::enable_if<W<types::detail::rvv_width_m1, int>::type = 0> inline rvv_reg_t<T, W * 2> rvvabut(rvv_reg_t<T, W> const& lo, rvv_reg_t<T, W> const& hi) noexcept
+            template <class T, size_t W, typename std::enable_if<W<types::detail::rvv_width_m1, int>::type = 0> XSIMD_INLINE rvv_reg_t<T, W * 2> rvvabut(rvv_reg_t<T, W> const& lo, rvv_reg_t<T, W> const& hi) noexcept
             {
                 return __riscv_vslideup(lo, hi, lo.vl, lo.vl * 2);
             }
@@ -544,7 +544,7 @@ namespace xsimd
             }
 
             template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
-            inline batch<std::complex<T>, A> load_complex(batch<T, A> const& lo, batch<T, A> const& hi, requires_arch<rvv>) noexcept
+            XSIMD_INLINE batch<std::complex<T>, A> load_complex(batch<T, A> const& lo, batch<T, A> const& hi, requires_arch<rvv>) noexcept
             {
                 const auto real_index = vindex<A, as_unsigned_integer_t<T>, 0, 1>();
                 const auto imag_index = vindex<A, as_unsigned_integer_t<T>, 1, 1>();
@@ -561,13 +561,13 @@ namespace xsimd
          *********/
 
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<rvv>) noexcept
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<rvv>) noexcept
         {
             detail::rvvse(reinterpret_cast<detail::rvv_fix_char_t<T>*>(dst), src);
         }
 
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<rvv>) noexcept
+        XSIMD_INLINE void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<rvv>) noexcept
         {
             store_aligned<A>(dst, src, rvv {});
         }
@@ -590,7 +590,7 @@ namespace xsimd
 
         // scatter
         template <class A, class T, class U, detail::rvv_enable_sg_t<T, U> = 0>
-        inline void scatter(batch<T, A> const& vals, T* dst, batch<U, A> const& index, kernel::requires_arch<rvv>) noexcept
+        XSIMD_INLINE void scatter(batch<T, A> const& vals, T* dst, batch<U, A> const& index, kernel::requires_arch<rvv>) noexcept
         {
             using UU = as_unsigned_integer_t<U>;
             const auto uindex = detail::rvv_to_unsigned_batch(index);
@@ -602,7 +602,7 @@ namespace xsimd
 
         // gather
         template <class A, class T, class U, detail::rvv_enable_sg_t<T, U> = 0>
-        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index, kernel::requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index, kernel::requires_arch<rvv>) noexcept
         {
             using UU = as_unsigned_integer_t<U>;
             const auto uindex = detail::rvv_to_unsigned_batch(index);
@@ -698,63 +698,63 @@ namespace xsimd
 
         // add
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvadd(lhs, rhs);
         }
 
         // sadd
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvsadd(lhs, rhs);
         }
 
         // sub
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvsub(lhs, rhs);
         }
 
         // ssub
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvssub(lhs, rhs);
         }
 
         // mul
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvmul(lhs, rhs);
         }
 
         // div
         template <class A, class T, typename detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvdiv(lhs, rhs);
         }
 
         // max
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvmax(lhs, rhs);
         }
 
         // min
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvmin(lhs, rhs);
         }
 
         // neg
         template <class A, class T, detail::rvv_enable_unsigned_int_t<T> = 0>
-        inline batch<T, A> neg(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             using S = as_signed_integer_t<T>;
             const auto as_signed = detail::rvvreinterpret<S>(arg);
@@ -763,27 +763,27 @@ namespace xsimd
         }
 
         template <class A, class T, detail::rvv_enable_signed_int_or_floating_point_t<T> = 0>
-        inline batch<T, A> neg(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             return detail::rvvneg(arg);
         }
 
         // abs
         template <class A, class T, detail::rvv_enable_unsigned_int_t<T> = 0>
-        inline batch<T, A> abs(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             return arg;
         }
 
         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
-        inline batch<T, A> abs(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             return detail::rvvabs(arg);
         }
 
         // fma: x * y + z
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
         {
             // also detail::rvvmadd(x, y, z);
             return detail::rvvmacc(z, x, y);
@@ -791,7 +791,7 @@ namespace xsimd
 
         // fnma: z - x * y
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
         {
             // also detail::rvvnmsub(x, y, z);
             return detail::rvvnmsac(z, x, y);
@@ -799,7 +799,7 @@ namespace xsimd
 
         // fms: x * y - z
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
         {
             // also vfmsac(z, x, y), but lacking integer version
             // also vfmsub(x, y, z), but lacking integer version
@@ -808,7 +808,7 @@ namespace xsimd
 
         // fnms: - x * y - z
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
         {
             // also vfnmacc(z, x, y), but lacking integer version
             // also vfnmadd(x, y, z), but lacking integer version
@@ -835,13 +835,13 @@ namespace xsimd
 
         // bitwise_and
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvand(lhs, rhs);
         }
 
         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
-        inline batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
             const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
@@ -850,21 +850,21 @@ namespace xsimd
         }
 
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvmand(lhs, rhs);
         }
 
         // bitwise_andnot
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             const auto not_rhs = detail::rvvnot(rhs);
             return detail::rvvand(lhs, not_rhs);
         }
 
         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
-        inline batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
             const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
@@ -874,20 +874,20 @@ namespace xsimd
         }
 
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvmandn(lhs, rhs);
         }
 
         // bitwise_or
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvor(lhs, rhs);
         }
 
         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
-        inline batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
             const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
@@ -896,20 +896,20 @@ namespace xsimd
         }
 
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvmor(lhs, rhs);
         }
 
         // bitwise_xor
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvxor(lhs, rhs);
         }
 
         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
-        inline batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
             const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
@@ -918,20 +918,20 @@ namespace xsimd
         }
 
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvmxor(lhs, rhs);
         }
 
         // bitwise_not
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             return detail::rvvnot(arg);
         }
 
         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
-        inline batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             const auto arg_bits = detail::rvv_to_unsigned_batch(arg);
             const auto result_bits = detail::rvvnot(arg_bits);
@@ -939,7 +939,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             return detail::rvvmnot(arg);
         }
@@ -962,7 +962,7 @@ namespace xsimd
 
         // bitwise_lshift
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& arg, int n, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& arg, int n, requires_arch<rvv>) noexcept
         {
             constexpr size_t size = sizeof(typename batch<T, A>::value_type) * 8;
             assert(0 <= n && static_cast<size_t>(n) < size && "index in bounds");
@@ -970,14 +970,14 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvsll(lhs, detail::rvv_to_unsigned_batch<A, T>(rhs));
         }
 
         // bitwise_rshift
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<rvv>) noexcept
         {
             constexpr size_t size = sizeof(typename batch<T, A>::value_type) * 8;
             assert(0 <= n && static_cast<size_t>(n) < size && "index in bounds");
@@ -985,7 +985,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvsr(lhs, detail::rvv_to_unsigned_batch<A, T>(rhs));
         }
@@ -1019,14 +1019,14 @@ namespace xsimd
                                 (__riscv_vfslide1down), , vec(vec, T))
 
             template <class A, class T>
-            inline T reduce_scalar(rvv_reg_t<T, types::detail::rvv_width_m1> const& arg)
+            XSIMD_INLINE T reduce_scalar(rvv_reg_t<T, types::detail::rvv_width_m1> const& arg)
             {
                 return detail::rvvmv_lane0(rvv_reg_t<T, A::width>(arg.get_bytes(), types::detail::XSIMD_RVV_BITCAST));
             }
         }
         // reduce_add
         template <class A, class T, class V = typename batch<T, A>::value_type, detail::rvv_enable_all_t<T> = 0>
-        inline V reduce_add(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE V reduce_add(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             const auto zero = detail::broadcast<T, types::detail::rvv_width_m1>(T(0));
             const auto r = detail::rvvredsum(arg, zero);
@@ -1035,7 +1035,7 @@ namespace xsimd
 
         // reduce_max
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline T reduce_max(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE T reduce_max(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             const auto lowest = detail::broadcast<T, types::detail::rvv_width_m1>(std::numeric_limits<T>::lowest());
             const auto r = detail::rvvredmax(arg, lowest);
@@ -1044,7 +1044,7 @@ namespace xsimd
 
         // reduce_min
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline T reduce_min(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE T reduce_min(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             const auto max = detail::broadcast<T, types::detail::rvv_width_m1>(std::numeric_limits<T>::max());
             const auto r = detail::rvvredmin(arg, max);
@@ -1053,7 +1053,7 @@ namespace xsimd
 
         // haddp
         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
-        inline batch<T, A> haddp(const batch<T, A>* row, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> haddp(const batch<T, A>* row, requires_arch<rvv>) noexcept
         {
             constexpr std::size_t size = batch<T, A>::size;
             T sums[size];
@@ -1071,13 +1071,13 @@ namespace xsimd
 
         // eq
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvmseq(lhs, rhs);
         }
 
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             const auto neq_result = detail::rvvmxor(lhs, rhs);
             return detail::rvvmnot(neq_result);
@@ -1085,41 +1085,41 @@ namespace xsimd
 
         // neq
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch_bool<T, A> neq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvmsne(lhs, rhs);
         }
 
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvmxor(lhs, rhs);
         }
 
         // lt
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvmslt(lhs, rhs);
         }
 
         // le
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvmsle(lhs, rhs);
         }
 
         // gt
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvmsgt(lhs, rhs);
         }
 
         // ge
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             return detail::rvvmsge(lhs, rhs);
         }
@@ -1133,7 +1133,7 @@ namespace xsimd
         }
         // compress
         template <class A, class T>
-        inline batch<T, A> compress(batch<T, A> const& x, batch_bool<T, A> const& mask, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> compress(batch<T, A> const& x, batch_bool<T, A> const& mask, requires_arch<rvv>) noexcept
         {
             return detail::rvvcompress(x, mask);
         }
@@ -1150,7 +1150,7 @@ namespace xsimd
 
         // swizzle
         template <class A, class T, class I, I... idx>
-        inline batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<batch<I, A>, idx...>, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<I, A, idx...>, requires_arch<rvv>) noexcept
         {
             static_assert(batch<T, A>::size == sizeof...(idx), "invalid swizzle indices");
             const batch<I, A> indices { idx... };
@@ -1158,12 +1158,12 @@ namespace xsimd
         }
 
         template <class A, class T, class I, I... idx>
-        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self,
-                                                 batch_constant<batch<I, A>, idx...>,
-                                                 requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self,
+                                                       batch_constant<I, A, idx...>,
+                                                       requires_arch<rvv>) noexcept
         {
-            const auto real = swizzle(self.real(), batch_constant<batch<I, A>, idx...> {}, rvv {});
-            const auto imag = swizzle(self.imag(), batch_constant<batch<I, A>, idx...> {}, rvv {});
+            const auto real = swizzle(self.real(), batch_constant<I, A, idx...> {}, rvv {});
+            const auto imag = swizzle(self.imag(), batch_constant<I, A, idx...> {}, rvv {});
             return batch<std::complex<T>>(real, imag);
         }
 
@@ -1174,7 +1174,7 @@ namespace xsimd
         // extract_pair
 
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, size_t n, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, size_t n, requires_arch<rvv>) noexcept
         {
             const auto tmp = detail::rvvslidedown(rhs, n);
             return detail::rvvslideup(tmp, lhs, lhs.size - n);
@@ -1182,20 +1182,20 @@ namespace xsimd
 
         // select
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<rvv>) noexcept
         {
             return detail::rvvmerge(b, a, cond);
         }
 
         template <class A, class T, bool... b>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<rvv>) noexcept
         {
             return select(batch_bool<T, A> { b... }, true_br, false_br, rvv {});
         }
 
         // zip_lo
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             const auto index = detail::vindex<A, as_unsigned_integer_t<T>, 0, -1>();
             const auto mask = detail::pmask8<T, A::width>(0xaa);
@@ -1206,7 +1206,7 @@ namespace xsimd
 
         // zip_hi
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
             const auto index = detail::vindex<A, as_unsigned_integer_t<T>, batch<T, A>::size / 2, -1>();
             const auto mask = detail::pmask8<T, A::width>(0xaa);
@@ -1217,7 +1217,7 @@ namespace xsimd
 
         // store_complex
         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
-        inline void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<rvv>) noexcept
+        XSIMD_INLINE void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<rvv>) noexcept
         {
             const auto lo = zip_lo(src.real(), src.imag());
             const auto hi = zip_hi(src.real(), src.imag());
@@ -1227,7 +1227,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
-        inline void store_complex_unaligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<rvv>) noexcept
+        XSIMD_INLINE void store_complex_unaligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<rvv>) noexcept
         {
             store_complex_aligned(dst, src, rvv {});
         }
@@ -1245,7 +1245,7 @@ namespace xsimd
 
         // rsqrt
         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
-        inline batch<T, A> rsqrt(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             auto approx = detail::rvvfrsqrt7(arg);
             approx = approx * (1.5 - (0.5 * arg * approx * approx));
@@ -1254,14 +1254,14 @@ namespace xsimd
 
         // sqrt
         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
-        inline batch<T, A> sqrt(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             return detail::rvvfsqrt(arg);
         }
 
         // reciprocal
         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
-        inline batch<T, A> reciprocal(const batch<T, A>& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> reciprocal(const batch<T, A>& arg, requires_arch<rvv>) noexcept
         {
             return detail::rvvfrec7(arg);
         }
@@ -1293,12 +1293,12 @@ namespace xsimd
             using rvv_enable_itof_t = typename std::enable_if<(sizeof(T) == sizeof(U) && !std::is_floating_point<T>::value && std::is_floating_point<U>::value), int>::type;
 
             template <class A, class T, class U, rvv_enable_ftoi_t<T, U> = 0>
-            inline batch<U, A> fast_cast(batch<T, A> const& arg, batch<U, A> const&, requires_arch<rvv>) noexcept
+            XSIMD_INLINE batch<U, A> fast_cast(batch<T, A> const& arg, batch<U, A> const&, requires_arch<rvv>) noexcept
             {
                 return rvvfcvt_rtz(U {}, arg);
             }
             template <class A, class T, class U, rvv_enable_itof_t<T, U> = 0>
-            inline batch<U, A> fast_cast(batch<T, A> const& arg, batch<U, A> const&, requires_arch<rvv>) noexcept
+            XSIMD_INLINE batch<U, A> fast_cast(batch<T, A> const& arg, batch<U, A> const&, requires_arch<rvv>) noexcept
             {
                 return rvvfcvt_f(arg);
             }
@@ -1310,22 +1310,22 @@ namespace xsimd
 
         // set
         template <class A, class T, class... Args>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<rvv>, Args... args) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<rvv>, Args... args) noexcept
         {
             const std::array<T, batch<T, A>::size> tmp { args... };
             return load_unaligned<A>(tmp.data(), convert<T>(), rvv {});
         }
 
         template <class A, class T, class... Args>
-        inline batch<std::complex<T>, A> set(batch<std::complex<T>, A> const&, requires_arch<rvv>,
-                                             Args... args_complex) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> set(batch<std::complex<T>, A> const&, requires_arch<rvv>,
+                                                   Args... args_complex) noexcept
         {
             return batch<std::complex<T>>(set(batch<T, rvv> {}, rvv {}, args_complex.real()...),
                                           set(batch<T, rvv> {}, rvv {}, args_complex.imag()...));
         }
 
         template <class A, class T, class... Args>
-        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<rvv>, Args... args) noexcept
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<rvv>, Args... args) noexcept
         {
             using U = as_unsigned_integer_t<T>;
             const auto values = set(batch<U, rvv> {}, rvv {}, static_cast<U>(args)...);
@@ -1336,7 +1336,7 @@ namespace xsimd
 
         // insert
         template <class A, class T, size_t I, detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> insert(batch<T, A> const& arg, T val, index<I>, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& arg, T val, index<I>, requires_arch<rvv>) noexcept
         {
             const auto mask = detail::pmask<T, A::width>(uint64_t(1) << I);
             return detail::rvvmerge_splat(arg, val, mask);
@@ -1344,14 +1344,14 @@ namespace xsimd
 
         // get
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline T get(batch<T, A> const& arg, size_t i, requires_arch<rvv>) noexcept
+        XSIMD_INLINE T get(batch<T, A> const& arg, size_t i, requires_arch<rvv>) noexcept
         {
             const auto tmp = detail::rvvslidedown(arg, i);
             return detail::rvvmv_lane0(tmp);
         }
 
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline std::complex<T> get(batch<std::complex<T>, A> const& arg, size_t i, requires_arch<rvv>) noexcept
+        XSIMD_INLINE std::complex<T> get(batch<std::complex<T>, A> const& arg, size_t i, requires_arch<rvv>) noexcept
         {
             const auto tmpr = detail::rvvslidedown(arg.real(), i);
             const auto tmpi = detail::rvvslidedown(arg.imag(), i);
@@ -1360,28 +1360,28 @@ namespace xsimd
 
         // all
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline bool all(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             return detail::rvvcpop(arg) == batch_bool<T, A>::size;
         }
 
         // any
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline bool any(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             return detail::rvvcpop(arg) > 0;
         }
 
         // bitwise_cast
         template <class A, class T, class R, detail::rvv_enable_all_t<T> = 0, detail::rvv_enable_all_t<R> = 0>
-        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<rvv>) noexcept
         {
             return detail::rvv_reg_t<R, A::width>(arg.data.get_bytes(), types::detail::XSIMD_RVV_BITCAST);
         }
 
         // batch_bool_cast
         template <class A, class T_out, class T_in, detail::rvv_enable_all_t<T_in> = 0>
-        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& arg, batch_bool<T_out, A> const&, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& arg, batch_bool<T_out, A> const&, requires_arch<rvv>) noexcept
         {
             using intermediate_t = typename detail::rvv_bool_t<T_out>;
             return intermediate_t(arg.data);
@@ -1389,7 +1389,7 @@ namespace xsimd
 
         // from_bool
         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             const auto zero = broadcast<A>(T(0), rvv {});
             return detail::rvvmerge_splat(zero, T(1), arg);
@@ -1398,26 +1398,26 @@ namespace xsimd
         namespace detail
         {
             template <size_t Width>
-            inline vuint8m1_t rvvslidedownbytes(vuint8m1_t arg, size_t i)
+            XSIMD_INLINE vuint8m1_t rvvslidedownbytes(vuint8m1_t arg, size_t i)
             {
                 return __riscv_vslidedown(arg, i, types::detail::rvv_width_m1 / 8);
             }
             template <>
-            inline vuint8m1_t rvvslidedownbytes<types::detail::rvv_width_mf2>(vuint8m1_t arg, size_t i)
+            XSIMD_INLINE vuint8m1_t rvvslidedownbytes<types::detail::rvv_width_mf2>(vuint8m1_t arg, size_t i)
             {
                 const auto bytes = __riscv_vlmul_trunc_u8mf2(arg);
                 const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf2 / 8);
                 return __riscv_vlmul_ext_u8m1(result);
             }
             template <>
-            inline vuint8m1_t rvvslidedownbytes<types::detail::rvv_width_mf4>(vuint8m1_t arg, size_t i)
+            XSIMD_INLINE vuint8m1_t rvvslidedownbytes<types::detail::rvv_width_mf4>(vuint8m1_t arg, size_t i)
             {
                 const auto bytes = __riscv_vlmul_trunc_u8mf4(arg);
                 const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf4 / 8);
                 return __riscv_vlmul_ext_u8m1(result);
             }
             template <>
-            inline vuint8m1_t rvvslidedownbytes<types::detail::rvv_width_mf8>(vuint8m1_t arg, size_t i)
+            XSIMD_INLINE vuint8m1_t rvvslidedownbytes<types::detail::rvv_width_mf8>(vuint8m1_t arg, size_t i)
             {
                 const auto bytes = __riscv_vlmul_trunc_u8mf8(arg);
                 const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf8 / 8);
@@ -1427,7 +1427,7 @@ namespace xsimd
 
         // slide_left
         template <size_t N, class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> slide_left(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             const auto zero = broadcast<A>(uint8_t(0), rvv {});
             const auto bytes = arg.data.get_bytes();
@@ -1436,7 +1436,7 @@ namespace xsimd
 
         // slide_right
         template <size_t N, class A, class T, detail::rvv_enable_all_t<T> = 0>
-        inline batch<T, A> slide_right(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             using reg_t = detail::rvv_reg_t<T, A::width>;
             const auto bytes = arg.data.get_bytes();
@@ -1445,7 +1445,7 @@ namespace xsimd
 
         // isnan
         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
-        inline batch_bool<T, A> isnan(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch_bool<T, A> isnan(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             return !(arg == arg);
         }
@@ -1456,13 +1456,13 @@ namespace xsimd
             using rvv_as_signed_integer_t = as_signed_integer_t<as_unsigned_integer_t<T>>;
 
             template <class A, class T, class U = rvv_as_signed_integer_t<T>>
-            inline batch<U, A> rvvfcvt_default(batch<T, A> const& arg) noexcept
+            XSIMD_INLINE batch<U, A> rvvfcvt_default(batch<T, A> const& arg) noexcept
             {
                 return rvvfcvt_rne(U {}, arg);
             }
 
             template <class A, class T, class U = rvv_as_signed_integer_t<T>>
-            inline batch<U, A> rvvfcvt_afz(batch<T, A> const& arg) noexcept
+            XSIMD_INLINE batch<U, A> rvvfcvt_afz(batch<T, A> const& arg) noexcept
             {
                 return rvvfcvt_rmm(U {}, arg);
             }
@@ -1470,7 +1470,7 @@ namespace xsimd
 
         // nearbyint_as_int
         template <class A, class T, class U = detail::rvv_as_signed_integer_t<T>>
-        inline batch<U, A> nearbyint_as_int(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<U, A> nearbyint_as_int(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             // Reference rounds ties to nearest even
             return detail::rvvfcvt_default(arg);
@@ -1478,7 +1478,7 @@ namespace xsimd
 
         // round
         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
-        inline batch<T, A> round(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> round(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             // Round ties away from zero.
             const auto mask = abs(arg) < constants::maxflint<batch<T, A>>();
@@ -1487,7 +1487,7 @@ namespace xsimd
 
         // nearbyint
         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
-        inline batch<T, A> nearbyint(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        XSIMD_INLINE batch<T, A> nearbyint(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             // Round according to current rounding mode.
             const auto mask = abs(arg) < constants::maxflint<batch<T, A>>();
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_scalar.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_scalar.hpp
index 1f766ee73a3..c75a8d39efd 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_scalar.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_scalar.hpp
@@ -20,6 +20,8 @@
 #include <limits>
 #include <type_traits>
 
+#include "xsimd/config/xsimd_inline.hpp"
+
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
 #error #include "xtl/xcomplex.hpp"
 #endif
@@ -86,6 +88,57 @@ namespace xsimd
     using std::tgamma;
     using std::trunc;
 
+    XSIMD_INLINE signed char abs(signed char v)
+    {
+        return v < 0 ? -v : v;
+    }
+
+    namespace detail
+    {
+        // Use templated type here to prevent automatic instantiation that may
+        // ends up in a warning
+        template <typename char_type>
+        XSIMD_INLINE char abs(char_type v, std::true_type)
+        {
+            return v;
+        }
+        template <typename char_type>
+        XSIMD_INLINE char abs(char_type v, std::false_type)
+        {
+            return v < 0 ? -v : v;
+        }
+    }
+
+    XSIMD_INLINE char abs(char v)
+    {
+        return detail::abs(v, std::is_unsigned<char>::type {});
+    }
+
+    XSIMD_INLINE short abs(short v)
+    {
+        return v < 0 ? -v : v;
+    }
+    XSIMD_INLINE unsigned char abs(unsigned char v)
+    {
+        return v;
+    }
+    XSIMD_INLINE unsigned short abs(unsigned short v)
+    {
+        return v;
+    }
+    XSIMD_INLINE unsigned int abs(unsigned int v)
+    {
+        return v;
+    }
+    XSIMD_INLINE unsigned long abs(unsigned long v)
+    {
+        return v;
+    }
+    XSIMD_INLINE unsigned long long abs(unsigned long long v)
+    {
+        return v;
+    }
+
 #ifndef _WIN32
     using std::isfinite;
     using std::isinf;
@@ -94,42 +147,42 @@ namespace xsimd
 
     // Windows defines catch all templates
     template <class T>
-    inline typename std::enable_if<std::is_floating_point<T>::value, bool>::type
+    XSIMD_INLINE typename std::enable_if<std::is_floating_point<T>::value, bool>::type
     isfinite(T var) noexcept
     {
         return std::isfinite(var);
     }
 
     template <class T>
-    inline typename std::enable_if<std::is_integral<T>::value, bool>::type
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, bool>::type
     isfinite(T var) noexcept
     {
         return isfinite(double(var));
     }
 
     template <class T>
-    inline typename std::enable_if<std::is_floating_point<T>::value, bool>::type
+    XSIMD_INLINE typename std::enable_if<std::is_floating_point<T>::value, bool>::type
     isinf(T var) noexcept
     {
         return std::isinf(var);
     }
 
     template <class T>
-    inline typename std::enable_if<std::is_integral<T>::value, bool>::type
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, bool>::type
     isinf(T var) noexcept
     {
         return isinf(double(var));
     }
 
     template <class T>
-    inline typename std::enable_if<std::is_floating_point<T>::value, bool>::type
+    XSIMD_INLINE typename std::enable_if<std::is_floating_point<T>::value, bool>::type
     isnan(T var) noexcept
     {
         return std::isnan(var);
     }
 
     template <class T>
-    inline typename std::enable_if<std::is_integral<T>::value, bool>::type
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, bool>::type
     isnan(T var) noexcept
     {
         return isnan(double(var));
@@ -137,118 +190,136 @@ namespace xsimd
 #endif
 
     template <class T, class Tp>
-    inline auto add(T const& x, Tp const& y) noexcept -> decltype(x + y)
+    XSIMD_INLINE typename std::common_type<T, Tp>::type add(T const& x, Tp const& y) noexcept
     {
         return x + y;
     }
 
+    template <class T, class Tp>
+    XSIMD_INLINE typename std::common_type<T, Tp>::type avg(T const& x, Tp const& y) noexcept
+    {
+        using common_type = typename std::common_type<T, Tp>::type;
+        if (std::is_floating_point<common_type>::value)
+            return (x + y) / 2;
+        else if (std::is_unsigned<common_type>::value)
+        {
+            return (x & y) + ((x ^ y) >> 1);
+        }
+        else
+        {
+            // Inspired by
+            // https://stackoverflow.com/questions/5697500/take-the-average-of-two-signed-numbers-in-c
+            auto t = (x & y) + ((x ^ y) >> 1);
+            auto t_u = static_cast<typename std::make_unsigned<common_type>::type>(t);
+            auto avg = t + (static_cast<T>(t_u >> (8 * sizeof(T) - 1)) & (x ^ y));
+            return avg;
+        }
+    }
+
+    template <class T, class Tp>
+    XSIMD_INLINE typename std::common_type<T, Tp>::type avgr(T const& x, Tp const& y) noexcept
+    {
+        using common_type = typename std::common_type<T, Tp>::type;
+        if (std::is_floating_point<common_type>::value)
+            return avg(x, y);
+        else
+        {
+            return avg(x, y) + ((x ^ y) & 1);
+        }
+    }
+
     template <class T>
-    inline T incr(T const& x) noexcept
+    XSIMD_INLINE T incr(T const& x) noexcept
     {
         return x + T(1);
     }
 
     template <class T>
-    inline T incr_if(T const& x, bool mask) noexcept
+    XSIMD_INLINE T incr_if(T const& x, bool mask) noexcept
     {
         return x + T(mask ? 1 : 0);
     }
 
-    inline bool all(bool mask)
+    XSIMD_INLINE bool all(bool mask)
     {
         return mask;
     }
 
-    inline bool any(bool mask)
+    XSIMD_INLINE bool any(bool mask)
     {
         return mask;
     }
 
-    inline bool none(bool mask)
+    XSIMD_INLINE bool none(bool mask)
     {
         return !mask;
     }
 
     template <class T>
-    inline typename std::enable_if<std::is_integral<T>::value, T>::type
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type
     bitwise_and(T x, T y) noexcept
     {
         return x & y;
     }
 
-    inline float bitwise_and(float x, float y) noexcept
-    {
-        uint32_t ix, iy;
-        std::memcpy((void*)&ix, (void*)&x, sizeof(float));
-        std::memcpy((void*)&iy, (void*)&y, sizeof(float));
-        uint32_t ir = bitwise_and(ix, iy);
-        float r;
-        std::memcpy((void*)&r, (void*)&ir, sizeof(float));
-        return r;
-    }
-
-    inline double bitwise_and(double x, double y) noexcept
+    template <class T_out, class T_in>
+    XSIMD_INLINE T_out bitwise_cast(T_in x) noexcept
     {
-        uint64_t ix, iy;
-        std::memcpy((void*)&ix, (void*)&x, sizeof(double));
-        std::memcpy((void*)&iy, (void*)&y, sizeof(double));
-        uint64_t ir = bitwise_and(ix, iy);
-        double r;
-        std::memcpy((void*)&r, (void*)&ir, sizeof(double));
+        static_assert(sizeof(T_in) == sizeof(T_out), "bitwise_cast between types of the same size");
+        T_out r;
+        std::memcpy((void*)&r, (void*)&x, sizeof(T_in));
         return r;
     }
 
-    template <class T>
-    inline typename std::enable_if<std::is_integral<T>::value, T>::type
-    bitwise_andnot(T x, T y) noexcept
-    {
-        return x & ~y;
-    }
-
-    inline float bitwise_andnot(float x, float y) noexcept
+    XSIMD_INLINE float bitwise_and(float x, float y) noexcept
     {
         uint32_t ix, iy;
         std::memcpy((void*)&ix, (void*)&x, sizeof(float));
         std::memcpy((void*)&iy, (void*)&y, sizeof(float));
-        uint32_t ir = bitwise_andnot(ix, iy);
+        uint32_t ir = bitwise_and(ix, iy);
         float r;
         std::memcpy((void*)&r, (void*)&ir, sizeof(float));
         return r;
     }
 
-    inline double bitwise_andnot(double x, double y) noexcept
+    XSIMD_INLINE double bitwise_and(double x, double y) noexcept
     {
         uint64_t ix, iy;
         std::memcpy((void*)&ix, (void*)&x, sizeof(double));
         std::memcpy((void*)&iy, (void*)&y, sizeof(double));
-        uint64_t ir = bitwise_andnot(ix, iy);
+        uint64_t ir = bitwise_and(ix, iy);
         double r;
         std::memcpy((void*)&r, (void*)&ir, sizeof(double));
         return r;
     }
 
     template <class T0, class T1>
-    inline typename std::enable_if<std::is_integral<T0>::value && std::is_integral<T1>::value, T0>::type
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T0>::value && std::is_integral<T1>::value, T0>::type
     bitwise_lshift(T0 x, T1 shift) noexcept
     {
         return x << shift;
     }
 
     template <class T0, class T1>
-    inline typename std::enable_if<std::is_integral<T0>::value && std::is_integral<T1>::value, T0>::type
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T0>::value && std::is_integral<T1>::value, T0>::type
     bitwise_rshift(T0 x, T1 shift) noexcept
     {
         return x >> shift;
     }
 
     template <class T>
-    inline typename std::enable_if<std::is_integral<T>::value, T>::type
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type
     bitwise_not(T x) noexcept
     {
         return ~x;
     }
 
-    inline float bitwise_not(float x) noexcept
+    XSIMD_INLINE bool bitwise_not(bool x) noexcept
+    {
+        return !x;
+    }
+
+    XSIMD_INLINE float bitwise_not(float x) noexcept
     {
         uint32_t ix;
         std::memcpy((void*)&ix, (void*)&x, sizeof(float));
@@ -258,7 +329,7 @@ namespace xsimd
         return r;
     }
 
-    inline double bitwise_not(double x) noexcept
+    XSIMD_INLINE double bitwise_not(double x) noexcept
     {
         uint64_t ix;
         std::memcpy((void*)&ix, (void*)&x, sizeof(double));
@@ -269,13 +340,19 @@ namespace xsimd
     }
 
     template <class T>
-    inline typename std::enable_if<std::is_integral<T>::value, T>::type
+    XSIMD_INLINE typename std::enable_if<std::is_scalar<T>::value, T>::type bitwise_andnot(T x, T y) noexcept
+    {
+        return bitwise_and(x, bitwise_not(y));
+    }
+
+    template <class T>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type
     bitwise_or(T x, T y) noexcept
     {
         return x | y;
     }
 
-    inline float bitwise_or(float x, float y) noexcept
+    XSIMD_INLINE float bitwise_or(float x, float y) noexcept
     {
         uint32_t ix, iy;
         std::memcpy((void*)&ix, (void*)&x, sizeof(float));
@@ -286,7 +363,7 @@ namespace xsimd
         return r;
     }
 
-    inline double bitwise_or(double x, double y) noexcept
+    XSIMD_INLINE double bitwise_or(double x, double y) noexcept
     {
         uint64_t ix, iy;
         std::memcpy((void*)&ix, (void*)&x, sizeof(double));
@@ -298,13 +375,13 @@ namespace xsimd
     }
 
     template <class T>
-    inline typename std::enable_if<std::is_integral<T>::value, T>::type
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type
     bitwise_xor(T x, T y) noexcept
     {
         return x ^ y;
     }
 
-    inline float bitwise_xor(float x, float y) noexcept
+    XSIMD_INLINE float bitwise_xor(float x, float y) noexcept
     {
         uint32_t ix, iy;
         std::memcpy((void*)&ix, (void*)&x, sizeof(float));
@@ -315,7 +392,7 @@ namespace xsimd
         return r;
     }
 
-    inline double bitwise_xor(double x, double y) noexcept
+    XSIMD_INLINE double bitwise_xor(double x, double y) noexcept
     {
         uint64_t ix, iy;
         std::memcpy((void*)&ix, (void*)&x, sizeof(double));
@@ -327,47 +404,47 @@ namespace xsimd
     }
 
     template <class T, class Tp>
-    inline auto div(T const& x, Tp const& y) noexcept -> decltype(x / y)
+    XSIMD_INLINE typename std::common_type<T, Tp>::type div(T const& x, Tp const& y) noexcept
     {
         return x / y;
     }
 
     template <class T, class Tp>
-    inline auto mod(T const& x, Tp const& y) noexcept -> decltype(x % y)
+    XSIMD_INLINE auto mod(T const& x, Tp const& y) noexcept -> decltype(x % y)
     {
         return x % y;
     }
 
     template <class T, class Tp>
-    inline auto mul(T const& x, Tp const& y) noexcept -> decltype(x * y)
+    XSIMD_INLINE typename std::common_type<T, Tp>::type mul(T const& x, Tp const& y) noexcept
     {
         return x * y;
     }
 
     template <class T>
-    inline auto neg(T const& x) noexcept -> decltype(-x)
+    XSIMD_INLINE T neg(T const& x) noexcept
     {
         return -x;
     }
 
     template <class T>
-    inline auto pos(T const& x) noexcept -> decltype(+x)
+    XSIMD_INLINE auto pos(T const& x) noexcept -> decltype(+x)
     {
         return +x;
     }
 
-    inline float reciprocal(float const& x) noexcept
+    XSIMD_INLINE float reciprocal(float const& x) noexcept
     {
         return 1.f / x;
     }
 
-    inline double reciprocal(double const& x) noexcept
+    XSIMD_INLINE double reciprocal(double const& x) noexcept
     {
         return 1. / x;
     }
 
     template <class T0, class T1>
-    inline typename std::enable_if<std::is_integral<T0>::value && std::is_integral<T1>::value, T0>::type
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T0>::value && std::is_integral<T1>::value, T0>::type
     rotl(T0 x, T1 shift) noexcept
     {
         constexpr auto N = std::numeric_limits<T0>::digits;
@@ -375,7 +452,7 @@ namespace xsimd
     }
 
     template <class T0, class T1>
-    inline typename std::enable_if<std::is_integral<T0>::value && std::is_integral<T1>::value, T0>::type
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T0>::value && std::is_integral<T1>::value, T0>::type
     rotr(T0 x, T1 shift) noexcept
     {
         constexpr auto N = std::numeric_limits<T0>::digits;
@@ -383,19 +460,19 @@ namespace xsimd
     }
 
     template <class T>
-    inline bool isnan(std::complex<T> var) noexcept
+    XSIMD_INLINE bool isnan(std::complex<T> var) noexcept
     {
         return std::isnan(std::real(var)) || std::isnan(std::imag(var));
     }
 
     template <class T>
-    inline bool isinf(std::complex<T> var) noexcept
+    XSIMD_INLINE bool isinf(std::complex<T> var) noexcept
     {
         return std::isinf(std::real(var)) || std::isinf(std::imag(var));
     }
 
     template <class T>
-    inline bool isfinite(std::complex<T> var) noexcept
+    XSIMD_INLINE bool isfinite(std::complex<T> var) noexcept
     {
         return std::isfinite(std::real(var)) && std::isfinite(std::imag(var));
     }
@@ -424,130 +501,130 @@ namespace xsimd
 #endif
 
     template <typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
-    inline T clip(const T& val, const T& low, const T& hi) noexcept
+    XSIMD_INLINE T clip(const T& val, const T& low, const T& hi) noexcept
     {
         assert(low <= hi && "ordered clipping bounds");
         return low > val ? low : (hi < val ? hi : val);
     }
 
     template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
-    inline bool is_flint(const T& x) noexcept
+    XSIMD_INLINE bool is_flint(const T& x) noexcept
     {
         return std::isnan(x - x) ? false : (x - std::trunc(x)) == T(0);
     }
 
     template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
-    inline bool is_even(const T& x) noexcept
+    XSIMD_INLINE bool is_even(const T& x) noexcept
     {
         return is_flint(x * T(0.5));
     }
 
     template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
-    inline bool is_odd(const T& x) noexcept
+    XSIMD_INLINE bool is_odd(const T& x) noexcept
     {
         return is_even(x - 1.);
     }
 
-    inline int32_t nearbyint_as_int(float var) noexcept
+    XSIMD_INLINE int32_t nearbyint_as_int(float var) noexcept
     {
         return static_cast<int32_t>(std::nearbyint(var));
     }
 
-    inline int64_t nearbyint_as_int(double var) noexcept
+    XSIMD_INLINE int64_t nearbyint_as_int(double var) noexcept
     {
         return static_cast<int64_t>(std::nearbyint(var));
     }
 
     template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
-    inline bool eq(const T& x0, const T& x1) noexcept
+    XSIMD_INLINE bool eq(const T& x0, const T& x1) noexcept
     {
         return x0 == x1;
     }
 
     template <class T>
-    inline bool eq(const std::complex<T>& x0, const std::complex<T>& x1) noexcept
+    XSIMD_INLINE bool eq(const std::complex<T>& x0, const std::complex<T>& x1) noexcept
     {
         return x0 == x1;
     }
 
     template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
-    inline bool ge(const T& x0, const T& x1) noexcept
+    XSIMD_INLINE bool ge(const T& x0, const T& x1) noexcept
     {
         return x0 >= x1;
     }
 
     template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
-    inline bool gt(const T& x0, const T& x1) noexcept
+    XSIMD_INLINE bool gt(const T& x0, const T& x1) noexcept
     {
         return x0 > x1;
     }
 
     template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
-    inline bool le(const T& x0, const T& x1) noexcept
+    XSIMD_INLINE bool le(const T& x0, const T& x1) noexcept
     {
         return x0 <= x1;
     }
 
     template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
-    inline bool lt(const T& x0, const T& x1) noexcept
+    XSIMD_INLINE bool lt(const T& x0, const T& x1) noexcept
     {
         return x0 < x1;
     }
 
     template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
-    inline bool neq(const T& x0, const T& x1) noexcept
+    XSIMD_INLINE bool neq(const T& x0, const T& x1) noexcept
     {
         return x0 != x1;
     }
 
     template <class T>
-    inline bool neq(const std::complex<T>& x0, const std::complex<T>& x1) noexcept
+    XSIMD_INLINE bool neq(const std::complex<T>& x0, const std::complex<T>& x1) noexcept
     {
         return !(x0 == x1);
     }
 
 #if defined(__APPLE__) && (MAC_OS_X_VERSION_MIN_REQUIRED > 1080)
-    inline float exp10(const float& x) noexcept
+    XSIMD_INLINE float exp10(const float& x) noexcept
     {
         return __exp10f(x);
     }
-    inline double exp10(const double& x) noexcept
+    XSIMD_INLINE double exp10(const double& x) noexcept
     {
         return __exp10(x);
     }
 #elif defined(__GLIBC__)
-    inline float exp10(const float& x) noexcept
+    XSIMD_INLINE float exp10(const float& x) noexcept
     {
         return ::exp10f(x);
     }
-    inline double exp10(const double& x) noexcept
+    XSIMD_INLINE double exp10(const double& x) noexcept
     {
         return ::exp10(x);
     }
 #elif !defined(__clang__) && defined(__GNUC__) && (__GNUC__ >= 5)
-    inline float exp10(const float& x) noexcept
+    XSIMD_INLINE float exp10(const float& x) noexcept
     {
         return __builtin_exp10f(x);
     }
-    inline double exp10(const double& x) noexcept
+    XSIMD_INLINE double exp10(const double& x) noexcept
     {
         return __builtin_exp10(x);
     }
 #elif defined(_WIN32)
     template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
-    inline T exp10(const T& x) noexcept
+    XSIMD_INLINE T exp10(const T& x) noexcept
     {
         // Very inefficient but other implementations give incorrect results
         // on Windows
         return std::pow(T(10), x);
     }
 #else
-    inline float exp10(const float& x) noexcept
+    XSIMD_INLINE float exp10(const float& x) noexcept
     {
         const float ln10 = std::log(10.f);
         return std::exp(ln10 * x);
     }
-    inline double exp10(const double& x) noexcept
+    XSIMD_INLINE double exp10(const double& x) noexcept
     {
         const double ln10 = std::log(10.);
         return std::exp(ln10 * x);
@@ -555,7 +632,7 @@ namespace xsimd
 #endif
 
     template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
-    inline auto rsqrt(const T& x) noexcept -> decltype(std::sqrt(x))
+    XSIMD_INLINE auto rsqrt(const T& x) noexcept -> decltype(std::sqrt(x))
     {
         using float_type = decltype(std::sqrt(x));
         return static_cast<float_type>(1) / std::sqrt(x);
@@ -564,7 +641,7 @@ namespace xsimd
     namespace detail
     {
         template <class C>
-        inline C expm1_complex_scalar_impl(const C& val) noexcept
+        XSIMD_INLINE C expm1_complex_scalar_impl(const C& val) noexcept
         {
             using T = typename C::value_type;
             T isin = std::sin(val.imag());
@@ -576,14 +653,14 @@ namespace xsimd
     }
 
     template <class T>
-    inline std::complex<T> expm1(const std::complex<T>& val) noexcept
+    XSIMD_INLINE std::complex<T> expm1(const std::complex<T>& val) noexcept
     {
         return detail::expm1_complex_scalar_impl(val);
     }
 
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
     template <class T, bool i3ec>
-    inline xtl::xcomplex<T, T, i3ec> expm1(const xtl::xcomplex<T, T, i3ec>& val) noexcept
+    XSIMD_INLINE xtl::xcomplex<T, T, i3ec> expm1(const xtl::xcomplex<T, T, i3ec>& val) noexcept
     {
         return detail::expm1_complex_scalar_impl(val);
     }
@@ -592,7 +669,7 @@ namespace xsimd
     namespace detail
     {
         template <class C>
-        inline C log1p_complex_scalar_impl(const C& val) noexcept
+        XSIMD_INLINE C log1p_complex_scalar_impl(const C& val) noexcept
         {
             using T = typename C::value_type;
             C u = C(1.) + val;
@@ -601,19 +678,19 @@ namespace xsimd
     }
 
     template <class T>
-    inline std::complex<T> log1p(const std::complex<T>& val) noexcept
+    XSIMD_INLINE std::complex<T> log1p(const std::complex<T>& val) noexcept
     {
         return detail::log1p_complex_scalar_impl(val);
     }
 
     template <class T>
-    inline std::complex<T> log2(const std::complex<T>& val) noexcept
+    XSIMD_INLINE std::complex<T> log2(const std::complex<T>& val) noexcept
     {
         return log(val) / std::log(T(2));
     }
 
     template <typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
-    inline T sadd(const T& lhs, const T& rhs) noexcept
+    XSIMD_INLINE T sadd(const T& lhs, const T& rhs) noexcept
     {
         if (std::numeric_limits<T>::is_signed)
         {
@@ -644,7 +721,7 @@ namespace xsimd
     }
 
     template <typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
-    inline T ssub(const T& lhs, const T& rhs) noexcept
+    XSIMD_INLINE T ssub(const T& lhs, const T& rhs) noexcept
     {
         if (std::numeric_limits<T>::is_signed)
         {
@@ -680,7 +757,7 @@ namespace xsimd
         using value_type_or_type = typename value_type_or_type_helper<T>::type;
 
         template <class T0, class T1>
-        inline typename std::enable_if<std::is_integral<T1>::value, T0>::type
+        XSIMD_INLINE typename std::enable_if<std::is_integral<T1>::value, T0>::type
         ipow(const T0& x, const T1& n) noexcept
         {
             static_assert(std::is_integral<T1>::value, "second argument must be an integer");
@@ -706,14 +783,14 @@ namespace xsimd
     }
 
     template <class T0, class T1>
-    inline typename std::enable_if<std::is_integral<T1>::value, T0>::type
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T1>::value, T0>::type
     pow(const T0& x, const T1& n) noexcept
     {
         return detail::ipow(x, n);
     }
 
     template <class T0, class T1>
-    inline auto
+    XSIMD_INLINE auto
     pow(const T0& t0, const T1& t1) noexcept
         -> typename std::enable_if<std::is_scalar<T0>::value && std::is_floating_point<T1>::value, decltype(std::pow(t0, t1))>::type
     {
@@ -721,21 +798,21 @@ namespace xsimd
     }
 
     template <class T0, class T1>
-    inline typename std::enable_if<std::is_integral<T1>::value, std::complex<T0>>::type
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T1>::value, std::complex<T0>>::type
     pow(const std::complex<T0>& t0, const T1& t1) noexcept
     {
         return detail::ipow(t0, t1);
     }
 
     template <class T0, class T1>
-    inline typename std::enable_if<!std::is_integral<T1>::value, std::complex<T0>>::type
+    XSIMD_INLINE typename std::enable_if<!std::is_integral<T1>::value, std::complex<T0>>::type
     pow(const std::complex<T0>& t0, const T1& t1) noexcept
     {
         return std::pow(t0, t1);
     }
 
     template <class T0, class T1>
-    inline auto
+    XSIMD_INLINE auto
     pow(const T0& t0, const std::complex<T1>& t1) noexcept
         -> typename std::enable_if<std::is_scalar<T0>::value, decltype(std::pow(t0, t1))>::type
     {
@@ -743,24 +820,39 @@ namespace xsimd
     }
 
     template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
-    inline bool bitofsign(T const& x) noexcept
+    XSIMD_INLINE T bitofsign(T const& x) noexcept
+    {
+        return T(x < T(0));
+    }
+
+    XSIMD_INLINE float bitofsign(float const& x) noexcept
+    {
+        return float(std::signbit(x));
+    }
+
+    XSIMD_INLINE double bitofsign(double const& x) noexcept
+    {
+        return double(std::signbit(x));
+    }
+
+    XSIMD_INLINE long double bitofsign(long double const& x) noexcept
     {
-        return x < T(0);
+        return static_cast<long double>(std::signbit(x));
     }
 
     template <class T>
-    inline auto signbit(T const& v) noexcept -> decltype(bitofsign(v))
+    XSIMD_INLINE auto signbit(T const& v) noexcept -> decltype(bitofsign(v))
     {
         return bitofsign(v);
     }
 
-    inline double sign(bool const& v) noexcept
+    XSIMD_INLINE double sign(bool const& v) noexcept
     {
         return v;
     }
 
     template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
-    inline T sign(const T& v) noexcept
+    XSIMD_INLINE T sign(const T& v) noexcept
     {
         return v < T(0) ? T(-1.) : v == T(0) ? T(0.)
                                              : T(1.);
@@ -769,7 +861,7 @@ namespace xsimd
     namespace detail
     {
         template <class C>
-        inline C sign_complex_scalar_impl(const C& v) noexcept
+        XSIMD_INLINE C sign_complex_scalar_impl(const C& v) noexcept
         {
             using value_type = typename C::value_type;
             if (v.real())
@@ -784,51 +876,51 @@ namespace xsimd
     }
 
     template <class T>
-    inline std::complex<T> sign(const std::complex<T>& v) noexcept
+    XSIMD_INLINE std::complex<T> sign(const std::complex<T>& v) noexcept
     {
         return detail::sign_complex_scalar_impl(v);
     }
 
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
     template <class T, bool i3ec>
-    inline xtl::xcomplex<T, T, i3ec> sign(const xtl::xcomplex<T, T, i3ec>& v) noexcept
+    XSIMD_INLINE xtl::xcomplex<T, T, i3ec> sign(const xtl::xcomplex<T, T, i3ec>& v) noexcept
     {
         return detail::sign_complex_scalar_impl(v);
     }
 #endif
 
-    inline double signnz(bool const&) noexcept
+    XSIMD_INLINE double signnz(bool const&) noexcept
     {
         return 1;
     }
 
     template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
-    inline T signnz(const T& v) noexcept
+    XSIMD_INLINE T signnz(const T& v) noexcept
     {
         return v < T(0) ? T(-1.) : T(1.);
     }
 
     template <class T, class Tp>
-    inline auto sub(T const& x, Tp const& y) noexcept -> decltype(x - y)
+    XSIMD_INLINE typename std::common_type<T, Tp>::type sub(T const& x, Tp const& y) noexcept
     {
         return x - y;
     }
 
     template <class T>
-    inline T decr(T const& x) noexcept
+    XSIMD_INLINE T decr(T const& x) noexcept
     {
         return x - T(1);
     }
 
     template <class T>
-    inline T decr_if(T const& x, bool mask) noexcept
+    XSIMD_INLINE T decr_if(T const& x, bool mask) noexcept
     {
         return x - T(mask ? 1 : 0);
     }
 
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
     template <class T, bool i3ec>
-    inline xtl::xcomplex<T, T, i3ec> log2(const xtl::xcomplex<T, T, i3ec>& val) noexcept
+    XSIMD_INLINE xtl::xcomplex<T, T, i3ec> log2(const xtl::xcomplex<T, T, i3ec>& val) noexcept
     {
         return log(val) / log(T(2));
     }
@@ -836,14 +928,14 @@ namespace xsimd
 
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
     template <class T, bool i3ec>
-    inline xtl::xcomplex<T, T, i3ec> log1p(const xtl::xcomplex<T, T, i3ec>& val) noexcept
+    XSIMD_INLINE xtl::xcomplex<T, T, i3ec> log1p(const xtl::xcomplex<T, T, i3ec>& val) noexcept
     {
         return detail::log1p_complex_scalar_impl(val);
     }
 #endif
 
     template <class T0, class T1>
-    inline auto min(T0 const& self, T1 const& other) noexcept
+    XSIMD_INLINE auto min(T0 const& self, T1 const& other) noexcept
         -> typename std::enable_if<std::is_scalar<T0>::value && std::is_scalar<T1>::value,
                                    typename std::decay<decltype(self > other ? other : self)>::type>::type
     {
@@ -852,14 +944,14 @@ namespace xsimd
 
     // numpy defines minimum operator on complex using lexical comparison
     template <class T0, class T1>
-    inline std::complex<typename std::common_type<T0, T1>::type>
+    XSIMD_INLINE std::complex<typename std::common_type<T0, T1>::type>
     min(std::complex<T0> const& self, std::complex<T1> const& other) noexcept
     {
         return (self.real() < other.real()) ? (self) : (self.real() == other.real() ? (self.imag() < other.imag() ? self : other) : other);
     }
 
     template <class T0, class T1>
-    inline auto max(T0 const& self, T1 const& other) noexcept
+    XSIMD_INLINE auto max(T0 const& self, T1 const& other) noexcept
         -> typename std::enable_if<std::is_scalar<T0>::value && std::is_scalar<T1>::value,
                                    typename std::decay<decltype(self > other ? other : self)>::type>::type
     {
@@ -868,26 +960,26 @@ namespace xsimd
 
     // numpy defines maximum operator on complex using lexical comparison
     template <class T0, class T1>
-    inline std::complex<typename std::common_type<T0, T1>::type>
+    XSIMD_INLINE std::complex<typename std::common_type<T0, T1>::type>
     max(std::complex<T0> const& self, std::complex<T1> const& other) noexcept
     {
         return (self.real() > other.real()) ? (self) : (self.real() == other.real() ? (self.imag() > other.imag() ? self : other) : other);
     }
 
     template <class T>
-    inline typename std::enable_if<std::is_integral<T>::value, T>::type fma(const T& a, const T& b, const T& c) noexcept
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type fma(const T& a, const T& b, const T& c) noexcept
     {
         return a * b + c;
     }
 
     template <class T>
-    inline typename std::enable_if<std::is_floating_point<T>::value, T>::type fma(const T& a, const T& b, const T& c) noexcept
+    XSIMD_INLINE typename std::enable_if<std::is_floating_point<T>::value, T>::type fma(const T& a, const T& b, const T& c) noexcept
     {
         return std::fma(a, b, c);
     }
 
     template <class T>
-    inline typename std::enable_if<std::is_scalar<T>::value, T>::type fms(const T& a, const T& b, const T& c) noexcept
+    XSIMD_INLINE typename std::enable_if<std::is_scalar<T>::value, T>::type fms(const T& a, const T& b, const T& c) noexcept
     {
         return a * b - c;
     }
@@ -895,7 +987,7 @@ namespace xsimd
     namespace detail
     {
         template <class C>
-        inline C fma_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
+        XSIMD_INLINE C fma_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
         {
             return { fms(a.real(), b.real(), fms(a.imag(), b.imag(), c.real())),
                      fma(a.real(), b.imag(), fma(a.imag(), b.real(), c.imag())) };
@@ -903,14 +995,14 @@ namespace xsimd
     }
 
     template <class T>
-    inline std::complex<T> fma(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
+    XSIMD_INLINE std::complex<T> fma(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
     {
         return detail::fma_complex_scalar_impl(a, b, c);
     }
 
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
     template <class T, bool i3ec>
-    inline xtl::xcomplex<T, T, i3ec> fma(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
+    XSIMD_INLINE xtl::xcomplex<T, T, i3ec> fma(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
     {
         return detail::fma_complex_scalar_impl(a, b, c);
     }
@@ -919,7 +1011,7 @@ namespace xsimd
     namespace detail
     {
         template <class C>
-        inline C fms_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
+        XSIMD_INLINE C fms_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
         {
             return { fms(a.real(), b.real(), fma(a.imag(), b.imag(), c.real())),
                      fma(a.real(), b.imag(), fms(a.imag(), b.real(), c.imag())) };
@@ -927,27 +1019,27 @@ namespace xsimd
     }
 
     template <class T>
-    inline std::complex<T> fms(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
+    XSIMD_INLINE std::complex<T> fms(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
     {
         return detail::fms_complex_scalar_impl(a, b, c);
     }
 
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
     template <class T, bool i3ec>
-    inline xtl::xcomplex<T, T, i3ec> fms(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
+    XSIMD_INLINE xtl::xcomplex<T, T, i3ec> fms(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
     {
         return detail::fms_complex_scalar_impl(a, b, c);
     }
 #endif
 
     template <class T>
-    inline typename std::enable_if<std::is_integral<T>::value, T>::type fnma(const T& a, const T& b, const T& c) noexcept
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type fnma(const T& a, const T& b, const T& c) noexcept
     {
         return -(a * b) + c;
     }
 
     template <class T>
-    inline typename std::enable_if<std::is_floating_point<T>::value, T>::type fnma(const T& a, const T& b, const T& c) noexcept
+    XSIMD_INLINE typename std::enable_if<std::is_floating_point<T>::value, T>::type fnma(const T& a, const T& b, const T& c) noexcept
     {
         return std::fma(-a, b, c);
     }
@@ -955,7 +1047,7 @@ namespace xsimd
     namespace detail
     {
         template <class C>
-        inline C fnma_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
+        XSIMD_INLINE C fnma_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
         {
             return { fms(a.imag(), b.imag(), fms(a.real(), b.real(), c.real())),
                      -fma(a.real(), b.imag(), fms(a.imag(), b.real(), c.imag())) };
@@ -963,27 +1055,27 @@ namespace xsimd
     }
 
     template <class T>
-    inline std::complex<T> fnma(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
+    XSIMD_INLINE std::complex<T> fnma(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
     {
         return detail::fnma_complex_scalar_impl(a, b, c);
     }
 
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
     template <class T, bool i3ec>
-    inline xtl::xcomplex<T, T, i3ec> fnma(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
+    XSIMD_INLINE xtl::xcomplex<T, T, i3ec> fnma(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
     {
         return detail::fnma_complex_scalar_impl(a, b, c);
     }
 #endif
 
     template <class T>
-    inline typename std::enable_if<std::is_integral<T>::value, T>::type fnms(const T& a, const T& b, const T& c) noexcept
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type fnms(const T& a, const T& b, const T& c) noexcept
     {
         return -(a * b) - c;
     }
 
     template <class T>
-    inline typename std::enable_if<std::is_floating_point<T>::value, T>::type fnms(const T& a, const T& b, const T& c) noexcept
+    XSIMD_INLINE typename std::enable_if<std::is_floating_point<T>::value, T>::type fnms(const T& a, const T& b, const T& c) noexcept
     {
         return -std::fma(a, b, c);
     }
@@ -991,7 +1083,7 @@ namespace xsimd
     namespace detail
     {
         template <class C>
-        inline C fnms_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
+        XSIMD_INLINE C fnms_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
         {
             return { fms(a.imag(), b.imag(), fma(a.real(), b.real(), c.real())),
                      -fma(a.real(), b.imag(), fma(a.imag(), b.real(), c.imag())) };
@@ -999,14 +1091,14 @@ namespace xsimd
     }
 
     template <class T>
-    inline std::complex<T> fnms(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
+    XSIMD_INLINE std::complex<T> fnms(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
     {
         return detail::fnms_complex_scalar_impl(a, b, c);
     }
 
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
     template <class T, bool i3ec>
-    inline xtl::xcomplex<T, T, i3ec> fnms(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
+    XSIMD_INLINE xtl::xcomplex<T, T, i3ec> fnms(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
     {
         return detail::fnms_complex_scalar_impl(a, b, c);
     }
@@ -1014,14 +1106,14 @@ namespace xsimd
 
     namespace detail
     {
-#define XSIMD_HASSINCOS_TRAIT(func)                                                                                                     \
-    template <class S>                                                                                                                  \
-    struct has##func                                                                                                                    \
-    {                                                                                                                                   \
-        template <class T>                                                                                                              \
-        static inline auto get(T* ptr) -> decltype(func(std::declval<T>(), std::declval<T*>(), std::declval<T*>()), std::true_type {}); \
-        static inline std::false_type get(...);                                                                                         \
-        static constexpr bool value = decltype(get((S*)nullptr))::value;                                                                \
+#define XSIMD_HASSINCOS_TRAIT(func)                                                                                                           \
+    template <class S>                                                                                                                        \
+    struct has##func                                                                                                                          \
+    {                                                                                                                                         \
+        template <class T>                                                                                                                    \
+        static XSIMD_INLINE auto get(T* ptr) -> decltype(func(std::declval<T>(), std::declval<T*>(), std::declval<T*>()), std::true_type {}); \
+        static XSIMD_INLINE std::false_type get(...);                                                                                         \
+        static constexpr bool value = decltype(get((S*)nullptr))::value;                                                                      \
     }
 
 #define XSIMD_HASSINCOS(func, T) has##func<T>::value
@@ -1034,21 +1126,21 @@ namespace xsimd
         struct generic_sincosf
         {
             template <class T>
-            inline typename std::enable_if<XSIMD_HASSINCOS(sincosf, T), void>::type
+            XSIMD_INLINE typename std::enable_if<XSIMD_HASSINCOS(sincosf, T), void>::type
             operator()(float val, T& s, T& c)
             {
                 sincosf(val, &s, &c);
             }
 
             template <class T>
-            inline typename std::enable_if<!XSIMD_HASSINCOS(sincosf, T) && XSIMD_HASSINCOS(__sincosf, T), void>::type
+            XSIMD_INLINE typename std::enable_if<!XSIMD_HASSINCOS(sincosf, T) && XSIMD_HASSINCOS(__sincosf, T), void>::type
             operator()(float val, T& s, T& c)
             {
                 __sincosf(val, &s, &c);
             }
 
             template <class T>
-            inline typename std::enable_if<!XSIMD_HASSINCOS(sincosf, T) && !XSIMD_HASSINCOS(__sincosf, T), void>::type
+            XSIMD_INLINE typename std::enable_if<!XSIMD_HASSINCOS(sincosf, T) && !XSIMD_HASSINCOS(__sincosf, T), void>::type
             operator()(float val, T& s, T& c)
             {
                 s = std::sin(val);
@@ -1059,21 +1151,21 @@ namespace xsimd
         struct generic_sincos
         {
             template <class T>
-            inline typename std::enable_if<XSIMD_HASSINCOS(sincos, T), void>::type
+            XSIMD_INLINE typename std::enable_if<XSIMD_HASSINCOS(sincos, T), void>::type
             operator()(double val, T& s, T& c)
             {
                 sincos(val, &s, &c);
             }
 
             template <class T>
-            inline typename std::enable_if<!XSIMD_HASSINCOS(sincos, T) && XSIMD_HASSINCOS(__sincos, T), void>::type
+            XSIMD_INLINE typename std::enable_if<!XSIMD_HASSINCOS(sincos, T) && XSIMD_HASSINCOS(__sincos, T), void>::type
             operator()(double val, T& s, T& c)
             {
                 __sincos(val, &s, &c);
             }
 
             template <class T>
-            inline typename std::enable_if<!XSIMD_HASSINCOS(sincos, T) && !XSIMD_HASSINCOS(__sincos, T), void>::type
+            XSIMD_INLINE typename std::enable_if<!XSIMD_HASSINCOS(sincos, T) && !XSIMD_HASSINCOS(__sincos, T), void>::type
             operator()(double val, T& s, T& c)
             {
                 s = std::sin(val);
@@ -1085,14 +1177,14 @@ namespace xsimd
 #undef XSIMD_HASSINCOS
     }
 
-    inline std::pair<float, float> sincos(float val) noexcept
+    XSIMD_INLINE std::pair<float, float> sincos(float val) noexcept
     {
         float s, c;
         detail::generic_sincosf {}(val, s, c);
         return std::make_pair(s, c);
     }
 
-    inline std::pair<double, double> sincos(double val) noexcept
+    XSIMD_INLINE std::pair<double, double> sincos(double val) noexcept
     {
         double s, c;
         detail::generic_sincos {}(val, s, c);
@@ -1100,7 +1192,7 @@ namespace xsimd
     }
 
     template <class T>
-    inline std::pair<std::complex<T>, std::complex<T>>
+    XSIMD_INLINE std::pair<std::complex<T>, std::complex<T>>
     sincos(const std::complex<T>& val) noexcept
     {
         return std::make_pair(std::sin(val), std::cos(val));
@@ -1108,20 +1200,20 @@ namespace xsimd
 
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
     template <class T>
-    inline std::pair<xtl::xcomplex<T>, xtl::xcomplex<T>> sincos(const xtl::xcomplex<T>& val) noexcept
+    XSIMD_INLINE std::pair<xtl::xcomplex<T>, xtl::xcomplex<T>> sincos(const xtl::xcomplex<T>& val) noexcept
     {
         return std::make_pair(sin(val), cos(val));
     }
 #endif
 
     template <class T, class _ = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
-    inline T frexp(T const& val, int& exp) noexcept
+    XSIMD_INLINE T frexp(T const& val, int& exp) noexcept
     {
         return std::frexp(val, &exp);
     }
 
     template <class T>
-    inline T select(bool cond, T const& true_br, T const& false_br) noexcept
+    XSIMD_INLINE T select(bool cond, T const& true_br, T const& false_br) noexcept
     {
         return cond ? true_br : false_br;
     }
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_sse2.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_sse2.hpp
index 1639ba2bfab..67b74f54825 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_sse2.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_sse2.hpp
@@ -20,13 +20,13 @@
 
 namespace xsimd
 {
-    template <class batch_type, bool... Values>
+    template <typename T, class A, bool... Values>
     struct batch_bool_constant;
 
     template <class T_out, class T_in, class A>
-    inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
+    XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
 
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
     struct batch_constant;
 
     namespace kernel
@@ -57,19 +57,23 @@ namespace xsimd
 
         // fwd
         template <class A, class T, size_t I>
-        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
         template <class A, typename T, typename ITy, ITy... Indices>
-        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept;
+        XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
 
         // abs
         template <class A>
-        inline batch<double, A> abs(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<sse2>) noexcept
         {
             __m128d sign_mask = _mm_set1_pd(-0.f); // -0.f = 1 << 31
             return _mm_andnot_pd(sign_mask, self);
         }
         template <class A>
-        inline batch<float, A> abs(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<sse2>) noexcept
         {
             __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
             return _mm_andnot_ps(sign_mask, self);
@@ -77,7 +81,7 @@ namespace xsimd
 
         // add
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -103,130 +107,168 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_add_ps(self, other);
         }
 
         template <class A>
-        inline batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_add_pd(self, other);
         }
 
         // all
         template <class A>
-        inline bool all(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE bool all(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_movemask_ps(self) == 0x0F;
         }
         template <class A>
-        inline bool all(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE bool all(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_movemask_pd(self) == 0x03;
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline bool all(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_movemask_epi8(self) == 0xFFFF;
         }
 
         // any
         template <class A>
-        inline bool any(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE bool any(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_movemask_ps(self) != 0;
         }
         template <class A>
-        inline bool any(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE bool any(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_movemask_pd(self) != 0;
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline bool any(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_movemask_epi8(self) != 0;
         }
 
+        // avgr
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_avg_epu8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_avg_epu16(self, other);
+            }
+            else
+            {
+                return avgr(self, other, generic {});
+            }
+        }
+
+        // avg
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto adj = ((self ^ other) << 7) >> 7;
+                return avgr(self, other, A {}) - adj;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto adj = ((self ^ other) << 15) >> 15;
+                return avgr(self, other, A {}) - adj;
+            }
+            else
+            {
+                return avg(self, other, generic {});
+            }
+        }
+
         // batch_bool_cast
         template <class A, class T_out, class T_in>
-        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<sse2>) noexcept
         {
             return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
         }
 
         // bitwise_and
         template <class A>
-        inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_and_ps(self, other);
         }
         template <class A>
-        inline batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_and_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_and_si128(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_and_si128(self, other);
         }
 
         template <class A>
-        batch<double, A> inline bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        batch<double, A> XSIMD_INLINE bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_and_pd(self, other);
         }
 
         template <class A>
-        inline batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_and_pd(self, other);
         }
 
         // bitwise_andnot
         template <class A>
-        inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_andnot_ps(other, self);
         }
 
         template <class A>
-        inline batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_andnot_ps(other, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_andnot_si128(other, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_andnot_si128(other, self);
         }
 
         template <class A>
-        inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_andnot_pd(other, self);
         }
 
         template <class A>
-        inline batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_andnot_pd(other, self);
         }
 
         // bitwise_lshift
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -253,73 +295,73 @@ namespace xsimd
 
         // bitwise_not
         template <class A>
-        inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
         }
         template <class A>
-        inline batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_xor_si128(self, _mm_set1_epi32(-1));
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_xor_si128(self, _mm_set1_epi32(-1));
         }
         template <class A>
-        inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
         }
         template <class A>
-        inline batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
         }
 
         // bitwise_or
         template <class A>
-        inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_or_ps(self, other);
         }
         template <class A>
-        inline batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_or_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_or_si128(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_or_si128(self, other);
         }
 
         template <class A>
-        inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_or_pd(self, other);
         }
 
         template <class A>
-        inline batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_or_pd(self, other);
         }
 
         // bitwise_rshift
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -381,81 +423,81 @@ namespace xsimd
 
         // bitwise_xor
         template <class A>
-        inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_xor_ps(self, other);
         }
         template <class A>
-        inline batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_xor_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_xor_si128(self, other);
         }
         template <class A>
-        inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_xor_pd(self, other);
         }
         template <class A>
-        inline batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_xor_pd(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_xor_si128(self, other);
         }
 
         // bitwise_cast
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
         {
             return _mm_castsi128_ps(self);
         }
         template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
-        inline batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<sse2>) noexcept
         {
             return batch<Tp, A>(self.data);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
         {
             return _mm_castps_si128(self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
         {
             return _mm_castsi128_pd(self);
         }
         template <class A>
-        inline batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
         {
             return _mm_castps_pd(self);
         }
         template <class A>
-        inline batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
         {
             return _mm_castpd_ps(self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
         {
             return _mm_castpd_si128(self);
         }
 
         // broadcast
         template <class A>
-        batch<float, A> inline broadcast(float val, requires_arch<sse2>) noexcept
+        batch<float, A> XSIMD_INLINE broadcast(float val, requires_arch<sse2>) noexcept
         {
             return _mm_set1_ps(val);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> broadcast(T val, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<sse2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -480,7 +522,7 @@ namespace xsimd
             }
         }
         template <class A>
-        inline batch<double, A> broadcast(double val, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<sse2>) noexcept
         {
             return _mm_set1_pd(val);
         }
@@ -491,23 +533,23 @@ namespace xsimd
             // Override these methods in SSE-based archs, no need to override store_aligned / store_unaligned
             // complex_low
             template <class A>
-            inline batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
             {
                 return _mm_unpacklo_ps(self.real(), self.imag());
             }
             // complex_high
             template <class A>
-            inline batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
             {
                 return _mm_unpackhi_ps(self.real(), self.imag());
             }
             template <class A>
-            inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
             {
                 return _mm_unpacklo_pd(self.real(), self.imag());
             }
             template <class A>
-            inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
             {
                 return _mm_unpackhi_pd(self.real(), self.imag());
             }
@@ -515,19 +557,19 @@ namespace xsimd
 
         // decr_if
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<sse2>) noexcept
         {
             return self + batch<T, A>(mask.data);
         }
 
         // div
         template <class A>
-        inline batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_div_ps(self, other);
         }
         template <class A>
-        inline batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_div_pd(self, other);
         }
@@ -536,13 +578,13 @@ namespace xsimd
         namespace detail
         {
             template <class A>
-            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
             {
                 return _mm_cvtepi32_ps(self);
             }
 
             template <class A>
-            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
             {
                 // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
                 // adapted to sse2
@@ -555,7 +597,7 @@ namespace xsimd
             }
 
             template <class A>
-            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
             {
                 // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
                 // adapted to sse2
@@ -569,7 +611,7 @@ namespace xsimd
             }
 
             template <class A>
-            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<sse2>) noexcept
             {
                 return _mm_cvttps_epi32(self);
             }
@@ -577,17 +619,17 @@ namespace xsimd
 
         // eq
         template <class A>
-        inline batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_cmpeq_ps(self, other);
         }
         template <class A>
-        inline batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(self), _mm_castps_si128(other)));
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -616,24 +658,24 @@ namespace xsimd
             }
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
         {
             return ~(self != other);
         }
         template <class A>
-        inline batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_cmpeq_pd(self, other);
         }
         template <class A>
-        inline batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other)));
         }
 
         // from_mask
         template <class A>
-        inline batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
         {
             alignas(A::alignment()) static const uint32_t lut[][4] = {
                 { 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
@@ -657,7 +699,7 @@ namespace xsimd
             return _mm_castsi128_ps(_mm_load_si128((const __m128i*)lut[mask]));
         }
         template <class A>
-        inline batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
         {
             alignas(A::alignment()) static const uint64_t lut[][4] = {
                 { 0x0000000000000000ul, 0x0000000000000000ul },
@@ -669,7 +711,7 @@ namespace xsimd
             return _mm_castsi128_pd(_mm_load_si128((const __m128i*)lut[mask]));
         }
         template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
         {
             alignas(A::alignment()) static const uint64_t lut64[] = {
                 0x0000000000000000,
@@ -729,24 +771,24 @@ namespace xsimd
 
         // ge
         template <class A>
-        inline batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_cmpge_ps(self, other);
         }
         template <class A>
-        inline batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_cmpge_pd(self, other);
         }
 
         // gt
         template <class A>
-        inline batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_cmpgt_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -774,14 +816,14 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_cmpgt_pd(self, other);
         }
 
         // haddp
         template <class A>
-        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse2>) noexcept
         {
             __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]);
             __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]);
@@ -794,7 +836,7 @@ namespace xsimd
             return _mm_add_ps(tmp0, tmp2);
         }
         template <class A>
-        inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse2>) noexcept
         {
             return _mm_add_pd(_mm_unpacklo_pd(row[0], row[1]),
                               _mm_unpackhi_pd(row[0], row[1]));
@@ -802,14 +844,14 @@ namespace xsimd
 
         // incr_if
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<sse2>) noexcept
         {
             return self - batch<T, A>(mask.data);
         }
 
         // insert
         template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
             {
@@ -823,46 +865,46 @@ namespace xsimd
 
         // isnan
         template <class A>
-        inline batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_cmpunord_ps(self, self);
         }
         template <class A>
-        inline batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_cmpunord_pd(self, self);
         }
 
         // load_aligned
         template <class A>
-        inline batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
         {
             return _mm_load_ps(mem);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
         {
             return _mm_load_si128((__m128i const*)mem);
         }
         template <class A>
-        inline batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
         {
             return _mm_load_pd(mem);
         }
 
         // load_unaligned
         template <class A>
-        inline batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
         {
             return _mm_loadu_ps(mem);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
         {
             return _mm_loadu_si128((__m128i const*)mem);
         }
         template <class A>
-        inline batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
         {
             return _mm_loadu_pd(mem);
         }
@@ -872,12 +914,12 @@ namespace xsimd
         {
             // Redefine these methods in the SSE-based archs if required
             template <class A>
-            inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<sse2>) noexcept
             {
                 return { _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)) };
             }
             template <class A>
-            inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<sse2>) noexcept
             {
                 return { _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)) };
             }
@@ -885,24 +927,24 @@ namespace xsimd
 
         // le
         template <class A>
-        inline batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_cmple_ps(self, other);
         }
         template <class A>
-        inline batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_cmple_pd(self, other);
         }
 
         // lt
         template <class A>
-        inline batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_cmplt_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -969,7 +1011,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_cmplt_pd(self, other);
         }
@@ -979,7 +1021,7 @@ namespace xsimd
          */
         namespace detail
         {
-            inline int mask_lut(int mask)
+            XSIMD_INLINE int mask_lut(int mask)
             {
                 // clang-format off
                 static const int mask_lut[256] = {
@@ -1007,7 +1049,7 @@ namespace xsimd
 
         // mask
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -1033,85 +1075,92 @@ namespace xsimd
             }
         }
         template <class A>
-        inline uint64_t mask(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE uint64_t mask(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_movemask_ps(self);
         }
 
         template <class A>
-        inline uint64_t mask(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE uint64_t mask(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_movemask_pd(self);
         }
 
         // max
         template <class A>
-        inline batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_max_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
         {
             return select(self > other, self, other);
         }
         template <class A>
-        inline batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_max_pd(self, other);
         }
 
         // min
         template <class A>
-        inline batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_min_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
         {
             return select(self <= other, self, other);
         }
         template <class A>
-        inline batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_min_pd(self, other);
         }
 
         // mul
         template <class A>
-        inline batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_mul_ps(self, other);
         }
         template <class A>
-        inline batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_mul_pd(self, other);
         }
 
+        // mul
+        template <class A>
+        XSIMD_INLINE batch<int16_t, A> mul(batch<int16_t, A> const& self, batch<int16_t, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_mullo_epi16(self, other);
+        }
+
         // nearbyint_as_int
         template <class A>
-        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
-                                                  requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                        requires_arch<sse2>) noexcept
         {
             return _mm_cvtps_epi32(self);
         }
 
         // neg
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> neg(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<sse2>) noexcept
         {
             return 0 - self;
         }
         template <class A>
-        inline batch<float, A> neg(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> neg(batch<float, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
         }
 
         template <class A>
-        inline batch<double, A> neg(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> neg(batch<double, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_xor_pd(
                 self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000)));
@@ -1119,57 +1168,94 @@ namespace xsimd
 
         // neq
         template <class A>
-        inline batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_cmpneq_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
         {
             return ~(self == other);
         }
         template <class A>
-        inline batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_xor_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(self.data), _mm_castsi128_ps(other.data)));
         }
 
         template <class A>
-        inline batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_cmpneq_pd(self, other);
         }
         template <class A>
-        inline batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_xor_pd(self, other);
         }
 
         // reciprocal
         template <class A>
-        inline batch<float, A> reciprocal(batch<float, A> const& self,
-                                          kernel::requires_arch<sse2>)
+        XSIMD_INLINE batch<float, A> reciprocal(batch<float, A> const& self,
+                                                kernel::requires_arch<sse2>)
         {
             return _mm_rcp_ps(self);
         }
 
         // reduce_add
         template <class A>
-        inline float reduce_add(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<sse2>) noexcept
         {
             __m128 tmp0 = _mm_add_ps(self, _mm_movehl_ps(self, self));
             __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
             return _mm_cvtss_f32(tmp1);
         }
 
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
+                __m128i tmp2 = _mm_add_epi32(self, tmp1);
+                __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01);
+                __m128i tmp4 = _mm_add_epi32(tmp2, tmp3);
+                return _mm_cvtsi128_si32(tmp4);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
+                __m128i tmp2 = _mm_add_epi64(self, tmp1);
+#if defined(__x86_64__)
+                return _mm_cvtsi128_si64(tmp2);
+#else
+                __m128i m;
+                _mm_storel_epi64(&m, tmp2);
+                int64_t i;
+                std::memcpy(&i, &m, sizeof(i));
+                return i;
+#endif
+            }
+            else
+            {
+                return hadd(self, generic {});
+            }
+        }
+
+        template <class A>
+        XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
+        }
+
         // reduce_max
         template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
-        inline T reduce_max(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<sse2>) noexcept
         {
             constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
             batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
@@ -1191,7 +1277,7 @@ namespace xsimd
 
         // reduce_min
         template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
-        inline T reduce_min(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<sse2>) noexcept
         {
             constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
             batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
@@ -1211,80 +1297,44 @@ namespace xsimd
             return acc3.get(0);
         }
 
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline T reduce_add(batch<T, A> const& self, requires_arch<sse2>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
-                __m128i tmp2 = _mm_add_epi32(self, tmp1);
-                __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01);
-                __m128i tmp4 = _mm_add_epi32(tmp2, tmp3);
-                return _mm_cvtsi128_si32(tmp4);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
-                __m128i tmp2 = _mm_add_epi64(self, tmp1);
-#if defined(__x86_64__)
-                return _mm_cvtsi128_si64(tmp2);
-#else
-                __m128i m;
-                _mm_storel_epi64(&m, tmp2);
-                int64_t i;
-                std::memcpy(&i, &m, sizeof(i));
-                return i;
-#endif
-            }
-            else
-            {
-                return hadd(self, generic {});
-            }
-        }
-        template <class A>
-        inline double reduce_add(batch<double, A> const& self, requires_arch<sse2>) noexcept
-        {
-            return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
-        }
-
         // rsqrt
         template <class A>
-        inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
         {
             return _mm_rsqrt_ps(val);
         }
         template <class A>
-        inline batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
         {
             return _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(val)));
         }
 
         // select
         template <class A>
-        inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse2>) noexcept
         {
             return _mm_or_ps(_mm_and_ps(cond, true_br), _mm_andnot_ps(cond, false_br));
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
         {
             return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br));
         }
         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
         {
             return select(batch_bool<T, A> { Values... }, true_br, false_br, sse2 {});
         }
         template <class A>
-        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse2>) noexcept
         {
             return _mm_or_pd(_mm_and_pd(cond, true_br), _mm_andnot_pd(cond, false_br));
         }
 
         // shuffle
         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
-        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3> mask, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<sse2>) noexcept
         {
             constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
             // shuffle within lane
@@ -1298,7 +1348,7 @@ namespace xsimd
         }
 
         template <class A, class ITy, ITy I0, ITy I1>
-        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1> mask, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1> mask, requires_arch<sse2>) noexcept
         {
             constexpr uint32_t smask = detail::mod_shuffle(I0, I1);
             // shuffle within lane
@@ -1313,26 +1363,26 @@ namespace xsimd
 
         // sqrt
         template <class A>
-        inline batch<float, A> sqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
         {
             return _mm_sqrt_ps(val);
         }
         template <class A>
-        inline batch<double, A> sqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
         {
             return _mm_sqrt_pd(val);
         }
 
         // slide_left
         template <size_t N, class A, class T>
-        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<sse2>) noexcept
         {
             return _mm_slli_si128(x, N);
         }
 
         // slide_right
         template <size_t N, class A, class T>
-        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<sse2>) noexcept
         {
             return _mm_srli_si128(x, N);
         }
@@ -1340,7 +1390,7 @@ namespace xsimd
         // sadd
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -1376,55 +1426,55 @@ namespace xsimd
 
         // set
         template <class A, class... Values>
-        inline batch<float, A> set(batch<float, A> const&, requires_arch<sse2>, Values... values) noexcept
+        XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<sse2>, Values... values) noexcept
         {
             static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
             return _mm_setr_ps(values...);
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1) noexcept
         {
             return _mm_set_epi64x(v1, v0);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3) noexcept
         {
             return _mm_setr_epi32(v0, v1, v2, v3);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
         {
             return _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
         {
             return _mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
         }
 
         template <class A, class... Values>
-        inline batch<double, A> set(batch<double, A> const&, requires_arch<sse2>, Values... values) noexcept
+        XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<sse2>, Values... values) noexcept
         {
             static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
             return _mm_setr_pd(values...);
         }
 
         template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sse2>, Values... values) noexcept
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sse2>, Values... values) noexcept
         {
             return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
         }
 
         template <class A, class... Values>
-        inline batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<sse2>, Values... values) noexcept
+        XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<sse2>, Values... values) noexcept
         {
             static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
             return _mm_castsi128_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
         }
 
         template <class A, class... Values>
-        inline batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<sse2>, Values... values) noexcept
+        XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<sse2>, Values... values) noexcept
         {
             static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
             return _mm_castsi128_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
@@ -1433,7 +1483,7 @@ namespace xsimd
         // ssub
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -1469,56 +1519,56 @@ namespace xsimd
 
         // store_aligned
         template <class A>
-        inline void store_aligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_store_ps(mem, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline void store_aligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_store_si128((__m128i*)mem, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_store_si128((__m128i*)mem, self);
         }
         template <class A>
-        inline void store_aligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_store_pd(mem, self);
         }
 
         // store_unaligned
         template <class A>
-        inline void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_storeu_ps(mem, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_storeu_si128((__m128i*)mem, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_storeu_si128((__m128i*)mem, self);
         }
         template <class A>
-        inline void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
         {
             return _mm_storeu_pd(mem, self);
         }
 
         // sub
         template <class A>
-        inline batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_sub_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -1543,7 +1593,7 @@ namespace xsimd
             }
         }
         template <class A>
-        inline batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_sub_pd(self, other);
         }
@@ -1551,53 +1601,53 @@ namespace xsimd
         // swizzle
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
         {
             constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
             return _mm_shuffle_ps(self, self, index);
         }
 
         template <class A, uint64_t V0, uint64_t V1>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
         {
             constexpr uint32_t index = detail::shuffle(V0, V1);
             return _mm_shuffle_pd(self, self, index);
         }
 
         template <class A, uint64_t V0, uint64_t V1>
-        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
         {
             constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
             return _mm_shuffle_epi32(self, index);
         }
 
         template <class A, uint64_t V0, uint64_t V1>
-        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1> mask, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1> mask, requires_arch<sse2>) noexcept
         {
             return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, sse2 {}));
         }
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
         {
             constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
             return _mm_shuffle_epi32(self, index);
         }
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> mask, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3> mask, requires_arch<sse2>) noexcept
         {
             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
         }
 
         // zip_hi
         template <class A>
-        inline batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_unpackhi_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -1622,19 +1672,19 @@ namespace xsimd
             }
         }
         template <class A>
-        inline batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_unpackhi_pd(self, other);
         }
 
         // zip_lo
         template <class A>
-        inline batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_unpacklo_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -1659,7 +1709,7 @@ namespace xsimd
             }
         }
         template <class A>
-        inline batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_unpacklo_pd(self, other);
         }
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_sse3.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_sse3.hpp
index ccc049795ca..ffdc5bc9fab 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_sse3.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_sse3.hpp
@@ -24,34 +24,34 @@ namespace xsimd
 
         // haddp
         template <class A>
-        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse3>) noexcept
+        XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse3>) noexcept
         {
             return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]),
                                _mm_hadd_ps(row[2], row[3]));
         }
         template <class A>
-        inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse3>) noexcept
+        XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse3>) noexcept
         {
             return _mm_hadd_pd(row[0], row[1]);
         }
 
         // load_unaligned
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse3>) noexcept
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse3>) noexcept
         {
             return _mm_lddqu_si128((__m128i const*)mem);
         }
 
         // reduce_add
         template <class A>
-        inline float reduce_add(batch<float, A> const& self, requires_arch<sse3>) noexcept
+        XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<sse3>) noexcept
         {
             __m128 tmp0 = _mm_hadd_ps(self, self);
             __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0);
             return _mm_cvtss_f32(tmp1);
         }
         template <class A>
-        inline double reduce_add(batch<double, A> const& self, requires_arch<sse3>) noexcept
+        XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<sse3>) noexcept
         {
             __m128d tmp0 = _mm_hadd_pd(self, self);
             return _mm_cvtsd_f64(tmp0);
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_sse4_1.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_sse4_1.hpp
index 165a191e426..7fce2c31472 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_sse4_1.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_sse4_1.hpp
@@ -24,18 +24,18 @@ namespace xsimd
         using namespace types;
         // any
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline bool any(batch<T, A> const& self, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE bool any(batch<T, A> const& self, requires_arch<sse4_1>) noexcept
         {
             return !_mm_testz_si128(self, self);
         }
         // ceil
         template <class A>
-        inline batch<float, A> ceil(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE batch<float, A> ceil(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
         {
             return _mm_ceil_ps(self);
         }
         template <class A>
-        inline batch<double, A> ceil(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE batch<double, A> ceil(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
         {
             return _mm_ceil_pd(self);
         }
@@ -44,7 +44,7 @@ namespace xsimd
         namespace detail
         {
             template <class A>
-            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
+            XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
             {
                 // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
                 __m128i xH = _mm_srai_epi32(x, 16);
@@ -56,7 +56,7 @@ namespace xsimd
             }
 
             template <class A>
-            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
+            XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
             {
                 // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
                 __m128i xH = _mm_srli_epi64(x, 32);
@@ -69,7 +69,7 @@ namespace xsimd
 
         // eq
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
             {
@@ -83,19 +83,19 @@ namespace xsimd
 
         // floor
         template <class A>
-        inline batch<float, A> floor(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE batch<float, A> floor(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
         {
             return _mm_floor_ps(self);
         }
         template <class A>
-        inline batch<double, A> floor(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE batch<double, A> floor(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
         {
             return _mm_floor_pd(self);
         }
 
         // insert
         template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse4_1>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -124,7 +124,7 @@ namespace xsimd
 
         // max
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -168,7 +168,7 @@ namespace xsimd
 
         // min
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -212,7 +212,7 @@ namespace xsimd
 
         // mul
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -247,12 +247,12 @@ namespace xsimd
 
         // nearbyint
         template <class A>
-        inline batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
         {
             return _mm_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
         }
         template <class A>
-        inline batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
         {
             return _mm_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
         }
@@ -261,32 +261,32 @@ namespace xsimd
         namespace detail
         {
             template <class T>
-            inline constexpr T interleave(T const& cond) noexcept
+            XSIMD_INLINE constexpr T interleave(T const& cond) noexcept
             {
                 return (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 49) & 0x5555) | (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 48) & 0xAAAA);
             }
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
         {
             return _mm_blendv_epi8(false_br, true_br, cond);
         }
         template <class A>
-        inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
         {
             return _mm_blendv_ps(false_br, true_br, cond);
         }
         template <class A>
-        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
         {
             return _mm_blendv_pd(false_br, true_br, cond);
         }
 
         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
         {
-            constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
+            constexpr int mask = batch_bool_constant<T, A, Values...>::mask();
             XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
             {
                 return _mm_blend_epi16(false_br, true_br, mask);
@@ -304,30 +304,30 @@ namespace xsimd
             }
             else
             {
-                return select(batch_bool_constant<batch<T, A>, Values...>(), true_br, false_br, ssse3 {});
+                return select(batch_bool_constant<T, A, Values...>(), true_br, false_br, ssse3 {});
             }
         }
         template <class A, bool... Values>
-        inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE batch<float, A> select(batch_bool_constant<float, A, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
         {
-            constexpr int mask = batch_bool_constant<batch<float, A>, Values...>::mask();
+            constexpr int mask = batch_bool_constant<float, A, Values...>::mask();
             return _mm_blend_ps(false_br, true_br, mask);
         }
         template <class A, bool... Values>
-        inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE batch<double, A> select(batch_bool_constant<double, A, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
         {
-            constexpr int mask = batch_bool_constant<batch<double, A>, Values...>::mask();
+            constexpr int mask = batch_bool_constant<double, A, Values...>::mask();
             return _mm_blend_pd(false_br, true_br, mask);
         }
 
         // trunc
         template <class A>
-        inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
         {
             return _mm_round_ps(self, _MM_FROUND_TO_ZERO);
         }
         template <class A>
-        inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        XSIMD_INLINE batch<double, A> trunc(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
         {
             return _mm_round_pd(self, _MM_FROUND_TO_ZERO);
         }
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_sse4_2.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_sse4_2.hpp
index 8f9b7a76e69..5265182f9d2 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_sse4_2.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_sse4_2.hpp
@@ -25,12 +25,12 @@ namespace xsimd
 
         // lt
         template <class A>
-        inline batch_bool<int64_t, A> lt(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<sse4_2>) noexcept
+        XSIMD_INLINE batch_bool<int64_t, A> lt(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<sse4_2>) noexcept
         {
             return _mm_cmpgt_epi64(other, self);
         }
         template <class A>
-        inline batch_bool<uint64_t, A> lt(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<sse4_2>) noexcept
+        XSIMD_INLINE batch_bool<uint64_t, A> lt(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<sse4_2>) noexcept
         {
             auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
             auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_ssse3.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_ssse3.hpp
index b6ea1192136..9424d4ada57 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_ssse3.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_ssse3.hpp
@@ -27,7 +27,7 @@ namespace xsimd
 
         // abs
         template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
-        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<ssse3>) noexcept
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<ssse3>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -57,13 +57,13 @@ namespace xsimd
         {
 
             template <class T, class A>
-            inline batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& other, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& other, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
             {
                 return other;
             }
 
             template <class T, class A, std::size_t I, std::size_t... Is>
-            inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, ::xsimd::detail::index_sequence<I, Is...>) noexcept
             {
                 if (i == I)
                 {
@@ -75,7 +75,7 @@ namespace xsimd
         }
 
         template <class A, class T, class _ = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<ssse3>) noexcept
+        XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<ssse3>) noexcept
         {
             constexpr std::size_t size = batch<T, A>::size;
             assert(0 <= i && i < size && "index in bounds");
@@ -84,7 +84,7 @@ namespace xsimd
 
         // reduce_add
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline T reduce_add(batch<T, A> const& self, requires_arch<ssse3>) noexcept
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<ssse3>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
             {
@@ -107,30 +107,30 @@ namespace xsimd
 
         // rotate_right
         template <size_t N, class A>
-        inline batch<uint16_t, A> rotate_right(batch<uint16_t, A> const& self, requires_arch<ssse3>) noexcept
+        XSIMD_INLINE batch<uint16_t, A> rotate_right(batch<uint16_t, A> const& self, requires_arch<ssse3>) noexcept
         {
             return _mm_alignr_epi8(self, self, N);
         }
         template <size_t N, class A>
-        inline batch<int16_t, A> rotate_right(batch<int16_t, A> const& self, requires_arch<ssse3>) noexcept
+        XSIMD_INLINE batch<int16_t, A> rotate_right(batch<int16_t, A> const& self, requires_arch<ssse3>) noexcept
         {
             return bitwise_cast<int16_t>(rotate_right<N, A>(bitwise_cast<uint16_t>(self), ssse3 {}));
         }
 
         // swizzle (dynamic mask)
         template <class A>
-        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<ssse3>) noexcept
+        XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<ssse3>) noexcept
         {
             return _mm_shuffle_epi8(self, mask);
         }
         template <class A>
-        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<ssse3>) noexcept
+        XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<ssse3>) noexcept
         {
             return _mm_shuffle_epi8(self, mask);
         }
 
         template <class A, class T, class IT>
-        inline typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
+        XSIMD_INLINE typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
         swizzle(batch<T, A> const& self, batch<IT, A> mask, requires_arch<ssse3>) noexcept
         {
             constexpr auto pikes = static_cast<as_unsigned_integer_t<T>>(0x0706050403020100ul);
@@ -140,32 +140,32 @@ namespace xsimd
 
         // swizzle (constant mask)
         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
-        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<ssse3>) noexcept
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<ssse3>) noexcept
         {
-            constexpr batch_constant<batch<uint8_t, A>, 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1,
+            constexpr batch_constant<uint8_t, A, 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1,
                                      2 * V4, 2 * V4 + 1, 2 * V5, 2 * V5 + 1, 2 * V6, 2 * V6 + 1, 2 * V7, 2 * V7 + 1>
                 mask8;
-            return _mm_shuffle_epi8(self, (batch<uint8_t, A>)mask8);
+            return _mm_shuffle_epi8(self, mask8.as_batch());
         }
 
         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
-        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
+        XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
         {
             return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, ssse3 {}));
         }
 
         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
                   uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
-        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
+        XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
         {
-            return swizzle(self, (batch<uint8_t, A>)mask, ssse3 {});
+            return swizzle(self, mask.as_batch(), ssse3 {});
         }
 
         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
                   uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
-        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
+        XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
         {
-            return swizzle(self, (batch<uint8_t, A>)mask, ssse3 {});
+            return swizzle(self, mask.as_batch(), ssse3 {});
         }
 
     }
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_sve.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_sve.hpp
index 3177c97b28b..1586b8e0bed 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_sve.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_sve.hpp
@@ -20,7 +20,7 @@
 
 namespace xsimd
 {
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
     struct batch_constant;
 
     namespace kernel
@@ -31,22 +31,22 @@ namespace xsimd
             using xsimd::types::detail::sve_vector_type;
 
             // predicate creation
-            inline svbool_t sve_ptrue_impl(index<1>) noexcept { return svptrue_b8(); }
-            inline svbool_t sve_ptrue_impl(index<2>) noexcept { return svptrue_b16(); }
-            inline svbool_t sve_ptrue_impl(index<4>) noexcept { return svptrue_b32(); }
-            inline svbool_t sve_ptrue_impl(index<8>) noexcept { return svptrue_b64(); }
+            XSIMD_INLINE svbool_t sve_ptrue_impl(index<1>) noexcept { return svptrue_b8(); }
+            XSIMD_INLINE svbool_t sve_ptrue_impl(index<2>) noexcept { return svptrue_b16(); }
+            XSIMD_INLINE svbool_t sve_ptrue_impl(index<4>) noexcept { return svptrue_b32(); }
+            XSIMD_INLINE svbool_t sve_ptrue_impl(index<8>) noexcept { return svptrue_b64(); }
 
             template <class T>
             svbool_t sve_ptrue() noexcept { return sve_ptrue_impl(index<sizeof(T)> {}); }
 
             // count active lanes in a predicate
-            inline uint64_t sve_pcount_impl(svbool_t p, index<1>) noexcept { return svcntp_b8(p, p); }
-            inline uint64_t sve_pcount_impl(svbool_t p, index<2>) noexcept { return svcntp_b16(p, p); }
-            inline uint64_t sve_pcount_impl(svbool_t p, index<4>) noexcept { return svcntp_b32(p, p); }
-            inline uint64_t sve_pcount_impl(svbool_t p, index<8>) noexcept { return svcntp_b64(p, p); }
+            XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<1>) noexcept { return svcntp_b8(p, p); }
+            XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<2>) noexcept { return svcntp_b16(p, p); }
+            XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<4>) noexcept { return svcntp_b32(p, p); }
+            XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<8>) noexcept { return svcntp_b64(p, p); }
 
             template <class T>
-            inline uint64_t sve_pcount(svbool_t p) noexcept { return sve_pcount_impl(p, index<sizeof(T)> {}); }
+            XSIMD_INLINE uint64_t sve_pcount(svbool_t p) noexcept { return sve_pcount_impl(p, index<sizeof(T)> {}); }
 
             // enable for signed integers
             template <class T>
@@ -84,20 +84,20 @@ namespace xsimd
         }
 
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<sve>) noexcept
         {
             return svld1(detail::sve_ptrue<T>(), reinterpret_cast<detail::sve_fix_char_t<T> const*>(src));
         }
 
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<sve>) noexcept
         {
             return load_aligned<A>(src, convert<T>(), sve {});
         }
 
         // load_complex
         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
-        inline batch<std::complex<T>, A> load_complex_aligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> load_complex_aligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
         {
             const T* buf = reinterpret_cast<const T*>(mem);
             const auto tmp = svld2(detail::sve_ptrue<T>(), buf);
@@ -107,7 +107,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
-        inline batch<std::complex<T>, A> load_complex_unaligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> load_complex_unaligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
         {
             return load_complex_aligned<A>(mem, convert<std::complex<T>> {}, sve {});
         }
@@ -117,20 +117,20 @@ namespace xsimd
          *********/
 
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<sve>) noexcept
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<sve>) noexcept
         {
             svst1(detail::sve_ptrue<T>(), reinterpret_cast<detail::sve_fix_char_t<T>*>(dst), src);
         }
 
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<sve>) noexcept
+        XSIMD_INLINE void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<sve>) noexcept
         {
             store_aligned<A>(dst, src, sve {});
         }
 
         // store_complex
         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
-        inline void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
+        XSIMD_INLINE void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
         {
             using v2type = typename std::conditional<(sizeof(T) == 4), svfloat32x2_t, svfloat64x2_t>::type;
             v2type tmp {};
@@ -141,7 +141,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
-        inline void store_complex_unaligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
+        XSIMD_INLINE void store_complex_unaligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
         {
             store_complex_aligned(dst, src, sve {});
         }
@@ -158,14 +158,14 @@ namespace xsimd
 
         // scatter
         template <class A, class T, class U, detail::sve_enable_sg_t<T, U> = 0>
-        inline void scatter(batch<T, A> const& src, T* dst, batch<U, A> const& index, kernel::requires_arch<sve>) noexcept
+        XSIMD_INLINE void scatter(batch<T, A> const& src, T* dst, batch<U, A> const& index, kernel::requires_arch<sve>) noexcept
         {
             svst1_scatter_index(detail::sve_ptrue<T>(), dst, index.data, src.data);
         }
 
         // gather
         template <class A, class T, class U, detail::sve_enable_sg_t<T, U> = 0>
-        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index, kernel::requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index, kernel::requires_arch<sve>) noexcept
         {
             return svld1_gather_index(detail::sve_ptrue<T>(), src, index.data);
         }
@@ -176,67 +176,67 @@ namespace xsimd
 
         // broadcast
         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
-        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
         {
             return svdup_n_u8(uint8_t(arg));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
-        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
         {
             return svdup_n_s8(int8_t(arg));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
-        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
         {
             return svdup_n_u16(uint16_t(arg));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
-        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
         {
             return svdup_n_s16(int16_t(arg));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
-        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
         {
             return svdup_n_u32(uint32_t(arg));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
-        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
         {
             return svdup_n_s32(int32_t(arg));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
         {
             return svdup_n_u64(uint64_t(arg));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
-        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
         {
             return svdup_n_s64(int64_t(arg));
         }
 
         template <class A>
-        inline batch<float, A> broadcast(float arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<float, A> broadcast(float arg, requires_arch<sve>) noexcept
         {
             return svdup_n_f32(arg);
         }
 
         template <class A>
-        inline batch<double, A> broadcast(double arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<double, A> broadcast(double arg, requires_arch<sve>) noexcept
         {
             return svdup_n_f64(arg);
         }
 
         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
-        inline batch<T, A> broadcast(T val, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<sve>) noexcept
         {
             return broadcast<sve>(val, sve {});
         }
@@ -247,128 +247,128 @@ namespace xsimd
 
         // add
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svadd_x(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         // sadd
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svqadd(lhs, rhs);
         }
 
         // sub
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svsub_x(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         // ssub
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svqsub(lhs, rhs);
         }
 
         // mul
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svmul_x(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         // div
         template <class A, class T, typename std::enable_if<sizeof(T) >= 4, int>::type = 0>
-        inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svdiv_x(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         // max
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svmax_x(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         // min
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svmin_x(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         // neg
         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
-        inline batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return svreinterpret_u8(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s8(arg)));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
-        inline batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return svreinterpret_u16(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s16(arg)));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
-        inline batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return svreinterpret_u32(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s32(arg)));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        inline batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return svreinterpret_u64(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s64(arg)));
         }
 
         template <class A, class T, detail::sve_enable_signed_int_or_floating_point_t<T> = 0>
-        inline batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return svneg_x(detail::sve_ptrue<T>(), arg);
         }
 
         // abs
         template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
-        inline batch<T, A> abs(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return arg;
         }
 
         template <class A, class T, detail::sve_enable_signed_int_or_floating_point_t<T> = 0>
-        inline batch<T, A> abs(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return svabs_x(detail::sve_ptrue<T>(), arg);
         }
 
         // fma: x * y + z
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
         {
             return svmad_x(detail::sve_ptrue<T>(), x, y, z);
         }
 
         // fnma: z - x * y
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
         {
             return svmsb_x(detail::sve_ptrue<T>(), x, y, z);
         }
 
         // fms: x * y - z
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
         {
             return -fnma(x, y, z, sve {});
         }
 
         // fnms: - x * y - z
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
         {
             return -fma(x, y, z, sve {});
         }
@@ -379,13 +379,13 @@ namespace xsimd
 
         // bitwise_and
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svand_x(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         template <class A>
-        inline batch<float, A> bitwise_and(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
         {
             const auto lhs_bits = svreinterpret_u32(lhs);
             const auto rhs_bits = svreinterpret_u32(rhs);
@@ -394,7 +394,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
         {
             const auto lhs_bits = svreinterpret_u64(lhs);
             const auto rhs_bits = svreinterpret_u64(rhs);
@@ -403,20 +403,20 @@ namespace xsimd
         }
 
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svand_z(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         // bitwise_andnot
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svbic_x(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         template <class A>
-        inline batch<float, A> bitwise_andnot(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
         {
             const auto lhs_bits = svreinterpret_u32(lhs);
             const auto rhs_bits = svreinterpret_u32(rhs);
@@ -425,7 +425,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
         {
             const auto lhs_bits = svreinterpret_u64(lhs);
             const auto rhs_bits = svreinterpret_u64(rhs);
@@ -434,20 +434,20 @@ namespace xsimd
         }
 
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svbic_z(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         // bitwise_or
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svorr_x(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         template <class A>
-        inline batch<float, A> bitwise_or(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
         {
             const auto lhs_bits = svreinterpret_u32(lhs);
             const auto rhs_bits = svreinterpret_u32(rhs);
@@ -456,7 +456,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
         {
             const auto lhs_bits = svreinterpret_u64(lhs);
             const auto rhs_bits = svreinterpret_u64(rhs);
@@ -465,20 +465,20 @@ namespace xsimd
         }
 
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svorr_z(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         // bitwise_xor
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return sveor_x(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         template <class A>
-        inline batch<float, A> bitwise_xor(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
         {
             const auto lhs_bits = svreinterpret_u32(lhs);
             const auto rhs_bits = svreinterpret_u32(rhs);
@@ -487,7 +487,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
         {
             const auto lhs_bits = svreinterpret_u64(lhs);
             const auto rhs_bits = svreinterpret_u64(rhs);
@@ -496,20 +496,20 @@ namespace xsimd
         }
 
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         // bitwise_not
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return svnot_x(detail::sve_ptrue<T>(), arg);
         }
 
         template <class A>
-        inline batch<float, A> bitwise_not(batch<float, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& arg, requires_arch<sve>) noexcept
         {
             const auto arg_bits = svreinterpret_u32(arg);
             const auto result_bits = svnot_x(detail::sve_ptrue<float>(), arg_bits);
@@ -517,7 +517,7 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<double, A> bitwise_not(batch<double, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& arg, requires_arch<sve>) noexcept
         {
             const auto arg_bits = svreinterpret_u64(arg);
             const auto result_bits = svnot_x(detail::sve_ptrue<double>(), arg_bits);
@@ -525,7 +525,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return svnot_z(detail::sve_ptrue<T>(), arg);
         }
@@ -537,31 +537,31 @@ namespace xsimd
         namespace detail
         {
             template <class A, class T, class U>
-            inline batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<1>) noexcept
+            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<1>) noexcept
             {
                 return svreinterpret_u8(arg);
             }
 
             template <class A, class T, class U>
-            inline batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<2>) noexcept
+            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<2>) noexcept
             {
                 return svreinterpret_u16(arg);
             }
 
             template <class A, class T, class U>
-            inline batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<4>) noexcept
+            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<4>) noexcept
             {
                 return svreinterpret_u32(arg);
             }
 
             template <class A, class T, class U>
-            inline batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<8>) noexcept
+            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<8>) noexcept
             {
                 return svreinterpret_u64(arg);
             }
 
             template <class A, class T, class U = as_unsigned_integer_t<T>>
-            inline batch<U, A> sve_to_unsigned_batch(batch<T, A> const& arg) noexcept
+            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch(batch<T, A> const& arg) noexcept
             {
                 return sve_to_unsigned_batch_impl<A, T, U>(arg, index<sizeof(T)> {});
             }
@@ -569,7 +569,7 @@ namespace xsimd
 
         // bitwise_lshift
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
         {
             constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
             assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
@@ -577,14 +577,14 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_integral_t<T> = 0>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svlsl_x(detail::sve_ptrue<T>(), lhs, detail::sve_to_unsigned_batch<A, T>(rhs));
         }
 
         // bitwise_rshift
         template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
         {
             constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
             assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
@@ -592,13 +592,13 @@ namespace xsimd
         }
 
         template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svlsr_x(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         template <class A, class T, detail::sve_enable_signed_int_t<T> = 0>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
         {
             constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
             assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
@@ -606,7 +606,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::sve_enable_signed_int_t<T> = 0>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svasr_x(detail::sve_ptrue<T>(), lhs, detail::sve_to_unsigned_batch<A, T>(rhs));
         }
@@ -617,7 +617,7 @@ namespace xsimd
 
         // reduce_add
         template <class A, class T, class V = typename batch<T, A>::value_type, detail::sve_enable_all_t<T> = 0>
-        inline V reduce_add(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE V reduce_add(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             // sve integer reduction results are promoted to 64 bits
             return static_cast<V>(svaddv(detail::sve_ptrue<T>(), arg));
@@ -625,21 +625,21 @@ namespace xsimd
 
         // reduce_max
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline T reduce_max(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE T reduce_max(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return svmaxv(detail::sve_ptrue<T>(), arg);
         }
 
         // reduce_min
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline T reduce_min(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE T reduce_min(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return svminv(detail::sve_ptrue<T>(), arg);
         }
 
         // haddp
         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
-        inline batch<T, A> haddp(const batch<T, A>* row, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> haddp(const batch<T, A>* row, requires_arch<sve>) noexcept
         {
             constexpr std::size_t size = batch<T, A>::size;
             T sums[size];
@@ -656,13 +656,13 @@ namespace xsimd
 
         // eq
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svcmpeq(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             const auto neq_result = sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
             return svnot_z(detail::sve_ptrue<T>(), neq_result);
@@ -670,41 +670,41 @@ namespace xsimd
 
         // neq
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch_bool<T, A> neq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svcmpne(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         // lt
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svcmplt(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         // le
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svcmple(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         // gt
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svcmpgt(detail::sve_ptrue<T>(), lhs, rhs);
         }
 
         // ge
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svcmpge(detail::sve_ptrue<T>(), lhs, rhs);
         }
@@ -715,22 +715,22 @@ namespace xsimd
 
         //  rotate_right
         template <size_t N, class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> rotate_right(batch<T, A> const& a, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> rotate_right(batch<T, A> const& a, requires_arch<sve>) noexcept
         {
             return svext(a, a, N);
         }
 
         // swizzle (dynamic)
         template <class A, class T, class I>
-        inline batch<T, A> swizzle(batch<T, A> const& arg, batch<I, A> indices, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& arg, batch<I, A> indices, requires_arch<sve>) noexcept
         {
             return svtbl(arg, indices);
         }
 
         template <class A, class T, class I>
-        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self,
-                                                 batch<I, A> indices,
-                                                 requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self,
+                                                       batch<I, A> indices,
+                                                       requires_arch<sve>) noexcept
         {
             const auto real = swizzle(self.real(), indices, sve {});
             const auto imag = swizzle(self.imag(), indices, sve {});
@@ -739,19 +739,19 @@ namespace xsimd
 
         // swizzle (static)
         template <class A, class T, class I, I... idx>
-        inline batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<batch<I, A>, idx...> indices, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<I, A, idx...> indices, requires_arch<sve>) noexcept
         {
             static_assert(batch<T, A>::size == sizeof...(idx), "invalid swizzle indices");
-            return swizzle(arg, (batch<I, A>)indices, sve {});
+            return swizzle(arg, indices.as_batch(), sve {});
         }
 
         template <class A, class T, class I, I... idx>
-        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& arg,
-                                                 batch_constant<batch<I, A>, idx...> indices,
-                                                 requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& arg,
+                                                       batch_constant<I, A, idx...> indices,
+                                                       requires_arch<sve>) noexcept
         {
             static_assert(batch<std::complex<T>, A>::size == sizeof...(idx), "invalid swizzle indices");
-            return swizzle(arg, (batch<I, A>)indices, sve {});
+            return swizzle(arg, indices.as_batch(), sve {});
         }
 
         /*************
@@ -762,14 +762,14 @@ namespace xsimd
         namespace detail
         {
             template <class A, class T>
-            inline batch<T, A> sve_extract_pair(batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
+            XSIMD_INLINE batch<T, A> sve_extract_pair(batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
             {
                 assert(false && "extract_pair out of bounds");
                 return batch<T, A> {};
             }
 
             template <class A, class T, size_t I, size_t... Is>
-            inline batch<T, A> sve_extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> sve_extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
             {
                 if (n == I)
                 {
@@ -782,7 +782,7 @@ namespace xsimd
             }
 
             template <class A, class T, size_t... Is>
-            inline batch<T, A> sve_extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<0, Is...>) noexcept
+            XSIMD_INLINE batch<T, A> sve_extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<0, Is...>) noexcept
             {
                 if (n == 0)
                 {
@@ -796,7 +796,7 @@ namespace xsimd
         }
 
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<sve>) noexcept
         {
             constexpr std::size_t size = batch<T, A>::size;
             assert(n < size && "index in bounds");
@@ -805,27 +805,27 @@ namespace xsimd
 
         // select
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<sve>) noexcept
         {
             return svsel(cond, a, b);
         }
 
         template <class A, class T, bool... b>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sve>) noexcept
         {
             return select(batch_bool<T, A> { b... }, true_br, false_br, sve {});
         }
 
         // zip_lo
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svzip1(lhs, rhs);
         }
 
         // zip_hi
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svzip2(lhs, rhs);
         }
@@ -836,21 +836,21 @@ namespace xsimd
 
         // rsqrt
         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
-        inline batch<T, A> rsqrt(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return svrsqrte(arg);
         }
 
         // sqrt
         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
-        inline batch<T, A> sqrt(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return svsqrt_x(detail::sve_ptrue<T>(), arg);
         }
 
         // reciprocal
         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
-        inline batch<T, A> reciprocal(const batch<T, A>& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> reciprocal(const batch<T, A>& arg, requires_arch<sve>) noexcept
         {
             return svrecpe(arg);
         }
@@ -863,37 +863,37 @@ namespace xsimd
         namespace detail
         {
             template <class A, class T, detail::enable_sized_integral_t<T, 4> = 0>
-            inline batch<float, A> fast_cast(batch<T, A> const& arg, batch<float, A> const&, requires_arch<sve>) noexcept
+            XSIMD_INLINE batch<float, A> fast_cast(batch<T, A> const& arg, batch<float, A> const&, requires_arch<sve>) noexcept
             {
                 return svcvt_f32_x(detail::sve_ptrue<T>(), arg);
             }
 
             template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
-            inline batch<double, A> fast_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<sve>) noexcept
+            XSIMD_INLINE batch<double, A> fast_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<sve>) noexcept
             {
                 return svcvt_f64_x(detail::sve_ptrue<T>(), arg);
             }
 
             template <class A>
-            inline batch<int32_t, A> fast_cast(batch<float, A> const& arg, batch<int32_t, A> const&, requires_arch<sve>) noexcept
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& arg, batch<int32_t, A> const&, requires_arch<sve>) noexcept
             {
                 return svcvt_s32_x(detail::sve_ptrue<float>(), arg);
             }
 
             template <class A>
-            inline batch<uint32_t, A> fast_cast(batch<float, A> const& arg, batch<uint32_t, A> const&, requires_arch<sve>) noexcept
+            XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<float, A> const& arg, batch<uint32_t, A> const&, requires_arch<sve>) noexcept
             {
                 return svcvt_u32_x(detail::sve_ptrue<float>(), arg);
             }
 
             template <class A>
-            inline batch<int64_t, A> fast_cast(batch<double, A> const& arg, batch<int64_t, A> const&, requires_arch<sve>) noexcept
+            XSIMD_INLINE batch<int64_t, A> fast_cast(batch<double, A> const& arg, batch<int64_t, A> const&, requires_arch<sve>) noexcept
             {
                 return svcvt_s64_x(detail::sve_ptrue<double>(), arg);
             }
 
             template <class A>
-            inline batch<uint64_t, A> fast_cast(batch<double, A> const& arg, batch<uint64_t, A> const&, requires_arch<sve>) noexcept
+            XSIMD_INLINE batch<uint64_t, A> fast_cast(batch<double, A> const& arg, batch<uint64_t, A> const&, requires_arch<sve>) noexcept
             {
                 return svcvt_u64_x(detail::sve_ptrue<double>(), arg);
             }
@@ -905,21 +905,21 @@ namespace xsimd
 
         // set
         template <class A, class T, class... Args>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<sve>, Args... args) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sve>, Args... args) noexcept
         {
             return detail::sve_vector_type<T> { args... };
         }
 
         template <class A, class T, class... Args>
-        inline batch<std::complex<T>, A> set(batch<std::complex<T>, A> const&, requires_arch<sve>,
-                                             Args... args_complex) noexcept
+        XSIMD_INLINE batch<std::complex<T>, A> set(batch<std::complex<T>, A> const&, requires_arch<sve>,
+                                                   Args... args_complex) noexcept
         {
             return batch<std::complex<T>>(detail::sve_vector_type<T> { args_complex.real()... },
                                           detail::sve_vector_type<T> { args_complex.imag()... });
         }
 
         template <class A, class T, class... Args>
-        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sve>, Args... args) noexcept
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sve>, Args... args) noexcept
         {
             using U = as_unsigned_integer_t<T>;
             const auto values = detail::sve_vector_type<U> { static_cast<U>(args)... };
@@ -931,17 +931,17 @@ namespace xsimd
         namespace detail
         {
             // generate index sequence (iota)
-            inline svuint8_t sve_iota_impl(index<1>) noexcept { return svindex_u8(0, 1); }
-            inline svuint16_t sve_iota_impl(index<2>) noexcept { return svindex_u16(0, 1); }
-            inline svuint32_t sve_iota_impl(index<4>) noexcept { return svindex_u32(0, 1); }
-            inline svuint64_t sve_iota_impl(index<8>) noexcept { return svindex_u64(0, 1); }
+            XSIMD_INLINE svuint8_t sve_iota_impl(index<1>) noexcept { return svindex_u8(0, 1); }
+            XSIMD_INLINE svuint16_t sve_iota_impl(index<2>) noexcept { return svindex_u16(0, 1); }
+            XSIMD_INLINE svuint32_t sve_iota_impl(index<4>) noexcept { return svindex_u32(0, 1); }
+            XSIMD_INLINE svuint64_t sve_iota_impl(index<8>) noexcept { return svindex_u64(0, 1); }
 
             template <class T, class V = sve_vector_type<as_unsigned_integer_t<T>>>
-            inline V sve_iota() noexcept { return sve_iota_impl(index<sizeof(T)> {}); }
+            XSIMD_INLINE V sve_iota() noexcept { return sve_iota_impl(index<sizeof(T)> {}); }
         } // namespace detail
 
         template <class A, class T, size_t I, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> insert(batch<T, A> const& arg, T val, index<I>, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& arg, T val, index<I>, requires_arch<sve>) noexcept
         {
             // create a predicate with only the I-th lane activated
             const auto iota = detail::sve_iota<T>();
@@ -951,89 +951,89 @@ namespace xsimd
 
         // all
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline bool all(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return detail::sve_pcount<T>(arg) == batch_bool<T, A>::size;
         }
 
         // any
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline bool any(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return svptest_any(arg, arg);
         }
 
         // bitwise_cast
         template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 1> = 0>
-        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
         {
             return svreinterpret_u8(arg);
         }
 
         template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 1> = 0>
-        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
         {
             return svreinterpret_s8(arg);
         }
 
         template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 2> = 0>
-        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
         {
             return svreinterpret_u16(arg);
         }
 
         template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 2> = 0>
-        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
         {
             return svreinterpret_s16(arg);
         }
 
         template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 4> = 0>
-        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
         {
             return svreinterpret_u32(arg);
         }
 
         template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 4> = 0>
-        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
         {
             return svreinterpret_s32(arg);
         }
 
         template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 8> = 0>
-        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
         {
             return svreinterpret_u64(arg);
         }
 
         template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 8> = 0>
-        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
         {
             return svreinterpret_s64(arg);
         }
 
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<float, A> bitwise_cast(batch<T, A> const& arg, batch<float, A> const&, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& arg, batch<float, A> const&, requires_arch<sve>) noexcept
         {
             return svreinterpret_f32(arg);
         }
 
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<sve>) noexcept
         {
             return svreinterpret_f64(arg);
         }
 
         // batch_bool_cast
         template <class A, class T_out, class T_in, detail::sve_enable_all_t<T_in> = 0>
-        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& arg, batch_bool<T_out, A> const&, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& arg, batch_bool<T_out, A> const&, requires_arch<sve>) noexcept
         {
             return arg.data;
         }
 
         // from_bool
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return select(arg, batch<T, A>(1), batch<T, A>(0));
         }
@@ -1045,7 +1045,7 @@ namespace xsimd
             struct sve_slider_left
             {
                 template <class A, class T>
-                inline batch<T, A> operator()(batch<T, A> const& arg) noexcept
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
                 {
                     using u8_vector = batch<uint8_t, A>;
                     const auto left = svdup_n_u8(0);
@@ -1059,7 +1059,7 @@ namespace xsimd
             struct sve_slider_left<0>
             {
                 template <class A, class T>
-                inline batch<T, A> operator()(batch<T, A> const& arg) noexcept
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
                 {
                     return arg;
                 }
@@ -1067,7 +1067,7 @@ namespace xsimd
         } // namespace detail
 
         template <size_t N, class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> slide_left(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return detail::sve_slider_left<N>()(arg);
         }
@@ -1079,7 +1079,7 @@ namespace xsimd
             struct sve_slider_right
             {
                 template <class A, class T>
-                inline batch<T, A> operator()(batch<T, A> const& arg) noexcept
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
                 {
                     using u8_vector = batch<uint8_t, A>;
                     const auto left = bitwise_cast(arg, u8_vector {}, sve {}).data;
@@ -1093,7 +1093,7 @@ namespace xsimd
             struct sve_slider_right<batch<uint8_t, sve>::size>
             {
                 template <class A, class T>
-                inline batch<T, A> operator()(batch<T, A> const&) noexcept
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const&) noexcept
                 {
                     return batch<T, A> {};
                 }
@@ -1101,35 +1101,35 @@ namespace xsimd
         } // namespace detail
 
         template <size_t N, class A, class T, detail::sve_enable_all_t<T> = 0>
-        inline batch<T, A> slide_right(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return detail::sve_slider_right<N>()(arg);
         }
 
         // isnan
         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
-        inline batch_bool<T, A> isnan(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch_bool<T, A> isnan(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return !(arg == arg);
         }
 
         // nearbyint
         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
-        inline batch<T, A> nearbyint(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> nearbyint(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return svrintx_x(detail::sve_ptrue<T>(), arg);
         }
 
         // nearbyint_as_int
         template <class A>
-        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& arg, requires_arch<sve>) noexcept
         {
             const auto nearest = svrintx_x(detail::sve_ptrue<float>(), arg);
             return svcvt_s32_x(detail::sve_ptrue<float>(), nearest);
         }
 
         template <class A>
-        inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& arg, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<int64_t, A> nearbyint_as_int(batch<double, A> const& arg, requires_arch<sve>) noexcept
         {
             const auto nearest = svrintx_x(detail::sve_ptrue<double>(), arg);
             return svcvt_s64_x(detail::sve_ptrue<double>(), nearest);
@@ -1137,7 +1137,7 @@ namespace xsimd
 
         // ldexp
         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
-        inline batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& exp, requires_arch<sve>) noexcept
+        XSIMD_INLINE batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& exp, requires_arch<sve>) noexcept
         {
             return svscale_x(detail::sve_ptrue<T>(), x, exp);
         }
diff --git a/contrib/python/pythran/pythran/xsimd/arch/xsimd_wasm.hpp b/contrib/python/pythran/pythran/xsimd/arch/xsimd_wasm.hpp
index 8160b2423bb..5316cce35f0 100644
--- a/contrib/python/pythran/pythran/xsimd/arch/xsimd_wasm.hpp
+++ b/contrib/python/pythran/pythran/xsimd/arch/xsimd_wasm.hpp
@@ -19,13 +19,13 @@
 
 namespace xsimd
 {
-    template <class batch_type, bool... Values>
+    template <typename T, class A, bool... Values>
     struct batch_bool_constant;
 
     template <class T_out, class T_in, class A>
-    inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
+    XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
 
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
     struct batch_constant;
 
     namespace kernel
@@ -34,13 +34,15 @@ namespace xsimd
 
         // fwd
         template <class A, class T, size_t I>
-        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
         template <class A, typename T, typename ITy, ITy... Indices>
-        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept;
+        XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
 
         // abs
         template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
-        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<wasm>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -66,20 +68,20 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<float, A> abs(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_abs(self);
         }
 
         template <class A>
-        inline batch<double, A> abs(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_abs(self);
         }
 
         // add
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -105,107 +107,145 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_add(self, other);
         }
 
         template <class A>
-        inline batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_add(self, other);
         }
 
+        // avgr
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_u8x16_avgr(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_u16x8_avgr(self, other);
+            }
+            else
+            {
+                return avgr(self, other, generic {});
+            }
+        }
+
+        // avg
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto adj = ((self ^ other) << 7) >> 7;
+                return avgr(self, other, A {}) - adj;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto adj = ((self ^ other) << 15) >> 15;
+                return avgr(self, other, A {}) - adj;
+            }
+            else
+            {
+                return avg(self, other, generic {});
+            }
+        }
+
         // all
         template <class A>
-        inline bool all(batch_bool<float, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE bool all(batch_bool<float, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_i32x4_bitmask(self) == 0x0F;
         }
         template <class A>
-        inline bool all(batch_bool<double, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE bool all(batch_bool<double, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_i64x2_bitmask(self) == 0x03;
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline bool all(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_i8x16_bitmask(self) == 0xFFFF;
         }
 
         // any
         template <class A>
-        inline bool any(batch_bool<float, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE bool any(batch_bool<float, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_i32x4_bitmask(self) != 0;
         }
         template <class A>
-        inline bool any(batch_bool<double, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE bool any(batch_bool<double, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_i64x2_bitmask(self) != 0;
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline bool any(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_i8x16_bitmask(self) != 0;
         }
 
         // batch_bool_cast
         template <class A, class T_out, class T_in>
-        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<wasm>) noexcept
         {
             return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
         }
 
         // bitwise_and
         template <class A, class T>
-        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_v128_and(self, other);
         }
 
         template <class A, class T>
-        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_v128_and(self, other);
         }
 
         // bitwise_andnot
         template <class A, class T>
-        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_v128_andnot(self, other);
         }
 
         template <class A, class T>
-        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_v128_andnot(self, other);
         }
 
         // bitwise_cast
         template <class A, class T, class Tp>
-        inline batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<wasm>) noexcept
         {
             return batch<Tp, A>(self.data);
         }
 
         // bitwise_or
         template <class A, class T>
-        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_v128_or(self, other);
         }
 
         template <class A, class T>
-        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_v128_or(self, other);
         }
 
         // bitwise_lshift
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<wasm>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -232,7 +272,7 @@ namespace xsimd
 
         // bitwise_rshift
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<wasm>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -286,38 +326,38 @@ namespace xsimd
 
         // bitwise_not
         template <class A, class T>
-        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_v128_not(self);
         }
 
         template <class A, class T>
-        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_v128_not(self);
         }
 
         // bitwise_xor
         template <class A, class T>
-        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_v128_xor(self, other);
         }
 
         template <class A, class T>
-        inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_v128_xor(self, other);
         }
 
         // broadcast
         template <class A>
-        batch<float, A> inline broadcast(float val, requires_arch<wasm>) noexcept
+        batch<float, A> XSIMD_INLINE broadcast(float val, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_splat(val);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> broadcast(T val, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<wasm>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -342,48 +382,48 @@ namespace xsimd
             }
         }
         template <class A>
-        inline batch<double, A> broadcast(double val, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_splat(val);
         }
 
         // ceil
         template <class A>
-        inline batch<float, A> ceil(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> ceil(batch<float, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_ceil(self);
         }
         template <class A>
-        inline batch<double, A> ceil(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> ceil(batch<double, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_ceil(self);
         }
 
         // div
         template <class A>
-        inline batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_div(self, other);
         }
         template <class A>
-        inline batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_div(self, other);
         }
 
         // eq
         template <class A>
-        inline batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_eq(self, other);
         }
         template <class A>
-        inline batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_i32x4_eq(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -408,7 +448,7 @@ namespace xsimd
             }
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -433,12 +473,12 @@ namespace xsimd
             }
         }
         template <class A>
-        inline batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_eq(self, other);
         }
         template <class A>
-        inline batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_i64x2_eq(self, other);
         }
@@ -447,13 +487,13 @@ namespace xsimd
         namespace detail
         {
             template <class A>
-            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<wasm>) noexcept
+            XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<wasm>) noexcept
             {
                 return wasm_f32x4_convert_i32x4(self);
             }
 
             template <class A>
-            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<wasm>) noexcept
+            XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<wasm>) noexcept
             {
                 // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
                 // adapted to wasm
@@ -466,7 +506,7 @@ namespace xsimd
             }
 
             template <class A>
-            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<wasm>) noexcept
+            XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<wasm>) noexcept
             {
                 // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
                 // adapted to wasm
@@ -480,7 +520,7 @@ namespace xsimd
             }
 
             template <class A>
-            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<wasm>) noexcept
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<wasm>) noexcept
             {
                 return wasm_i32x4_make(
                     static_cast<int32_t>(wasm_f32x4_extract_lane(self, 0)),
@@ -492,20 +532,20 @@ namespace xsimd
 
         // floor
         template <class A>
-        inline batch<float, A> floor(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> floor(batch<float, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_floor(self);
         }
 
         template <class A>
-        inline batch<double, A> floor(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> floor(batch<double, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_floor(self);
         }
 
         // from_mask
         template <class A>
-        inline batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<wasm>) noexcept
         {
             alignas(A::alignment()) static const uint32_t lut[][4] = {
                 { 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
@@ -529,7 +569,7 @@ namespace xsimd
             return wasm_v128_load((const v128_t*)lut[mask]);
         }
         template <class A>
-        inline batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<wasm>) noexcept
         {
             alignas(A::alignment()) static const uint64_t lut[][4] = {
                 { 0x0000000000000000ul, 0x0000000000000000ul },
@@ -541,7 +581,7 @@ namespace xsimd
             return wasm_v128_load((const v128_t*)lut[mask]);
         }
         template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<wasm>) noexcept
         {
             alignas(A::alignment()) static const uint64_t lut64[] = {
                 0x0000000000000000,
@@ -627,24 +667,24 @@ namespace xsimd
 
         // ge
         template <class A>
-        inline batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_ge(self, other);
         }
         template <class A>
-        inline batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_ge(self, other);
         }
 
         // gt
         template <class A>
-        inline batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_gt(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -692,14 +732,14 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_gt(self, other);
         }
 
         // haddp
         template <class A>
-        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<wasm>) noexcept
         {
             v128_t tmp0 = wasm_i32x4_shuffle(row[0], row[1], 0, 4, 1, 5);
             v128_t tmp1 = wasm_i32x4_shuffle(row[0], row[1], 2, 6, 3, 7);
@@ -712,7 +752,7 @@ namespace xsimd
             return wasm_f32x4_add(tmp0, tmp2);
         }
         template <class A>
-        inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_add(wasm_i64x2_shuffle(row[0], row[1], 0, 2),
                                   wasm_i64x2_shuffle(row[0], row[1], 1, 3));
@@ -720,12 +760,12 @@ namespace xsimd
 
         // insert
         template <class A, size_t I>
-        inline batch<float, A> insert(batch<float, A> const& self, float val, index<I> pos, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> insert(batch<float, A> const& self, float val, index<I> pos, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_replace_lane(self, pos, val);
         }
         template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<wasm>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -778,48 +818,48 @@ namespace xsimd
         }
 
         template <class A, size_t I>
-        inline batch<double, A> insert(batch<double, A> const& self, double val, index<I> pos, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> insert(batch<double, A> const& self, double val, index<I> pos, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_replace_lane(self, pos, val);
         }
 
         // isnan
         template <class A>
-        inline batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_v128_or(wasm_f32x4_ne(self, self), wasm_f32x4_ne(self, self));
         }
         template <class A>
-        inline batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_v128_or(wasm_f64x2_ne(self, self), wasm_f64x2_ne(self, self));
         }
 
         // le
         template <class A>
-        inline batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_le(self, other);
         }
         template <class A>
-        inline batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_le(self, other);
         }
 
         // load_aligned
         template <class A>
-        inline batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<wasm>) noexcept
         {
             return wasm_v128_load(mem);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<wasm>) noexcept
         {
             return wasm_v128_load((v128_t const*)mem);
         }
         template <class A>
-        inline batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<wasm>) noexcept
         {
             return wasm_v128_load(mem);
         }
@@ -828,12 +868,12 @@ namespace xsimd
         namespace detail
         {
             template <class A>
-            inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<wasm>) noexcept
+            XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<wasm>) noexcept
             {
                 return { wasm_i32x4_shuffle(hi, lo, 0, 2, 4, 6), wasm_i32x4_shuffle(hi, lo, 1, 3, 5, 7) };
             }
             template <class A>
-            inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<wasm>) noexcept
+            XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<wasm>) noexcept
             {
                 return { wasm_i64x2_shuffle(hi, lo, 0, 2), wasm_i64x2_shuffle(hi, lo, 1, 3) };
             }
@@ -841,29 +881,29 @@ namespace xsimd
 
         // load_unaligned
         template <class A>
-        inline batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<wasm>) noexcept
         {
             return wasm_v128_load(mem);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<wasm>) noexcept
         {
             return wasm_v128_load((v128_t const*)mem);
         }
         template <class A>
-        inline batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<wasm>) noexcept
         {
             return wasm_v128_load(mem);
         }
 
         // lt
         template <class A>
-        inline batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_lt(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -924,14 +964,14 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_lt(self, other);
         }
 
         // mask
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -956,66 +996,66 @@ namespace xsimd
             }
         }
         template <class A>
-        inline uint64_t mask(batch_bool<float, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE uint64_t mask(batch_bool<float, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_i32x4_bitmask(self);
         }
 
         template <class A>
-        inline uint64_t mask(batch_bool<double, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE uint64_t mask(batch_bool<double, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_i64x2_bitmask(self);
         }
 
         // max
         template <class A>
-        inline batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_pmax(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
         {
             return select(self > other, self, other);
         }
         template <class A>
-        inline batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_pmax(self, other);
         }
 
         // min
         template <class A>
-        inline batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_pmin(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
         {
             return select(self <= other, self, other);
         }
         template <class A>
-        inline batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_pmin(self, other);
         }
 
         // mul
         template <class A>
-        inline batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_mul(self, other);
         }
         template <class A>
-        inline batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_mul(self, other);
         }
 
         // neg
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> neg(batch<T, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<wasm>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -1041,59 +1081,59 @@ namespace xsimd
         }
 
         template <class A>
-        inline batch<float, A> neg(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> neg(batch<float, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_neg(self);
         }
 
         template <class A>
-        inline batch<double, A> neg(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> neg(batch<double, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_neg(self);
         }
 
         // neq
         template <class A>
-        inline batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_ne(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
         {
             return ~(self == other);
         }
         template <class A>
-        inline batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_ne(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
         {
             return ~(self == other);
         }
 
         template <class A>
-        inline batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_ne(self, other);
         }
         template <class A>
-        inline batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_ne(self, other);
         }
 
         // reciprocal
         template <class A>
-        inline batch<float, A> reciprocal(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> reciprocal(batch<float, A> const& self, requires_arch<wasm>) noexcept
         {
             v128_t one = wasm_f32x4_splat(1.0f);
             return wasm_f32x4_div(one, self);
         }
         template <class A>
-        inline batch<double, A> reciprocal(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> reciprocal(batch<double, A> const& self, requires_arch<wasm>) noexcept
         {
             v128_t one = wasm_f64x2_splat(1.0);
             return wasm_f64x2_div(one, self);
@@ -1101,7 +1141,7 @@ namespace xsimd
 
         // reduce_add
         template <class A>
-        inline float reduce_add(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<wasm>) noexcept
         {
             v128_t tmp0 = wasm_f32x4_add(self, wasm_i32x4_shuffle(self, self, 6, 7, 2, 3));
             v128_t tmp1 = wasm_i32x4_shuffle(tmp0, tmp0, 1, 0, 4, 4);
@@ -1110,7 +1150,7 @@ namespace xsimd
             return wasm_f32x4_extract_lane(tmp3, 0);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline T reduce_add(batch<T, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<wasm>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
             {
@@ -1132,7 +1172,7 @@ namespace xsimd
             }
         }
         template <class A>
-        inline double reduce_add(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<wasm>) noexcept
         {
             v128_t tmp0 = wasm_i64x2_shuffle(self, self, 1, 3);
             v128_t tmp1 = wasm_f64x2_add(self, tmp0);
@@ -1142,13 +1182,13 @@ namespace xsimd
 
         // rsqrt
         template <class A>
-        inline batch<float, A> rsqrt(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& self, requires_arch<wasm>) noexcept
         {
             v128_t one = wasm_f32x4_splat(1.0f);
             return wasm_f32x4_div(one, wasm_f32x4_sqrt(self));
         }
         template <class A>
-        inline batch<double, A> rsqrt(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& self, requires_arch<wasm>) noexcept
         {
             v128_t one = wasm_f64x2_splat(1.0);
             return wasm_f64x2_div(one, wasm_f64x2_sqrt(self));
@@ -1156,7 +1196,7 @@ namespace xsimd
 
         // slide_left
         template <size_t N, class A, class T>
-        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<wasm>) noexcept
         {
             return wasm_i8x16_shuffle(
                 wasm_i64x2_const(0, 0), x, ((N) & 0xF0) ? 0 : 16 - ((N) & 0xF),
@@ -1172,7 +1212,7 @@ namespace xsimd
 
         // slide_right
         template <size_t N, class A, class T>
-        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<wasm>) noexcept
         {
             return wasm_i8x16_shuffle(
                 x, wasm_i64x2_const(0, 0), ((N) & 0xF0) ? 16 : ((N) & 0xF) + 0,
@@ -1188,7 +1228,7 @@ namespace xsimd
 
         // sadd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -1224,94 +1264,94 @@ namespace xsimd
 
         // select
         template <class A>
-        inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<wasm>) noexcept
         {
             return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond));
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<wasm>) noexcept
         {
             return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond));
         }
         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<wasm>) noexcept
         {
             return select(batch_bool<T, A> { Values... }, true_br, false_br, wasm {});
         }
         template <class A>
-        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<wasm>) noexcept
         {
             return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond));
         }
 
         // shuffle
         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
-        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3>, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3>, requires_arch<wasm>) noexcept
         {
             return wasm_i32x4_shuffle(x, y, I0, I1, I2, I3);
         }
 
         template <class A, class ITy, ITy I0, ITy I1>
-        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1>, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1>, requires_arch<wasm>) noexcept
         {
             return wasm_i64x2_shuffle(x, y, I0, I1);
         }
 
         // set
         template <class A, class... Values>
-        inline batch<float, A> set(batch<float, A> const&, requires_arch<wasm>, Values... values) noexcept
+        XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<wasm>, Values... values) noexcept
         {
             static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
             return wasm_f32x4_make(values...);
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1) noexcept
         {
             return wasm_i64x2_make(v0, v1);
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1, T v2, T v3) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1, T v2, T v3) noexcept
         {
             return wasm_i32x4_make(v0, v1, v2, v3);
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
         {
             return wasm_i16x8_make(v0, v1, v2, v3, v4, v5, v6, v7);
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
         {
             return wasm_i8x16_make(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
         }
 
         template <class A, class... Values>
-        inline batch<double, A> set(batch<double, A> const&, requires_arch<wasm>, Values... values) noexcept
+        XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<wasm>, Values... values) noexcept
         {
             static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
             return wasm_f64x2_make(values...);
         }
 
         template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<wasm>, Values... values) noexcept
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<wasm>, Values... values) noexcept
         {
             return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
         }
 
         template <class A, class... Values>
-        inline batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<wasm>, Values... values) noexcept
+        XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<wasm>, Values... values) noexcept
         {
             static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
             return set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data;
         }
 
         template <class A, class... Values>
-        inline batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<wasm>, Values... values) noexcept
+        XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<wasm>, Values... values) noexcept
         {
             static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
             return set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data;
@@ -1319,7 +1359,7 @@ namespace xsimd
 
         // ssub
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -1355,22 +1395,22 @@ namespace xsimd
 
         // store_aligned
         template <class A>
-        inline void store_aligned(float* mem, batch<float, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_v128_store(mem, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline void store_aligned(T* mem, batch<T, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_v128_store((v128_t*)mem, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_v128_store((v128_t*)mem, self);
         }
         template <class A>
-        inline void store_aligned(double* mem, batch<double, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_v128_store(mem, self);
         }
@@ -1380,23 +1420,23 @@ namespace xsimd
         {
             // complex_low
             template <class A>
-            inline batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<wasm>) noexcept
+            XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<wasm>) noexcept
             {
                 return wasm_i32x4_shuffle(self.real(), self.imag(), 0, 4, 1, 5);
             }
             // complex_high
             template <class A>
-            inline batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<wasm>) noexcept
+            XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<wasm>) noexcept
             {
                 return wasm_i32x4_shuffle(self.real(), self.imag(), 2, 6, 3, 7);
             }
             template <class A>
-            inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<wasm>) noexcept
+            XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<wasm>) noexcept
             {
                 return wasm_i64x2_shuffle(self.real(), self.imag(), 0, 2);
             }
             template <class A>
-            inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<wasm>) noexcept
+            XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<wasm>) noexcept
             {
                 return wasm_i64x2_shuffle(self.real(), self.imag(), 1, 3);
             }
@@ -1404,34 +1444,34 @@ namespace xsimd
 
         // store_unaligned
         template <class A>
-        inline void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_v128_store(mem, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_v128_store((v128_t*)mem, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_v128_store((v128_t*)mem, self);
         }
         template <class A>
-        inline void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_v128_store(mem, self);
         }
 
         // sub
         template <class A>
-        inline batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_sub(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -1456,106 +1496,106 @@ namespace xsimd
             }
         }
         template <class A>
-        inline batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_sub(self, other);
         }
 
         // sqrt
         template <class A>
-        inline batch<float, A> sqrt(batch<float, A> const& val, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_sqrt(val);
         }
         template <class A>
-        inline batch<double, A> sqrt(batch<double, A> const& val, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_sqrt(val);
         }
 
         // swizzle
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
         {
             return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3);
         }
 
         template <class A, uint64_t V0, uint64_t V1>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<wasm>) noexcept
         {
             return wasm_i64x2_shuffle(self, self, V0, V1);
         }
 
         template <class A, uint64_t V0, uint64_t V1>
-        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<wasm>) noexcept
         {
             return wasm_i64x2_shuffle(self, self, V0, V1);
         }
 
         template <class A, uint64_t V0, uint64_t V1>
-        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1> mask, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1> mask, requires_arch<wasm>) noexcept
         {
             return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, wasm {}));
         }
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
         {
             return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3);
         }
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> mask, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3> mask, requires_arch<wasm>) noexcept
         {
             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, wasm {}));
         }
 
         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
-        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<wasm>) noexcept
         {
             return wasm_i16x8_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7);
         }
 
         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
-        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<wasm>) noexcept
         {
             return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, wasm {}));
         }
 
         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
                   uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
-        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15>, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15>, requires_arch<wasm>) noexcept
         {
             return wasm_i8x16_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15);
         }
 
         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
                   uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
-        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<wasm>) noexcept
         {
             return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, wasm {}));
         }
 
         // trunc
         template <class A>
-        inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_f32x4_trunc(self);
         }
         template <class A>
-        inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> trunc(batch<double, A> const& self, requires_arch<wasm>) noexcept
         {
             return wasm_f64x2_trunc(self);
         }
 
         // zip_hi
         template <class A>
-        inline batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_i32x4_shuffle(self, other, 2, 6, 3, 7);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -1580,19 +1620,19 @@ namespace xsimd
             }
         }
         template <class A>
-        inline batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_i64x2_shuffle(self, other, 1, 3);
         }
 
         // zip_lo
         template <class A>
-        inline batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_i32x4_shuffle(self, other, 0, 4, 1, 5);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -1617,7 +1657,7 @@ namespace xsimd
             }
         }
         template <class A>
-        inline batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
         {
             return wasm_i64x2_shuffle(self, other, 0, 2);
         }
diff --git a/contrib/python/pythran/pythran/xsimd/config/xsimd_arch.hpp b/contrib/python/pythran/pythran/xsimd/config/xsimd_arch.hpp
index fe8c5416692..39d0d581de9 100644
--- a/contrib/python/pythran/pythran/xsimd/config/xsimd_arch.hpp
+++ b/contrib/python/pythran/pythran/xsimd/config/xsimd_arch.hpp
@@ -33,7 +33,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return false; }
         static constexpr bool available() noexcept { return false; }
-        static constexpr unsigned version() noexcept { return 0; }
         static constexpr std::size_t alignment() noexcept { return 0; }
         static constexpr bool requires_alignment() noexcept { return false; }
         static constexpr char const* name() noexcept { return "<none>"; }
@@ -57,34 +56,14 @@ namespace xsimd
         {
         };
 
-        template <unsigned... Vals>
-        struct is_sorted;
-
-        template <>
-        struct is_sorted<> : std::true_type
-        {
-        };
-
-        template <unsigned Val>
-        struct is_sorted<Val> : std::true_type
-        {
-        };
-
-        template <unsigned V0, unsigned V1, unsigned... Vals>
-        struct is_sorted<V0, V1, Vals...>
-            : std::conditional<(V0 >= V1), is_sorted<V1, Vals...>,
-                               std::false_type>::type
-        {
-        };
-
         template <typename T>
-        inline constexpr T max_of(T value) noexcept
+        XSIMD_INLINE constexpr T max_of(T value) noexcept
         {
             return value;
         }
 
         template <typename T, typename... Ts>
-        inline constexpr T max_of(T head0, T head1, Ts... tail) noexcept
+        XSIMD_INLINE constexpr T max_of(T head0, T head1, Ts... tail) noexcept
         {
             return max_of((head0 > head1 ? head0 : head1), tail...);
         }
@@ -106,15 +85,10 @@ namespace xsimd
 
     } // namespace detail
 
-    // An arch_list is a list of architectures, sorted by version number.
+    // An arch_list is a list of architectures.
     template <class... Archs>
     struct arch_list
     {
-#ifndef NDEBUG
-        static_assert(detail::is_sorted<Archs::version()...>::value,
-                      "architecture list must be sorted by version");
-#endif
-
         using best = typename detail::head<Archs...>::type;
 
         template <class Arch>
@@ -130,7 +104,7 @@ namespace xsimd
         }
 
         template <class F>
-        static inline void for_each(F&& f) noexcept
+        static XSIMD_INLINE void for_each(F&& f) noexcept
         {
             (void)std::initializer_list<bool> { (f(Archs {}), true)... };
         }
@@ -187,9 +161,6 @@ namespace xsimd
         };
     } // namespace detail
 
-    struct unsupported
-    {
-    };
     using all_x86_architectures = arch_list<
         avx512vnni<avx512vbmi>, avx512vbmi, avx512ifma, avx512pf, avx512vnni<avx512bw>, avx512bw, avx512er, avx512dq, avx512cd, avx512f,
         avxvnni, fma3<avx2>, avx2, fma3<avx>, avx, fma4, fma3<sse4_2>,
@@ -197,7 +168,7 @@ namespace xsimd
 
     using all_sve_architectures = arch_list<detail::sve<512>, detail::sve<256>, detail::sve<128>>;
     using all_rvv_architectures = arch_list<detail::rvv<512>, detail::rvv<256>, detail::rvv<128>>;
-    using all_arm_architectures = typename detail::join<all_sve_architectures, arch_list<neon64, neon>>::type;
+    using all_arm_architectures = typename detail::join<all_sve_architectures, arch_list<i8mm<neon64>, neon64, neon>>::type;
     using all_riscv_architectures = all_rvv_architectures;
     using all_wasm_architectures = arch_list<wasm>;
     using all_architectures = typename detail::join<all_riscv_architectures, all_wasm_architectures, all_arm_architectures, all_x86_architectures>::type;
@@ -221,34 +192,34 @@ namespace xsimd
         class dispatcher
         {
 
-            const unsigned best_arch_found;
+            const decltype(available_architectures()) availables_archs;
             F functor;
 
             template <class Arch, class... Tys>
-            inline auto walk_archs(arch_list<Arch>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
+            XSIMD_INLINE auto walk_archs(arch_list<Arch>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
             {
                 assert(Arch::available() && "At least one arch must be supported during dispatch");
                 return functor(Arch {}, std::forward<Tys>(args)...);
             }
 
             template <class Arch, class ArchNext, class... Archs, class... Tys>
-            inline auto walk_archs(arch_list<Arch, ArchNext, Archs...>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
+            XSIMD_INLINE auto walk_archs(arch_list<Arch, ArchNext, Archs...>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
             {
-                if (Arch::version() <= best_arch_found)
+                if (availables_archs.has(Arch {}))
                     return functor(Arch {}, std::forward<Tys>(args)...);
                 else
                     return walk_archs(arch_list<ArchNext, Archs...> {}, std::forward<Tys>(args)...);
             }
 
         public:
-            inline dispatcher(F f) noexcept
-                : best_arch_found(available_architectures().best)
+            XSIMD_INLINE dispatcher(F f) noexcept
+                : availables_archs(available_architectures())
                 , functor(f)
             {
             }
 
             template <class... Tys>
-            inline auto operator()(Tys&&... args) noexcept -> decltype(functor(default_arch {}, std::forward<Tys>(args)...))
+            XSIMD_INLINE auto operator()(Tys&&... args) noexcept -> decltype(functor(default_arch {}, std::forward<Tys>(args)...))
             {
                 return walk_archs(ArchList {}, std::forward<Tys>(args)...);
             }
@@ -257,7 +228,7 @@ namespace xsimd
 
     // Generic function dispatch, à la ifunc
     template <class ArchList = supported_architectures, class F>
-    inline detail::dispatcher<F, ArchList> dispatch(F&& f) noexcept
+    XSIMD_INLINE detail::dispatcher<F, ArchList> dispatch(F&& f) noexcept
     {
         return { std::forward<F>(f) };
     }
diff --git a/contrib/python/pythran/pythran/xsimd/config/xsimd_config.hpp b/contrib/python/pythran/pythran/xsimd/config/xsimd_config.hpp
index cf5163c37ef..c32a12012d4 100644
--- a/contrib/python/pythran/pythran/xsimd/config/xsimd_config.hpp
+++ b/contrib/python/pythran/pythran/xsimd/config/xsimd_config.hpp
@@ -12,9 +12,9 @@
 #ifndef XSIMD_CONFIG_HPP
 #define XSIMD_CONFIG_HPP
 
-#define XSIMD_VERSION_MAJOR 12
-#define XSIMD_VERSION_MINOR 1
-#define XSIMD_VERSION_PATCH 1
+#define XSIMD_VERSION_MAJOR 13
+#define XSIMD_VERSION_MINOR 0
+#define XSIMD_VERSION_PATCH 0
 
 /**
  * high level free functions
@@ -352,6 +352,17 @@
 /**
  * @ingroup xsimd_config_macro
  *
+ * Set to 1 if i8mm neon64 extension is available at compile-time, to 0 otherwise.
+ */
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+#define XSIMD_WITH_I8MM_NEON64 1
+#else
+#define XSIMD_WITH_I8MM_NEON64 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
  * Set to 1 if SVE is available and bit width is pre-set at compile-time, to 0 otherwise.
  */
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
diff --git a/contrib/python/pythran/pythran/xsimd/config/xsimd_cpuid.hpp b/contrib/python/pythran/pythran/xsimd/config/xsimd_cpuid.hpp
index 62aca6132fd..f22089bac0a 100644
--- a/contrib/python/pythran/pythran/xsimd/config/xsimd_cpuid.hpp
+++ b/contrib/python/pythran/pythran/xsimd/config/xsimd_cpuid.hpp
@@ -18,6 +18,11 @@
 #if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM) || defined(__riscv_vector))
 #include <asm/hwcap.h>
 #include <sys/auxv.h>
+
+#ifndef HWCAP2_I8MM
+#define HWCAP2_I8MM (1 << 13)
+#endif
+
 #endif
 
 #if defined(_MSC_VER)
@@ -33,65 +38,71 @@ namespace xsimd
     {
         struct supported_arch
         {
-            unsigned sse2 : 1;
-            unsigned sse3 : 1;
-            unsigned ssse3 : 1;
-            unsigned sse4_1 : 1;
-            unsigned sse4_2 : 1;
-            unsigned sse4a : 1;
-            unsigned fma3_sse : 1;
-            unsigned fma4 : 1;
-            unsigned xop : 1;
-            unsigned avx : 1;
-            unsigned fma3_avx : 1;
-            unsigned avx2 : 1;
-            unsigned avxvnni : 1;
-            unsigned fma3_avx2 : 1;
-            unsigned avx512f : 1;
-            unsigned avx512cd : 1;
-            unsigned avx512dq : 1;
-            unsigned avx512bw : 1;
-            unsigned avx512er : 1;
-            unsigned avx512pf : 1;
-            unsigned avx512ifma : 1;
-            unsigned avx512vbmi : 1;
-            unsigned avx512vnni_bw : 1;
-            unsigned avx512vnni_vbmi : 1;
-            unsigned neon : 1;
-            unsigned neon64 : 1;
-            unsigned sve : 1;
-            unsigned rvv : 1;
-
-            // version number of the best arch available
-            unsigned best;
-
-            inline supported_arch() noexcept
+
+#define ARCH_FIELD_EX(arch, field_name) \
+    unsigned field_name;                \
+    XSIMD_INLINE bool has(::xsimd::arch) const { return this->field_name; }
+#define ARCH_FIELD(name) ARCH_FIELD_EX(name, name)
+
+            ARCH_FIELD(sse2)
+            ARCH_FIELD(sse3)
+
+            ARCH_FIELD(ssse3)
+            ARCH_FIELD(sse4_1)
+            ARCH_FIELD(sse4_2)
+            // ARCH_FIELD(sse4a)
+            ARCH_FIELD_EX(fma3<::xsimd::sse4_2>, fma3_sse42)
+            ARCH_FIELD(fma4)
+            // ARCH_FIELD(xop)
+            ARCH_FIELD(avx)
+            ARCH_FIELD_EX(fma3<::xsimd::avx>, fma3_avx)
+            ARCH_FIELD(avx2)
+            ARCH_FIELD(avxvnni)
+            ARCH_FIELD_EX(fma3<::xsimd::avx2>, fma3_avx2)
+            ARCH_FIELD(avx512f)
+            ARCH_FIELD(avx512cd)
+            ARCH_FIELD(avx512dq)
+            ARCH_FIELD(avx512bw)
+            ARCH_FIELD(avx512er)
+            ARCH_FIELD(avx512pf)
+            ARCH_FIELD(avx512ifma)
+            ARCH_FIELD(avx512vbmi)
+            ARCH_FIELD_EX(avx512vnni<::xsimd::avx512bw>, avx512vnni_bw)
+            ARCH_FIELD_EX(avx512vnni<::xsimd::avx512vbmi>, avx512vnni_vbmi)
+            ARCH_FIELD(neon)
+            ARCH_FIELD(neon64)
+            ARCH_FIELD_EX(i8mm<::xsimd::neon64>, i8mm_neon64)
+            ARCH_FIELD(sve)
+            ARCH_FIELD(rvv)
+            ARCH_FIELD(wasm)
+
+#undef ARCH_FIELD
+
+            XSIMD_INLINE supported_arch() noexcept
             {
                 memset(this, 0, sizeof(supported_arch));
 
+#if XSIMD_WITH_WASM
+                wasm = 1;
+#endif
+
 #if defined(__aarch64__) || defined(_M_ARM64)
                 neon = 1;
                 neon64 = 1;
-                best = neon64::version();
+#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
+                i8mm_neon64 = bool(getauxval(AT_HWCAP2) & HWCAP2_I8MM);
+#endif
 #elif defined(__ARM_NEON) || defined(_M_ARM)
 
 #if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
                 neon = bool(getauxval(AT_HWCAP) & HWCAP_NEON);
-#else
-                // that's very conservative :-/
-                neon = 0;
 #endif
-                neon64 = 0;
-                best = neon::version() * neon;
 
 #elif defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
 
 #if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
                 sve = bool(getauxval(AT_HWCAP) & HWCAP_SVE);
-#else
-                sve = 0;
 #endif
-                best = sve::version() * sve;
 
 #elif defined(__riscv_vector) && defined(__riscv_v_fixed_vlen) && __riscv_v_fixed_vlen > 0
 
@@ -100,11 +111,8 @@ namespace xsimd
 #define HWCAP_V (1 << ('V' - 'A'))
 #endif
                 rvv = bool(getauxval(AT_HWCAP) & HWCAP_V);
-#else
-                rvv = 0;
 #endif
 
-                best = ::xsimd::rvv::version() * rvv;
 #elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)
                 auto get_cpuid = [](int reg[4], int level, int count = 0) noexcept
                 {
@@ -122,14 +130,12 @@ namespace xsimd
                     __asm__("xchg{l}\t{%%}ebx, %1\n\t"
                             "cpuid\n\t"
                             "xchg{l}\t{%%}ebx, %1\n\t"
-                            : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]),
-                              "=d"(reg[3])
+                            : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
                             : "0"(level), "2"(count));
 
 #else
                     __asm__("cpuid\n\t"
-                            : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]),
-                              "=d"(reg[3])
+                            : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
                             : "0"(level), "2"(count));
 #endif
 
@@ -143,89 +149,49 @@ namespace xsimd
                 get_cpuid(regs1, 0x1);
 
                 sse2 = regs1[3] >> 26 & 1;
-                best = std::max(best, sse2::version() * sse2);
-
                 sse3 = regs1[2] >> 0 & 1;
-                best = std::max(best, sse3::version() * sse3);
-
                 ssse3 = regs1[2] >> 9 & 1;
-                best = std::max(best, ssse3::version() * ssse3);
-
                 sse4_1 = regs1[2] >> 19 & 1;
-                best = std::max(best, sse4_1::version() * sse4_1);
-
                 sse4_2 = regs1[2] >> 20 & 1;
-                best = std::max(best, sse4_2::version() * sse4_2);
-
-                fma3_sse = regs1[2] >> 12 & 1;
-                if (sse4_2)
-                    best = std::max(best, fma3<xsimd::sse4_2>::version() * fma3_sse);
+                fma3_sse42 = regs1[2] >> 12 & 1;
 
                 avx = regs1[2] >> 28 & 1;
-                best = std::max(best, avx::version() * avx);
-
-                fma3_avx = avx && fma3_sse;
-                best = std::max(best, fma3<xsimd::avx>::version() * fma3_avx);
+                fma3_avx = avx && fma3_sse42;
 
                 int regs8[4];
                 get_cpuid(regs8, 0x80000001);
                 fma4 = regs8[2] >> 16 & 1;
-                best = std::max(best, fma4::version() * fma4);
 
                 // sse4a = regs[2] >> 6 & 1;
-                // best = std::max(best, XSIMD_X86_AMD_SSE4A_VERSION * sse4a);
 
                 // xop = regs[2] >> 11 & 1;
-                // best = std::max(best, XSIMD_X86_AMD_XOP_VERSION * xop);
 
                 int regs7[4];
                 get_cpuid(regs7, 0x7);
                 avx2 = regs7[1] >> 5 & 1;
-                best = std::max(best, avx2::version() * avx2);
 
                 int regs7a[4];
                 get_cpuid(regs7a, 0x7, 0x1);
                 avxvnni = regs7a[0] >> 4 & 1;
-                best = std::max(best, avxvnni::version() * avxvnni * avx2);
 
-                fma3_avx2 = avx2 && fma3_sse;
-                best = std::max(best, fma3<xsimd::avx2>::version() * fma3_avx2);
+                fma3_avx2 = avx2 && fma3_sse42;
 
                 avx512f = regs7[1] >> 16 & 1;
-                best = std::max(best, avx512f::version() * avx512f);
-
                 avx512cd = regs7[1] >> 28 & 1;
-                best = std::max(best, avx512cd::version() * avx512cd * avx512f);
-
                 avx512dq = regs7[1] >> 17 & 1;
-                best = std::max(best, avx512dq::version() * avx512dq * avx512cd * avx512f);
-
                 avx512bw = regs7[1] >> 30 & 1;
-                best = std::max(best, avx512bw::version() * avx512bw * avx512dq * avx512cd * avx512f);
-
                 avx512er = regs7[1] >> 27 & 1;
-                best = std::max(best, avx512er::version() * avx512er * avx512cd * avx512f);
-
                 avx512pf = regs7[1] >> 26 & 1;
-                best = std::max(best, avx512pf::version() * avx512pf * avx512er * avx512cd * avx512f);
-
                 avx512ifma = regs7[1] >> 21 & 1;
-                best = std::max(best, avx512ifma::version() * avx512ifma * avx512bw * avx512dq * avx512cd * avx512f);
-
                 avx512vbmi = regs7[2] >> 1 & 1;
-                best = std::max(best, avx512vbmi::version() * avx512vbmi * avx512ifma * avx512bw * avx512dq * avx512cd * avx512f);
-
                 avx512vnni_bw = regs7[2] >> 11 & 1;
-                best = std::max(best, avx512vnni<xsimd::avx512bw>::version() * avx512vnni_bw * avx512bw * avx512dq * avx512cd * avx512f);
-
                 avx512vnni_vbmi = avx512vbmi && avx512vnni_bw;
-                best = std::max(best, avx512vnni<xsimd::avx512vbmi>::version() * avx512vnni_vbmi);
 #endif
             }
         };
-    }
+    } // namespace detail
 
-    inline detail::supported_arch available_architectures() noexcept
+    XSIMD_INLINE detail::supported_arch available_architectures() noexcept
     {
         static detail::supported_arch supported;
         return supported;
diff --git a/contrib/python/pythran/pythran/xsimd/config/xsimd_inline.hpp b/contrib/python/pythran/pythran/xsimd/config/xsimd_inline.hpp
new file mode 100644
index 00000000000..88e9cbcd0d6
--- /dev/null
+++ b/contrib/python/pythran/pythran/xsimd/config/xsimd_inline.hpp
@@ -0,0 +1,23 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_INLINE_HPP
+#define XSIMD_INLINE_HPP
+
+#if defined(__GNUC__)
+#define XSIMD_INLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define XSIMD_INLINE inline __forceinline
+#else
+#define XSIMD_INLINE inline
+#endif
+
+#endif
diff --git a/contrib/python/pythran/pythran/xsimd/math/xsimd_rem_pio2.hpp b/contrib/python/pythran/pythran/xsimd/math/xsimd_rem_pio2.hpp
index 05371ee520f..eb232c56820 100644
--- a/contrib/python/pythran/pythran/xsimd/math/xsimd_rem_pio2.hpp
+++ b/contrib/python/pythran/pythran/xsimd/math/xsimd_rem_pio2.hpp
@@ -217,7 +217,7 @@ namespace xsimd
          *
          */
 
-        inline int32_t __kernel_rem_pio2(double* x, double* y, int32_t e0, int32_t nx, int32_t prec, const int32_t* ipio2) noexcept
+        XSIMD_INLINE int32_t __kernel_rem_pio2(double* x, double* y, int32_t e0, int32_t nx, int32_t prec, const int32_t* ipio2) noexcept
         {
             static const int32_t init_jk[] = { 2, 3, 4, 6 }; /* initial value for jk */
 
@@ -450,7 +450,7 @@ namespace xsimd
             return n & 7;
         }
 
-        inline std::int32_t __ieee754_rem_pio2(double x, double* y) noexcept
+        XSIMD_INLINE std::int32_t __ieee754_rem_pio2(double x, double* y) noexcept
         {
             static const std::int32_t two_over_pi[] = {
                 0xA2F983,
diff --git a/contrib/python/pythran/pythran/xsimd/memory/xsimd_aligned_allocator.hpp b/contrib/python/pythran/pythran/xsimd/memory/xsimd_aligned_allocator.hpp
index 3918d68a704..51779f31caf 100644
--- a/contrib/python/pythran/pythran/xsimd/memory/xsimd_aligned_allocator.hpp
+++ b/contrib/python/pythran/pythran/xsimd/memory/xsimd_aligned_allocator.hpp
@@ -59,43 +59,43 @@ namespace xsimd
             using other = aligned_allocator<U, Align>;
         };
 
-        inline aligned_allocator() noexcept;
-        inline aligned_allocator(const aligned_allocator& rhs) noexcept;
+        XSIMD_INLINE aligned_allocator() noexcept;
+        XSIMD_INLINE aligned_allocator(const aligned_allocator& rhs) noexcept;
 
         template <class U>
-        inline aligned_allocator(const aligned_allocator<U, Align>& rhs) noexcept;
+        XSIMD_INLINE aligned_allocator(const aligned_allocator<U, Align>& rhs) noexcept;
 
-        inline ~aligned_allocator();
+        XSIMD_INLINE ~aligned_allocator();
 
-        inline pointer address(reference) noexcept;
-        inline const_pointer address(const_reference) const noexcept;
+        XSIMD_INLINE pointer address(reference) noexcept;
+        XSIMD_INLINE const_pointer address(const_reference) const noexcept;
 
-        inline pointer allocate(size_type n, const void* hint = 0);
-        inline void deallocate(pointer p, size_type n);
+        XSIMD_INLINE pointer allocate(size_type n, const void* hint = 0);
+        XSIMD_INLINE void deallocate(pointer p, size_type n);
 
-        inline size_type max_size() const noexcept;
-        inline size_type size_max() const noexcept;
+        XSIMD_INLINE size_type max_size() const noexcept;
+        XSIMD_INLINE size_type size_max() const noexcept;
 
         template <class U, class... Args>
-        inline void construct(U* p, Args&&... args);
+        XSIMD_INLINE void construct(U* p, Args&&... args);
 
         template <class U>
-        inline void destroy(U* p);
+        XSIMD_INLINE void destroy(U* p);
     };
 
     template <class T1, size_t Align1, class T2, size_t Align2>
-    inline bool operator==(const aligned_allocator<T1, Align1>& lhs,
-                           const aligned_allocator<T2, Align2>& rhs) noexcept;
+    XSIMD_INLINE bool operator==(const aligned_allocator<T1, Align1>& lhs,
+                                 const aligned_allocator<T2, Align2>& rhs) noexcept;
 
     template <class T1, size_t Align1, class T2, size_t Align2>
-    inline bool operator!=(const aligned_allocator<T1, Align1>& lhs,
-                           const aligned_allocator<T2, Align2>& rhs) noexcept;
+    XSIMD_INLINE bool operator!=(const aligned_allocator<T1, Align1>& lhs,
+                                 const aligned_allocator<T2, Align2>& rhs) noexcept;
 
-    inline void* aligned_malloc(size_t size, size_t alignment);
-    inline void aligned_free(void* ptr);
+    XSIMD_INLINE void* aligned_malloc(size_t size, size_t alignment);
+    XSIMD_INLINE void aligned_free(void* ptr);
 
     template <class T>
-    inline size_t get_alignment_offset(const T* p, size_t size, size_t block_size);
+    XSIMD_INLINE size_t get_alignment_offset(const T* p, size_t size, size_t block_size);
 
     /************************************
      * aligned_allocator implementation *
@@ -105,7 +105,7 @@ namespace xsimd
      * Default constructor.
      */
     template <class T, size_t A>
-    inline aligned_allocator<T, A>::aligned_allocator() noexcept
+    XSIMD_INLINE aligned_allocator<T, A>::aligned_allocator() noexcept
     {
     }
 
@@ -113,7 +113,7 @@ namespace xsimd
      * Copy constructor.
      */
     template <class T, size_t A>
-    inline aligned_allocator<T, A>::aligned_allocator(const aligned_allocator&) noexcept
+    XSIMD_INLINE aligned_allocator<T, A>::aligned_allocator(const aligned_allocator&) noexcept
     {
     }
 
@@ -122,7 +122,7 @@ namespace xsimd
      */
     template <class T, size_t A>
     template <class U>
-    inline aligned_allocator<T, A>::aligned_allocator(const aligned_allocator<U, A>&) noexcept
+    XSIMD_INLINE aligned_allocator<T, A>::aligned_allocator(const aligned_allocator<U, A>&) noexcept
     {
     }
 
@@ -130,7 +130,7 @@ namespace xsimd
      * Destructor.
      */
     template <class T, size_t A>
-    inline aligned_allocator<T, A>::~aligned_allocator()
+    XSIMD_INLINE aligned_allocator<T, A>::~aligned_allocator()
     {
     }
 
@@ -140,7 +140,7 @@ namespace xsimd
      * @return the actual address of \c r.
      */
     template <class T, size_t A>
-    inline auto
+    XSIMD_INLINE auto
     aligned_allocator<T, A>::address(reference r) noexcept -> pointer
     {
         return &r;
@@ -152,7 +152,7 @@ namespace xsimd
      * @return the actual address of \c r.
      */
     template <class T, size_t A>
-    inline auto
+    XSIMD_INLINE auto
     aligned_allocator<T, A>::address(const_reference r) const noexcept -> const_pointer
     {
         return &r;
@@ -167,7 +167,7 @@ namespace xsimd
      * hold an array of \c n objects of type \c T.
      */
     template <class T, size_t A>
-    inline auto
+    XSIMD_INLINE auto
     aligned_allocator<T, A>::allocate(size_type n, const void*) -> pointer
     {
         pointer res = reinterpret_cast<pointer>(aligned_malloc(sizeof(T) * n, A));
@@ -186,7 +186,7 @@ namespace xsimd
      * @param n number of objects earlier passed to allocate().
      */
     template <class T, size_t A>
-    inline void aligned_allocator<T, A>::deallocate(pointer p, size_type)
+    XSIMD_INLINE void aligned_allocator<T, A>::deallocate(pointer p, size_type)
     {
         aligned_free(p);
     }
@@ -197,7 +197,7 @@ namespace xsimd
      * @return the maximum supported allocated size.
      */
     template <class T, size_t A>
-    inline auto
+    XSIMD_INLINE auto
     aligned_allocator<T, A>::max_size() const noexcept -> size_type
     {
         return size_type(-1) / sizeof(T);
@@ -207,7 +207,7 @@ namespace xsimd
      * This method is deprecated, use max_size() instead
      */
     template <class T, size_t A>
-    inline auto
+    XSIMD_INLINE auto
     aligned_allocator<T, A>::size_max() const noexcept -> size_type
     {
         return size_type(-1) / sizeof(T);
@@ -221,7 +221,7 @@ namespace xsimd
      */
     template <class T, size_t A>
     template <class U, class... Args>
-    inline void aligned_allocator<T, A>::construct(U* p, Args&&... args)
+    XSIMD_INLINE void aligned_allocator<T, A>::construct(U* p, Args&&... args)
     {
         new ((void*)p) U(std::forward<Args>(args)...);
     }
@@ -232,7 +232,7 @@ namespace xsimd
      */
     template <class T, size_t A>
     template <class U>
-    inline void aligned_allocator<T, A>::destroy(U* p)
+    XSIMD_INLINE void aligned_allocator<T, A>::destroy(U* p)
     {
         p->~U();
     }
@@ -250,8 +250,8 @@ namespace xsimd
      * @return true if the allocators have the same alignment.
      */
     template <class T1, size_t A1, class T2, size_t A2>
-    inline bool operator==(const aligned_allocator<T1, A1>& lhs,
-                           const aligned_allocator<T2, A2>& rhs) noexcept
+    XSIMD_INLINE bool operator==(const aligned_allocator<T1, A1>& lhs,
+                                 const aligned_allocator<T2, A2>& rhs) noexcept
     {
         return lhs.alignment == rhs.alignment;
     }
@@ -265,8 +265,8 @@ namespace xsimd
      * @return true if the allocators have different alignments.
      */
     template <class T1, size_t A1, class T2, size_t A2>
-    inline bool operator!=(const aligned_allocator<T1, A1>& lhs,
-                           const aligned_allocator<T2, A2>& rhs) noexcept
+    XSIMD_INLINE bool operator!=(const aligned_allocator<T1, A1>& lhs,
+                                 const aligned_allocator<T2, A2>& rhs) noexcept
     {
         return !(lhs == rhs);
     }
@@ -277,7 +277,7 @@ namespace xsimd
 
     namespace detail
     {
-        inline void* xaligned_malloc(size_t size, size_t alignment)
+        XSIMD_INLINE void* xaligned_malloc(size_t size, size_t alignment)
         {
             assert(((alignment & (alignment - 1)) == 0) && "alignment must be a power of two");
             assert((alignment >= sizeof(void*)) && "alignment must be at least the size of a pointer");
@@ -293,7 +293,7 @@ namespace xsimd
             return res;
         }
 
-        inline void xaligned_free(void* ptr)
+        XSIMD_INLINE void xaligned_free(void* ptr)
         {
 #ifdef _WIN32
             _aligned_free(ptr);
@@ -303,18 +303,18 @@ namespace xsimd
         }
     }
 
-    inline void* aligned_malloc(size_t size, size_t alignment)
+    XSIMD_INLINE void* aligned_malloc(size_t size, size_t alignment)
     {
         return detail::xaligned_malloc(size, alignment);
     }
 
-    inline void aligned_free(void* ptr)
+    XSIMD_INLINE void aligned_free(void* ptr)
     {
         detail::xaligned_free(ptr);
     }
 
     template <class T>
-    inline size_t get_alignment_offset(const T* p, size_t size, size_t block_size)
+    XSIMD_INLINE size_t get_alignment_offset(const T* p, size_t size, size_t block_size)
     {
         // size_t block_size = simd_traits<T>::size;
         if (block_size == 1)
diff --git a/contrib/python/pythran/pythran/xsimd/memory/xsimd_alignment.hpp b/contrib/python/pythran/pythran/xsimd/memory/xsimd_alignment.hpp
index 2b3b350880e..2d59ac1fc48 100644
--- a/contrib/python/pythran/pythran/xsimd/memory/xsimd_alignment.hpp
+++ b/contrib/python/pythran/pythran/xsimd/memory/xsimd_alignment.hpp
@@ -81,7 +81,7 @@ namespace xsimd
      * @return true if the alignment requirements are met
      */
     template <class Arch = default_arch>
-    inline bool is_aligned(void const* ptr)
+    XSIMD_INLINE bool is_aligned(void const* ptr)
     {
         return (reinterpret_cast<uintptr_t>(ptr) % static_cast<uintptr_t>(Arch::alignment())) == 0;
     }
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_all_registers.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_all_registers.hpp
index 4350ca0a281..6d024a16777 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_all_registers.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_all_registers.hpp
@@ -36,6 +36,8 @@
 #include "xsimd_avx512dq_register.hpp"
 #include "xsimd_avx512f_register.hpp"
 
+#include "xsimd_i8mm_neon64_register.hpp"
+
 #include "xsimd_neon64_register.hpp"
 #include "xsimd_neon_register.hpp"
 
@@ -44,3 +46,7 @@
 #include "xsimd_rvv_register.hpp"
 
 #include "xsimd_wasm_register.hpp"
+
+#if XSIMD_WITH_EMULATED
+#include "xsimd_emulated_register.hpp"
+#endif
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_api.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_api.hpp
index 0420f0a09d6..79be4d88d2a 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_api.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_api.hpp
@@ -53,7 +53,7 @@ namespace xsimd
      * @return the absolute values of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> abs(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> abs(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::abs<A>(x, A {});
@@ -67,7 +67,7 @@ namespace xsimd
      * @return the absolute values of \c z.
      */
     template <class T, class A>
-    inline batch<T, A> abs(batch<std::complex<T>, A> const& z) noexcept
+    XSIMD_INLINE batch<T, A> abs(batch<std::complex<T>, A> const& z) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::abs<A>(z, A {});
@@ -82,7 +82,7 @@ namespace xsimd
      * @return the sum of \c x and \c y
      */
     template <class T, class A>
-    inline auto add(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x + y)
+    XSIMD_INLINE auto add(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x + y)
     {
         detail::static_check_supported_config<T, A>();
         return x + y;
@@ -96,7 +96,7 @@ namespace xsimd
      * @return the arc cosine of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> acos(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> acos(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::acos<A>(x, A {});
@@ -110,7 +110,7 @@ namespace xsimd
      * @return the inverse hyperbolic cosine of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> acosh(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> acosh(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::acosh<A>(x, A {});
@@ -124,7 +124,7 @@ namespace xsimd
      * @return the argument of \c z.
      */
     template <class T, class A>
-    inline real_batch_type_t<batch<T, A>> arg(batch<T, A> const& z) noexcept
+    XSIMD_INLINE real_batch_type_t<batch<T, A>> arg(batch<T, A> const& z) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::arg<A>(z, A {});
@@ -138,7 +138,7 @@ namespace xsimd
      * @return the arc sine of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> asin(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> asin(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::asin<A>(x, A {});
@@ -152,7 +152,7 @@ namespace xsimd
      * @return the inverse hyperbolic sine of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> asinh(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> asinh(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::asinh<A>(x, A {});
@@ -166,7 +166,7 @@ namespace xsimd
      * @return the arc tangent of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> atan(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> atan(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::atan<A>(x, A {});
@@ -182,7 +182,7 @@ namespace xsimd
      * @return the arc tangent of \c x/y.
      */
     template <class T, class A>
-    inline batch<T, A> atan2(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch<T, A> atan2(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::atan2<A>(x, y, A {});
@@ -196,13 +196,43 @@ namespace xsimd
      * @return the inverse hyperbolic tangent of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> atanh(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> atanh(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::atanh<A>(x, A {});
     }
 
     /**
+     * @ingroup batch_math
+     *
+     * Computes the average of batches \c x and \c y
+     * @param x batch of T
+     * @param y batch of T
+     * @return the average of elements between \c x and \c y.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::avg<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the rounded average of batches \c x and \c y
+     * @param x batch of T
+     * @param y batch of T
+     * @return the rounded average of elements between \c x and \c y.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::avgr<A>(x, y, A {});
+    }
+
+    /**
      * @ingroup batch_conversion
      *
      * Perform a static_cast from \c T_in to \c T_out on \c \c x.
@@ -210,7 +240,7 @@ namespace xsimd
      * @return \c x cast to \c T_out
      */
     template <class T_out, class T_in, class A>
-    inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& x) noexcept
+    XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& x) noexcept
     {
         detail::static_check_supported_config<T_out, A>();
         detail::static_check_supported_config<T_in, A>();
@@ -226,7 +256,7 @@ namespace xsimd
      * @return \c x cast to \c T_out
      */
     template <class T_out, class T_in, class A>
-    inline batch<T_out, A> batch_cast(batch<T_in, A> const& x) noexcept
+    XSIMD_INLINE batch<T_out, A> batch_cast(batch<T_in, A> const& x) noexcept
     {
         detail::static_check_supported_config<T_out, A>();
         detail::static_check_supported_config<T_in, A>();
@@ -241,7 +271,7 @@ namespace xsimd
      * @return bit of sign of \c x
      */
     template <class T, class A>
-    inline batch<T, A> bitofsign(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> bitofsign(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::bitofsign<A>(x, A {});
@@ -256,7 +286,7 @@ namespace xsimd
      * @return the result of the bitwise and.
      */
     template <class T, class A>
-    inline auto bitwise_and(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x & y)
+    XSIMD_INLINE auto bitwise_and(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x & y)
     {
         detail::static_check_supported_config<T, A>();
         return x & y;
@@ -271,7 +301,7 @@ namespace xsimd
      * @return the result of the bitwise and.
      */
     template <class T, class A>
-    inline auto bitwise_and(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x & y)
+    XSIMD_INLINE auto bitwise_and(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x & y)
     {
         detail::static_check_supported_config<T, A>();
         return x & y;
@@ -286,7 +316,7 @@ namespace xsimd
      * @return the result of the bitwise and not.
      */
     template <class T, class A>
-    inline batch<T, A> bitwise_andnot(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::bitwise_andnot<A>(x, y, A {});
@@ -301,7 +331,7 @@ namespace xsimd
      * @return the result of the bitwise and not.
      */
     template <class T, class A>
-    inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept
+    XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::bitwise_andnot<A>(x, y, A {});
@@ -315,7 +345,7 @@ namespace xsimd
      * @return \c x reinterpreted as \c T_out
      */
     template <class T_out, class T_in, class A>
-    inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept
+    XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept
     {
         detail::static_check_supported_config<T_in, A>();
         detail::static_check_supported_config<T_out, A>();
@@ -331,13 +361,13 @@ namespace xsimd
      * @return shifted \c x.
      */
     template <class T, class A>
-    inline batch<T, A> bitwise_lshift(batch<T, A> const& x, int shift) noexcept
+    XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, int shift) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::bitwise_lshift<A>(x, shift, A {});
     }
     template <class T, class A>
-    inline batch<T, A> bitwise_lshift(batch<T, A> const& x, batch<T, A> const& shift) noexcept
+    XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, batch<T, A> const& shift) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::bitwise_lshift<A>(x, shift, A {});
@@ -351,7 +381,7 @@ namespace xsimd
      * @return the result of the bitwise not.
      */
     template <class T, class A>
-    inline batch<T, A> bitwise_not(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::bitwise_not<A>(x, A {});
@@ -365,7 +395,7 @@ namespace xsimd
      * @return the result of the bitwise not.
      */
     template <class T, class A>
-    inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& x) noexcept
+    XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::bitwise_not<A>(x, A {});
@@ -380,7 +410,7 @@ namespace xsimd
      * @return the result of the bitwise or.
      */
     template <class T, class A>
-    inline auto bitwise_or(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x | y)
+    XSIMD_INLINE auto bitwise_or(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x | y)
     {
         detail::static_check_supported_config<T, A>();
         return x | y;
@@ -395,7 +425,7 @@ namespace xsimd
      * @return the result of the bitwise or.
      */
     template <class T, class A>
-    inline auto bitwise_or(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x | y)
+    XSIMD_INLINE auto bitwise_or(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x | y)
     {
         detail::static_check_supported_config<T, A>();
         return x | y;
@@ -410,13 +440,13 @@ namespace xsimd
      * @return shifted \c x.
      */
     template <class T, class A>
-    inline batch<T, A> bitwise_rshift(batch<T, A> const& x, int shift) noexcept
+    XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, int shift) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::bitwise_rshift<A>(x, shift, A {});
     }
     template <class T, class A>
-    inline batch<T, A> bitwise_rshift(batch<T, A> const& x, batch<T, A> const& shift) noexcept
+    XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, batch<T, A> const& shift) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::bitwise_rshift<A>(x, shift, A {});
@@ -431,7 +461,7 @@ namespace xsimd
      * @return the result of the bitwise xor.
      */
     template <class T, class A>
-    inline auto bitwise_xor(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x ^ y)
+    XSIMD_INLINE auto bitwise_xor(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x ^ y)
     {
         detail::static_check_supported_config<T, A>();
         return x ^ y;
@@ -446,7 +476,7 @@ namespace xsimd
      * @return the result of the bitwise xor.
      */
     template <class T, class A>
-    inline auto bitwise_xor(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x ^ y)
+    XSIMD_INLINE auto bitwise_xor(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x ^ y)
     {
         detail::static_check_supported_config<T, A>();
         return x ^ y;
@@ -460,7 +490,7 @@ namespace xsimd
      * @return a new batch instance
      */
     template <class T, class A = default_arch>
-    inline batch<T, A> broadcast(T v) noexcept
+    XSIMD_INLINE batch<T, A> broadcast(T v) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return batch<T, A>::broadcast(v);
@@ -475,7 +505,7 @@ namespace xsimd
      * @return a new batch instance
      */
     template <class To, class A = default_arch, class From>
-    inline simd_return_type<From, To, A> broadcast_as(From v) noexcept
+    XSIMD_INLINE simd_return_type<From, To, A> broadcast_as(From v) noexcept
     {
         detail::static_check_supported_config<From, A>();
         using batch_value_type = typename simd_return_type<From, To, A>::value_type;
@@ -493,7 +523,7 @@ namespace xsimd
      * @return the cubic root of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> cbrt(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> cbrt(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::cbrt<A>(x, A {});
@@ -508,7 +538,7 @@ namespace xsimd
      * @return the batch of smallest integer values not less than \c x.
      */
     template <class T, class A>
-    inline batch<T, A> ceil(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> ceil(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::ceil<A>(x, A {});
@@ -524,7 +554,7 @@ namespace xsimd
      * @return the result of the clipping.
      */
     template <class T, class A>
-    inline batch<T, A> clip(batch<T, A> const& x, batch<T, A> const& lo, batch<T, A> const& hi) noexcept
+    XSIMD_INLINE batch<T, A> clip(batch<T, A> const& x, batch<T, A> const& lo, batch<T, A> const& hi) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::clip(x, lo, hi, A {});
@@ -537,7 +567,7 @@ namespace xsimd
      * resulting vector, zeroing the remaining slots
      */
     template <class T, class A>
-    inline batch<T, A> compress(batch<T, A> const& x, batch_bool<T, A> const& mask) noexcept
+    XSIMD_INLINE batch<T, A> compress(batch<T, A> const& x, batch_bool<T, A> const& mask) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::compress<A>(x, mask, A {});
@@ -551,7 +581,7 @@ namespace xsimd
      * @return the argument of \c z.
      */
     template <class A, class T>
-    inline complex_batch_type_t<batch<T, A>> conj(batch<T, A> const& z) noexcept
+    XSIMD_INLINE complex_batch_type_t<batch<T, A>> conj(batch<T, A> const& z) noexcept
     {
         return kernel::conj(z, A {});
     }
@@ -567,7 +597,7 @@ namespace xsimd
      * matches that of \c y.
      */
     template <class T, class A>
-    inline batch<T, A> copysign(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch<T, A> copysign(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::copysign<A>(x, y, A {});
@@ -581,7 +611,7 @@ namespace xsimd
      * @return the cosine of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> cos(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> cos(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::cos<A>(x, A {});
@@ -595,7 +625,7 @@ namespace xsimd
      * @return the hyperbolic cosine of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> cosh(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> cosh(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::cosh<A>(x, A {});
@@ -609,7 +639,7 @@ namespace xsimd
      * @return the subtraction of \c x and 1.
      */
     template <class T, class A>
-    inline batch<T, A> decr(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> decr(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::decr<A>(x, A {});
@@ -625,7 +655,7 @@ namespace xsimd
      * @return the subtraction of \c x and 1 when \c mask is true.
      */
     template <class T, class A, class Mask>
-    inline batch<T, A> decr_if(batch<T, A> const& x, Mask const& mask) noexcept
+    XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& x, Mask const& mask) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::decr_if<A>(x, mask, A {});
@@ -640,7 +670,7 @@ namespace xsimd
      * @return the result of the division.
      */
     template <class T, class A>
-    inline auto div(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x / y)
+    XSIMD_INLINE auto div(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x / y)
     {
         detail::static_check_supported_config<T, A>();
         return x / y;
@@ -655,7 +685,7 @@ namespace xsimd
      * @return a boolean batch.
      */
     template <class T, class A>
-    inline auto eq(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x == y)
+    XSIMD_INLINE auto eq(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x == y)
     {
         detail::static_check_supported_config<T, A>();
         return x == y;
@@ -670,7 +700,7 @@ namespace xsimd
      * @return a boolean batch.
      */
     template <class T, class A>
-    inline auto eq(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x == y)
+    XSIMD_INLINE auto eq(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x == y)
     {
         detail::static_check_supported_config<T, A>();
         return x == y;
@@ -684,7 +714,7 @@ namespace xsimd
      * @return the natural exponential of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> exp(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> exp(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::exp<A>(x, A {});
@@ -698,7 +728,7 @@ namespace xsimd
      * @return the base 10 exponential of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> exp10(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> exp10(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::exp10<A>(x, A {});
@@ -712,7 +742,7 @@ namespace xsimd
      * @return the base 2 exponential of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> exp2(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> exp2(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::exp2<A>(x, A {});
@@ -725,7 +755,7 @@ namespace xsimd
      * mask, zeroing the other slots
      */
     template <class T, class A>
-    inline batch<T, A> expand(batch<T, A> const& x, batch_bool<T, A> const& mask) noexcept
+    XSIMD_INLINE batch<T, A> expand(batch<T, A> const& x, batch_bool<T, A> const& mask) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::expand<A>(x, mask, A {});
@@ -739,7 +769,7 @@ namespace xsimd
      * @return the natural exponential of \c x, minus one.
      */
     template <class T, class A>
-    inline batch<T, A> expm1(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> expm1(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::expm1<A>(x, A {});
@@ -753,7 +783,7 @@ namespace xsimd
      * @return the error function of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> erf(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> erf(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::erf<A>(x, A {});
@@ -767,7 +797,7 @@ namespace xsimd
      * @return the error function of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> erfc(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> erfc(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::erfc<A>(x, A {});
@@ -784,7 +814,7 @@ namespace xsimd
      * @return.
      */
     template <class T, class A>
-    inline batch<T, A> extract_pair(batch<T, A> const& x, batch<T, A> const& y, std::size_t i) noexcept
+    XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& x, batch<T, A> const& y, std::size_t i) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::extract_pair<A>(x, y, i, A {});
@@ -798,7 +828,7 @@ namespace xsimd
      * @return the absolute values of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> fabs(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> fabs(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::abs<A>(x, A {});
@@ -814,7 +844,7 @@ namespace xsimd
      * @return the positive difference.
      */
     template <class T, class A>
-    inline batch<T, A> fdim(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch<T, A> fdim(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::fdim<A>(x, y, A {});
@@ -829,7 +859,7 @@ namespace xsimd
      * @return the batch of largest integer values not greater than \c x.
      */
     template <class T, class A>
-    inline batch<T, A> floor(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> floor(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::floor<A>(x, A {});
@@ -845,7 +875,7 @@ namespace xsimd
      * @return the result of the fused multiply-add operation.
      */
     template <class T, class A>
-    inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    XSIMD_INLINE batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::fma<A>(x, y, z, A {});
@@ -860,7 +890,7 @@ namespace xsimd
      * @return a batch of the larger values.
      */
     template <class T, class A>
-    inline batch<T, A> fmax(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch<T, A> fmax(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::max<A>(x, y, A {});
@@ -875,7 +905,7 @@ namespace xsimd
      * @return a batch of the smaller values.
      */
     template <class T, class A>
-    inline batch<T, A> fmin(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch<T, A> fmin(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::min<A>(x, y, A {});
@@ -890,7 +920,7 @@ namespace xsimd
      * @return the result of the modulo.
      */
     template <class T, class A>
-    inline batch<T, A> fmod(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch<T, A> fmod(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::fmod<A>(x, y, A {});
@@ -906,7 +936,7 @@ namespace xsimd
      * @return the result of the fused multiply-sub operation.
      */
     template <class T, class A>
-    inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::fms<A>(x, y, z, A {});
@@ -922,7 +952,7 @@ namespace xsimd
      * @return the result of the fused negated multiply-add operation.
      */
     template <class T, class A>
-    inline batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    XSIMD_INLINE batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::fnma<A>(x, y, z, A {});
@@ -938,7 +968,7 @@ namespace xsimd
      * @return the result of the fused negated multiply-sub operation.
      */
     template <class T, class A>
-    inline batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    XSIMD_INLINE batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::fnms<A>(x, y, z, A {});
@@ -953,7 +983,7 @@ namespace xsimd
      * @return the normalized fraction of x
      */
     template <class T, class A>
-    inline batch<T, A> frexp(const batch<T, A>& x, batch<as_integer_t<T>, A>& y) noexcept
+    XSIMD_INLINE batch<T, A> frexp(const batch<T, A>& x, batch<as_integer_t<T>, A>& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::frexp<A>(x, y, A {});
@@ -969,7 +999,7 @@ namespace xsimd
      * @return a boolean batch.
      */
     template <class T, class A>
-    inline batch_bool<T, A> ge(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return x >= y;
@@ -985,7 +1015,7 @@ namespace xsimd
      * @return a boolean batch.
      */
     template <class T, class A>
-    inline batch_bool<T, A> gt(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return x > y;
@@ -1001,7 +1031,7 @@ namespace xsimd
      * @return the result of the reduction.
      */
     template <class T, class A>
-    inline batch<T, A> haddp(batch<T, A> const* row) noexcept
+    XSIMD_INLINE batch<T, A> haddp(batch<T, A> const* row) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::haddp<A>(row, A {});
@@ -1017,7 +1047,7 @@ namespace xsimd
      * @return the square root of the sum of the squares of \c x and \c y.
      */
     template <class T, class A>
-    inline batch<T, A> hypot(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch<T, A> hypot(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::hypot<A>(x, y, A {});
@@ -1031,7 +1061,7 @@ namespace xsimd
      * @return the argument of \c x.
      */
     template <class T, class A>
-    inline real_batch_type_t<batch<T, A>> imag(batch<T, A> const& x) noexcept
+    XSIMD_INLINE real_batch_type_t<batch<T, A>> imag(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::imag<A>(x, A {});
@@ -1045,7 +1075,7 @@ namespace xsimd
      * @return the sum of \c x and 1.
      */
     template <class T, class A>
-    inline batch<T, A> incr(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> incr(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::incr<A>(x, A {});
@@ -1061,7 +1091,7 @@ namespace xsimd
      * @return the sum of \c x and 1 when \c mask is true.
      */
     template <class T, class A, class Mask>
-    inline batch<T, A> incr_if(batch<T, A> const& x, Mask const& mask) noexcept
+    XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& x, Mask const& mask) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::incr_if<A>(x, mask, A {});
@@ -1074,7 +1104,7 @@ namespace xsimd
      * @return a batch of positive infinity
      */
     template <class B>
-    inline B infinity()
+    XSIMD_INLINE B infinity()
     {
         using T = typename B::value_type;
         using A = typename B::arch_type;
@@ -1092,7 +1122,7 @@ namespace xsimd
      * @return copy of \c x with position \c pos set to \c val
      */
     template <class T, class A, size_t I>
-    inline batch<T, A> insert(batch<T, A> const& x, T val, index<I> pos) noexcept
+    XSIMD_INLINE batch<T, A> insert(batch<T, A> const& x, T val, index<I> pos) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::insert<A>(x, val, pos, A {});
@@ -1106,7 +1136,7 @@ namespace xsimd
      * @return a batch of booleans.
      */
     template <class T, class A>
-    inline batch_bool<T, A> is_even(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch_bool<T, A> is_even(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::is_even<A>(x, A {});
@@ -1120,7 +1150,7 @@ namespace xsimd
      * @return a batch of booleans.
      */
     template <class T, class A>
-    inline batch_bool<T, A> is_flint(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch_bool<T, A> is_flint(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::is_flint<A>(x, A {});
@@ -1134,7 +1164,7 @@ namespace xsimd
      * @return a batch of booleans.
      */
     template <class T, class A>
-    inline batch_bool<T, A> is_odd(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch_bool<T, A> is_odd(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::is_odd<A>(x, A {});
@@ -1148,7 +1178,7 @@ namespace xsimd
      * @return a batch of booleans.
      */
     template <class T, class A>
-    inline typename batch<T, A>::batch_bool_type isinf(batch<T, A> const& x) noexcept
+    XSIMD_INLINE typename batch<T, A>::batch_bool_type isinf(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::isinf<A>(x, A {});
@@ -1162,7 +1192,7 @@ namespace xsimd
      * @return a batch of booleans.
      */
     template <class T, class A>
-    inline typename batch<T, A>::batch_bool_type isfinite(batch<T, A> const& x) noexcept
+    XSIMD_INLINE typename batch<T, A>::batch_bool_type isfinite(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::isfinite<A>(x, A {});
@@ -1176,7 +1206,7 @@ namespace xsimd
      * @return a batch of booleans.
      */
     template <class T, class A>
-    inline typename batch<T, A>::batch_bool_type isnan(batch<T, A> const& x) noexcept
+    XSIMD_INLINE typename batch<T, A>::batch_bool_type isnan(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::isnan<A>(x, A {});
@@ -1191,7 +1221,7 @@ namespace xsimd
      * @return a batch of floating point values.
      */
     template <class T, class A>
-    inline batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& y) noexcept
+    XSIMD_INLINE batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::ldexp<A>(x, y, A {});
@@ -1206,7 +1236,7 @@ namespace xsimd
      * @return a boolean batch.
      */
     template <class T, class A>
-    inline batch_bool<T, A> le(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return x <= y;
@@ -1220,7 +1250,7 @@ namespace xsimd
      * @return the natural logarithm of the gamma function of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> lgamma(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> lgamma(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::lgamma<A>(x, A {});
@@ -1235,7 +1265,7 @@ namespace xsimd
      * @return a new batch instance
      */
     template <class To, class A = default_arch, class From>
-    inline simd_return_type<From, To, A> load_as(From const* ptr, aligned_mode) noexcept
+    XSIMD_INLINE simd_return_type<From, To, A> load_as(From const* ptr, aligned_mode) noexcept
     {
         using batch_value_type = typename simd_return_type<From, To, A>::value_type;
         detail::static_check_supported_config<From, A>();
@@ -1244,14 +1274,14 @@ namespace xsimd
     }
 
     template <class To, class A = default_arch>
-    inline simd_return_type<bool, To, A> load_as(bool const* ptr, aligned_mode) noexcept
+    XSIMD_INLINE simd_return_type<bool, To, A> load_as(bool const* ptr, aligned_mode) noexcept
     {
         detail::static_check_supported_config<To, A>();
         return simd_return_type<bool, To, A>::load_aligned(ptr);
     }
 
     template <class To, class A = default_arch, class From>
-    inline simd_return_type<std::complex<From>, To, A> load_as(std::complex<From> const* ptr, aligned_mode) noexcept
+    XSIMD_INLINE simd_return_type<std::complex<From>, To, A> load_as(std::complex<From> const* ptr, aligned_mode) noexcept
     {
         detail::static_check_supported_config<To, A>();
         using batch_value_type = typename simd_return_type<std::complex<From>, To, A>::value_type;
@@ -1260,7 +1290,7 @@ namespace xsimd
 
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
     template <class To, class A = default_arch, class From, bool i3ec>
-    inline simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, aligned_mode) noexcept
+    XSIMD_INLINE simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, aligned_mode) noexcept
     {
         detail::static_check_supported_config<To, A>();
         detail::static_check_supported_config<From, A>();
@@ -1277,7 +1307,7 @@ namespace xsimd
      * @return a new batch instance
      */
     template <class To, class A = default_arch, class From>
-    inline simd_return_type<From, To, A> load_as(From const* ptr, unaligned_mode) noexcept
+    XSIMD_INLINE simd_return_type<From, To, A> load_as(From const* ptr, unaligned_mode) noexcept
     {
         using batch_value_type = typename simd_return_type<From, To, A>::value_type;
         detail::static_check_supported_config<To, A>();
@@ -1286,13 +1316,13 @@ namespace xsimd
     }
 
     template <class To, class A = default_arch>
-    inline simd_return_type<bool, To, A> load_as(bool const* ptr, unaligned_mode) noexcept
+    XSIMD_INLINE simd_return_type<bool, To, A> load_as(bool const* ptr, unaligned_mode) noexcept
     {
         return simd_return_type<bool, To, A>::load_unaligned(ptr);
     }
 
     template <class To, class A = default_arch, class From>
-    inline simd_return_type<std::complex<From>, To, A> load_as(std::complex<From> const* ptr, unaligned_mode) noexcept
+    XSIMD_INLINE simd_return_type<std::complex<From>, To, A> load_as(std::complex<From> const* ptr, unaligned_mode) noexcept
     {
         detail::static_check_supported_config<To, A>();
         detail::static_check_supported_config<From, A>();
@@ -1302,7 +1332,7 @@ namespace xsimd
 
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
     template <class To, class A = default_arch, class From, bool i3ec>
-    inline simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, unaligned_mode) noexcept
+    XSIMD_INLINE simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, unaligned_mode) noexcept
     {
         detail::static_check_supported_config<To, A>();
         detail::static_check_supported_config<From, A>();
@@ -1319,7 +1349,7 @@ namespace xsimd
      * @return a new batch instance
      */
     template <class A = default_arch, class From>
-    inline batch<From, A> load(From const* ptr, aligned_mode = {}) noexcept
+    XSIMD_INLINE batch<From, A> load(From const* ptr, aligned_mode = {}) noexcept
     {
         detail::static_check_supported_config<From, A>();
         return load_as<From, A>(ptr, aligned_mode {});
@@ -1334,7 +1364,7 @@ namespace xsimd
      * @return a new batch instance
      */
     template <class A = default_arch, class From>
-    inline batch<From, A> load(From const* ptr, unaligned_mode) noexcept
+    XSIMD_INLINE batch<From, A> load(From const* ptr, unaligned_mode) noexcept
     {
         detail::static_check_supported_config<From, A>();
         return load_as<From, A>(ptr, unaligned_mode {});
@@ -1349,7 +1379,7 @@ namespace xsimd
      * @return a new batch instance
      */
     template <class A = default_arch, class From>
-    inline batch<From, A> load_aligned(From const* ptr) noexcept
+    XSIMD_INLINE batch<From, A> load_aligned(From const* ptr) noexcept
     {
         detail::static_check_supported_config<From, A>();
         return load_as<From, A>(ptr, aligned_mode {});
@@ -1364,7 +1394,7 @@ namespace xsimd
      * @return a new batch instance
      */
     template <class A = default_arch, class From>
-    inline batch<From, A> load_unaligned(From const* ptr) noexcept
+    XSIMD_INLINE batch<From, A> load_unaligned(From const* ptr) noexcept
     {
         detail::static_check_supported_config<From, A>();
         return load_as<From, A>(ptr, unaligned_mode {});
@@ -1378,7 +1408,7 @@ namespace xsimd
      * @return the natural logarithm of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> log(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> log(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::log<A>(x, A {});
@@ -1391,7 +1421,7 @@ namespace xsimd
      * @return the base 2 logarithm of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> log2(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> log2(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::log2<A>(x, A {});
@@ -1404,7 +1434,7 @@ namespace xsimd
      * @return the base 10 logarithm of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> log10(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> log10(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::log10<A>(x, A {});
@@ -1417,7 +1447,7 @@ namespace xsimd
      * @return the natural logarithm of one plus \c x.
      */
     template <class T, class A>
-    inline batch<T, A> log1p(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> log1p(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::log1p<A>(x, A {});
@@ -1432,7 +1462,7 @@ namespace xsimd
      * @return a boolean batch.
      */
     template <class T, class A>
-    inline batch_bool<T, A> lt(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return x < y;
@@ -1447,7 +1477,7 @@ namespace xsimd
      * @return a batch of the larger values.
      */
     template <class T, class A>
-    inline batch<T, A> max(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch<T, A> max(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::max<A>(x, y, A {});
@@ -1462,7 +1492,7 @@ namespace xsimd
      * @return a batch of the smaller values.
      */
     template <class T, class A>
-    inline batch<T, A> min(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch<T, A> min(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::min<A>(x, y, A {});
@@ -1475,7 +1505,7 @@ namespace xsimd
      * @return a batch of positive infinity
      */
     template <class B>
-    inline B minusinfinity() noexcept
+    XSIMD_INLINE B minusinfinity() noexcept
     {
         using T = typename B::value_type;
         using A = typename B::arch_type;
@@ -1492,7 +1522,7 @@ namespace xsimd
      * @return the result of the modulo.
      */
     template <class T, class A>
-    inline auto mod(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x % y)
+    XSIMD_INLINE auto mod(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x % y)
     {
         detail::static_check_supported_config<T, A>();
         return x % y;
@@ -1508,7 +1538,7 @@ namespace xsimd
      * @return the result of the product.
      */
     template <class T, class A>
-    inline auto mul(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x * y)
+    XSIMD_INLINE auto mul(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x * y)
     {
         detail::static_check_supported_config<T, A>();
         return x * y;
@@ -1523,7 +1553,7 @@ namespace xsimd
      * @return the batch of nearest integer values.
      */
     template <class T, class A>
-    inline batch<T, A> nearbyint(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> nearbyint(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::nearbyint<A>(x, A {});
@@ -1540,7 +1570,7 @@ namespace xsimd
      * @warning For very large values the conversion to int silently overflows.
      */
     template <class T, class A>
-    inline batch<as_integer_t<T>, A>
+    XSIMD_INLINE batch<as_integer_t<T>, A>
     nearbyint_as_int(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
@@ -1556,7 +1586,7 @@ namespace xsimd
      * @return a boolean batch.
      */
     template <class T, class A>
-    inline auto neq(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x != y)
+    XSIMD_INLINE auto neq(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x != y)
     {
         detail::static_check_supported_config<T, A>();
         return x != y;
@@ -1571,7 +1601,7 @@ namespace xsimd
      * @return a boolean batch.
      */
     template <class T, class A>
-    inline auto neq(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x != y)
+    XSIMD_INLINE auto neq(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x != y)
     {
         detail::static_check_supported_config<T, A>();
         return x != y;
@@ -1585,7 +1615,7 @@ namespace xsimd
      * @return the opposite of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> neg(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> neg(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return -x;
@@ -1601,7 +1631,7 @@ namespace xsimd
      * @return \c x raised to the power \c y.
      */
     template <class T, class A>
-    inline batch<T, A> nextafter(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch<T, A> nextafter(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::nextafter<A>(x, y, A {});
@@ -1615,7 +1645,7 @@ namespace xsimd
      * @return the norm of \c x.
      */
     template <class T, class A>
-    inline real_batch_type_t<batch<T, A>> norm(batch<T, A> const& x) noexcept
+    XSIMD_INLINE real_batch_type_t<batch<T, A>> norm(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::norm(x, A {});
@@ -1630,7 +1660,7 @@ namespace xsimd
      * @return \c r exp(i * \c theta).
      */
     template <class T, class A>
-    inline complex_batch_type_t<batch<T, A>> polar(batch<T, A> const& r, batch<T, A> const& theta = batch<T, A> {}) noexcept
+    XSIMD_INLINE complex_batch_type_t<batch<T, A>> polar(batch<T, A> const& r, batch<T, A> const& theta = batch<T, A> {}) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::polar<A>(r, theta, A {});
@@ -1644,7 +1674,7 @@ namespace xsimd
      * @return \c x.
      */
     template <class T, class A>
-    inline batch<T, A> pos(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> pos(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return +x;
@@ -1660,7 +1690,7 @@ namespace xsimd
      * @return \c x raised to the power \c y.
      */
     template <class T, class A>
-    inline batch<T, A> pow(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch<T, A> pow(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::pow<A>(x, y, A {});
@@ -1676,7 +1706,7 @@ namespace xsimd
      * @return \c x raised to the power \c y.
      */
     template <class T, class ITy, class A, class = typename std::enable_if<std::is_integral<ITy>::value, void>::type>
-    inline batch<T, A> pow(batch<T, A> const& x, ITy y) noexcept
+    XSIMD_INLINE batch<T, A> pow(batch<T, A> const& x, ITy y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::ipow<A>(x, y, A {});
@@ -1690,7 +1720,7 @@ namespace xsimd
      * @return the projection of \c z.
      */
     template <class T, class A>
-    inline complex_batch_type_t<batch<T, A>> proj(batch<T, A> const& z) noexcept
+    XSIMD_INLINE complex_batch_type_t<batch<T, A>> proj(batch<T, A> const& z) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::proj(z, A {});
@@ -1704,7 +1734,7 @@ namespace xsimd
      * @return the argument of \c z.
      */
     template <class T, class A>
-    inline real_batch_type_t<batch<T, A>> real(batch<T, A> const& z) noexcept
+    XSIMD_INLINE real_batch_type_t<batch<T, A>> real(batch<T, A> const& z) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::real<A>(z, A {});
@@ -1720,7 +1750,7 @@ namespace xsimd
      * @return the reciprocal.
      */
     template <class T, class A, class = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
-    inline batch<T, A> reciprocal(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> reciprocal(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::reciprocal(x, A {});
@@ -1735,7 +1765,7 @@ namespace xsimd
      * @return the result of the reduction, as a scalar.
      */
     template <class T, class A, class F>
-    inline T reduce(F&& f, batch<T, A> const& x) noexcept
+    XSIMD_INLINE T reduce(F&& f, batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::detail::reduce(std::forward<F>(f), x, std::integral_constant<unsigned, batch<T, A>::size>());
@@ -1749,7 +1779,7 @@ namespace xsimd
      * @return the result of the reduction.
      */
     template <class T, class A>
-    inline T reduce_add(batch<T, A> const& x) noexcept
+    XSIMD_INLINE T reduce_add(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::reduce_add<A>(x, A {});
@@ -1763,7 +1793,7 @@ namespace xsimd
      * @return the result of the reduction.
      */
     template <class T, class A>
-    inline T reduce_max(batch<T, A> const& x) noexcept
+    XSIMD_INLINE T reduce_max(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::reduce_max<A>(x, A {});
@@ -1777,7 +1807,7 @@ namespace xsimd
      * @return the result of the reduction.
      */
     template <class T, class A>
-    inline T reduce_min(batch<T, A> const& x) noexcept
+    XSIMD_INLINE T reduce_min(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::reduce_min<A>(x, A {});
@@ -1792,7 +1822,7 @@ namespace xsimd
      * @return the result of the addition.
      */
     template <class T, class A>
-    inline batch<T, A> remainder(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch<T, A> remainder(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::remainder<A>(x, y, A {});
@@ -1807,7 +1837,7 @@ namespace xsimd
      * @return the batch of rounded values.
      */
     template <class T, class A>
-    inline batch<T, A> rint(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> rint(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return nearbyint(x);
@@ -1825,7 +1855,7 @@ namespace xsimd
      * @return rotated batch.
      */
     template <size_t N, class T, class A>
-    inline batch<T, A> rotate_left(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> rotate_left(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::rotate_left<N, A>(x, A {});
@@ -1843,7 +1873,7 @@ namespace xsimd
      * @return rotated batch.
      */
     template <size_t N, class T, class A>
-    inline batch<T, A> rotate_right(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> rotate_right(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::rotate_right<N, A>(x, A {});
@@ -1859,13 +1889,13 @@ namespace xsimd
      * @return rotated \c x.
      */
     template <class T, class A>
-    inline batch<T, A> rotl(batch<T, A> const& x, int shift) noexcept
+    XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& x, int shift) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::rotl<A>(x, shift, A {});
     }
     template <class T, class A>
-    inline batch<T, A> rotl(batch<T, A> const& x, batch<T, A> const& shift) noexcept
+    XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& x, batch<T, A> const& shift) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::rotl<A>(x, shift, A {});
@@ -1881,13 +1911,13 @@ namespace xsimd
      * @return rotated \c x.
      */
     template <class T, class A>
-    inline batch<T, A> rotr(batch<T, A> const& x, int shift) noexcept
+    XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& x, int shift) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::rotr<A>(x, shift, A {});
     }
     template <class T, class A>
-    inline batch<T, A> rotr(batch<T, A> const& x, batch<T, A> const& shift) noexcept
+    XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& x, batch<T, A> const& shift) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::rotr<A>(x, shift, A {});
@@ -1903,7 +1933,7 @@ namespace xsimd
      * @return the batch of nearest integer values.
      */
     template <class T, class A>
-    inline batch<T, A> round(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> round(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::round<A>(x, A {});
@@ -1921,7 +1951,7 @@ namespace xsimd
      * @return the inverse square root of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> rsqrt(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::rsqrt<A>(x, A {});
@@ -1938,7 +1968,7 @@ namespace xsimd
      * @return the result of the saturated addition.
      */
     template <class T, class A>
-    inline batch<T, A> sadd(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::sadd<A>(x, y, A {});
@@ -1959,7 +1989,7 @@ namespace xsimd
      * @return the result of the selection.
      */
     template <class T, class A>
-    inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
+    XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::select<A>(cond, true_br, false_br, A {});
@@ -1980,7 +2010,7 @@ namespace xsimd
      * @return the result of the selection.
      */
     template <class T, class A>
-    inline batch<std::complex<T>, A> select(batch_bool<T, A> const& cond, batch<std::complex<T>, A> const& true_br, batch<std::complex<T>, A> const& false_br) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A> select(batch_bool<T, A> const& cond, batch<std::complex<T>, A> const& true_br, batch<std::complex<T>, A> const& false_br) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::select<A>(cond, true_br, false_br, A {});
@@ -2001,7 +2031,7 @@ namespace xsimd
      * @return the result of the selection.
      */
     template <class T, class A, bool... Values>
-    inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
+    XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::select<A>(cond, true_br, false_br, A {});
@@ -2017,15 +2047,15 @@ namespace xsimd
      * element of \c x and \c y. Each element of the mask index the vector that
      * would be formed by the concatenation of \c x and \c y. For instance
      * \code{.cpp}
-     * batch_constant<batch<uint32_t, sse2>, 0, 4, 3, 7>
+     * batch_constant<uint32_t, sse2, 0, 4, 3, 7>
      * \endcode
      * Picks \c x[0], \c y[0], \c x[3], \c y[3]
      *
      * @return combined batch
      */
     template <class T, class A, class Vt, Vt... Values>
-    inline typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
-    shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<Vt, A>, Values...> mask) noexcept
+    XSIMD_INLINE typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
+    shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<Vt, A, Values...> mask) noexcept
     {
         static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
         detail::static_check_supported_config<T, A>();
@@ -2040,7 +2070,7 @@ namespace xsimd
      * @return -1 for each negative element, -1 or +1 for each null element and +1 for each element
      */
     template <class T, class A>
-    inline batch<T, A> sign(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> sign(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::sign<A>(x, A {});
@@ -2054,7 +2084,7 @@ namespace xsimd
      * @return -1 for each negative element, -1 or +1 for each null element and +1 for each element
      */
     template <class T, class A>
-    inline batch<T, A> signnz(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> signnz(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::signnz<A>(x, A {});
@@ -2068,7 +2098,7 @@ namespace xsimd
      * @return the sine of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> sin(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> sin(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::sin<A>(x, A {});
@@ -2083,7 +2113,7 @@ namespace xsimd
      * @return a pair containing the sine then the cosine of  batch \c x
      */
     template <class T, class A>
-    inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& x) noexcept
+    XSIMD_INLINE std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::sincos<A>(x, A {});
@@ -2097,7 +2127,7 @@ namespace xsimd
      * @return the hyperbolic sine of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> sinh(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> sinh(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::sinh<A>(x, A {});
@@ -2114,7 +2144,7 @@ namespace xsimd
      * @return slided batch.
      */
     template <size_t N, class T, class A>
-    inline batch<T, A> slide_left(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x) noexcept
     {
         static_assert(std::is_integral<T>::value, "can only slide batch of integers");
         detail::static_check_supported_config<T, A>();
@@ -2132,7 +2162,7 @@ namespace xsimd
      * @return slided batch.
      */
     template <size_t N, class T, class A>
-    inline batch<T, A> slide_right(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x) noexcept
     {
         static_assert(std::is_integral<T>::value, "can only slide batch of integers");
         detail::static_check_supported_config<T, A>();
@@ -2147,7 +2177,7 @@ namespace xsimd
      * @return the square root of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> sqrt(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::sqrt<A>(x, A {});
@@ -2163,7 +2193,7 @@ namespace xsimd
      * @return the result of the saturated difference.
      */
     template <class T, class A>
-    inline batch<T, A> ssub(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::ssub<A>(x, y, A {});
@@ -2178,26 +2208,29 @@ namespace xsimd
      * @param src the batch to copy
      */
     template <class To, class A = default_arch, class From>
-    inline void store_as(To* dst, batch<From, A> const& src, aligned_mode) noexcept
+    XSIMD_INLINE void store_as(To* dst, batch<From, A> const& src, aligned_mode) noexcept
     {
-        kernel::store_aligned(dst, src, A {});
+        detail::static_check_supported_config<From, A>();
+        kernel::store_aligned<A>(dst, src, A {});
     }
 
     template <class A = default_arch, class From>
-    inline void store_as(bool* dst, batch_bool<From, A> const& src, aligned_mode) noexcept
+    XSIMD_INLINE void store_as(bool* dst, batch_bool<From, A> const& src, aligned_mode) noexcept
     {
-        kernel::store(src, dst, A {});
+        detail::static_check_supported_config<From, A>();
+        kernel::store<A>(src, dst, A {});
     }
 
     template <class To, class A = default_arch, class From>
-    inline void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
+    XSIMD_INLINE void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
     {
-        kernel::store_complex_aligned(dst, src, A {});
+        detail::static_check_supported_config<std::complex<From>, A>();
+        kernel::store_complex_aligned<A>(dst, src, A {});
     }
 
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
     template <class To, class A = default_arch, class From, bool i3ec>
-    inline void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
+    XSIMD_INLINE void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
     {
         store_as(reinterpret_cast<std::complex<To>*>(dst), src, aligned_mode());
     }
@@ -2212,27 +2245,31 @@ namespace xsimd
      * @param src the batch to copy
      */
     template <class To, class A = default_arch, class From>
-    inline void store_as(To* dst, batch<From, A> const& src, unaligned_mode) noexcept
+    XSIMD_INLINE void store_as(To* dst, batch<From, A> const& src, unaligned_mode) noexcept
     {
-        kernel::store_unaligned(dst, src, A {});
+        detail::static_check_supported_config<From, A>();
+        kernel::store_unaligned<A>(dst, src, A {});
     }
 
     template <class A = default_arch, class From>
-    inline void store_as(bool* dst, batch_bool<From, A> const& src, unaligned_mode) noexcept
+    XSIMD_INLINE void store_as(bool* dst, batch_bool<From, A> const& src, unaligned_mode) noexcept
     {
-        kernel::store(src, dst, A {});
+        detail::static_check_supported_config<From, A>();
+        kernel::store<A>(src, dst, A {});
     }
 
     template <class To, class A = default_arch, class From>
-    inline void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, unaligned_mode) noexcept
+    XSIMD_INLINE void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, unaligned_mode) noexcept
     {
-        kernel::store_complex_unaligned(dst, src, A {});
+        detail::static_check_supported_config<std::complex<From>, A>();
+        kernel::store_complex_unaligned<A>(dst, src, A {});
     }
 
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
     template <class To, class A = default_arch, class From, bool i3ec>
-    inline void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, unaligned_mode) noexcept
+    XSIMD_INLINE void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, unaligned_mode) noexcept
     {
+        detail::static_check_supported_config<std::complex<From>, A>();
         store_as(reinterpret_cast<std::complex<To>*>(dst), src, unaligned_mode());
     }
 #endif
@@ -2246,7 +2283,7 @@ namespace xsimd
      * @param val the batch to copy from
      */
     template <class A, class T>
-    inline void store(T* mem, batch<T, A> const& val, aligned_mode = {}) noexcept
+    XSIMD_INLINE void store(T* mem, batch<T, A> const& val, aligned_mode = {}) noexcept
     {
         store_as<T, A>(mem, val, aligned_mode {});
     }
@@ -2260,7 +2297,7 @@ namespace xsimd
      * @param val the batch to copy from
      */
     template <class A, class T>
-    inline void store(T* mem, batch<T, A> const& val, unaligned_mode) noexcept
+    XSIMD_INLINE void store(T* mem, batch<T, A> const& val, unaligned_mode) noexcept
     {
         store_as<T, A>(mem, val, unaligned_mode {});
     }
@@ -2274,7 +2311,7 @@ namespace xsimd
      * @param val the batch to copy from
      */
     template <class A, class T>
-    inline void store_aligned(T* mem, batch<T, A> const& val) noexcept
+    XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& val) noexcept
     {
         store_as<T, A>(mem, val, aligned_mode {});
     }
@@ -2288,7 +2325,7 @@ namespace xsimd
      * @param val the batch to copy
      */
     template <class A, class T>
-    inline void store_unaligned(T* mem, batch<T, A> const& val) noexcept
+    XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& val) noexcept
     {
         store_as<T, A>(mem, val, unaligned_mode {});
     }
@@ -2303,7 +2340,7 @@ namespace xsimd
      * @return the difference between \c x and \c y
      */
     template <class T, class A>
-    inline auto sub(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x - y)
+    XSIMD_INLINE auto sub(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x - y)
     {
         detail::static_check_supported_config<T, A>();
         return x - y;
@@ -2319,15 +2356,15 @@ namespace xsimd
      * @return swizzled batch
      */
     template <class T, class A, class Vt, Vt... Values>
-    inline typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
-    swizzle(batch<T, A> const& x, batch_constant<batch<Vt, A>, Values...> mask) noexcept
+    XSIMD_INLINE typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
+    swizzle(batch<T, A> const& x, batch_constant<Vt, A, Values...> mask) noexcept
     {
         static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
         detail::static_check_supported_config<T, A>();
         return kernel::swizzle<A>(x, mask, A {});
     }
     template <class T, class A, class Vt, Vt... Values>
-    inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& x, batch_constant<batch<Vt, A>, Values...> mask) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& x, batch_constant<Vt, A, Values...> mask) noexcept
     {
         static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
         detail::static_check_supported_config<T, A>();
@@ -2344,7 +2381,7 @@ namespace xsimd
      * @return swizzled batch
      */
     template <class T, class A, class Vt>
-    inline typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
+    XSIMD_INLINE typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
     swizzle(batch<T, A> const& x, batch<Vt, A> mask) noexcept
     {
         static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
@@ -2353,7 +2390,7 @@ namespace xsimd
     }
 
     template <class T, class A, class Vt>
-    inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& x, batch<Vt, A> mask) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& x, batch<Vt, A> mask) noexcept
     {
         static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
         detail::static_check_supported_config<T, A>();
@@ -2368,7 +2405,7 @@ namespace xsimd
      * @return the tangent of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> tan(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> tan(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::tan<A>(x, A {});
@@ -2382,7 +2419,7 @@ namespace xsimd
      * @return the hyperbolic tangent of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> tanh(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> tanh(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::tanh<A>(x, A {});
@@ -2396,7 +2433,7 @@ namespace xsimd
      * @return the gamma function of \c x.
      */
     template <class T, class A>
-    inline batch<T, A> tgamma(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> tgamma(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::tgamma<A>(x, A {});
@@ -2411,7 +2448,7 @@ namespace xsimd
      * @return \c i converted to a value of an floating point type of the same size as \c T
      */
     template <class T, class A>
-    inline batch<as_float_t<T>, A> to_float(batch<T, A> const& i) noexcept
+    XSIMD_INLINE batch<as_float_t<T>, A> to_float(batch<T, A> const& i) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return batch_cast<as_float_t<T>>(i);
@@ -2426,7 +2463,7 @@ namespace xsimd
      * @return \c x converted to a value of an integer type of the same size as \c T
      */
     template <class T, class A>
-    inline batch<as_integer_t<T>, A> to_int(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<as_integer_t<T>, A> to_int(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return batch_cast<as_integer_t<T>>(x);
@@ -2441,7 +2478,7 @@ namespace xsimd
      * @return the batch of nearest integer values not greater in magnitude than \c x.
      */
     template <class T, class A>
-    inline batch<T, A> trunc(batch<T, A> const& x) noexcept
+    XSIMD_INLINE batch<T, A> trunc(batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::trunc<A>(x, A {});
@@ -2457,7 +2494,7 @@ namespace xsimd
      * @return a batch of the high part of shuffled values.
      */
     template <class T, class A>
-    inline batch<T, A> zip_hi(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::zip_hi<A>(x, y, A {});
@@ -2473,7 +2510,7 @@ namespace xsimd
      * @return a batch of the low part of shuffled values.
      */
     template <class T, class A>
-    inline batch<T, A> zip_lo(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& x, batch<T, A> const& y) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::zip_lo<A>(x, y, A {});
@@ -2490,7 +2527,7 @@ namespace xsimd
      * @return \c self cast to a \c batch of \c T
      */
     template <class T, class A, typename std::enable_if<std::is_integral<T>::value, int>::type = 3>
-    inline batch<T, A> bitwise_cast(batch_bool<T, A> const& self) noexcept
+    XSIMD_INLINE batch<T, A> bitwise_cast(batch_bool<T, A> const& self) noexcept
     {
         T z(0);
         detail::static_check_supported_config<T, A>();
@@ -2498,7 +2535,7 @@ namespace xsimd
     }
 
     template <class T, class A, typename std::enable_if<std::is_floating_point<T>::value, int>::type = 3>
-    inline batch<T, A> bitwise_cast(batch_bool<T, A> const& self) noexcept
+    XSIMD_INLINE batch<T, A> bitwise_cast(batch_bool<T, A> const& self) noexcept
     {
         T z0(0), z1(0);
         using int_type = as_unsigned_integer_t<T>;
@@ -2517,7 +2554,7 @@ namespace xsimd
      * @return a boolean scalar.
      */
     template <class T, class A>
-    inline bool all(batch_bool<T, A> const& x) noexcept
+    XSIMD_INLINE bool all(batch_bool<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::all<A>(x, A {});
@@ -2532,7 +2569,7 @@ namespace xsimd
      * @return a boolean scalar.
      */
     template <class T, class A>
-    inline bool any(batch_bool<T, A> const& x) noexcept
+    XSIMD_INLINE bool any(batch_bool<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::any<A>(x, A {});
@@ -2547,7 +2584,7 @@ namespace xsimd
      * @return a boolean scalar.
      */
     template <class T, class A>
-    inline bool none(batch_bool<T, A> const& x) noexcept
+    XSIMD_INLINE bool none(batch_bool<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return !xsimd::any(x);
@@ -2562,7 +2599,7 @@ namespace xsimd
      * @return a reference to \c o
      */
     template <class T, class A>
-    inline std::ostream& operator<<(std::ostream& o, batch<T, A> const& x) noexcept
+    XSIMD_INLINE std::ostream& operator<<(std::ostream& o, batch<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         constexpr auto size = batch<T, A>::size;
@@ -2583,7 +2620,7 @@ namespace xsimd
      * @return a reference to \c o
      */
     template <class T, class A>
-    inline std::ostream& operator<<(std::ostream& o, batch_bool<T, A> const& x) noexcept
+    XSIMD_INLINE std::ostream& operator<<(std::ostream& o, batch_bool<T, A> const& x) noexcept
     {
         detail::static_check_supported_config<T, A>();
         constexpr auto size = batch_bool<T, A>::size;
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx2_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx2_register.hpp
index cd10383e2bc..264b7c3eda9 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx2_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx2_register.hpp
@@ -25,7 +25,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_AVX2; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(2, 2, 0); }
         static constexpr char const* name() noexcept { return "avx2"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512bw_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512bw_register.hpp
index 15c19832ae8..9d4d33b64eb 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512bw_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512bw_register.hpp
@@ -26,7 +26,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512BW; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(3, 4, 0); }
         static constexpr char const* name() noexcept { return "avx512bw"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512cd_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512cd_register.hpp
index 29efca368ce..cf060139557 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512cd_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512cd_register.hpp
@@ -26,7 +26,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512CD; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(3, 2, 0); }
         static constexpr char const* name() noexcept { return "avx512cd"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512dq_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512dq_register.hpp
index 25a255ec157..f8a8dc54343 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512dq_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512dq_register.hpp
@@ -26,7 +26,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512DQ; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(3, 3, 0); }
         static constexpr char const* name() noexcept { return "avx512dq"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512er_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512er_register.hpp
index a99157cf372..a52bd0064e2 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512er_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512er_register.hpp
@@ -26,7 +26,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512ER; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(3, 3, 1); }
         static constexpr char const* name() noexcept { return "avx512er"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512f_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512f_register.hpp
index c1f80a122dd..1a11b6c92ab 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512f_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512f_register.hpp
@@ -26,7 +26,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512F; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(3, 1, 0); }
         static constexpr std::size_t alignment() noexcept { return 64; }
         static constexpr bool requires_alignment() noexcept { return true; }
         static constexpr char const* name() noexcept { return "avx512f"; }
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512ifma_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512ifma_register.hpp
index ba76ea147bf..a8bc8885fb4 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512ifma_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512ifma_register.hpp
@@ -26,7 +26,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512IFMA; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(3, 5, 0); }
         static constexpr char const* name() noexcept { return "avx512ifma"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512pf_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512pf_register.hpp
index 38a10f02273..4838a8a461e 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512pf_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512pf_register.hpp
@@ -26,7 +26,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512PF; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(3, 4, 1); }
         static constexpr char const* name() noexcept { return "avx512pf"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512vbmi_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512vbmi_register.hpp
index 19ff744d720..40f51e9b192 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512vbmi_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512vbmi_register.hpp
@@ -26,7 +26,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VBMI; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(3, 6, 0); }
         static constexpr char const* name() noexcept { return "avx512vbmi"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp
index 85edbdf230c..a19b949f8bc 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp
@@ -29,7 +29,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VNNI_AVX512BW; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(3, 4, 1); }
         static constexpr char const* name() noexcept { return "avx512vnni+avx512bw"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512vnni_avx512vbmi_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512vnni_avx512vbmi_register.hpp
index 232b19a5cb8..0a6b45f76cd 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512vnni_avx512vbmi_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx512vnni_avx512vbmi_register.hpp
@@ -29,7 +29,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VNNI_AVX512VBMI; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(3, 6, 1); }
         static constexpr char const* name() noexcept { return "avx512vnni+avx512vbmi"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx_register.hpp
index 6b1951f964b..7357304d5d6 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_avx_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_avx_register.hpp
@@ -26,7 +26,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_AVX; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(2, 1, 0); }
         static constexpr std::size_t alignment() noexcept { return 32; }
         static constexpr bool requires_alignment() noexcept { return true; }
         static constexpr char const* name() noexcept { return "avx"; }
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_avxvnni_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_avxvnni_register.hpp
index f68fe16bad2..419547b1cf4 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_avxvnni_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_avxvnni_register.hpp
@@ -25,7 +25,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_AVXVNNI; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(2, 3, 0); }
         static constexpr char const* name() noexcept { return "avxvnni"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_batch.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_batch.hpp
index b4989fc88d0..898f7b5a442 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_batch.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_batch.hpp
@@ -29,38 +29,38 @@ namespace xsimd
         template <class T, class A>
         struct integral_only_operators
         {
-            inline batch<T, A>& operator%=(batch<T, A> const& other) noexcept;
-            inline batch<T, A>& operator>>=(int32_t other) noexcept;
-            inline batch<T, A>& operator>>=(batch<T, A> const& other) noexcept;
-            inline batch<T, A>& operator<<=(int32_t other) noexcept;
-            inline batch<T, A>& operator<<=(batch<T, A> const& other) noexcept;
+            XSIMD_INLINE batch<T, A>& operator%=(batch<T, A> const& other) noexcept;
+            XSIMD_INLINE batch<T, A>& operator>>=(int32_t other) noexcept;
+            XSIMD_INLINE batch<T, A>& operator>>=(batch<T, A> const& other) noexcept;
+            XSIMD_INLINE batch<T, A>& operator<<=(int32_t other) noexcept;
+            XSIMD_INLINE batch<T, A>& operator<<=(batch<T, A> const& other) noexcept;
 
             /** Shorthand for xsimd::mod() */
-            friend inline batch<T, A> operator%(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            friend XSIMD_INLINE batch<T, A> operator%(batch<T, A> const& self, batch<T, A> const& other) noexcept
             {
                 return batch<T, A>(self) %= other;
             }
 
             /** Shorthand for xsimd::bitwise_rshift() */
-            friend inline batch<T, A> operator>>(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            friend XSIMD_INLINE batch<T, A> operator>>(batch<T, A> const& self, batch<T, A> const& other) noexcept
             {
                 return batch<T, A>(self) >>= other;
             }
 
             /** Shorthand for xsimd::bitwise_lshift() */
-            friend inline batch<T, A> operator<<(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            friend XSIMD_INLINE batch<T, A> operator<<(batch<T, A> const& self, batch<T, A> const& other) noexcept
             {
                 return batch<T, A>(self) <<= other;
             }
 
             /** Shorthand for xsimd::bitwise_rshift() */
-            friend inline batch<T, A> operator>>(batch<T, A> const& self, int32_t other) noexcept
+            friend XSIMD_INLINE batch<T, A> operator>>(batch<T, A> const& self, int32_t other) noexcept
             {
                 return batch<T, A>(self) >>= other;
             }
 
             /** Shorthand for xsimd::bitwise_lshift() */
-            friend inline batch<T, A> operator<<(batch<T, A> const& self, int32_t other) noexcept
+            friend XSIMD_INLINE batch<T, A> operator<<(batch<T, A> const& self, int32_t other) noexcept
             {
                 return batch<T, A>(self) <<= other;
             }
@@ -82,22 +82,22 @@ namespace xsimd
         // with batch<T, A>. Their implementation must appear only once the
         // kernel implementations have been included.
         template <class T, class A>
-        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other) noexcept;
 
         template <class T, class A>
-        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other) noexcept;
 
         template <class T, class A>
-        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other) noexcept;
 
         template <class T, class A>
-        inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other) noexcept;
 
         template <class T, class A>
-        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other) noexcept;
 
         template <class T, class A>
-        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other) noexcept;
     }
 
     /**
@@ -123,152 +123,152 @@ namespace xsimd
         using batch_bool_type = batch_bool<T, A>; ///< Associated batch type used to represented logical operations on this batch.
 
         // constructors
-        inline batch() = default; ///< Create a batch initialized with undefined values.
-        inline batch(T val) noexcept;
+        XSIMD_INLINE batch() = default; ///< Create a batch initialized with undefined values.
+        XSIMD_INLINE batch(T val) noexcept;
         template <class... Ts>
-        inline batch(T val0, T val1, Ts... vals) noexcept;
-        inline explicit batch(batch_bool_type const& b) noexcept;
-        inline batch(register_type reg) noexcept;
+        XSIMD_INLINE batch(T val0, T val1, Ts... vals) noexcept;
+        XSIMD_INLINE explicit batch(batch_bool_type const& b) noexcept;
+        XSIMD_INLINE batch(register_type reg) noexcept;
 
         template <class U>
-        XSIMD_NO_DISCARD static inline batch broadcast(U val) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch broadcast(U val) noexcept;
 
         // memory operators
         template <class U>
-        inline void store_aligned(U* mem) const noexcept;
+        XSIMD_INLINE void store_aligned(U* mem) const noexcept;
         template <class U>
-        inline void store_unaligned(U* mem) const noexcept;
+        XSIMD_INLINE void store_unaligned(U* mem) const noexcept;
         template <class U>
-        inline void store(U* mem, aligned_mode) const noexcept;
+        XSIMD_INLINE void store(U* mem, aligned_mode) const noexcept;
         template <class U>
-        inline void store(U* mem, unaligned_mode) const noexcept;
+        XSIMD_INLINE void store(U* mem, unaligned_mode) const noexcept;
 
         template <class U>
-        XSIMD_NO_DISCARD static inline batch load_aligned(U const* mem) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(U const* mem) noexcept;
         template <class U>
-        XSIMD_NO_DISCARD static inline batch load_unaligned(U const* mem) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_unaligned(U const* mem) noexcept;
         template <class U>
-        XSIMD_NO_DISCARD static inline batch load(U const* mem, aligned_mode) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, aligned_mode) noexcept;
         template <class U>
-        XSIMD_NO_DISCARD static inline batch load(U const* mem, unaligned_mode) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, unaligned_mode) noexcept;
 
         template <class U, class V>
-        XSIMD_NO_DISCARD static inline batch gather(U const* src, batch<V, arch_type> const& index) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch gather(U const* src, batch<V, arch_type> const& index) noexcept;
         template <class U, class V>
-        inline void scatter(U* dst, batch<V, arch_type> const& index) const noexcept;
+        XSIMD_INLINE void scatter(U* dst, batch<V, arch_type> const& index) const noexcept;
 
-        inline T get(std::size_t i) const noexcept;
+        XSIMD_INLINE T get(std::size_t i) const noexcept;
 
         // comparison operators. Defined as friend to enable automatic
         // conversion of parameters from scalar to batch, at the cost of using a
         // proxy implementation from details::.
-        friend inline batch_bool<T, A> operator==(batch const& self, batch const& other) noexcept
+        friend XSIMD_INLINE batch_bool<T, A> operator==(batch const& self, batch const& other) noexcept
         {
             return details::eq<T, A>(self, other);
         }
-        friend inline batch_bool<T, A> operator!=(batch const& self, batch const& other) noexcept
+        friend XSIMD_INLINE batch_bool<T, A> operator!=(batch const& self, batch const& other) noexcept
         {
             return details::neq<T, A>(self, other);
         }
-        friend inline batch_bool<T, A> operator>=(batch const& self, batch const& other) noexcept
+        friend XSIMD_INLINE batch_bool<T, A> operator>=(batch const& self, batch const& other) noexcept
         {
             return details::ge<T, A>(self, other);
         }
-        friend inline batch_bool<T, A> operator<=(batch const& self, batch const& other) noexcept
+        friend XSIMD_INLINE batch_bool<T, A> operator<=(batch const& self, batch const& other) noexcept
         {
             return details::le<T, A>(self, other);
         }
-        friend inline batch_bool<T, A> operator>(batch const& self, batch const& other) noexcept
+        friend XSIMD_INLINE batch_bool<T, A> operator>(batch const& self, batch const& other) noexcept
         {
             return details::gt<T, A>(self, other);
         }
-        friend inline batch_bool<T, A> operator<(batch const& self, batch const& other) noexcept
+        friend XSIMD_INLINE batch_bool<T, A> operator<(batch const& self, batch const& other) noexcept
         {
             return details::lt<T, A>(self, other);
         }
 
         // Update operators
-        inline batch& operator+=(batch const& other) noexcept;
-        inline batch& operator-=(batch const& other) noexcept;
-        inline batch& operator*=(batch const& other) noexcept;
-        inline batch& operator/=(batch const& other) noexcept;
-        inline batch& operator&=(batch const& other) noexcept;
-        inline batch& operator|=(batch const& other) noexcept;
-        inline batch& operator^=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator+=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator-=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator*=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator/=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator&=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator|=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator^=(batch const& other) noexcept;
 
         // incr/decr operators
-        inline batch& operator++() noexcept;
-        inline batch& operator--() noexcept;
-        inline batch operator++(int) noexcept;
-        inline batch operator--(int) noexcept;
+        XSIMD_INLINE batch& operator++() noexcept;
+        XSIMD_INLINE batch& operator--() noexcept;
+        XSIMD_INLINE batch operator++(int) noexcept;
+        XSIMD_INLINE batch operator--(int) noexcept;
 
         // unary operators
-        inline batch_bool_type operator!() const noexcept;
-        inline batch operator~() const noexcept;
-        inline batch operator-() const noexcept;
-        inline batch operator+() const noexcept;
+        XSIMD_INLINE batch_bool_type operator!() const noexcept;
+        XSIMD_INLINE batch operator~() const noexcept;
+        XSIMD_INLINE batch operator-() const noexcept;
+        XSIMD_INLINE batch operator+() const noexcept;
 
         // arithmetic operators. They are defined as friend to enable automatic
         // conversion of parameters from scalar to batch. Inline implementation
         // is required to avoid warnings.
 
         /** Shorthand for xsimd::add() */
-        friend inline batch operator+(batch const& self, batch const& other) noexcept
+        friend XSIMD_INLINE batch operator+(batch const& self, batch const& other) noexcept
         {
             return batch(self) += other;
         }
 
         /** Shorthand for xsimd::sub() */
-        friend inline batch operator-(batch const& self, batch const& other) noexcept
+        friend XSIMD_INLINE batch operator-(batch const& self, batch const& other) noexcept
         {
             return batch(self) -= other;
         }
 
         /** Shorthand for xsimd::mul() */
-        friend inline batch operator*(batch const& self, batch const& other) noexcept
+        friend XSIMD_INLINE batch operator*(batch const& self, batch const& other) noexcept
         {
             return batch(self) *= other;
         }
 
         /** Shorthand for xsimd::div() */
-        friend inline batch operator/(batch const& self, batch const& other) noexcept
+        friend XSIMD_INLINE batch operator/(batch const& self, batch const& other) noexcept
         {
             return batch(self) /= other;
         }
 
         /** Shorthand for xsimd::bitwise_and() */
-        friend inline batch operator&(batch const& self, batch const& other) noexcept
+        friend XSIMD_INLINE batch operator&(batch const& self, batch const& other) noexcept
         {
             return batch(self) &= other;
         }
 
         /** Shorthand for xsimd::bitwise_or() */
-        friend inline batch operator|(batch const& self, batch const& other) noexcept
+        friend XSIMD_INLINE batch operator|(batch const& self, batch const& other) noexcept
         {
             return batch(self) |= other;
         }
 
         /** Shorthand for xsimd::bitwise_xor() */
-        friend inline batch operator^(batch const& self, batch const& other) noexcept
+        friend XSIMD_INLINE batch operator^(batch const& self, batch const& other) noexcept
         {
             return batch(self) ^= other;
         }
 
         /** Shorthand for xsimd::logical_and() */
-        friend inline batch operator&&(batch const& self, batch const& other) noexcept
+        friend XSIMD_INLINE batch operator&&(batch const& self, batch const& other) noexcept
         {
             return batch(self).logical_and(other);
         }
 
         /** Shorthand for xsimd::logical_or() */
-        friend inline batch operator||(batch const& self, batch const& other) noexcept
+        friend XSIMD_INLINE batch operator||(batch const& self, batch const& other) noexcept
         {
             return batch(self).logical_or(other);
         }
 
     private:
-        inline batch logical_and(batch const& other) const noexcept;
-        inline batch logical_or(batch const& other) const noexcept;
+        XSIMD_INLINE batch logical_and(batch const& other) const noexcept;
+        XSIMD_INLINE batch logical_or(batch const& other) const noexcept;
     };
 
     template <class T, class A>
@@ -297,51 +297,51 @@ namespace xsimd
         using batch_type = batch<T, A>; ///< Associated batch type this batch represents logical operations for.
 
         // constructors
-        inline batch_bool() = default; ///< Create a batch initialized with undefined values.
-        inline batch_bool(bool val) noexcept;
-        inline batch_bool(register_type reg) noexcept;
+        XSIMD_INLINE batch_bool() = default; ///< Create a batch initialized with undefined values.
+        XSIMD_INLINE batch_bool(bool val) noexcept;
+        XSIMD_INLINE batch_bool(register_type reg) noexcept;
         template <class... Ts>
-        inline batch_bool(bool val0, bool val1, Ts... vals) noexcept;
+        XSIMD_INLINE batch_bool(bool val0, bool val1, Ts... vals) noexcept;
 
         template <class Tp>
-        inline batch_bool(Tp const*) = delete;
+        XSIMD_INLINE batch_bool(Tp const*) = delete;
 
         // memory operators
-        inline void store_aligned(bool* mem) const noexcept;
-        inline void store_unaligned(bool* mem) const noexcept;
-        XSIMD_NO_DISCARD static inline batch_bool load_aligned(bool const* mem) noexcept;
-        XSIMD_NO_DISCARD static inline batch_bool load_unaligned(bool const* mem) noexcept;
+        XSIMD_INLINE void store_aligned(bool* mem) const noexcept;
+        XSIMD_INLINE void store_unaligned(bool* mem) const noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch_bool load_aligned(bool const* mem) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch_bool load_unaligned(bool const* mem) noexcept;
 
-        inline bool get(std::size_t i) const noexcept;
+        XSIMD_INLINE bool get(std::size_t i) const noexcept;
 
         // mask operations
-        inline uint64_t mask() const noexcept;
-        inline static batch_bool from_mask(uint64_t mask) noexcept;
+        XSIMD_INLINE uint64_t mask() const noexcept;
+        XSIMD_INLINE static batch_bool from_mask(uint64_t mask) noexcept;
 
         // comparison operators
-        inline batch_bool operator==(batch_bool const& other) const noexcept;
-        inline batch_bool operator!=(batch_bool const& other) const noexcept;
+        XSIMD_INLINE batch_bool operator==(batch_bool const& other) const noexcept;
+        XSIMD_INLINE batch_bool operator!=(batch_bool const& other) const noexcept;
 
         // logical operators
-        inline batch_bool operator~() const noexcept;
-        inline batch_bool operator!() const noexcept;
-        inline batch_bool operator&(batch_bool const& other) const noexcept;
-        inline batch_bool operator|(batch_bool const& other) const noexcept;
-        inline batch_bool operator^(batch_bool const& other) const noexcept;
-        inline batch_bool operator&&(batch_bool const& other) const noexcept;
-        inline batch_bool operator||(batch_bool const& other) const noexcept;
+        XSIMD_INLINE batch_bool operator~() const noexcept;
+        XSIMD_INLINE batch_bool operator!() const noexcept;
+        XSIMD_INLINE batch_bool operator&(batch_bool const& other) const noexcept;
+        XSIMD_INLINE batch_bool operator|(batch_bool const& other) const noexcept;
+        XSIMD_INLINE batch_bool operator^(batch_bool const& other) const noexcept;
+        XSIMD_INLINE batch_bool operator&&(batch_bool const& other) const noexcept;
+        XSIMD_INLINE batch_bool operator||(batch_bool const& other) const noexcept;
 
         // update operators
-        inline batch_bool& operator&=(batch_bool const& other) noexcept { return (*this) = (*this) & other; }
-        inline batch_bool& operator|=(batch_bool const& other) noexcept { return (*this) = (*this) | other; }
-        inline batch_bool& operator^=(batch_bool const& other) noexcept { return (*this) = (*this) ^ other; }
+        XSIMD_INLINE batch_bool& operator&=(batch_bool const& other) noexcept { return (*this) = (*this) & other; }
+        XSIMD_INLINE batch_bool& operator|=(batch_bool const& other) noexcept { return (*this) = (*this) | other; }
+        XSIMD_INLINE batch_bool& operator^=(batch_bool const& other) noexcept { return (*this) = (*this) ^ other; }
 
     private:
         template <class U, class... V, size_t I, size_t... Is>
-        static inline register_type make_register(detail::index_sequence<I, Is...>, U u, V... v) noexcept;
+        static XSIMD_INLINE register_type make_register(detail::index_sequence<I, Is...>, U u, V... v) noexcept;
 
         template <class... V>
-        static inline register_type make_register(detail::index_sequence<>, V... v) noexcept;
+        static XSIMD_INLINE register_type make_register(detail::index_sequence<>, V... v) noexcept;
     };
 
     template <class T, class A>
@@ -367,106 +367,106 @@ namespace xsimd
         static constexpr std::size_t size = real_batch::size; ///< Number of complex elements in this batch.
 
         // constructors
-        inline batch() = default; ///< Create a batch initialized with undefined values.
-        inline batch(value_type const& val) noexcept;
-        inline batch(real_batch const& real, real_batch const& imag) noexcept;
+        XSIMD_INLINE batch() = default; ///< Create a batch initialized with undefined values.
+        XSIMD_INLINE batch(value_type const& val) noexcept;
+        XSIMD_INLINE batch(real_batch const& real, real_batch const& imag) noexcept;
 
-        inline batch(real_batch const& real) noexcept;
-        inline batch(T val) noexcept;
+        XSIMD_INLINE batch(real_batch const& real) noexcept;
+        XSIMD_INLINE batch(T val) noexcept;
         template <class... Ts>
-        inline batch(value_type val0, value_type val1, Ts... vals) noexcept;
-        inline explicit batch(batch_bool_type const& b) noexcept;
+        XSIMD_INLINE batch(value_type val0, value_type val1, Ts... vals) noexcept;
+        XSIMD_INLINE explicit batch(batch_bool_type const& b) noexcept;
 
         template <class U>
-        XSIMD_NO_DISCARD static inline batch broadcast(U val) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch broadcast(U val) noexcept;
 
         // memory operators
-        XSIMD_NO_DISCARD static inline batch load_aligned(const T* real_src, const T* imag_src = nullptr) noexcept;
-        XSIMD_NO_DISCARD static inline batch load_unaligned(const T* real_src, const T* imag_src = nullptr) noexcept;
-        inline void store_aligned(T* real_dst, T* imag_dst) const noexcept;
-        inline void store_unaligned(T* real_dst, T* imag_dst) const noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(const T* real_src, const T* imag_src = nullptr) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_unaligned(const T* real_src, const T* imag_src = nullptr) noexcept;
+        XSIMD_INLINE void store_aligned(T* real_dst, T* imag_dst) const noexcept;
+        XSIMD_INLINE void store_unaligned(T* real_dst, T* imag_dst) const noexcept;
 
-        XSIMD_NO_DISCARD static inline batch load_aligned(const value_type* src) noexcept;
-        XSIMD_NO_DISCARD static inline batch load_unaligned(const value_type* src) noexcept;
-        inline void store_aligned(value_type* dst) const noexcept;
-        inline void store_unaligned(value_type* dst) const noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(const value_type* src) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_unaligned(const value_type* src) noexcept;
+        XSIMD_INLINE void store_aligned(value_type* dst) const noexcept;
+        XSIMD_INLINE void store_unaligned(value_type* dst) const noexcept;
 
         template <class U>
-        XSIMD_NO_DISCARD static inline batch load(U const* mem, aligned_mode) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, aligned_mode) noexcept;
         template <class U>
-        XSIMD_NO_DISCARD static inline batch load(U const* mem, unaligned_mode) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, unaligned_mode) noexcept;
         template <class U>
-        inline void store(U* mem, aligned_mode) const noexcept;
+        XSIMD_INLINE void store(U* mem, aligned_mode) const noexcept;
         template <class U>
-        inline void store(U* mem, unaligned_mode) const noexcept;
+        XSIMD_INLINE void store(U* mem, unaligned_mode) const noexcept;
 
-        inline real_batch real() const noexcept;
-        inline real_batch imag() const noexcept;
+        XSIMD_INLINE real_batch real() const noexcept;
+        XSIMD_INLINE real_batch imag() const noexcept;
 
-        inline value_type get(std::size_t i) const noexcept;
+        XSIMD_INLINE value_type get(std::size_t i) const noexcept;
 
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
         // xtl-related methods
         template <bool i3ec>
-        inline batch(xtl::xcomplex<T, T, i3ec> const& val) noexcept;
+        XSIMD_INLINE batch(xtl::xcomplex<T, T, i3ec> const& val) noexcept;
         template <bool i3ec, class... Ts>
-        inline batch(xtl::xcomplex<T, T, i3ec> val0, xtl::xcomplex<T, T, i3ec> val1, Ts... vals) noexcept;
+        XSIMD_INLINE batch(xtl::xcomplex<T, T, i3ec> val0, xtl::xcomplex<T, T, i3ec> val1, Ts... vals) noexcept;
 
         template <bool i3ec>
-        XSIMD_NO_DISCARD static inline batch load_aligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept;
         template <bool i3ec>
-        XSIMD_NO_DISCARD static inline batch load_unaligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_unaligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept;
         template <bool i3ec>
-        inline void store_aligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept;
+        XSIMD_INLINE void store_aligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept;
         template <bool i3ec>
-        inline void store_unaligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept;
+        XSIMD_INLINE void store_unaligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept;
 #endif
 
         // comparison operators
-        inline batch_bool<T, A> operator==(batch const& other) const noexcept;
-        inline batch_bool<T, A> operator!=(batch const& other) const noexcept;
+        XSIMD_INLINE batch_bool<T, A> operator==(batch const& other) const noexcept;
+        XSIMD_INLINE batch_bool<T, A> operator!=(batch const& other) const noexcept;
 
         // Update operators
-        inline batch& operator+=(batch const& other) noexcept;
-        inline batch& operator-=(batch const& other) noexcept;
-        inline batch& operator*=(batch const& other) noexcept;
-        inline batch& operator/=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator+=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator-=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator*=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator/=(batch const& other) noexcept;
 
         // incr/decr operators
-        inline batch& operator++() noexcept;
-        inline batch& operator--() noexcept;
-        inline batch operator++(int) noexcept;
-        inline batch operator--(int) noexcept;
+        XSIMD_INLINE batch& operator++() noexcept;
+        XSIMD_INLINE batch& operator--() noexcept;
+        XSIMD_INLINE batch operator++(int) noexcept;
+        XSIMD_INLINE batch operator--(int) noexcept;
 
         // unary operators
-        inline batch_bool_type operator!() const noexcept;
-        inline batch operator~() const noexcept;
-        inline batch operator-() const noexcept;
-        inline batch operator+() const noexcept;
+        XSIMD_INLINE batch_bool_type operator!() const noexcept;
+        XSIMD_INLINE batch operator~() const noexcept;
+        XSIMD_INLINE batch operator-() const noexcept;
+        XSIMD_INLINE batch operator+() const noexcept;
 
         // arithmetic operators. They are defined as friend to enable automatic
         // conversion of parameters from scalar to batch
 
         /** Shorthand for xsimd::add() */
-        friend inline batch operator+(batch const& self, batch const& other) noexcept
+        friend XSIMD_INLINE batch operator+(batch const& self, batch const& other) noexcept
         {
             return batch(self) += other;
         }
 
         /** Shorthand for xsimd::sub() */
-        friend inline batch operator-(batch const& self, batch const& other) noexcept
+        friend XSIMD_INLINE batch operator-(batch const& self, batch const& other) noexcept
         {
             return batch(self) -= other;
         }
 
         /** Shorthand for xsimd::mul() */
-        friend inline batch operator*(batch const& self, batch const& other) noexcept
+        friend XSIMD_INLINE batch operator*(batch const& self, batch const& other) noexcept
         {
             return batch(self) *= other;
         }
 
         /** Shorthand for xsimd::div() */
-        friend inline batch operator/(batch const& self, batch const& other) noexcept
+        friend XSIMD_INLINE batch operator/(batch const& self, batch const& other) noexcept
         {
             return batch(self) /= other;
         }
@@ -500,7 +500,7 @@ namespace xsimd
      * Create a batch with all element initialized to \c val.
      */
     template <class T, class A>
-    inline batch<T, A>::batch(T val) noexcept
+    XSIMD_INLINE batch<T, A>::batch(T val) noexcept
         : types::simd_register<T, A>(kernel::broadcast<A>(val, A {}))
     {
         detail::static_check_supported_config<T, A>();
@@ -512,7 +512,7 @@ namespace xsimd
      */
     template <class T, class A>
     template <class... Ts>
-    inline batch<T, A>::batch(T val0, T val1, Ts... vals) noexcept
+    XSIMD_INLINE batch<T, A>::batch(T val0, T val1, Ts... vals) noexcept
         : batch(kernel::set<A>(batch {}, A {}, val0, val1, static_cast<T>(vals)...))
     {
         detail::static_check_supported_config<T, A>();
@@ -525,7 +525,7 @@ namespace xsimd
      * (resp. `false`).
      */
     template <class T, class A>
-    inline batch<T, A>::batch(batch_bool<T, A> const& b) noexcept
+    XSIMD_INLINE batch<T, A>::batch(batch_bool<T, A> const& b) noexcept
         : batch(kernel::from_bool(b, A {}))
     {
     }
@@ -535,7 +535,7 @@ namespace xsimd
      * becomes handy when doing architecture-specific operations.
      */
     template <class T, class A>
-    inline batch<T, A>::batch(register_type reg) noexcept
+    XSIMD_INLINE batch<T, A>::batch(register_type reg) noexcept
         : types::simd_register<T, A>({ reg })
     {
         detail::static_check_supported_config<T, A>();
@@ -546,7 +546,7 @@ namespace xsimd
      */
     template <class T, class A>
     template <class U>
-    XSIMD_NO_DISCARD inline batch<T, A> batch<T, A>::broadcast(U val) noexcept
+    XSIMD_NO_DISCARD XSIMD_INLINE batch<T, A> batch<T, A>::broadcast(U val) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return batch(static_cast<T>(val));
@@ -562,7 +562,7 @@ namespace xsimd
      */
     template <class T, class A>
     template <class U>
-    inline void batch<T, A>::store_aligned(U* mem) const noexcept
+    XSIMD_INLINE void batch<T, A>::store_aligned(U* mem) const noexcept
     {
         detail::static_check_supported_config<T, A>();
         assert(((reinterpret_cast<uintptr_t>(mem) % A::alignment()) == 0)
@@ -576,7 +576,7 @@ namespace xsimd
      */
     template <class T, class A>
     template <class U>
-    inline void batch<T, A>::store_unaligned(U* mem) const noexcept
+    XSIMD_INLINE void batch<T, A>::store_unaligned(U* mem) const noexcept
     {
         detail::static_check_supported_config<T, A>();
         kernel::store_unaligned<A>(mem, *this, A {});
@@ -587,7 +587,7 @@ namespace xsimd
      */
     template <class T, class A>
     template <class U>
-    inline void batch<T, A>::store(U* mem, aligned_mode) const noexcept
+    XSIMD_INLINE void batch<T, A>::store(U* mem, aligned_mode) const noexcept
     {
         detail::static_check_supported_config<T, A>();
         return store_aligned(mem);
@@ -598,7 +598,7 @@ namespace xsimd
      */
     template <class T, class A>
     template <class U>
-    inline void batch<T, A>::store(U* mem, unaligned_mode) const noexcept
+    XSIMD_INLINE void batch<T, A>::store(U* mem, unaligned_mode) const noexcept
     {
         detail::static_check_supported_config<T, A>();
         return store_unaligned(mem);
@@ -610,7 +610,7 @@ namespace xsimd
      */
     template <class T, class A>
     template <class U>
-    inline batch<T, A> batch<T, A>::load_aligned(U const* mem) noexcept
+    XSIMD_INLINE batch<T, A> batch<T, A>::load_aligned(U const* mem) noexcept
     {
         assert(((reinterpret_cast<uintptr_t>(mem) % A::alignment()) == 0)
                && "loaded pointer is not properly aligned");
@@ -624,7 +624,7 @@ namespace xsimd
      */
     template <class T, class A>
     template <class U>
-    inline batch<T, A> batch<T, A>::load_unaligned(U const* mem) noexcept
+    XSIMD_INLINE batch<T, A> batch<T, A>::load_unaligned(U const* mem) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::load_unaligned<A>(mem, kernel::convert<T> {}, A {});
@@ -635,7 +635,7 @@ namespace xsimd
      */
     template <class T, class A>
     template <class U>
-    inline batch<T, A> batch<T, A>::load(U const* mem, aligned_mode) noexcept
+    XSIMD_INLINE batch<T, A> batch<T, A>::load(U const* mem, aligned_mode) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return load_aligned(mem);
@@ -646,7 +646,7 @@ namespace xsimd
      */
     template <class T, class A>
     template <class U>
-    inline batch<T, A> batch<T, A>::load(U const* mem, unaligned_mode) noexcept
+    XSIMD_INLINE batch<T, A> batch<T, A>::load(U const* mem, unaligned_mode) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return load_unaligned(mem);
@@ -660,7 +660,7 @@ namespace xsimd
      */
     template <class T, class A>
     template <typename U, typename V>
-    inline batch<T, A> batch<T, A>::gather(U const* src, batch<V, A> const& index) noexcept
+    XSIMD_INLINE batch<T, A> batch<T, A>::gather(U const* src, batch<V, A> const& index) noexcept
     {
         detail::static_check_supported_config<T, A>();
         static_assert(std::is_convertible<T, U>::value, "Can't convert from src to this batch's type!");
@@ -675,7 +675,7 @@ namespace xsimd
      */
     template <class T, class A>
     template <class U, class V>
-    inline void batch<T, A>::scatter(U* dst, batch<V, A> const& index) const noexcept
+    XSIMD_INLINE void batch<T, A>::scatter(U* dst, batch<V, A> const& index) const noexcept
     {
         detail::static_check_supported_config<T, A>();
         static_assert(std::is_convertible<T, U>::value, "Can't convert from this batch's type to dst!");
@@ -688,7 +688,7 @@ namespace xsimd
      * \c warning This is very inefficient and should only be used for debugging purpose.
      */
     template <class T, class A>
-    inline T batch<T, A>::get(std::size_t i) const noexcept
+    XSIMD_INLINE T batch<T, A>::get(std::size_t i) const noexcept
     {
         return kernel::get(*this, i, A {});
     }
@@ -702,7 +702,7 @@ namespace xsimd
          * Shorthand for xsimd::eq()
          */
         template <class T, class A>
-        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other) noexcept
         {
             detail::static_check_supported_config<T, A>();
             return kernel::eq<A>(self, other, A {});
@@ -712,7 +712,7 @@ namespace xsimd
          * Shorthand for xsimd::neq()
          */
         template <class T, class A>
-        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other) noexcept
         {
             detail::static_check_supported_config<T, A>();
             return kernel::neq<A>(self, other, A {});
@@ -722,7 +722,7 @@ namespace xsimd
          * Shorthand for xsimd::ge()
          */
         template <class T, class A>
-        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other) noexcept
         {
             detail::static_check_supported_config<T, A>();
             return kernel::ge<A>(self, other, A {});
@@ -732,7 +732,7 @@ namespace xsimd
          * Shorthand for xsimd::le()
          */
         template <class T, class A>
-        inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other) noexcept
         {
             detail::static_check_supported_config<T, A>();
             return kernel::le<A>(self, other, A {});
@@ -742,7 +742,7 @@ namespace xsimd
          * Shorthand for xsimd::gt()
          */
         template <class T, class A>
-        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other) noexcept
         {
             detail::static_check_supported_config<T, A>();
             return kernel::gt<A>(self, other, A {});
@@ -752,7 +752,7 @@ namespace xsimd
          * Shorthand for xsimd::lt()
          */
         template <class T, class A>
-        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other) noexcept
         {
             detail::static_check_supported_config<T, A>();
             return kernel::lt<A>(self, other, A {});
@@ -764,84 +764,84 @@ namespace xsimd
      **************************/
 
     template <class T, class A>
-    inline batch<T, A>& batch<T, A>::operator+=(batch<T, A> const& other) noexcept
+    XSIMD_INLINE batch<T, A>& batch<T, A>::operator+=(batch<T, A> const& other) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return *this = kernel::add<A>(*this, other, A {});
     }
 
     template <class T, class A>
-    inline batch<T, A>& batch<T, A>::operator-=(batch<T, A> const& other) noexcept
+    XSIMD_INLINE batch<T, A>& batch<T, A>::operator-=(batch<T, A> const& other) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return *this = kernel::sub<A>(*this, other, A {});
     }
 
     template <class T, class A>
-    inline batch<T, A>& batch<T, A>::operator*=(batch<T, A> const& other) noexcept
+    XSIMD_INLINE batch<T, A>& batch<T, A>::operator*=(batch<T, A> const& other) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return *this = kernel::mul<A>(*this, other, A {});
     }
 
     template <class T, class A>
-    inline batch<T, A>& batch<T, A>::operator/=(batch<T, A> const& other) noexcept
+    XSIMD_INLINE batch<T, A>& batch<T, A>::operator/=(batch<T, A> const& other) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return *this = kernel::div<A>(*this, other, A {});
     }
 
     template <class T, class A>
-    inline batch<T, A>& types::integral_only_operators<T, A>::operator%=(batch<T, A> const& other) noexcept
+    XSIMD_INLINE batch<T, A>& types::integral_only_operators<T, A>::operator%=(batch<T, A> const& other) noexcept
     {
         ::xsimd::detail::static_check_supported_config<T, A>();
         return *static_cast<batch<T, A>*>(this) = kernel::mod<A>(*static_cast<batch<T, A>*>(this), other, A {});
     }
 
     template <class T, class A>
-    inline batch<T, A>& batch<T, A>::operator&=(batch<T, A> const& other) noexcept
+    XSIMD_INLINE batch<T, A>& batch<T, A>::operator&=(batch<T, A> const& other) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return *this = kernel::bitwise_and<A>(*this, other, A {});
     }
 
     template <class T, class A>
-    inline batch<T, A>& batch<T, A>::operator|=(batch<T, A> const& other) noexcept
+    XSIMD_INLINE batch<T, A>& batch<T, A>::operator|=(batch<T, A> const& other) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return *this = kernel::bitwise_or<A>(*this, other, A {});
     }
 
     template <class T, class A>
-    inline batch<T, A>& batch<T, A>::operator^=(batch<T, A> const& other) noexcept
+    XSIMD_INLINE batch<T, A>& batch<T, A>::operator^=(batch<T, A> const& other) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return *this = kernel::bitwise_xor<A>(*this, other, A {});
     }
 
     template <class T, class A>
-    inline batch<T, A>& kernel::integral_only_operators<T, A>::operator>>=(batch<T, A> const& other) noexcept
+    XSIMD_INLINE batch<T, A>& kernel::integral_only_operators<T, A>::operator>>=(batch<T, A> const& other) noexcept
     {
         ::xsimd::detail::static_check_supported_config<T, A>();
         return *static_cast<batch<T, A>*>(this) = kernel::bitwise_rshift<A>(*static_cast<batch<T, A>*>(this), other, A {});
     }
 
     template <class T, class A>
-    inline batch<T, A>& kernel::integral_only_operators<T, A>::operator<<=(batch<T, A> const& other) noexcept
+    XSIMD_INLINE batch<T, A>& kernel::integral_only_operators<T, A>::operator<<=(batch<T, A> const& other) noexcept
     {
         ::xsimd::detail::static_check_supported_config<T, A>();
         return *static_cast<batch<T, A>*>(this) = kernel::bitwise_lshift<A>(*static_cast<batch<T, A>*>(this), other, A {});
     }
 
     template <class T, class A>
-    inline batch<T, A>& kernel::integral_only_operators<T, A>::operator>>=(int32_t other) noexcept
+    XSIMD_INLINE batch<T, A>& kernel::integral_only_operators<T, A>::operator>>=(int32_t other) noexcept
     {
         ::xsimd::detail::static_check_supported_config<T, A>();
         return *static_cast<batch<T, A>*>(this) = kernel::bitwise_rshift<A>(*static_cast<batch<T, A>*>(this), other, A {});
     }
 
     template <class T, class A>
-    inline batch<T, A>& kernel::integral_only_operators<T, A>::operator<<=(int32_t other) noexcept
+    XSIMD_INLINE batch<T, A>& kernel::integral_only_operators<T, A>::operator<<=(int32_t other) noexcept
     {
         ::xsimd::detail::static_check_supported_config<T, A>();
         return *static_cast<batch<T, A>*>(this) = kernel::bitwise_lshift<A>(*static_cast<batch<T, A>*>(this), other, A {});
@@ -852,21 +852,21 @@ namespace xsimd
      *****************************/
 
     template <class T, class A>
-    inline batch<T, A>& batch<T, A>::operator++() noexcept
+    XSIMD_INLINE batch<T, A>& batch<T, A>::operator++() noexcept
     {
         detail::static_check_supported_config<T, A>();
         return operator+=(1);
     }
 
     template <class T, class A>
-    inline batch<T, A>& batch<T, A>::operator--() noexcept
+    XSIMD_INLINE batch<T, A>& batch<T, A>::operator--() noexcept
     {
         detail::static_check_supported_config<T, A>();
         return operator-=(1);
     }
 
     template <class T, class A>
-    inline batch<T, A> batch<T, A>::operator++(int) noexcept
+    XSIMD_INLINE batch<T, A> batch<T, A>::operator++(int) noexcept
     {
         detail::static_check_supported_config<T, A>();
         batch<T, A> copy(*this);
@@ -875,7 +875,7 @@ namespace xsimd
     }
 
     template <class T, class A>
-    inline batch<T, A> batch<T, A>::operator--(int) noexcept
+    XSIMD_INLINE batch<T, A> batch<T, A>::operator--(int) noexcept
     {
         detail::static_check_supported_config<T, A>();
         batch copy(*this);
@@ -888,28 +888,28 @@ namespace xsimd
      *************************/
 
     template <class T, class A>
-    inline batch_bool<T, A> batch<T, A>::operator!() const noexcept
+    XSIMD_INLINE batch_bool<T, A> batch<T, A>::operator!() const noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::eq<A>(*this, batch(0), A {});
     }
 
     template <class T, class A>
-    inline batch<T, A> batch<T, A>::operator~() const noexcept
+    XSIMD_INLINE batch<T, A> batch<T, A>::operator~() const noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::bitwise_not<A>(*this, A {});
     }
 
     template <class T, class A>
-    inline batch<T, A> batch<T, A>::operator-() const noexcept
+    XSIMD_INLINE batch<T, A> batch<T, A>::operator-() const noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::neg<A>(*this, A {});
     }
 
     template <class T, class A>
-    inline batch<T, A> batch<T, A>::operator+() const noexcept
+    XSIMD_INLINE batch<T, A> batch<T, A>::operator+() const noexcept
     {
         detail::static_check_supported_config<T, A>();
         return *this;
@@ -920,13 +920,13 @@ namespace xsimd
      ************************/
 
     template <class T, class A>
-    inline batch<T, A> batch<T, A>::logical_and(batch<T, A> const& other) const noexcept
+    XSIMD_INLINE batch<T, A> batch<T, A>::logical_and(batch<T, A> const& other) const noexcept
     {
         return kernel::logical_and<A>(*this, other, A());
     }
 
     template <class T, class A>
-    inline batch<T, A> batch<T, A>::logical_or(batch<T, A> const& other) const noexcept
+    XSIMD_INLINE batch<T, A> batch<T, A>::logical_or(batch<T, A> const& other) const noexcept
     {
         return kernel::logical_or<A>(*this, other, A());
     }
@@ -936,14 +936,14 @@ namespace xsimd
      ***************************/
 
     template <class T, class A>
-    inline batch_bool<T, A>::batch_bool(register_type reg) noexcept
+    XSIMD_INLINE batch_bool<T, A>::batch_bool(register_type reg) noexcept
         : types::get_bool_simd_register_t<T, A>({ reg })
     {
     }
 
     template <class T, class A>
     template <class... Ts>
-    inline batch_bool<T, A>::batch_bool(bool val0, bool val1, Ts... vals) noexcept
+    XSIMD_INLINE batch_bool<T, A>::batch_bool(bool val0, bool val1, Ts... vals) noexcept
         : batch_bool(kernel::set<A>(batch_bool {}, A {}, val0, val1, static_cast<bool>(vals)...))
     {
         static_assert(sizeof...(Ts) + 2 == size, "The constructor requires as many arguments as batch elements.");
@@ -954,19 +954,19 @@ namespace xsimd
      *******************************/
 
     template <class T, class A>
-    inline void batch_bool<T, A>::store_aligned(bool* mem) const noexcept
+    XSIMD_INLINE void batch_bool<T, A>::store_aligned(bool* mem) const noexcept
     {
         kernel::store(*this, mem, A {});
     }
 
     template <class T, class A>
-    inline void batch_bool<T, A>::store_unaligned(bool* mem) const noexcept
+    XSIMD_INLINE void batch_bool<T, A>::store_unaligned(bool* mem) const noexcept
     {
         store_aligned(mem);
     }
 
     template <class T, class A>
-    inline batch_bool<T, A> batch_bool<T, A>::load_aligned(bool const* mem) noexcept
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::load_aligned(bool const* mem) noexcept
     {
         batch_type ref(0);
         alignas(A::alignment()) T buffer[size];
@@ -976,7 +976,7 @@ namespace xsimd
     }
 
     template <class T, class A>
-    inline batch_bool<T, A> batch_bool<T, A>::load_unaligned(bool const* mem) noexcept
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::load_unaligned(bool const* mem) noexcept
     {
         return load_aligned(mem);
     }
@@ -987,7 +987,7 @@ namespace xsimd
      * @return bit mask
      */
     template <class T, class A>
-    inline uint64_t batch_bool<T, A>::mask() const noexcept
+    XSIMD_INLINE uint64_t batch_bool<T, A>::mask() const noexcept
     {
         return kernel::mask(*this, A {});
     }
@@ -998,13 +998,13 @@ namespace xsimd
      * @return bit mask
      */
     template <class T, class A>
-    inline batch_bool<T, A> batch_bool<T, A>::from_mask(uint64_t mask) noexcept
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::from_mask(uint64_t mask) noexcept
     {
         return kernel::from_mask(batch_bool<T, A>(), mask, A {});
     }
 
     template <class T, class A>
-    inline bool batch_bool<T, A>::get(std::size_t i) const noexcept
+    XSIMD_INLINE bool batch_bool<T, A>::get(std::size_t i) const noexcept
     {
         return kernel::get(*this, i, A {});
     }
@@ -1014,13 +1014,13 @@ namespace xsimd
      ***********************************/
 
     template <class T, class A>
-    inline batch_bool<T, A> batch_bool<T, A>::operator==(batch_bool<T, A> const& other) const noexcept
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::operator==(batch_bool<T, A> const& other) const noexcept
     {
         return kernel::eq<A>(*this, other, A {}).data;
     }
 
     template <class T, class A>
-    inline batch_bool<T, A> batch_bool<T, A>::operator!=(batch_bool<T, A> const& other) const noexcept
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::operator!=(batch_bool<T, A> const& other) const noexcept
     {
         return kernel::neq<A>(*this, other, A {}).data;
     }
@@ -1030,43 +1030,43 @@ namespace xsimd
      ********************************/
 
     template <class T, class A>
-    inline batch_bool<T, A> batch_bool<T, A>::operator~() const noexcept
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::operator~() const noexcept
     {
         return kernel::bitwise_not<A>(*this, A {}).data;
     }
 
     template <class T, class A>
-    inline batch_bool<T, A> batch_bool<T, A>::operator!() const noexcept
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::operator!() const noexcept
     {
         return operator==(batch_bool(false));
     }
 
     template <class T, class A>
-    inline batch_bool<T, A> batch_bool<T, A>::operator&(batch_bool<T, A> const& other) const noexcept
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::operator&(batch_bool<T, A> const& other) const noexcept
     {
         return kernel::bitwise_and<A>(*this, other, A {}).data;
     }
 
     template <class T, class A>
-    inline batch_bool<T, A> batch_bool<T, A>::operator|(batch_bool<T, A> const& other) const noexcept
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::operator|(batch_bool<T, A> const& other) const noexcept
     {
         return kernel::bitwise_or<A>(*this, other, A {}).data;
     }
 
     template <class T, class A>
-    inline batch_bool<T, A> batch_bool<T, A>::operator^(batch_bool<T, A> const& other) const noexcept
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::operator^(batch_bool<T, A> const& other) const noexcept
     {
         return kernel::bitwise_xor<A>(*this, other, A {}).data;
     }
 
     template <class T, class A>
-    inline batch_bool<T, A> batch_bool<T, A>::operator&&(batch_bool const& other) const noexcept
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::operator&&(batch_bool const& other) const noexcept
     {
         return operator&(other);
     }
 
     template <class T, class A>
-    inline batch_bool<T, A> batch_bool<T, A>::operator||(batch_bool const& other) const noexcept
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::operator||(batch_bool const& other) const noexcept
     {
         return operator|(other);
     }
@@ -1076,21 +1076,21 @@ namespace xsimd
      ******************************/
 
     template <class T, class A>
-    inline batch_bool<T, A>::batch_bool(bool val) noexcept
+    XSIMD_INLINE batch_bool<T, A>::batch_bool(bool val) noexcept
         : base_type { make_register(detail::make_index_sequence<size - 1>(), val) }
     {
     }
 
     template <class T, class A>
     template <class U, class... V, size_t I, size_t... Is>
-    inline auto batch_bool<T, A>::make_register(detail::index_sequence<I, Is...>, U u, V... v) noexcept -> register_type
+    XSIMD_INLINE auto batch_bool<T, A>::make_register(detail::index_sequence<I, Is...>, U u, V... v) noexcept -> register_type
     {
         return make_register(detail::index_sequence<Is...>(), u, u, v...);
     }
 
     template <class T, class A>
     template <class... V>
-    inline auto batch_bool<T, A>::make_register(detail::index_sequence<>, V... v) noexcept -> register_type
+    XSIMD_INLINE auto batch_bool<T, A>::make_register(detail::index_sequence<>, V... v) noexcept -> register_type
     {
         return kernel::set<A>(batch_bool<T, A>(), A {}, v...).data;
     }
@@ -1100,28 +1100,28 @@ namespace xsimd
      *******************************/
 
     template <class T, class A>
-    inline batch<std::complex<T>, A>::batch(value_type const& val) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A>::batch(value_type const& val) noexcept
         : m_real(val.real())
         , m_imag(val.imag())
     {
     }
 
     template <class T, class A>
-    inline batch<std::complex<T>, A>::batch(real_batch const& real, real_batch const& imag) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A>::batch(real_batch const& real, real_batch const& imag) noexcept
         : m_real(real)
         , m_imag(imag)
     {
     }
 
     template <class T, class A>
-    inline batch<std::complex<T>, A>::batch(real_batch const& real) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A>::batch(real_batch const& real) noexcept
         : m_real(real)
         , m_imag(0)
     {
     }
 
     template <class T, class A>
-    inline batch<std::complex<T>, A>::batch(T val) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A>::batch(T val) noexcept
         : m_real(val)
         , m_imag(0)
     {
@@ -1129,14 +1129,14 @@ namespace xsimd
 
     template <class T, class A>
     template <class... Ts>
-    inline batch<std::complex<T>, A>::batch(value_type val0, value_type val1, Ts... vals) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A>::batch(value_type val0, value_type val1, Ts... vals) noexcept
         : batch(kernel::set<A>(batch {}, A {}, val0, val1, static_cast<value_type>(vals)...))
     {
         static_assert(sizeof...(Ts) + 2 == size, "as many arguments as batch elements");
     }
 
     template <class T, class A>
-    inline batch<std::complex<T>, A>::batch(batch_bool_type const& b) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A>::batch(batch_bool_type const& b) noexcept
         : m_real(b)
         , m_imag(0)
     {
@@ -1144,7 +1144,7 @@ namespace xsimd
 
     template <class T, class A>
     template <class U>
-    XSIMD_NO_DISCARD inline batch<std::complex<T>, A> batch<std::complex<T>, A>::broadcast(U val) noexcept
+    XSIMD_NO_DISCARD XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::broadcast(U val) noexcept
     {
         return batch(static_cast<std::complex<T>>(val));
     }
@@ -1154,18 +1154,18 @@ namespace xsimd
      ***********************************/
 
     template <class T, class A>
-    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load_aligned(const T* real_src, const T* imag_src) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::load_aligned(const T* real_src, const T* imag_src) noexcept
     {
         return { batch<T, A>::load_aligned(real_src), imag_src ? batch<T, A>::load_aligned(imag_src) : batch<T, A>(0) };
     }
     template <class T, class A>
-    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load_unaligned(const T* real_src, const T* imag_src) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::load_unaligned(const T* real_src, const T* imag_src) noexcept
     {
         return { batch<T, A>::load_unaligned(real_src), imag_src ? batch<T, A>::load_unaligned(imag_src) : batch<T, A>(0) };
     }
 
     template <class T, class A>
-    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load_aligned(const value_type* src) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::load_aligned(const value_type* src) noexcept
     {
         assert(((reinterpret_cast<uintptr_t>(src) % A::alignment()) == 0)
                && "loaded pointer is not properly aligned");
@@ -1173,13 +1173,13 @@ namespace xsimd
     }
 
     template <class T, class A>
-    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load_unaligned(const value_type* src) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::load_unaligned(const value_type* src) noexcept
     {
         return kernel::load_complex_unaligned<A>(src, kernel::convert<value_type> {}, A {});
     }
 
     template <class T, class A>
-    inline void batch<std::complex<T>, A>::store_aligned(value_type* dst) const noexcept
+    XSIMD_INLINE void batch<std::complex<T>, A>::store_aligned(value_type* dst) const noexcept
     {
         assert(((reinterpret_cast<uintptr_t>(dst) % A::alignment()) == 0)
                && "store location is not properly aligned");
@@ -1187,20 +1187,20 @@ namespace xsimd
     }
 
     template <class T, class A>
-    inline void batch<std::complex<T>, A>::store_unaligned(value_type* dst) const noexcept
+    XSIMD_INLINE void batch<std::complex<T>, A>::store_unaligned(value_type* dst) const noexcept
     {
         return kernel::store_complex_unaligned(dst, *this, A {});
     }
 
     template <class T, class A>
-    inline void batch<std::complex<T>, A>::store_aligned(T* real_dst, T* imag_dst) const noexcept
+    XSIMD_INLINE void batch<std::complex<T>, A>::store_aligned(T* real_dst, T* imag_dst) const noexcept
     {
         m_real.store_aligned(real_dst);
         m_imag.store_aligned(imag_dst);
     }
 
     template <class T, class A>
-    inline void batch<std::complex<T>, A>::store_unaligned(T* real_dst, T* imag_dst) const noexcept
+    XSIMD_INLINE void batch<std::complex<T>, A>::store_unaligned(T* real_dst, T* imag_dst) const noexcept
     {
         m_real.store_unaligned(real_dst);
         m_imag.store_unaligned(imag_dst);
@@ -1208,46 +1208,46 @@ namespace xsimd
 
     template <class T, class A>
     template <class U>
-    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load(U const* mem, aligned_mode) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::load(U const* mem, aligned_mode) noexcept
     {
         return load_aligned(mem);
     }
 
     template <class T, class A>
     template <class U>
-    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load(U const* mem, unaligned_mode) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::load(U const* mem, unaligned_mode) noexcept
     {
         return load_unaligned(mem);
     }
 
     template <class T, class A>
     template <class U>
-    inline void batch<std::complex<T>, A>::store(U* mem, aligned_mode) const noexcept
+    XSIMD_INLINE void batch<std::complex<T>, A>::store(U* mem, aligned_mode) const noexcept
     {
         return store_aligned(mem);
     }
 
     template <class T, class A>
     template <class U>
-    inline void batch<std::complex<T>, A>::store(U* mem, unaligned_mode) const noexcept
+    XSIMD_INLINE void batch<std::complex<T>, A>::store(U* mem, unaligned_mode) const noexcept
     {
         return store_unaligned(mem);
     }
 
     template <class T, class A>
-    inline auto batch<std::complex<T>, A>::real() const noexcept -> real_batch
+    XSIMD_INLINE auto batch<std::complex<T>, A>::real() const noexcept -> real_batch
     {
         return m_real;
     }
 
     template <class T, class A>
-    inline auto batch<std::complex<T>, A>::imag() const noexcept -> real_batch
+    XSIMD_INLINE auto batch<std::complex<T>, A>::imag() const noexcept -> real_batch
     {
         return m_imag;
     }
 
     template <class T, class A>
-    inline auto batch<std::complex<T>, A>::get(std::size_t i) const noexcept -> value_type
+    XSIMD_INLINE auto batch<std::complex<T>, A>::get(std::size_t i) const noexcept -> value_type
     {
         return kernel::get(*this, i, A {});
     }
@@ -1260,7 +1260,7 @@ namespace xsimd
 
     template <class T, class A>
     template <bool i3ec>
-    inline batch<std::complex<T>, A>::batch(xtl::xcomplex<T, T, i3ec> const& val) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A>::batch(xtl::xcomplex<T, T, i3ec> const& val) noexcept
         : m_real(val.real())
         , m_imag(val.imag())
     {
@@ -1268,7 +1268,7 @@ namespace xsimd
 
     template <class T, class A>
     template <bool i3ec, class... Ts>
-    inline batch<std::complex<T>, A>::batch(xtl::xcomplex<T, T, i3ec> val0, xtl::xcomplex<T, T, i3ec> val1, Ts... vals) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A>::batch(xtl::xcomplex<T, T, i3ec> val0, xtl::xcomplex<T, T, i3ec> val1, Ts... vals) noexcept
         : batch(kernel::set<A>(batch {}, A {}, val0, val1, static_cast<xtl::xcomplex<T, T, i3ec>>(vals)...))
     {
         static_assert(sizeof...(Ts) + 2 == size, "as many arguments as batch elements");
@@ -1280,28 +1280,28 @@ namespace xsimd
 
     template <class T, class A>
     template <bool i3ec>
-    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load_aligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::load_aligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept
     {
         return load_aligned(reinterpret_cast<std::complex<T> const*>(src));
     }
 
     template <class T, class A>
     template <bool i3ec>
-    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load_unaligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::load_unaligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept
     {
         return load_unaligned(reinterpret_cast<std::complex<T> const*>(src));
     }
 
     template <class T, class A>
     template <bool i3ec>
-    inline void batch<std::complex<T>, A>::store_aligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept
+    XSIMD_INLINE void batch<std::complex<T>, A>::store_aligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept
     {
         store_aligned(reinterpret_cast<std::complex<T>*>(dst));
     }
 
     template <class T, class A>
     template <bool i3ec>
-    inline void batch<std::complex<T>, A>::store_unaligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept
+    XSIMD_INLINE void batch<std::complex<T>, A>::store_unaligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept
     {
         store_unaligned(reinterpret_cast<std::complex<T>*>(dst));
     }
@@ -1313,13 +1313,13 @@ namespace xsimd
      ***************************************/
 
     template <class T, class A>
-    inline batch_bool<T, A> batch<std::complex<T>, A>::operator==(batch const& other) const noexcept
+    XSIMD_INLINE batch_bool<T, A> batch<std::complex<T>, A>::operator==(batch const& other) const noexcept
     {
         return m_real == other.m_real && m_imag == other.m_imag;
     }
 
     template <class T, class A>
-    inline batch_bool<T, A> batch<std::complex<T>, A>::operator!=(batch const& other) const noexcept
+    XSIMD_INLINE batch_bool<T, A> batch<std::complex<T>, A>::operator!=(batch const& other) const noexcept
     {
         return m_real != other.m_real || m_imag != other.m_imag;
     }
@@ -1329,7 +1329,7 @@ namespace xsimd
      ***********************************/
 
     template <class T, class A>
-    inline batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator+=(batch const& other) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator+=(batch const& other) noexcept
     {
         m_real += other.m_real;
         m_imag += other.m_imag;
@@ -1337,7 +1337,7 @@ namespace xsimd
     }
 
     template <class T, class A>
-    inline batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator-=(batch const& other) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator-=(batch const& other) noexcept
     {
         m_real -= other.m_real;
         m_imag -= other.m_imag;
@@ -1345,17 +1345,17 @@ namespace xsimd
     }
 
     template <class T, class A>
-    inline batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator*=(batch const& other) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator*=(batch const& other) noexcept
     {
-        real_batch new_real = real() * other.real() - imag() * other.imag();
-        real_batch new_imag = real() * other.imag() + imag() * other.real();
+        real_batch new_real = fms(real(), other.real(), imag() * other.imag());
+        real_batch new_imag = fma(real(), other.imag(), imag() * other.real());
         m_real = new_real;
         m_imag = new_imag;
         return *this;
     }
 
     template <class T, class A>
-    inline batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator/=(batch const& other) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator/=(batch const& other) noexcept
     {
         real_batch a = real();
         real_batch b = imag();
@@ -1372,19 +1372,19 @@ namespace xsimd
      **************************************/
 
     template <class T, class A>
-    inline batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator++() noexcept
+    XSIMD_INLINE batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator++() noexcept
     {
         return operator+=(1);
     }
 
     template <class T, class A>
-    inline batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator--() noexcept
+    XSIMD_INLINE batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator--() noexcept
     {
         return operator-=(1);
     }
 
     template <class T, class A>
-    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::operator++(int) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::operator++(int) noexcept
     {
         batch copy(*this);
         operator+=(1);
@@ -1392,7 +1392,7 @@ namespace xsimd
     }
 
     template <class T, class A>
-    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::operator--(int) noexcept
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::operator--(int) noexcept
     {
         batch copy(*this);
         operator-=(1);
@@ -1404,25 +1404,25 @@ namespace xsimd
      **********************************/
 
     template <class T, class A>
-    inline batch_bool<T, A> batch<std::complex<T>, A>::operator!() const noexcept
+    XSIMD_INLINE batch_bool<T, A> batch<std::complex<T>, A>::operator!() const noexcept
     {
         return operator==(batch(0));
     }
 
     template <class T, class A>
-    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::operator~() const noexcept
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::operator~() const noexcept
     {
         return { ~m_real, ~m_imag };
     }
 
     template <class T, class A>
-    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::operator-() const noexcept
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::operator-() const noexcept
     {
         return { -m_real, -m_imag };
     }
 
     template <class T, class A>
-    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::operator+() const noexcept
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::operator+() const noexcept
     {
         return { +m_real, +m_imag };
     }
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_batch_constant.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_batch_constant.hpp
index 0de9c8ad42c..3d960327747 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_batch_constant.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_batch_constant.hpp
@@ -25,17 +25,24 @@ namespace xsimd
      * @tparam batch_type the type of the associated batch values.
      * @tparam Values boolean constant represented by this batch
      **/
-    template <class batch_type, bool... Values>
+    template <typename T, class A, bool... Values>
     struct batch_bool_constant
     {
-
-    public:
+        using batch_type = batch_bool<T, A>;
         static constexpr std::size_t size = sizeof...(Values);
-        using arch_type = typename batch_type::arch_type;
         using value_type = bool;
         static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
 
-        constexpr operator batch_bool<typename batch_type::value_type, arch_type>() const noexcept { return { Values... }; }
+    public:
+        /**
+         * @brief Generate a batch of @p batch_type from this @p batch_bool_constant
+         */
+        constexpr batch_type as_batch_bool() const noexcept { return { Values... }; }
+
+        /**
+         * @brief Generate a batch of @p batch_type from this @p batch_bool_constant
+         */
+        constexpr operator batch_type() const noexcept { return as_batch_bool(); }
 
         constexpr bool get(size_t i) const noexcept
         {
@@ -70,14 +77,14 @@ namespace xsimd
         };
 
         template <class F, class SelfPack, class OtherPack, size_t... Indices>
-        static constexpr batch_bool_constant<batch_type, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
+        static constexpr batch_bool_constant<T, A, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
         apply(detail::index_sequence<Indices...>)
         {
             return {};
         }
 
         template <class F, bool... OtherValues>
-        static constexpr auto apply(batch_bool_constant<batch_type, Values...>, batch_bool_constant<batch_type, OtherValues...>)
+        static constexpr auto apply(batch_bool_constant<T, A, Values...>, batch_bool_constant<T, A, OtherValues...>)
             -> decltype(apply<F, std::tuple<std::integral_constant<bool, Values>...>, std::tuple<std::integral_constant<bool, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>()))
         {
             static_assert(sizeof...(Values) == sizeof...(OtherValues), "compatible constant batches");
@@ -85,12 +92,12 @@ namespace xsimd
         }
 
     public:
-#define MAKE_BINARY_OP(OP, NAME)                                                            \
-    template <bool... OtherValues>                                                          \
-    constexpr auto operator OP(batch_bool_constant<batch_type, OtherValues...> other) const \
-        -> decltype(apply<NAME>(*this, other))                                              \
-    {                                                                                       \
-        return apply<NAME>(*this, other);                                                   \
+#define MAKE_BINARY_OP(OP, NAME)                                                      \
+    template <bool... OtherValues>                                                    \
+    constexpr auto operator OP(batch_bool_constant<T, A, OtherValues...> other) const \
+        -> decltype(apply<NAME>(*this, other))                                        \
+    {                                                                                 \
+        return apply<NAME>(*this, other);                                             \
     }
 
         MAKE_BINARY_OP(|, logical_or)
@@ -101,12 +108,12 @@ namespace xsimd
 
 #undef MAKE_BINARY_OP
 
-        constexpr batch_bool_constant<batch_type, !Values...> operator!() const
+        constexpr batch_bool_constant<T, A, !Values...> operator!() const
         {
             return {};
         }
 
-        constexpr batch_bool_constant<batch_type, !Values...> operator~() const
+        constexpr batch_bool_constant<T, A, !Values...> operator~() const
         {
             return {};
         }
@@ -120,88 +127,93 @@ namespace xsimd
      * @tparam batch_type the type of the associated batch values.
      * @tparam Values constants represented by this batch
      **/
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
     struct batch_constant
     {
         static constexpr std::size_t size = sizeof...(Values);
-        using arch_type = typename batch_type::arch_type;
+        using batch_type = batch<T, A>;
         using value_type = typename batch_type::value_type;
         static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
 
         /**
          * @brief Generate a batch of @p batch_type from this @p batch_constant
          */
-        inline operator batch_type() const noexcept { return { Values... }; }
+        XSIMD_INLINE batch_type as_batch() const noexcept { return { Values... }; }
+
+        /**
+         * @brief Generate a batch of @p batch_type from this @p batch_constant
+         */
+        XSIMD_INLINE operator batch_type() const noexcept { return as_batch(); }
 
         /**
          * @brief Get the @p i th element of this @p batch_constant
          */
-        constexpr value_type get(size_t i) const noexcept
+        constexpr T get(size_t i) const noexcept
         {
-            return get(i, std::array<value_type, size> { Values... });
+            return get(i, std::array<T, size> { Values... });
         }
 
     private:
-        constexpr value_type get(size_t i, std::array<value_type, size> const& values) const noexcept
+        constexpr T get(size_t i, std::array<T, size> const& values) const noexcept
         {
             return values[i];
         }
 
         struct arithmetic_add
         {
-            constexpr value_type operator()(value_type x, value_type y) const { return x + y; }
+            constexpr T operator()(T x, T y) const { return x + y; }
         };
         struct arithmetic_sub
         {
-            constexpr value_type operator()(value_type x, value_type y) const { return x - y; }
+            constexpr T operator()(T x, T y) const { return x - y; }
         };
         struct arithmetic_mul
         {
-            constexpr value_type operator()(value_type x, value_type y) const { return x * y; }
+            constexpr T operator()(T x, T y) const { return x * y; }
         };
         struct arithmetic_div
         {
-            constexpr value_type operator()(value_type x, value_type y) const { return x / y; }
+            constexpr T operator()(T x, T y) const { return x / y; }
         };
         struct arithmetic_mod
         {
-            constexpr value_type operator()(value_type x, value_type y) const { return x % y; }
+            constexpr T operator()(T x, T y) const { return x % y; }
         };
         struct binary_and
         {
-            constexpr value_type operator()(value_type x, value_type y) const { return x & y; }
+            constexpr T operator()(T x, T y) const { return x & y; }
         };
         struct binary_or
         {
-            constexpr value_type operator()(value_type x, value_type y) const { return x | y; }
+            constexpr T operator()(T x, T y) const { return x | y; }
         };
         struct binary_xor
         {
-            constexpr value_type operator()(value_type x, value_type y) const { return x ^ y; }
+            constexpr T operator()(T x, T y) const { return x ^ y; }
         };
 
         template <class F, class SelfPack, class OtherPack, size_t... Indices>
-        static constexpr batch_constant<batch_type, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
+        static constexpr batch_constant<T, A, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
         apply(detail::index_sequence<Indices...>)
         {
             return {};
         }
 
-        template <class F, value_type... OtherValues>
-        static constexpr auto apply(batch_constant<batch_type, Values...>, batch_constant<batch_type, OtherValues...>)
-            -> decltype(apply<F, std::tuple<std::integral_constant<value_type, Values>...>, std::tuple<std::integral_constant<value_type, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>()))
+        template <class F, T... OtherValues>
+        static constexpr auto apply(batch_constant<T, A, Values...>, batch_constant<T, A, OtherValues...>)
+            -> decltype(apply<F, std::tuple<std::integral_constant<T, Values>...>, std::tuple<std::integral_constant<T, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>()))
         {
             static_assert(sizeof...(Values) == sizeof...(OtherValues), "compatible constant batches");
-            return apply<F, std::tuple<std::integral_constant<value_type, Values>...>, std::tuple<std::integral_constant<value_type, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>());
+            return apply<F, std::tuple<std::integral_constant<T, Values>...>, std::tuple<std::integral_constant<T, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>());
         }
 
     public:
-#define MAKE_BINARY_OP(OP, NAME)                                                       \
-    template <value_type... OtherValues>                                               \
-    constexpr auto operator OP(batch_constant<batch_type, OtherValues...> other) const \
-        -> decltype(apply<NAME>(*this, other))                                         \
-    {                                                                                  \
-        return apply<NAME>(*this, other);                                              \
+#define MAKE_BINARY_OP(OP, NAME)                                                 \
+    template <T... OtherValues>                                                  \
+    constexpr auto operator OP(batch_constant<T, A, OtherValues...> other) const \
+        -> decltype(apply<NAME>(*this, other))                                   \
+    {                                                                            \
+        return apply<NAME>(*this, other);                                        \
     }
 
         MAKE_BINARY_OP(+, arithmetic_add)
@@ -215,17 +227,17 @@ namespace xsimd
 
 #undef MAKE_BINARY_OP
 
-        constexpr batch_constant<batch_type, (value_type)-Values...> operator-() const
+        constexpr batch_constant<T, A, (T)-Values...> operator-() const
         {
             return {};
         }
 
-        constexpr batch_constant<batch_type, (value_type) + Values...> operator+() const
+        constexpr batch_constant<T, A, (T) + Values...> operator+() const
         {
             return {};
         }
 
-        constexpr batch_constant<batch_type, (value_type)~Values...> operator~() const
+        constexpr batch_constant<T, A, (T)~Values...> operator~() const
         {
             return {};
         }
@@ -233,15 +245,15 @@ namespace xsimd
 
     namespace detail
     {
-        template <class batch_type, class G, std::size_t... Is>
-        inline constexpr auto make_batch_constant(detail::index_sequence<Is...>) noexcept
-            -> batch_constant<batch_type, (typename batch_type::value_type)G::get(Is, sizeof...(Is))...>
+        template <typename T, class A, class G, std::size_t... Is>
+        XSIMD_INLINE constexpr auto make_batch_constant(detail::index_sequence<Is...>) noexcept
+            -> batch_constant<T, A, (T)G::get(Is, sizeof...(Is))...>
         {
             return {};
         }
-        template <class batch_type, class G, std::size_t... Is>
-        inline constexpr auto make_batch_bool_constant(detail::index_sequence<Is...>) noexcept
-            -> batch_bool_constant<batch_type, G::get(Is, sizeof...(Is))...>
+        template <typename T, class A, class G, std::size_t... Is>
+        XSIMD_INLINE constexpr auto make_batch_bool_constant(detail::index_sequence<Is...>) noexcept
+            -> batch_bool_constant<T, A, G::get(Is, sizeof...(Is))...>
         {
             return {};
         }
@@ -268,19 +280,19 @@ namespace xsimd
      * };
      * @endcode
      */
-    template <class batch_type, class G>
-    inline constexpr auto make_batch_constant() noexcept -> decltype(detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>()))
+    template <typename T, class A, class G>
+    XSIMD_INLINE constexpr auto make_batch_constant() noexcept -> decltype(detail::make_batch_constant<T, A, G>(detail::make_index_sequence<batch<T, A>::size>()))
     {
-        return detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>());
+        return detail::make_batch_constant<T, A, G>(detail::make_index_sequence<batch<T, A>::size>());
     }
 
-    template <class batch_type, class G>
-    inline constexpr auto make_batch_bool_constant() noexcept
-        -> decltype(detail::make_batch_bool_constant<batch_type, G>(
-            detail::make_index_sequence<batch_type::size>()))
+    template <typename T, class A, class G>
+    XSIMD_INLINE constexpr auto make_batch_bool_constant() noexcept
+        -> decltype(detail::make_batch_bool_constant<T, A, G>(
+            detail::make_index_sequence<batch<T, A>::size>()))
     {
-        return detail::make_batch_bool_constant<batch_type, G>(
-            detail::make_index_sequence<batch_type::size>());
+        return detail::make_batch_bool_constant<T, A, G>(
+            detail::make_index_sequence<batch<T, A>::size>());
     }
 
 } // namespace xsimd
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_emulated_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_emulated_register.hpp
new file mode 100644
index 00000000000..6e0d659bd9d
--- /dev/null
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_emulated_register.hpp
@@ -0,0 +1,80 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_EMULATED_REGISTER_HPP
+#define XSIMD_EMULATED_REGISTER_HPP
+
+#include "./xsimd_generic_arch.hpp"
+#include "./xsimd_register.hpp"
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * emulated instructions
+     */
+    template <size_t N>
+    struct emulated : generic
+    {
+        static constexpr bool supported() noexcept { return true; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return false; }
+        static constexpr std::size_t alignment() noexcept { return 8; }
+        static constexpr char const* name() noexcept { return "emulated"; }
+    };
+
+    namespace types
+    {
+        template <size_t N>
+        struct simd_emulated_bool_register
+        {
+            using register_type = std::array<bool, N>;
+            register_type data;
+            simd_emulated_bool_register() = default;
+            simd_emulated_bool_register(register_type r) { data = r; }
+            operator register_type() const noexcept { return data; }
+        };
+        template <typename T, size_t N>
+        struct get_bool_simd_register<T, emulated<N>>
+        {
+            using type = simd_emulated_bool_register<N / (8 * sizeof(T))>;
+        };
+
+        template <typename T, size_t N>
+        struct simd_register<T, emulated<N>>
+        {
+            static_assert(N % (8 * sizeof(T)) == 0, "bit width must be a multiple of scalar width");
+            using register_type = std::array<T, N / (8 * sizeof(T))>;
+            register_type data;
+            XSIMD_INLINE operator register_type() const noexcept
+            {
+                return data;
+            }
+        };
+        template <typename T, size_t N>
+        struct has_simd_register<T, emulated<N>> : std::is_scalar<T>
+        {
+        };
+        template <typename T, size_t N>
+        struct has_simd_register<std::complex<T>, emulated<N>> : std::true_type
+        {
+        };
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+        template <typename T, bool i3ec, size_t N>
+        struct has_simd_register<xtl::complex<T, T, i3ec>, emulated<N>> : std::true_type
+        {
+        };
+#endif
+    }
+}
+
+#endif
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_fma3_avx2_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_fma3_avx2_register.hpp
index b9a59954142..cf3e26d08da 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_fma3_avx2_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_fma3_avx2_register.hpp
@@ -29,7 +29,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX2; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(2, 2, 1); }
         static constexpr char const* name() noexcept { return "fma3+avx2"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_fma3_avx_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_fma3_avx_register.hpp
index ae10598f2c7..5012d25a067 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_fma3_avx_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_fma3_avx_register.hpp
@@ -29,7 +29,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(2, 1, 1); }
         static constexpr char const* name() noexcept { return "fma3+avx"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_fma3_sse_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_fma3_sse_register.hpp
index a267490d66d..87ebc27b554 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_fma3_sse_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_fma3_sse_register.hpp
@@ -29,7 +29,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_SSE; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(1, 4, 3); }
         static constexpr char const* name() noexcept { return "fma3+sse4.2"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_fma4_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_fma4_register.hpp
index 3684bbb4019..1a066cd206d 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_fma4_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_fma4_register.hpp
@@ -25,7 +25,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_FMA4; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(1, 4, 4); }
         static constexpr char const* name() noexcept { return "fma4"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_generic_arch.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_generic_arch.hpp
index 2aa25419c6f..d16a37fea7e 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_generic_arch.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_generic_arch.hpp
@@ -35,13 +35,12 @@ namespace xsimd
         static constexpr std::size_t alignment() noexcept { return 0; }
         /// Whether this architecture requires aligned memory access.
         static constexpr bool requires_alignment() noexcept { return false; }
-        /// Unique identifier for this architecture.
-        static constexpr unsigned version() noexcept { return generic::version(0, 0, 0); }
         /// Name of the architecture.
         static constexpr char const* name() noexcept { return "generic"; }
+    };
 
-    protected:
-        static constexpr unsigned version(unsigned major, unsigned minor, unsigned patch, unsigned multiplier = 100u) noexcept { return major * multiplier * multiplier + minor * multiplier + patch; }
+    struct unsupported
+    {
     };
 }
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_i8mm_neon64_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_i8mm_neon64_register.hpp
new file mode 100644
index 00000000000..0e2b42d8eac
--- /dev/null
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_i8mm_neon64_register.hpp
@@ -0,0 +1,50 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_I8MM_NEON64_REGISTER_HPP
+#define XSIMD_I8MM_NEON64_REGISTER_HPP
+
+#include "./xsimd_neon64_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct i8mm;
+
+    /**
+     * @ingroup architectures
+     *
+     * Neon64 + i8mm instructions
+     */
+    template <>
+    struct i8mm<neon64> : neon64
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_I8MM_NEON64; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "i8mm+neon64"; }
+    };
+
+#if XSIMD_WITH_I8MM_NEON64
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(i8mm<neon64>, neon64);
+
+        template <class T>
+        struct get_bool_simd_register<T, i8mm<neon64>>
+            : detail::neon_bool_simd_register<T, i8mm<neon64>>
+        {
+        };
+    }
+#endif
+
+}
+#endif
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_neon64_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_neon64_register.hpp
index 3aa8973b636..709f601a3f0 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_neon64_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_neon64_register.hpp
@@ -27,7 +27,6 @@ namespace xsimd
         static constexpr bool available() noexcept { return true; }
         static constexpr bool requires_alignment() noexcept { return true; }
         static constexpr std::size_t alignment() noexcept { return 16; }
-        static constexpr unsigned version() noexcept { return generic::version(8, 1, 0); }
         static constexpr char const* name() noexcept { return "arm64+neon"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_neon_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_neon_register.hpp
index 0ef4b381d36..a9f4a46c8bf 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_neon_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_neon_register.hpp
@@ -32,7 +32,6 @@ namespace xsimd
         static constexpr bool available() noexcept { return true; }
         static constexpr bool requires_alignment() noexcept { return true; }
         static constexpr std::size_t alignment() noexcept { return 16; }
-        static constexpr unsigned version() noexcept { return generic::version(7, 0, 0); }
         static constexpr char const* name() noexcept { return "arm32+neon"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_register.hpp
index 4fe4f3f13fd..a838f8786de 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_register.hpp
@@ -37,7 +37,7 @@ namespace xsimd
     {                                                              \
         using register_type = VECTOR_TYPE;                         \
         register_type data;                                        \
-        inline operator register_type() const noexcept             \
+        XSIMD_INLINE operator register_type() const noexcept       \
         {                                                          \
             return data;                                           \
         }                                                          \
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_rvv_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_rvv_register.hpp
index 7bf5230bed8..cb5626d10aa 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_rvv_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_rvv_register.hpp
@@ -37,7 +37,6 @@ namespace xsimd
             static constexpr bool available() noexcept { return true; }
             static constexpr bool requires_alignment() noexcept { return true; }
             static constexpr std::size_t alignment() noexcept { return 16; }
-            static constexpr unsigned version() noexcept { return generic::version(1, 0, 0, /*multiplier=*/1000); }
             static constexpr char const* name() noexcept { return "riscv+rvv"; }
         };
     }
@@ -89,14 +88,14 @@ namespace xsimd
         using byte_type = XSIMD_RVV_TYPE(u, 8, vmul);                                     \
         using fixed_type = type __attribute__((riscv_rvv_vector_bits(width)));            \
         template <class U>                                                                \
-        static inline type bitcast(U x) noexcept                                          \
+        static XSIMD_INLINE type bitcast(U x) noexcept                                    \
         {                                                                                 \
             const auto words = XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, s, m, vmul)(x); \
             return XSIMD_RVV_JOINT5(__riscv_vreinterpret_, t, s, m, vmul)(words);         \
         }                                                                                 \
         template <>                                                                       \
-        inline type bitcast<type>(type x) noexcept { return x; }                          \
-        static inline byte_type as_bytes(type x) noexcept                                 \
+        XSIMD_INLINE type bitcast<type>(type x) noexcept { return x; }                    \
+        static XSIMD_INLINE byte_type as_bytes(type x) noexcept                           \
         {                                                                                 \
             const auto words = XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, s, m, vmul)(x); \
             return XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, 8, m, vmul)(words);         \
@@ -268,17 +267,17 @@ namespace xsimd
             //
             template <size_t>
             struct rvv_bool_info;
-#define XSIMD_RVV_MAKE_BOOL_TYPE(i)                                                       \
-    template <>                                                                           \
-    struct rvv_bool_info<i>                                                               \
-    {                                                                                     \
-        using type = XSIMD_RVV_JOINT(vbool, i, _t);                                       \
-        template <class T>                                                                \
-        static inline type bitcast(T value) noexcept                                      \
-        {                                                                                 \
-            return XSIMD_RVV_JOINT(__riscv_vreinterpret_b, i, )(value);                   \
-        }                                                                                 \
-        /*template <> static inline type bitcast(type value) noexcept { return value; }*/ \
+#define XSIMD_RVV_MAKE_BOOL_TYPE(i)                                                             \
+    template <>                                                                                 \
+    struct rvv_bool_info<i>                                                                     \
+    {                                                                                           \
+        using type = XSIMD_RVV_JOINT(vbool, i, _t);                                             \
+        template <class T>                                                                      \
+        static XSIMD_INLINE type bitcast(T value) noexcept                                      \
+        {                                                                                       \
+            return XSIMD_RVV_JOINT(__riscv_vreinterpret_b, i, )(value);                         \
+        }                                                                                       \
+        /*template <> static XSIMD_INLINE type bitcast(type value) noexcept { return value; }*/ \
     };
             XSIMD_RVV_MAKE_BOOL_TYPE(1);
             XSIMD_RVV_MAKE_BOOL_TYPE(2);
@@ -411,6 +410,8 @@ namespace xsimd
             using type = detail::rvv_bool_simd_register<T>;
         };
     } // namespace types
+#else
+    using rvv = detail::rvv<0xFFFFFFFF>;
 #endif
 } // namespace xsimd
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_sse2_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_sse2_register.hpp
index a9dc8960b66..e6eabec7ade 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_sse2_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_sse2_register.hpp
@@ -32,7 +32,6 @@ namespace xsimd
         static constexpr bool supported() noexcept { return XSIMD_WITH_SSE2; }
         static constexpr bool available() noexcept { return true; }
         static constexpr bool requires_alignment() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(1, 2, 0); }
         static constexpr std::size_t alignment() noexcept { return 16; }
         static constexpr char const* name() noexcept { return "sse2"; }
     };
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_sse3_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_sse3_register.hpp
index 1a7708a896b..6f216bb8129 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_sse3_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_sse3_register.hpp
@@ -29,7 +29,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_SSE3; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(1, 3, 0); }
         static constexpr char const* name() noexcept { return "sse3"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_sse4_1_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_sse4_1_register.hpp
index d906712d566..f7f6c06575b 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_sse4_1_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_sse4_1_register.hpp
@@ -29,7 +29,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_1; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(1, 4, 1); }
         static constexpr char const* name() noexcept { return "sse4.1"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_sse4_2_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_sse4_2_register.hpp
index b3446c90913..e92e4987243 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_sse4_2_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_sse4_2_register.hpp
@@ -29,7 +29,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_2; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(1, 4, 2); }
         static constexpr char const* name() noexcept { return "sse4.2"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_ssse3_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_ssse3_register.hpp
index 50ffac1e06f..fc1c0f82dec 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_ssse3_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_ssse3_register.hpp
@@ -29,7 +29,6 @@ namespace xsimd
     {
         static constexpr bool supported() noexcept { return XSIMD_WITH_SSSE3; }
         static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(1, 3, 1); }
         static constexpr char const* name() noexcept { return "ssse3"; }
     };
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_sve_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_sve_register.hpp
index 561e0d4a664..3342f2fa0b2 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_sve_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_sve_register.hpp
@@ -36,7 +36,6 @@ namespace xsimd
             static constexpr bool available() noexcept { return true; }
             static constexpr bool requires_alignment() noexcept { return true; }
             static constexpr std::size_t alignment() noexcept { return 16; }
-            static constexpr unsigned version() noexcept { return generic::version(9, Width / 32, 0); }
             static constexpr char const* name() noexcept { return "arm64+sve"; }
         };
     }
@@ -149,6 +148,8 @@ namespace xsimd
             using type = detail::sve_bool_simd_register;
         };
     } // namespace types
+#else
+    using sve = detail::sve<0xFFFFFFFF>;
 #endif
 } // namespace xsimd
 
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_traits.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_traits.hpp
index f848aab1f7c..471e979a49d 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_traits.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_traits.hpp
@@ -86,7 +86,7 @@ namespace xsimd
 
         // consistency checker
         template <class T, class A>
-        inline void static_check_supported_config()
+        XSIMD_INLINE void static_check_supported_config()
         {
             (void)static_check_supported_config_emitter<T, A>();
         }
diff --git a/contrib/python/pythran/pythran/xsimd/types/xsimd_wasm_register.hpp b/contrib/python/pythran/pythran/xsimd/types/xsimd_wasm_register.hpp
index aff05135db5..a9bd9e8531d 100644
--- a/contrib/python/pythran/pythran/xsimd/types/xsimd_wasm_register.hpp
+++ b/contrib/python/pythran/pythran/xsimd/types/xsimd_wasm_register.hpp
@@ -32,7 +32,6 @@ namespace xsimd
         static constexpr bool supported() noexcept { return XSIMD_WITH_WASM; }
         static constexpr bool available() noexcept { return true; }
         static constexpr bool requires_alignment() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(10, 0, 0); }
         static constexpr std::size_t alignment() noexcept { return 16; }
         static constexpr char const* name() noexcept { return "wasm"; }
     };
diff --git a/contrib/python/pythran/pythran/xsimd/xsimd.hpp b/contrib/python/pythran/pythran/xsimd/xsimd.hpp
index 8d76a5f91dd..b5548e7ac9c 100644
--- a/contrib/python/pythran/pythran/xsimd/xsimd.hpp
+++ b/contrib/python/pythran/pythran/xsimd/xsimd.hpp
@@ -51,6 +51,7 @@
 #endif
 
 #include "config/xsimd_config.hpp"
+#include "config/xsimd_inline.hpp"
 
 #include "arch/xsimd_scalar.hpp"
 #include "memory/xsimd_aligned_allocator.hpp"
diff --git a/contrib/python/pythran/ya.make b/contrib/python/pythran/ya.make
index d8781636425..59e472063c8 100644
--- a/contrib/python/pythran/ya.make
+++ b/contrib/python/pythran/ya.make
@@ -2,7 +2,7 @@
 
 PY3_LIBRARY()
 
-VERSION(0.15.0)
+VERSION(0.16.1)
 
 LICENSE(BSD-3-Clause)
author	robot-piglet <[email protected]>	2024-06-11 21:56:53 +0300
committer	robot-piglet <[email protected]>	2024-06-12 11:36:46 +0300
commit	f34ee6ebd8f8178f084e7003b7ee7694231ffe0a (patch)
tree	92dc4e972d754bde559cc5f2f01c6da76beab74a /contrib/python/pythran
parent	3ef7f2079326399e4eb328be651e9fff8e4734aa (diff)