aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/isa-l/erasure_code/aarch64
diff options
context:
space:
mode:
authorMaxim Yurchuk <maxim-yurchuk@ydb.tech>2024-10-18 20:31:38 +0300
committerGitHub <noreply@github.com>2024-10-18 20:31:38 +0300
commit2a74bac2d2d3bccb4e10120f1ead805640ec9dd0 (patch)
tree047e4818ced5aaf73f58517629e5260b5291f9f0 /contrib/libs/isa-l/erasure_code/aarch64
parent2d9656823e9521d8c29ea4c9a1d0eab78391abfc (diff)
parent3d834a1923bbf9403cd4a448e7f32b670aa4124f (diff)
downloadydb-2a74bac2d2d3bccb4e10120f1ead805640ec9dd0.tar.gz
Merge pull request #10502 from ydb-platform/mergelibs-241016-1210
Library import 241016-1210
Diffstat (limited to 'contrib/libs/isa-l/erasure_code/aarch64')
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/.yandex_meta/licenses.list.txt164
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/Makefile.am60
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_dispatcher.c124
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_highlevel_func.c264
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/ec_multibinary_arm.S37
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_neon.S402
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_sve.S168
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_neon.S411
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_sve.S152
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_neon.S361
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_sve.S189
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_neon.S391
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_sve.S175
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_neon.S425
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_sve.S208
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_neon.S464
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S194
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_neon.S484
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_sve.S237
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_neon.S544
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_sve.S218
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_dot_prod_sve.S258
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_neon.S618
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_sve.S237
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_7vect_dot_prod_sve.S281
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_8vect_dot_prod_sve.S307
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_neon.S303
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_sve.S132
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_neon.S324
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_sve.S126
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_neon.S240
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_sve.S123
-rw-r--r--contrib/libs/isa-l/erasure_code/aarch64/ya.make51
33 files changed, 8672 insertions, 0 deletions
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/.yandex_meta/licenses.list.txt b/contrib/libs/isa-l/erasure_code/aarch64/.yandex_meta/licenses.list.txt
new file mode 100644
index 0000000000..8f218b47cb
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/.yandex_meta/licenses.list.txt
@@ -0,0 +1,164 @@
+====================BSD-3-Clause====================
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+====================BSD-3-Clause====================
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Arm Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+====================BSD-3-Clause====================
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+====================BSD-3-Clause====================
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+====================BSD-3-Clause====================
+ISA-L is licensed using a BSD 3-clause [license]. All code submitted to
+
+
+====================BSD-3-Clause AND BSD-3-Clause AND BSD-3-Clause====================
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+====================COPYRIGHT====================
+ Copyright(c) 2011-2013 Intel Corporation All rights reserved.
+
+
+====================COPYRIGHT====================
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+
+====================COPYRIGHT====================
+# Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+
+
+====================COPYRIGHT====================
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+
+====================COPYRIGHT====================
+# Copyright(c) 2011-2018 Intel Corporation All rights reserved.
+
+
+====================COPYRIGHT====================
+# Copyright(c) 2019 Arm Corporation All rights reserved.
+
+
+====================COPYRIGHT====================
+; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/Makefile.am b/contrib/libs/isa-l/erasure_code/aarch64/Makefile.am
new file mode 100644
index 0000000000..47bbf12d2b
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/Makefile.am
@@ -0,0 +1,60 @@
+##################################################################
+# Copyright (c) 2019 Huawei Technologies Co., Ltd.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Huawei Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_aarch64 += \
+ erasure_code/aarch64/ec_aarch64_highlevel_func.c \
+ erasure_code/aarch64/ec_aarch64_dispatcher.c \
+ erasure_code/aarch64/gf_vect_dot_prod_neon.S \
+ erasure_code/aarch64/gf_2vect_dot_prod_neon.S \
+ erasure_code/aarch64/gf_3vect_dot_prod_neon.S \
+ erasure_code/aarch64/gf_4vect_dot_prod_neon.S \
+ erasure_code/aarch64/gf_5vect_dot_prod_neon.S \
+ erasure_code/aarch64/gf_vect_mad_neon.S \
+ erasure_code/aarch64/gf_2vect_mad_neon.S \
+ erasure_code/aarch64/gf_3vect_mad_neon.S \
+ erasure_code/aarch64/gf_4vect_mad_neon.S \
+ erasure_code/aarch64/gf_5vect_mad_neon.S \
+ erasure_code/aarch64/gf_6vect_mad_neon.S \
+ erasure_code/aarch64/gf_vect_mul_neon.S \
+ erasure_code/aarch64/gf_vect_mad_sve.S \
+ erasure_code/aarch64/gf_2vect_mad_sve.S \
+ erasure_code/aarch64/gf_3vect_mad_sve.S \
+ erasure_code/aarch64/gf_4vect_mad_sve.S \
+ erasure_code/aarch64/gf_5vect_mad_sve.S \
+ erasure_code/aarch64/gf_6vect_mad_sve.S \
+ erasure_code/aarch64/gf_vect_dot_prod_sve.S \
+ erasure_code/aarch64/gf_2vect_dot_prod_sve.S \
+ erasure_code/aarch64/gf_3vect_dot_prod_sve.S \
+ erasure_code/aarch64/gf_4vect_dot_prod_sve.S \
+ erasure_code/aarch64/gf_5vect_dot_prod_sve.S \
+ erasure_code/aarch64/gf_6vect_dot_prod_sve.S \
+ erasure_code/aarch64/gf_7vect_dot_prod_sve.S \
+ erasure_code/aarch64/gf_8vect_dot_prod_sve.S \
+ erasure_code/aarch64/gf_vect_mul_sve.S \
+ erasure_code/aarch64/ec_multibinary_arm.S
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_dispatcher.c b/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_dispatcher.c
new file mode 100644
index 0000000000..0a11604076
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_dispatcher.c
@@ -0,0 +1,124 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(gf_vect_dot_prod)
+{
+#if defined(__linux__)
+ unsigned long auxval = getauxval(AT_HWCAP);
+
+ if (auxval & HWCAP_SVE)
+ return PROVIDER_INFO(gf_vect_dot_prod_sve);
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(gf_vect_dot_prod_neon);
+#elif defined(__APPLE__)
+ if (sysctlEnabled(SYSCTL_SVE_KEY))
+ return PROVIDER_INFO(gf_vect_dot_prod_sve);
+ return PROVIDER_INFO(gf_vect_dot_prod_neon);
+#endif
+ return PROVIDER_BASIC(gf_vect_dot_prod);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(gf_vect_mad)
+{
+#if defined(__linux__)
+ unsigned long auxval = getauxval(AT_HWCAP);
+
+ if (auxval & HWCAP_SVE)
+ return PROVIDER_INFO(gf_vect_mad_sve);
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(gf_vect_mad_neon);
+#elif defined(__APPLE__)
+ if (sysctlEnabled(SYSCTL_SVE_KEY))
+ return PROVIDER_INFO(gf_vect_mad_sve);
+ return PROVIDER_INFO(gf_vect_mad_neon);
+#endif
+ return PROVIDER_BASIC(gf_vect_mad);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(ec_encode_data)
+{
+#if defined(__linux__)
+ unsigned long auxval = getauxval(AT_HWCAP);
+
+ if (auxval & HWCAP_SVE)
+ return PROVIDER_INFO(ec_encode_data_sve);
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(ec_encode_data_neon);
+#elif defined(__APPLE__)
+ if (sysctlEnabled(SYSCTL_SVE_KEY))
+ return PROVIDER_INFO(ec_encode_data_sve);
+ return PROVIDER_INFO(ec_encode_data_neon);
+#endif
+ return PROVIDER_BASIC(ec_encode_data);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(ec_encode_data_update)
+{
+#if defined(__linux__)
+ unsigned long auxval = getauxval(AT_HWCAP);
+
+ if (auxval & HWCAP_SVE)
+ return PROVIDER_INFO(ec_encode_data_update_sve);
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(ec_encode_data_update_neon);
+#elif defined(__APPLE__)
+ if (sysctlEnabled(SYSCTL_SVE_KEY))
+ return PROVIDER_INFO(ec_encode_data_update_sve);
+ return PROVIDER_INFO(ec_encode_data_update_neon);
+#endif
+ return PROVIDER_BASIC(ec_encode_data_update);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(gf_vect_mul)
+{
+#if defined(__linux__)
+ unsigned long auxval = getauxval(AT_HWCAP);
+
+ if (auxval & HWCAP_SVE)
+ return PROVIDER_INFO(gf_vect_mul_sve);
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(gf_vect_mul_neon);
+#elif defined(__APPLE__)
+ if (sysctlEnabled(SYSCTL_SVE_KEY))
+ return PROVIDER_INFO(gf_vect_mul_sve);
+ return PROVIDER_INFO(gf_vect_mul_neon);
+#endif
+ return PROVIDER_BASIC(gf_vect_mul);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(ec_init_tables)
+{
+ return PROVIDER_BASIC(ec_init_tables);
+}
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_highlevel_func.c b/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_highlevel_func.c
new file mode 100644
index 0000000000..e001fd72a0
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_highlevel_func.c
@@ -0,0 +1,264 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "erasure_code.h"
+
+/*external function*/
+extern void gf_vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char *dest);
+extern void gf_2vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_3vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_4vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_5vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char *dest);
+extern void gf_2vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_3vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_4vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_5vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_6vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+
+void ec_encode_data_neon(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
+ unsigned char **coding)
+{
+ if (len < 16) {
+ ec_encode_data_base(len, k, rows, g_tbls, data, coding);
+ return;
+ }
+
+ while (rows > 5) {
+ gf_5vect_dot_prod_neon(len, k, g_tbls, data, coding);
+ g_tbls += 5 * k * 32;
+ coding += 5;
+ rows -= 5;
+ }
+ switch (rows) {
+ case 5:
+ gf_5vect_dot_prod_neon(len, k, g_tbls, data, coding);
+ break;
+ case 4:
+ gf_4vect_dot_prod_neon(len, k, g_tbls, data, coding);
+ break;
+ case 3:
+ gf_3vect_dot_prod_neon(len, k, g_tbls, data, coding);
+ break;
+ case 2:
+ gf_2vect_dot_prod_neon(len, k, g_tbls, data, coding);
+ break;
+ case 1:
+ gf_vect_dot_prod_neon(len, k, g_tbls, data, *coding);
+ break;
+ case 0:
+ break;
+ default:
+ break;
+ }
+}
+
+void ec_encode_data_update_neon(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+ unsigned char *data, unsigned char **coding)
+{
+ if (len < 16) {
+ ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
+ return;
+ }
+ while (rows > 6) {
+ gf_6vect_mad_neon(len, k, vec_i, g_tbls, data, coding);
+ g_tbls += 6 * k * 32;
+ coding += 6;
+ rows -= 6;
+ }
+ switch (rows) {
+ case 6:
+ gf_6vect_mad_neon(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 5:
+ gf_5vect_mad_neon(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 4:
+ gf_4vect_mad_neon(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 3:
+ gf_3vect_mad_neon(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 2:
+ gf_2vect_mad_neon(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 1:
+ gf_vect_mad_neon(len, k, vec_i, g_tbls, data, *coding);
+ break;
+ case 0:
+ break;
+ }
+}
+
+/* SVE */
+extern void gf_vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char *dest);
+extern void gf_2vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_3vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_4vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_5vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_6vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_7vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_8vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+extern void gf_vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char *dest);
+extern void gf_2vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_3vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_4vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_5vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+extern void gf_6vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+
+void ec_encode_data_sve(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
+ unsigned char **coding)
+{
+ if (len < 16) {
+ ec_encode_data_base(len, k, rows, g_tbls, data, coding);
+ return;
+ }
+
+ while (rows > 11) {
+ gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ g_tbls += 6 * k * 32;
+ coding += 6;
+ rows -= 6;
+ }
+
+ switch (rows) {
+ case 11:
+ /* 7 + 4 */
+ gf_7vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ g_tbls += 7 * k * 32;
+ coding += 7;
+ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 10:
+ /* 6 + 4 */
+ gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ g_tbls += 6 * k * 32;
+ coding += 6;
+ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 9:
+ /* 5 + 4 */
+ gf_5vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ g_tbls += 5 * k * 32;
+ coding += 5;
+ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 8:
+ /* 4 + 4 */
+ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ g_tbls += 4 * k * 32;
+ coding += 4;
+ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 7:
+ gf_7vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 6:
+ gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 5:
+ gf_5vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 4:
+ gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 3:
+ gf_3vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 2:
+ gf_2vect_dot_prod_sve(len, k, g_tbls, data, coding);
+ break;
+ case 1:
+ gf_vect_dot_prod_sve(len, k, g_tbls, data, *coding);
+ break;
+ default:
+ break;
+ }
+}
+
+void ec_encode_data_update_sve(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+ unsigned char *data, unsigned char **coding)
+{
+ if (len < 16) {
+ ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
+ return;
+ }
+ while (rows > 6) {
+ gf_6vect_mad_sve(len, k, vec_i, g_tbls, data, coding);
+ g_tbls += 6 * k * 32;
+ coding += 6;
+ rows -= 6;
+ }
+ switch (rows) {
+ case 6:
+ gf_6vect_mad_sve(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 5:
+ gf_5vect_mad_sve(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 4:
+ gf_4vect_mad_sve(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 3:
+ gf_3vect_mad_sve(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 2:
+ gf_2vect_mad_sve(len, k, vec_i, g_tbls, data, coding);
+ break;
+ case 1:
+ gf_vect_mad_sve(len, k, vec_i, g_tbls, data, *coding);
+ break;
+ default:
+ break;
+ }
+}
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/ec_multibinary_arm.S b/contrib/libs/isa-l/erasure_code/aarch64/ec_multibinary_arm.S
new file mode 100644
index 0000000000..c276e63780
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/ec_multibinary_arm.S
@@ -0,0 +1,37 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aarch64_multibinary.h"
+
+mbin_interface ec_encode_data
+mbin_interface gf_vect_mul
+mbin_interface gf_vect_dot_prod
+mbin_interface gf_vect_mad
+mbin_interface ec_encode_data_update
+mbin_interface ec_init_tables
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_neon.S
new file mode 100644
index 0000000000..4ff7e7ce16
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_neon.S
@@ -0,0 +1,402 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_2vect_dot_prod_neon)
+#ifndef __APPLE__
+.type gf_2vect_dot_prod_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+x_tmp .req x8
+x_tbl1 .req x9
+x_tbl2 .req x10
+x_dest1 .req x11
+x_dest2 .req x12
+
+/* vectors */
+v_gft1_lo .req v0
+v_gft1_hi .req v1
+v_gft2_lo .req v2
+v_gft2_hi .req v3
+q_gft1_lo .req q0
+q_gft1_hi .req q1
+q_gft2_lo .req q2
+q_gft2_hi .req q3
+
+v_mask0f .req v4
+q_mask0f .req q4
+
+v_tmp1_lo .req v5
+v_tmp1_hi .req v6
+v_tmp1 .req v7
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+v_data_4 .req v12
+v_data_5 .req v13
+v_data_6 .req v14
+v_data_7 .req v15
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+q_data_4 .req q12
+q_data_5 .req q13
+q_data_6 .req q14
+q_data_7 .req q15
+
+v_p1_0 .req v16
+v_p1_1 .req v17
+v_p1_2 .req v18
+v_p1_3 .req v19
+v_p1_4 .req v20
+v_p1_5 .req v21
+v_p1_6 .req v22
+v_p1_7 .req v23
+v_p2_0 .req v24
+v_p2_1 .req v25
+v_p2_2 .req v26
+v_p2_3 .req v27
+v_p2_4 .req v28
+v_p2_5 .req v29
+v_p2_6 .req v30
+v_p2_7 .req v31
+
+q_p1_0 .req q16
+q_p1_1 .req q17
+q_p1_2 .req q18
+q_p1_3 .req q19
+q_p1_4 .req q20
+q_p1_5 .req q21
+q_p1_6 .req q22
+q_p1_7 .req q23
+q_p2_0 .req q24
+q_p2_1 .req q25
+q_p2_2 .req q26
+q_p2_3 .req q27
+q_p2_4 .req q28
+q_p2_5 .req q29
+q_p2_6 .req q30
+q_p2_7 .req q31
+
+v_p1 .req v_p1_0
+q_p1 .req q_p1_0
+v_p2 .req v_p2_0
+q_p2 .req q_p2_0
+v_data .req v_p1_1
+q_data .req q_p1_1
+v_data_lo .req v_p1_2
+v_data_hi .req v_p1_3
+
+cdecl(gf_2vect_dot_prod_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldr x_dest1, [x_dest, #8*0]
+ ldr x_dest2, [x_dest, #8*1]
+
+.Lloop128_init:
+ /* less than 128 bytes, goto Lloop16_init */
+ cmp x_len, #128
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_len, x_len, #128
+
+.Lloop128:
+ movi v_p1_0.16b, #0
+ movi v_p1_1.16b, #0
+ movi v_p1_2.16b, #0
+ movi v_p1_3.16b, #0
+ movi v_p1_4.16b, #0
+ movi v_p1_5.16b, #0
+ movi v_p1_6.16b, #0
+ movi v_p1_7.16b, #0
+
+ movi v_p2_0.16b, #0
+ movi v_p2_1.16b, #0
+ movi v_p2_2.16b, #0
+ movi v_p2_3.16b, #0
+ movi v_p2_4.16b, #0
+ movi v_p2_5.16b, #0
+ movi v_p2_6.16b, #0
+ movi v_p2_7.16b, #0
+
+ mov x_tbl1, x_tbl
+ add x_tbl2, x_tbl, x_vec, lsl #2
+ mov x_vec_i, #0
+
+.Lloop128_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ add x_vec_i, x_vec_i, #8
+ add x_ptr, x_ptr, x_pos
+
+ ldp q_data_0, q_data_1, [x_ptr], #32
+ ldp q_data_2, q_data_3, [x_ptr], #32
+
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+ ldp q_data_4, q_data_5, [x_ptr], #32
+ ldp q_data_6, q_data_7, [x_ptr], #32
+ prfm pldl1strm, [x_ptr]
+ prfm pldl1keep, [x_tbl1]
+ prfm pldl1keep, [x_tbl2]
+
+ /* data_0 */
+ and v_tmp1.16b, v_data_0.16b, v_mask0f.16b
+ ushr v_data_0.16b, v_data_0.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b
+ eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b
+ eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b
+ eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b
+ eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b
+
+ /* data_1 */
+ and v_tmp1.16b, v_data_1.16b, v_mask0f.16b
+ ushr v_data_1.16b, v_data_1.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b
+ eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b
+ eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b
+ eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b
+ eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b
+
+ /* data_2 */
+ and v_tmp1.16b, v_data_2.16b, v_mask0f.16b
+ ushr v_data_2.16b, v_data_2.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b
+ eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b
+ eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b
+ eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b
+ eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b
+
+ /* data_3 */
+ and v_tmp1.16b, v_data_3.16b, v_mask0f.16b
+ ushr v_data_3.16b, v_data_3.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b
+ eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b
+ eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b
+ eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b
+ eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b
+
+ /* data_4 */
+ and v_tmp1.16b, v_data_4.16b, v_mask0f.16b
+ ushr v_data_4.16b, v_data_4.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_4.16b
+ eor v_p1_4.16b, v_tmp1_lo.16b, v_p1_4.16b
+ eor v_p1_4.16b, v_p1_4.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_4.16b
+ eor v_p2_4.16b, v_tmp1_lo.16b, v_p2_4.16b
+ eor v_p2_4.16b, v_p2_4.16b, v_tmp1_hi.16b
+
+ /* data_5 */
+ and v_tmp1.16b, v_data_5.16b, v_mask0f.16b
+ ushr v_data_5.16b, v_data_5.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_5.16b
+ eor v_p1_5.16b, v_tmp1_lo.16b, v_p1_5.16b
+ eor v_p1_5.16b, v_p1_5.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_5.16b
+ eor v_p2_5.16b, v_tmp1_lo.16b, v_p2_5.16b
+ eor v_p2_5.16b, v_p2_5.16b, v_tmp1_hi.16b
+
+ /* data_6 */
+ and v_tmp1.16b, v_data_6.16b, v_mask0f.16b
+ ushr v_data_6.16b, v_data_6.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_6.16b
+ eor v_p1_6.16b, v_tmp1_lo.16b, v_p1_6.16b
+ eor v_p1_6.16b, v_p1_6.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_6.16b
+ eor v_p2_6.16b, v_tmp1_lo.16b, v_p2_6.16b
+ eor v_p2_6.16b, v_p2_6.16b, v_tmp1_hi.16b
+
+ /* data_7 */
+ and v_tmp1.16b, v_data_7.16b, v_mask0f.16b
+ ushr v_data_7.16b, v_data_7.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_7.16b
+ eor v_p1_7.16b, v_tmp1_lo.16b, v_p1_7.16b
+ eor v_p1_7.16b, v_p1_7.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_7.16b
+ eor v_p2_7.16b, v_tmp1_lo.16b, v_p2_7.16b
+ eor v_p2_7.16b, v_p2_7.16b, v_tmp1_hi.16b
+
+ cmp x_vec_i, x_vec
+ blt .Lloop128_vects
+
+.Lloop128_vects_end:
+ add x_ptr, x_dest1, x_pos
+ stp q_p1_0, q_p1_1, [x_ptr], #32
+ stp q_p1_2, q_p1_3, [x_ptr], #32
+ stp q_p1_4, q_p1_5, [x_ptr], #32
+ stp q_p1_6, q_p1_7, [x_ptr]
+
+ add x_ptr, x_dest2, x_pos
+ stp q_p2_0, q_p2_1, [x_ptr], #32
+ stp q_p2_2, q_p2_3, [x_ptr], #32
+ stp q_p2_4, q_p2_5, [x_ptr], #32
+ stp q_p2_6, q_p2_7, [x_ptr]
+
+ add x_pos, x_pos, #128
+ cmp x_pos, x_len
+ ble .Lloop128
+
+.Lloop128_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+
+ add x_len, x_len, #128
+ cmp x_pos, x_len
+ beq .return_pass
+
+.Lloop16_init:
+ sub x_len, x_len, #16
+ cmp x_pos, x_len
+ bgt .lessthan16_init
+
+.Lloop16:
+ movi v_p1.16b, #0
+ movi v_p2.16b, #0
+ mov x_tbl1, x_tbl
+ add x_tbl2, x_tbl, x_vec, lsl #2
+ mov x_vec_i, #0
+
+.Lloop16_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ ldr q_data, [x_ptr, x_pos]
+ add x_vec_i, x_vec_i, #8
+
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ eor v_p1.16b, v_tmp1_lo.16b, v_p1.16b
+ eor v_p1.16b, v_p1.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ eor v_p2.16b, v_tmp1_lo.16b, v_p2.16b
+ eor v_p2.16b, v_p2.16b, v_tmp1_hi.16b
+
+ cmp x_vec_i, x_vec
+ bne .Lloop16_vects
+
+.Lloop16_vects_end:
+ str q_p1, [x_dest1, x_pos]
+ str q_p2, [x_dest2, x_pos]
+ add x_pos, x_pos, #16
+ cmp x_pos, x_len
+ ble .Lloop16
+
+.Lloop16_end:
+ sub x_tmp, x_pos, x_len
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16_init:
+ mov x_pos, x_len
+ b .Lloop16
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_sve.S
new file mode 100644
index 0000000000..99b5f15cfb
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_sve.S
@@ -0,0 +1,168 @@
+/*************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_2vect_dot_prod_sve)
+#ifndef __APPLE__
+.type gf_2vect_dot_prod_sve, %function
+#endif
+/* void gf_2vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+ */
+
+/* arguments */
+x_len .req x0 /* vector length */
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+
+x_tbl1 .req x8
+x_tbl2 .req x9
+x_dest1 .req x10
+x_dest2 .req x_dest /* reused */
+
+/* r16,r17,r18,r29,r30: special role registers, avoided */
+/* r19..r29 and SP must be preserved */
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_gft1_lo .req z4
+z_gft1_hi .req z5
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_dest2 .req z27
+
+cdecl(gf_2vect_dot_prod_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
+
+/* Loop 1: x_len, vector length */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ mov x_vec_i, #0 /* clear x_vec_i */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ mov z_dest1.b, #0 /* clear z_dest1 */
+ mov z_dest2.b, #0 /* clear z_dest2 */
+
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
+
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
+.Lloopsve_vl_vects:
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
+ /* load gf_table's */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+
+ /* prefetch */
+ prfb pldl2keep, p0, [x_tbl1]
+ prfb pldl2keep, p0, [x_tbl2]
+
+ /* calc for next */
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ /* dest 1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
+ eor z_dest1.d, z_dest1.d, z_gft1_hi.d
+
+ /* dest 2 */
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
+ eor z_dest2.d, z_dest2.d, z_gft2_hi.d
+
+ cmp x_vec_i, x_vec
+ blt .Lloopsve_vl_vects
+/* end of Loop 2 */
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+
+ /* increment one vector length */
+ incb x_pos
+ b .Lloopsve_vl
+/* end of Loop 1 */
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_neon.S
new file mode 100644
index 0000000000..453524a221
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_neon.S
@@ -0,0 +1,411 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_2vect_mad_neon)
+#ifndef __APPLE__
+.type gf_2vect_mad_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_src_end .req x6
+x_dest1 .req x7
+x_dest2 .req x8
+x_tmp .req x9
+x_tbl1 .req x10
+x_tbl2 .req x11
+x_const .req x12
+
+/* vectors */
+v_mask0f .req v0
+v_tmp_lo .req v1
+v_tmp_hi .req v2
+v_tmp .req v3
+q_tmp .req q3
+
+v_gft1_lo .req v4
+v_gft1_hi .req v5
+v_gft2_lo .req v6
+v_gft2_hi .req v7
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+q_gft2_lo .req q6
+q_gft2_hi .req q7
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+v_data_4 .req v12
+v_data_5 .req v13
+v_data_6 .req v14
+v_data_7 .req v15
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+q_data_4 .req q12
+q_data_5 .req q13
+q_data_6 .req q14
+q_data_7 .req q15
+
+v_data_0_lo .req v16
+v_data_1_lo .req v17
+v_data_2_lo .req v18
+v_data_3_lo .req v19
+v_data_4_lo .req v20
+v_data_5_lo .req v21
+v_data_6_lo .req v22
+v_data_7_lo .req v23
+v_data_0_hi .req v_data_0
+v_data_1_hi .req v_data_1
+v_data_2_hi .req v_data_2
+v_data_3_hi .req v_data_3
+v_data_4_hi .req v_data_4
+v_data_5_hi .req v_data_5
+v_data_6_hi .req v_data_6
+v_data_7_hi .req v_data_7
+
+v_d0 .req v24
+v_d1 .req v25
+v_d2 .req v26
+v_d3 .req v27
+v_d4 .req v28
+v_d5 .req v29
+v_d6 .req v30
+v_d7 .req v31
+q_d0 .req q24
+q_d1 .req q25
+q_d2 .req q26
+q_d3 .req q27
+q_d4 .req q28
+q_d5 .req q29
+q_d6 .req q30
+q_d7 .req q31
+
+v_data .req v16
+q_data .req q16
+v_data_lo .req v17
+v_data_hi .req v18
+
+
+cdecl(gf_2vect_mad_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ lsl x_vec_i, x_vec_i, #5
+ lsl x_vec, x_vec, #5
+ add x_tbl1, x_tbl, x_vec_i
+ add x_tbl2, x_tbl1, x_vec
+ add x_src_end, x_src, x_len
+
+ ldr x_dest1, [x_dest]
+ ldr x_dest2, [x_dest, #8]
+ ldr q_gft1_lo, [x_tbl1]
+ ldr q_gft1_hi, [x_tbl1, #16]
+ ldr q_gft2_lo, [x_tbl2]
+ ldr q_gft2_hi, [x_tbl2, #16]
+
+.Lloop128_init:
+ /* less than 128 bytes, goto Lloop16_init */
+ cmp x_len, #128
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_src_end, x_src_end, #128
+
+.Lloop128:
+ ldr q_data_0, [x_src, #16*0]
+ ldr q_data_1, [x_src, #16*1]
+ ldr q_data_2, [x_src, #16*2]
+ ldr q_data_3, [x_src, #16*3]
+ ldr q_data_4, [x_src, #16*4]
+ ldr q_data_5, [x_src, #16*5]
+ ldr q_data_6, [x_src, #16*6]
+ ldr q_data_7, [x_src, #16*7]
+
+ ldr q_d0, [x_dest1, #16*0]
+ ldr q_d1, [x_dest1, #16*1]
+ ldr q_d2, [x_dest1, #16*2]
+ ldr q_d3, [x_dest1, #16*3]
+ ldr q_d4, [x_dest1, #16*4]
+ ldr q_d5, [x_dest1, #16*5]
+ ldr q_d6, [x_dest1, #16*6]
+ ldr q_d7, [x_dest1, #16*7]
+
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
+ and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
+ and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
+ and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
+ and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
+ and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
+
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ ushr v_data_2_hi.16b, v_data_2.16b, #4
+ ushr v_data_3_hi.16b, v_data_3.16b, #4
+ ushr v_data_4_hi.16b, v_data_4.16b, #4
+ ushr v_data_5_hi.16b, v_data_5.16b, #4
+ ushr v_data_6_hi.16b, v_data_6.16b, #4
+ ushr v_data_7_hi.16b, v_data_7.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
+ eor v_d0.16b, v_tmp_lo.16b, v_d0.16b
+ eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
+ eor v_d1.16b, v_tmp_lo.16b, v_d1.16b
+ eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
+ eor v_d2.16b, v_tmp_lo.16b, v_d2.16b
+ eor v_d2.16b, v_d2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
+ eor v_d3.16b, v_tmp_lo.16b, v_d3.16b
+ eor v_d3.16b, v_d3.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
+ eor v_d4.16b, v_tmp_lo.16b, v_d4.16b
+ eor v_d4.16b, v_d4.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
+ eor v_d5.16b, v_tmp_lo.16b, v_d5.16b
+ eor v_d5.16b, v_d5.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
+ eor v_d6.16b, v_tmp_lo.16b, v_d6.16b
+ eor v_d6.16b, v_d6.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
+ eor v_d7.16b, v_tmp_lo.16b, v_d7.16b
+ eor v_d7.16b, v_d7.16b, v_tmp_hi.16b
+
+ str q_d0, [x_dest1, #16*0]
+ str q_d1, [x_dest1, #16*1]
+ str q_d2, [x_dest1, #16*2]
+ str q_d3, [x_dest1, #16*3]
+ str q_d4, [x_dest1, #16*4]
+ str q_d5, [x_dest1, #16*5]
+ str q_d6, [x_dest1, #16*6]
+ str q_d7, [x_dest1, #16*7]
+
+ ldr q_d0, [x_dest2, #16*0]
+ ldr q_d1, [x_dest2, #16*1]
+ ldr q_d2, [x_dest2, #16*2]
+ ldr q_d3, [x_dest2, #16*3]
+ ldr q_d4, [x_dest2, #16*4]
+ ldr q_d5, [x_dest2, #16*5]
+ ldr q_d6, [x_dest2, #16*6]
+ ldr q_d7, [x_dest2, #16*7]
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
+ eor v_d0.16b, v_tmp_lo.16b, v_d0.16b
+ eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
+ eor v_d1.16b, v_tmp_lo.16b, v_d1.16b
+ eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
+ eor v_d2.16b, v_tmp_lo.16b, v_d2.16b
+ eor v_d2.16b, v_d2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
+ eor v_d3.16b, v_tmp_lo.16b, v_d3.16b
+ eor v_d3.16b, v_d3.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_4_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_4_hi.16b
+ eor v_d4.16b, v_tmp_lo.16b, v_d4.16b
+ eor v_d4.16b, v_d4.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_5_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_5_hi.16b
+ eor v_d5.16b, v_tmp_lo.16b, v_d5.16b
+ eor v_d5.16b, v_d5.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_6_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_6_hi.16b
+ eor v_d6.16b, v_tmp_lo.16b, v_d6.16b
+ eor v_d6.16b, v_d6.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_7_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_7_hi.16b
+ eor v_d7.16b, v_tmp_lo.16b, v_d7.16b
+ eor v_d7.16b, v_d7.16b, v_tmp_hi.16b
+
+ str q_d0, [x_dest2, #16*0]
+ str q_d1, [x_dest2, #16*1]
+ str q_d2, [x_dest2, #16*2]
+ str q_d3, [x_dest2, #16*3]
+ str q_d4, [x_dest2, #16*4]
+ str q_d5, [x_dest2, #16*5]
+ str q_d6, [x_dest2, #16*6]
+ str q_d7, [x_dest2, #16*7]
+
+ add x_src, x_src, #128
+ add x_dest1, x_dest1, #128
+ add x_dest2, x_dest2, #128
+ cmp x_src, x_src_end
+ bls .Lloop128
+
+.Lloop128_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+ add x_src_end, x_src_end, #128
+
+.Lloop16_init:
+ sub x_src_end, x_src_end, #16
+ cmp x_src, x_src_end
+ bhi .lessthan16_init
+
+.Lloop16:
+ ldr q_data, [x_src]
+
+ ldr q_d0, [x_dest1]
+ ldr q_d1, [x_dest2]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_d0.16b, v_tmp_lo.16b, v_d0.16b
+ eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_d1.16b, v_tmp_lo.16b, v_d1.16b
+ eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
+
+ str q_d0, [x_dest1]
+ str q_d1, [x_dest2]
+
+ add x_dest1, x_dest1, #16
+ add x_dest2, x_dest2, #16
+ add x_src, x_src, #16
+ cmp x_src, x_src_end
+ bls .Lloop16
+
+.lessthan16_init:
+ sub x_tmp, x_src, x_src_end
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16:
+ mov x_src, x_src_end
+ sub x_dest1, x_dest1, x_tmp
+ sub x_dest2, x_dest2, x_tmp
+
+#ifndef __APPLE__
+ adrp x_const, const_tbl
+ add x_const, x_const, :lo12:const_tbl
+#else
+ adrp x_const, const_tbl@PAGE
+ add x_const, x_const, const_tbl@PAGEOFF
+#endif
+ sub x_const, x_const, x_tmp
+ ldr q_tmp, [x_const, #16]
+
+ ldr q_data, [x_src]
+ ldr q_d0, [x_dest1]
+ ldr q_d1, [x_dest2]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d0.16b, v_d0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d1.16b, v_d1.16b, v_tmp_hi.16b
+
+ str q_d0, [x_dest1]
+ str q_d1, [x_dest2]
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
+
+ASM_DEF_RODATA
+.balign 8
+const_tbl:
+ .dword 0x0000000000000000, 0x0000000000000000
+ .dword 0xffffffffffffffff, 0xffffffffffffffff
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_sve.S
new file mode 100644
index 0000000000..f0ddf01187
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_sve.S
@@ -0,0 +1,152 @@
+/**************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_2vect_mad_sve)
+#ifndef __APPLE__
+.type gf_2vect_mad_sve, %function
+#endif
+
+/* gf_2vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+ */
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_pos .req x6
+x_dest2 .req x7
+x_dest1 .req x12
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_tmp_lo .req z4
+z_tmp_hi .req z5
+
+z_gft1_lo .req z6
+z_gft1_hi .req z7
+q_gft1_lo .req q6
+q_gft1_hi .req q7
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_dest2 .req z27
+
+cdecl(gf_2vect_mad_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ /* load table 1 */
+ add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */
+
+ /* Load table 1 with NEON instruction ldp */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
+ /* load table 2 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl]
+
+ ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */
+ ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */
+
+ mov x_pos, #0
+
+ /* vector length agnostic */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ /* prefetch dest data */
+ prfb pldl2strm, p0, [x_dest1, x_pos]
+ prfb pldl2strm, p0, [x_dest2, x_pos]
+
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_src, x_pos]
+
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* load dest data, governed by p0 */
+ ld1b z_dest1.b, p0/z, [x_dest1, x_pos]
+ ld1b z_dest2.b, p0/z, [x_dest2, x_pos]
+
+ /* dest1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_tmp_lo.d, z_dest1.d
+ eor z_dest1.d, z_tmp_hi.d, z_dest1.d
+
+ /* dest2 */
+ tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_tmp_lo.d, z_dest2.d
+ eor z_dest2.d, z_tmp_hi.d, z_dest2.d
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+ /* increment one vector length */
+ incb x_pos
+
+ b .Lloopsve_vl
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_neon.S
new file mode 100644
index 0000000000..cff34fc3dd
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_neon.S
@@ -0,0 +1,361 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_3vect_dot_prod_neon)
+#ifndef __APPLE__
+.type gf_3vect_dot_prod_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+x_tmp .req x8
+x_dest1 .req x9
+x_tbl1 .req x10
+x_dest2 .req x11
+x_tbl2 .req x12
+x_dest3 .req x13
+x_tbl3 .req x14
+
+/* vectors */
+v_gft1_lo .req v0
+v_gft1_hi .req v1
+v_gft2_lo .req v2
+v_gft2_hi .req v3
+v_gft3_lo .req v4
+v_gft3_hi .req v5
+q_gft1_lo .req q0
+q_gft1_hi .req q1
+q_gft2_lo .req q2
+q_gft2_hi .req q3
+q_gft3_lo .req q4
+q_gft3_hi .req q5
+
+v_mask0f .req v6
+q_mask0f .req q6
+v_tmp1 .req v7
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+
+v_tmp1_lo .req v12
+v_tmp1_hi .req v13
+
+v_p1_0 .req v20
+v_p1_1 .req v21
+v_p1_2 .req v22
+v_p1_3 .req v23
+v_p2_0 .req v24
+v_p2_1 .req v25
+v_p2_2 .req v26
+v_p2_3 .req v27
+v_p3_0 .req v28
+v_p3_1 .req v29
+v_p3_2 .req v30
+v_p3_3 .req v31
+
+q_p1_0 .req q20
+q_p1_1 .req q21
+q_p1_2 .req q22
+q_p1_3 .req q23
+q_p2_0 .req q24
+q_p2_1 .req q25
+q_p2_2 .req q26
+q_p2_3 .req q27
+q_p3_0 .req q28
+q_p3_1 .req q29
+q_p3_2 .req q30
+q_p3_3 .req q31
+
+v_data .req v_p1_1
+q_data .req q_p1_1
+v_data_lo .req v_p1_2
+v_data_hi .req v_p1_3
+
+
+cdecl(gf_3vect_dot_prod_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldr x_dest1, [x_dest, #8*0]
+ ldr x_dest2, [x_dest, #8*1]
+ ldr x_dest3, [x_dest, #8*2]
+
+.Lloop64_init:
+ /* less than 64 bytes, goto Lloop16_init */
+ cmp x_len, #64
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_len, x_len, #64
+
+.Lloop64:
+ movi v_p1_0.16b, #0
+ movi v_p1_1.16b, #0
+ movi v_p1_2.16b, #0
+ movi v_p1_3.16b, #0
+ movi v_p2_0.16b, #0
+ movi v_p2_1.16b, #0
+ movi v_p2_2.16b, #0
+ movi v_p2_3.16b, #0
+ movi v_p3_0.16b, #0
+ movi v_p3_1.16b, #0
+ movi v_p3_2.16b, #0
+ movi v_p3_3.16b, #0
+
+ mov x_tbl1, x_tbl
+ add x_tbl2, x_tbl1, x_vec, lsl #2
+ add x_tbl3, x_tbl2, x_vec, lsl #2
+ mov x_vec_i, #0
+
+.Lloop64_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ add x_vec_i, x_vec_i, #8
+ add x_ptr, x_ptr, x_pos
+
+ ldr q_data_0, [x_ptr], #16
+ ldr q_data_1, [x_ptr], #16
+
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+
+ ldr q_data_2, [x_ptr], #16
+ ldr q_data_3, [x_ptr], #16
+ prfm pldl1strm, [x_ptr]
+ prfm pldl1keep, [x_tbl1]
+ prfm pldl1keep, [x_tbl2]
+ prfm pldl1keep, [x_tbl3]
+
+ /* data_0 */
+ and v_tmp1.16b, v_data_0.16b, v_mask0f.16b
+ ushr v_data_0.16b, v_data_0.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b
+ eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b
+ eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b
+ eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b
+ eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b
+ eor v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b
+ eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b
+
+ /* data_1 */
+ and v_tmp1.16b, v_data_1.16b, v_mask0f.16b
+ ushr v_data_1.16b, v_data_1.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b
+ eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b
+ eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b
+ eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b
+ eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b
+ eor v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b
+ eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b
+
+ /* data_2 */
+ and v_tmp1.16b, v_data_2.16b, v_mask0f.16b
+ ushr v_data_2.16b, v_data_2.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b
+ eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b
+ eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b
+ eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b
+ eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b
+ eor v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b
+ eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b
+
+ /* data_3 */
+ and v_tmp1.16b, v_data_3.16b, v_mask0f.16b
+ ushr v_data_3.16b, v_data_3.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b
+ eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b
+ eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b
+ eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b
+ eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b
+ eor v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b
+ eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b
+
+ cmp x_vec_i, x_vec
+ blt .Lloop64_vects
+
+.Lloop64_vects_end:
+ add x_ptr, x_dest1, x_pos
+ stp q_p1_0, q_p1_1, [x_ptr], #32
+ stp q_p1_2, q_p1_3, [x_ptr]
+
+ add x_ptr, x_dest2, x_pos
+ stp q_p2_0, q_p2_1, [x_ptr], #32
+ stp q_p2_2, q_p2_3, [x_ptr]
+
+ add x_ptr, x_dest3, x_pos
+ stp q_p3_0, q_p3_1, [x_ptr], #32
+ stp q_p3_2, q_p3_3, [x_ptr]
+
+ add x_pos, x_pos, #64
+ cmp x_pos, x_len
+ ble .Lloop64
+
+.Lloop64_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+
+ add x_len, x_len, #64
+ cmp x_pos, x_len
+ beq .return_pass
+
+.Lloop16_init:
+ sub x_len, x_len, #16
+ cmp x_pos, x_len
+ bgt .lessthan16_init
+
+.Lloop16:
+ movi v_p1_0.16b, #0
+ movi v_p2_0.16b, #0
+ movi v_p3_0.16b, #0
+ mov x_tbl1, x_tbl
+ add x_tbl2, x_tbl1, x_vec, lsl #2
+ add x_tbl3, x_tbl2, x_vec, lsl #2
+ mov x_vec_i, #0
+
+.Lloop16_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ add x_vec_i, x_vec_i, #8
+ ldr q_data, [x_ptr, x_pos]
+
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+
+ eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b
+ eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b
+ eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b
+ eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b
+ eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b
+ eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b
+
+ cmp x_vec_i, x_vec
+ bne .Lloop16_vects
+
+.Lloop16_vects_end:
+ str q_p1_0, [x_dest1, x_pos]
+ str q_p2_0, [x_dest2, x_pos]
+ str q_p3_0, [x_dest3, x_pos]
+ add x_pos, x_pos, #16
+ cmp x_pos, x_len
+ ble .Lloop16
+
+.Lloop16_end:
+ sub x_tmp, x_pos, x_len
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16_init:
+ mov x_pos, x_len
+ b .Lloop16
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_sve.S
new file mode 100644
index 0000000000..8f6414ee52
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_sve.S
@@ -0,0 +1,189 @@
+/*************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_3vect_dot_prod_sve)
+#ifndef __APPLE__
+.type gf_3vect_dot_prod_sve, %function
+#endif
+/* void gf_3vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+ */
+
+/* arguments */
+x_len .req x0 /* vector length */
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+
+x_tbl1 .req x8
+x_tbl2 .req x9
+x_tbl3 .req x10
+x_dest1 .req x11
+x_dest2 .req x12
+x_dest3 .req x_dest /* reused */
+
+/* r16,r17,r18,r29,r30: special role registers, avoided */
+/* r19..r29 and SP must be preserved */
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_gft1_lo .req z4
+z_gft1_hi .req z5
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_dest2 .req z27
+z_dest3 .req z28
+
+cdecl(gf_3vect_dot_prod_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
+ ldr x_dest3, [x_dest, #8*2]
+
+/* Loop 1: x_len, vector length */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ mov x_vec_i, #0 /* clear x_vec_i */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ mov z_dest1.b, #0 /* clear z_dest1 */
+ mov z_dest2.b, #0 /* clear z_dest2 */
+ mov z_dest3.b, #0 /* clear z_dest3 */
+
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
+ add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */
+
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
+.Lloopsve_vl_vects:
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
+ /* load gf_table's */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+
+ /* prefetch */
+ prfb pldl2keep, p0, [x_tbl1]
+ prfb pldl2keep, p0, [x_tbl2]
+
+ /* calc for next */
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ /* dest 1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
+ eor z_dest1.d, z_dest1.d, z_gft1_hi.d
+
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+ prfb pldl2keep, p0, [x_tbl3]
+
+ /* dest 2 */
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
+ eor z_dest2.d, z_dest2.d, z_gft2_hi.d
+
+ /* dest 3 */
+ tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_gft3_lo.d, z_dest3.d
+ eor z_dest3.d, z_dest3.d, z_gft3_hi.d
+
+ cmp x_vec_i, x_vec
+ blt .Lloopsve_vl_vects
+/* end of Loop 2 */
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+
+ /* increment one vector length */
+ incb x_pos
+ b .Lloopsve_vl
+/* end of Loop 1 */
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_neon.S
new file mode 100644
index 0000000000..fcfeec1e23
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_neon.S
@@ -0,0 +1,391 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_3vect_mad_neon)
+#ifndef __APPLE__
+.type gf_3vect_mad_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_src_end .req x6
+x_dest1 .req x7
+x_dest2 .req x8
+x_dest3 .req x_dest
+x_tmp .req x10
+x_tbl1 .req x11
+x_tbl2 .req x12
+x_tbl3 .req x13
+x_const .req x14
+
+/* vectors */
+v_mask0f .req v0
+v_tmp_lo .req v1
+v_tmp_hi .req v2
+v_tmp .req v3
+q_tmp .req q3
+
+v_gft1_lo .req v4
+v_gft1_hi .req v5
+v_gft2_lo .req v6
+v_gft2_hi .req v7
+v_gft3_lo .req v16
+v_gft3_hi .req v17
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+q_gft2_lo .req q6
+q_gft2_hi .req q7
+q_gft3_lo .req q16
+q_gft3_hi .req q17
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+
+v_data_0_lo .req v12
+v_data_1_lo .req v13
+v_data_2_lo .req v14
+v_data_3_lo .req v15
+v_data_0_hi .req v_data_0
+v_data_1_hi .req v_data_1
+v_data_2_hi .req v_data_2
+v_data_3_hi .req v_data_3
+
+v_d1_0 .req v20
+v_d1_1 .req v21
+v_d1_2 .req v22
+v_d1_3 .req v23
+v_d2_0 .req v24
+v_d2_1 .req v25
+v_d2_2 .req v26
+v_d2_3 .req v27
+v_d3_0 .req v28
+v_d3_1 .req v29
+v_d3_2 .req v30
+v_d3_3 .req v31
+q_d1_0 .req q20
+q_d1_1 .req q21
+q_d1_2 .req q22
+q_d1_3 .req q23
+q_d2_0 .req q24
+q_d2_1 .req q25
+q_d2_2 .req q26
+q_d2_3 .req q27
+q_d3_0 .req q28
+q_d3_1 .req q29
+q_d3_2 .req q30
+q_d3_3 .req q31
+
+v_data .req v21
+q_data .req q21
+v_data_lo .req v22
+v_data_hi .req v23
+
+cdecl(gf_3vect_mad_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ lsl x_vec_i, x_vec_i, #5
+ lsl x_vec, x_vec, #5
+ add x_tbl1, x_tbl, x_vec_i
+ add x_tbl2, x_tbl1, x_vec
+ add x_tbl3, x_tbl2, x_vec
+ add x_src_end, x_src, x_len
+ ldr x_dest1, [x_dest]
+ ldr x_dest2, [x_dest, #8]
+ ldr x_dest3, [x_dest, #16]
+ ldr q_gft1_lo, [x_tbl1]
+ ldr q_gft1_hi, [x_tbl1, #16]
+ ldr q_gft2_lo, [x_tbl2]
+ ldr q_gft2_hi, [x_tbl2, #16]
+ ldr q_gft3_lo, [x_tbl3]
+ ldr q_gft3_hi, [x_tbl3, #16]
+
+.Lloop64_init:
+ /* less than 64 bytes, goto Lloop16_init */
+ cmp x_len, #64
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_src_end, x_src_end, #64
+
+.Lloop64:
+ ldr q_data_0, [x_src, #16*0]
+ ldr q_data_1, [x_src, #16*1]
+ ldr q_data_2, [x_src, #16*2]
+ ldr q_data_3, [x_src, #16*3]
+ add x_src, x_src, #64
+
+ ldr q_d1_0, [x_dest1, #16*0]
+ ldr q_d1_1, [x_dest1, #16*1]
+ ldr q_d1_2, [x_dest1, #16*2]
+ ldr q_d1_3, [x_dest1, #16*3]
+
+ ldr q_d2_0, [x_dest2, #16*0]
+ ldr q_d2_1, [x_dest2, #16*1]
+ ldr q_d2_2, [x_dest2, #16*2]
+ ldr q_d2_3, [x_dest2, #16*3]
+
+ ldr q_d3_0, [x_dest3, #16*0]
+ ldr q_d3_1, [x_dest3, #16*1]
+ ldr q_d3_2, [x_dest3, #16*2]
+ ldr q_d3_3, [x_dest3, #16*3]
+
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
+ and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
+
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ ushr v_data_2_hi.16b, v_data_2.16b, #4
+ ushr v_data_3_hi.16b, v_data_3.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
+ eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
+ eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
+ eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
+ eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
+ eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
+ eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
+ eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
+ eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
+ eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
+ eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
+ eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
+ eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
+ eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
+ eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
+ eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
+ eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
+ eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
+ eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
+ eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
+ eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
+ eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1, #16*0]
+ str q_d1_1, [x_dest1, #16*1]
+ str q_d1_2, [x_dest1, #16*2]
+ str q_d1_3, [x_dest1, #16*3]
+ add x_dest1, x_dest1, #64
+
+ str q_d2_0, [x_dest2, #16*0]
+ str q_d2_1, [x_dest2, #16*1]
+ str q_d2_2, [x_dest2, #16*2]
+ str q_d2_3, [x_dest2, #16*3]
+ add x_dest2, x_dest2, #64
+
+ str q_d3_0, [x_dest3, #16*0]
+ str q_d3_1, [x_dest3, #16*1]
+ str q_d3_2, [x_dest3, #16*2]
+ str q_d3_3, [x_dest3, #16*3]
+ add x_dest3, x_dest3, #64
+
+ cmp x_src, x_src_end
+ bls .Lloop64
+
+.Lloop64_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+ add x_src_end, x_src_end, #64
+
+.Lloop16_init:
+ sub x_src_end, x_src_end, #16
+ cmp x_src, x_src_end
+ bhi .lessthan16_init
+
+.Lloop16:
+ ldr q_data, [x_src]
+
+ ldr q_d1_0, [x_dest1]
+ ldr q_d2_0, [x_dest2]
+ ldr q_d3_0, [x_dest3]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1]
+ str q_d2_0, [x_dest2]
+ str q_d3_0, [x_dest3]
+
+ add x_src, x_src, #16
+ add x_dest1, x_dest1, #16
+ add x_dest2, x_dest2, #16
+ add x_dest3, x_dest3, #16
+ cmp x_src, x_src_end
+ bls .Lloop16
+
+.lessthan16_init:
+ sub x_tmp, x_src, x_src_end
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16:
+ mov x_src, x_src_end
+ sub x_dest1, x_dest1, x_tmp
+ sub x_dest2, x_dest2, x_tmp
+ sub x_dest3, x_dest3, x_tmp
+
+#ifndef __APPLE__
+ adrp x_const, const_tbl
+ add x_const, x_const, :lo12:const_tbl
+#else
+ adrp x_const, const_tbl@PAGE
+ add x_const, x_const, const_tbl@PAGEOFF
+#endif
+ sub x_const, x_const, x_tmp
+ ldr q_tmp, [x_const, #16]
+
+ ldr q_data, [x_src]
+ ldr q_d1_0, [x_dest1]
+ ldr q_d2_0, [x_dest2]
+ ldr q_d3_0, [x_dest3]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1]
+ str q_d2_0, [x_dest2]
+ str q_d3_0, [x_dest3]
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
+
+ASM_DEF_RODATA
+.balign 8
+const_tbl:
+ .dword 0x0000000000000000, 0x0000000000000000
+ .dword 0xffffffffffffffff, 0xffffffffffffffff
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_sve.S
new file mode 100644
index 0000000000..9e0ca5c4b3
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_sve.S
@@ -0,0 +1,175 @@
+/**************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_3vect_mad_sve)
+#ifndef __APPLE__
+.type gf_3vect_mad_sve, %function
+#endif
+
+/* gf_3vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+ */
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_pos .req x6
+x_dest2 .req x7
+x_dest3 .req x8
+x_dest1 .req x12
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_tmp_lo .req z4
+z_tmp_hi .req z5
+
+z_gft1_lo .req z6
+z_gft1_hi .req z7
+q_gft1_lo .req q6
+q_gft1_hi .req q7
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_dest2 .req z27
+z_dest3 .req z28
+
+cdecl(gf_3vect_mad_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ /* load table 1 */
+ add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */
+
+ /* Load table 1 with NEON instruction ldp */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
+ /* load table 2 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl]
+ /* load table 3 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl]
+
+ ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */
+ ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */
+ ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */
+
+ mov x_pos, #0
+
+ /* vector length agnostic */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ /* dest data prefetch */
+ prfb pldl2strm, p0, [x_dest1, x_pos]
+ prfb pldl2strm, p0, [x_dest2, x_pos]
+
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_src, x_pos]
+
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* load dest data, governed by p0 */
+ ld1b z_dest1.b, p0/z, [x_dest1, x_pos]
+ ld1b z_dest2.b, p0/z, [x_dest2, x_pos]
+ prfb pldl2strm, p0, [x_dest3, x_pos]
+
+ /* dest1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_tmp_lo.d, z_dest1.d
+ eor z_dest1.d, z_tmp_hi.d, z_dest1.d
+
+ /* dest2 */
+ tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b
+
+ ld1b z_dest3.b, p0/z, [x_dest3, x_pos]
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+
+ eor z_dest2.d, z_tmp_lo.d, z_dest2.d
+ eor z_dest2.d, z_tmp_hi.d, z_dest2.d
+
+ /* dest3 */
+ tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_tmp_lo.d, z_dest3.d
+ eor z_dest3.d, z_tmp_hi.d, z_dest3.d
+
+ /* store dest data, governed by p0 */
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+ /* increment one vector length */
+ incb x_pos
+
+ b .Lloopsve_vl
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_neon.S
new file mode 100644
index 0000000000..6204102f68
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_neon.S
@@ -0,0 +1,425 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_4vect_dot_prod_neon)
+#ifndef __APPLE__
+.type gf_4vect_dot_prod_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+x_tmp .req x8
+x_dest1 .req x9
+x_tbl1 .req x10
+x_dest2 .req x11
+x_tbl2 .req x12
+x_dest3 .req x13
+x_tbl3 .req x14
+x_dest4 .req x_dest
+x_tbl4 .req x15
+
+/* vectors */
+v_mask0f .req v0
+q_mask0f .req q0
+v_tmp1_lo .req v1
+v_tmp1_hi .req v2
+v_tmp1 .req v3
+q_tmp1 .req q3
+
+v_p1_0 .req v4
+v_p2_0 .req v5
+v_p3_0 .req v6
+v_p4_0 .req v7
+
+q_p1_0 .req q4
+q_p2_0 .req q5
+q_p3_0 .req q6
+q_p4_0 .req q7
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+
+v_p1_3 .req v12
+v_p2_3 .req v13
+v_p3_3 .req v14
+v_p4_3 .req v15
+q_p1_3 .req q12
+q_p2_3 .req q13
+q_p3_3 .req q14
+q_p4_3 .req q15
+
+v_gft1_lo .req v16
+v_gft1_hi .req v17
+v_gft2_lo .req v18
+v_gft2_hi .req v19
+v_gft3_lo .req v20
+v_gft3_hi .req v21
+v_gft4_lo .req v22
+v_gft4_hi .req v23
+q_gft1_lo .req q16
+q_gft1_hi .req q17
+q_gft2_lo .req q18
+q_gft2_hi .req q19
+q_gft3_lo .req q20
+q_gft3_hi .req q21
+q_gft4_lo .req q22
+q_gft4_hi .req q23
+
+v_p1_1 .req v24
+v_p1_2 .req v25
+v_p2_1 .req v26
+v_p2_2 .req v27
+v_p3_1 .req v28
+v_p3_2 .req v29
+v_p4_1 .req v30
+v_p4_2 .req v31
+
+q_p1_1 .req q24
+q_p1_2 .req q25
+q_p2_1 .req q26
+q_p2_2 .req q27
+q_p3_1 .req q28
+q_p3_2 .req q29
+q_p4_1 .req q30
+q_p4_2 .req q31
+
+v_data .req v_tmp1
+q_data .req q_tmp1
+v_data_lo .req v_tmp1_lo
+v_data_hi .req v_tmp1_hi
+
+cdecl(gf_4vect_dot_prod_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldr x_dest1, [x_dest, #8*0]
+ ldr x_dest2, [x_dest, #8*1]
+ ldr x_dest3, [x_dest, #8*2]
+ ldr x_dest4, [x_dest, #8*3]
+
+.Lloop64_init:
+ /* less than 64 bytes, goto Lloop16_init */
+ cmp x_len, #64
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_len, x_len, #64
+
+.Lloop64:
+ movi v_p1_0.16b, #0
+ movi v_p1_1.16b, #0
+ movi v_p1_2.16b, #0
+ movi v_p1_3.16b, #0
+ movi v_p2_0.16b, #0
+ movi v_p2_1.16b, #0
+ movi v_p2_2.16b, #0
+ movi v_p2_3.16b, #0
+ movi v_p3_0.16b, #0
+ movi v_p3_1.16b, #0
+ movi v_p3_2.16b, #0
+ movi v_p3_3.16b, #0
+ movi v_p4_0.16b, #0
+ movi v_p4_1.16b, #0
+ movi v_p4_2.16b, #0
+ movi v_p4_3.16b, #0
+
+ mov x_tbl1, x_tbl
+ add x_tbl2, x_tbl1, x_vec, lsl #2
+ add x_tbl3, x_tbl2, x_vec, lsl #2
+ add x_tbl4, x_tbl3, x_vec, lsl #2
+ mov x_vec_i, #0
+ prfm pldl1keep, [x_tbl1]
+ prfm pldl1keep, [x_tbl2]
+ prfm pldl1keep, [x_tbl3]
+ prfm pldl1keep, [x_tbl4]
+
+.Lloop64_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ add x_vec_i, x_vec_i, #8
+ add x_ptr, x_ptr, x_pos
+
+ ldr q_data_0, [x_ptr], #16
+ ldr q_data_1, [x_ptr], #16
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
+ ldr q_data_2, [x_ptr], #16
+ ldr q_data_3, [x_ptr], #16
+
+ prfm pldl1strm, [x_ptr]
+ prfm pldl1keep, [x_tbl1]
+ prfm pldl1keep, [x_tbl2]
+ prfm pldl1keep, [x_tbl3]
+ prfm pldl1keep, [x_tbl4]
+
+ /* data_0 */
+ and v_tmp1.16b, v_data_0.16b, v_mask0f.16b
+ ushr v_data_0.16b, v_data_0.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b
+ eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b
+ eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b
+ eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b
+ eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b
+ eor v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b
+ eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_0.16b
+ eor v_p4_0.16b, v_tmp1_lo.16b, v_p4_0.16b
+ eor v_p4_0.16b, v_p4_0.16b, v_tmp1_hi.16b
+
+ /* data_1 */
+ and v_tmp1.16b, v_data_1.16b, v_mask0f.16b
+ ushr v_data_1.16b, v_data_1.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b
+ eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b
+ eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b
+ eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b
+ eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b
+ eor v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b
+ eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_1.16b
+ eor v_p4_1.16b, v_tmp1_lo.16b, v_p4_1.16b
+ eor v_p4_1.16b, v_p4_1.16b, v_tmp1_hi.16b
+
+ /* data_2 */
+ and v_tmp1.16b, v_data_2.16b, v_mask0f.16b
+ ushr v_data_2.16b, v_data_2.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b
+ eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b
+ eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b
+ eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b
+ eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b
+ eor v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b
+ eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_2.16b
+ eor v_p4_2.16b, v_tmp1_lo.16b, v_p4_2.16b
+ eor v_p4_2.16b, v_p4_2.16b, v_tmp1_hi.16b
+
+ /* data_3 */
+ and v_tmp1.16b, v_data_3.16b, v_mask0f.16b
+ ushr v_data_3.16b, v_data_3.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b
+ eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b
+ eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b
+ eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b
+ eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b
+ eor v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b
+ eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
+ tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_3.16b
+ eor v_p4_3.16b, v_tmp1_lo.16b, v_p4_3.16b
+ eor v_p4_3.16b, v_p4_3.16b, v_tmp1_hi.16b
+
+ cmp x_vec_i, x_vec
+ blt .Lloop64_vects
+
+.Lloop64_vects_end:
+ add x_ptr, x_dest1, x_pos
+ stp q_p1_0, q_p1_1, [x_ptr], #32
+ stp q_p1_2, q_p1_3, [x_ptr]
+
+ add x_ptr, x_dest2, x_pos
+ stp q_p2_0, q_p2_1, [x_ptr], #32
+ stp q_p2_2, q_p2_3, [x_ptr]
+
+ add x_ptr, x_dest3, x_pos
+ stp q_p3_0, q_p3_1, [x_ptr], #32
+ stp q_p3_2, q_p3_3, [x_ptr]
+
+ add x_ptr, x_dest4, x_pos
+ stp q_p4_0, q_p4_1, [x_ptr], #32
+ stp q_p4_2, q_p4_3, [x_ptr]
+
+ add x_pos, x_pos, #64
+ cmp x_pos, x_len
+ ble .Lloop64
+
+.Lloop64_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+
+ add x_len, x_len, #64
+ cmp x_pos, x_len
+ beq .return_pass
+
+.Lloop16_init:
+ sub x_len, x_len, #16
+ cmp x_pos, x_len
+ bgt .lessthan16_init
+
+.Lloop16:
+ movi v_p1_0.16b, #0
+ movi v_p2_0.16b, #0
+ movi v_p3_0.16b, #0
+ movi v_p4_0.16b, #0
+ mov x_tbl1, x_tbl
+ add x_tbl2, x_tbl1, x_vec, lsl #2
+ add x_tbl3, x_tbl2, x_vec, lsl #2
+ add x_tbl4, x_tbl3, x_vec, lsl #2
+ mov x_vec_i, #0
+
+.Lloop16_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ add x_vec_i, x_vec_i, #8
+ ldr q_data, [x_ptr, x_pos]
+
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
+
+ prfm pldl1keep, [x_tbl1]
+ prfm pldl1keep, [x_tbl2]
+ prfm pldl1keep, [x_tbl3]
+ prfm pldl1keep, [x_tbl4]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ tbl v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
+ tbl v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
+
+ eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b
+ eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b
+ eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b
+ eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b
+ eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b
+ eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b
+ eor v_p4_0.16b, v_gft4_hi.16b, v_p4_0.16b
+ eor v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b
+
+ cmp x_vec_i, x_vec
+ bne .Lloop16_vects
+
+.Lloop16_vects_end:
+ str q_p1_0, [x_dest1, x_pos]
+ str q_p2_0, [x_dest2, x_pos]
+ str q_p3_0, [x_dest3, x_pos]
+ str q_p4_0, [x_dest4, x_pos]
+ add x_pos, x_pos, #16
+ cmp x_pos, x_len
+ ble .Lloop16
+
+.Lloop16_end:
+ sub x_tmp, x_pos, x_len
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16_init:
+ mov x_pos, x_len
+ b .Lloop16
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_sve.S
new file mode 100644
index 0000000000..eb354279f8
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_sve.S
@@ -0,0 +1,208 @@
+/*************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_4vect_dot_prod_sve)
+#ifndef __APPLE__
+.type gf_4vect_dot_prod_sve, %function
+#endif
+/* void gf_4vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+ */
+
+/* arguments */
+x_len .req x0 /* vector length */
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+
+x_tbl1 .req x8
+x_tbl2 .req x9
+x_tbl3 .req x10
+x_tbl4 .req x11
+x_dest1 .req x12
+x_dest2 .req x13
+x_dest3 .req x14
+x_dest4 .req x_dest /* reused */
+
+/* r16,r17,r18,r29,r30: special role registers, avoided */
+/* r19..r29 and SP must be preserved */
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_gft1_lo .req z4
+z_gft1_hi .req z5
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_gft4_lo .req z21
+z_gft4_hi .req z22
+q_gft4_lo .req q21
+q_gft4_hi .req q22
+
+z_dest2 .req z27
+z_dest3 .req z28
+z_dest4 .req z29
+
+cdecl(gf_4vect_dot_prod_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
+ ldp x_dest3, x_dest4, [x_dest, #8*2]
+
+/* Loop 1: x_len, vector length */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ mov x_vec_i, #0 /* clear x_vec_i */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ mov z_dest1.b, #0 /* clear z_dest1 */
+ mov z_dest2.b, #0 /* clear z_dest2 */
+ mov z_dest3.b, #0 /* clear z_dest3 */
+ mov z_dest4.b, #0 /* clear z_dest4 */
+
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
+ add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */
+ add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */
+
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
+.Lloopsve_vl_vects:
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
+ /* load gf_table's */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+
+ /* prefetch */
+ prfb pldl2keep, p0, [x_tbl1]
+ prfb pldl2keep, p0, [x_tbl2]
+
+ /* calc for next */
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ /* dest 1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
+ eor z_dest1.d, z_dest1.d, z_gft1_hi.d
+
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
+ prfb pldl2keep, p0, [x_tbl3]
+ prfb pldl2keep, p0, [x_tbl4]
+
+ /* dest 2 */
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
+ eor z_dest2.d, z_dest2.d, z_gft2_hi.d
+
+ /* dest 3 */
+ tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_gft3_lo.d, z_dest3.d
+ eor z_dest3.d, z_dest3.d, z_gft3_hi.d
+
+ /* dest 4 */
+ tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b
+ tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b
+ eor z_dest4.d, z_gft4_lo.d, z_dest4.d
+ eor z_dest4.d, z_dest4.d, z_gft4_hi.d
+
+ cmp x_vec_i, x_vec
+ blt .Lloopsve_vl_vects
+/* end of Loop 2 */
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
+
+ /* increment one vector length */
+ incb x_pos
+ b .Lloopsve_vl
+/* end of Loop 1 */
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_neon.S
new file mode 100644
index 0000000000..ebf82e7ffe
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_neon.S
@@ -0,0 +1,464 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_4vect_mad_neon)
+#ifndef __APPLE__
+.type gf_4vect_mad_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_src_end .req x6
+x_dest1 .req x7
+x_dest2 .req x8
+x_dest3 .req x9
+x_dest4 .req x_dest
+x_tmp .req x10
+x_tbl1 .req x11
+x_tbl2 .req x12
+x_tbl3 .req x13
+x_tbl4 .req x14
+x_const .req x15
+
+/* vectors */
+v_mask0f .req v0
+v_tmp_lo .req v1
+v_tmp_hi .req v2
+v_tmp .req v3
+q_tmp .req q3
+
+v_gft1_lo .req v4
+v_gft1_hi .req v5
+v_gft2_lo .req v6
+v_gft2_hi .req v7
+v_gft3_lo .req v16
+v_gft3_hi .req v17
+v_gft4_lo .req v18
+v_gft4_hi .req v19
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+q_gft2_lo .req q6
+q_gft2_hi .req q7
+q_gft3_lo .req q16
+q_gft3_hi .req q17
+q_gft4_lo .req q18
+q_gft4_hi .req q19
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+
+v_data_0_lo .req v12
+v_data_1_lo .req v13
+v_data_2_lo .req v14
+v_data_3_lo .req v15
+v_data_0_hi .req v_data_0
+v_data_1_hi .req v_data_1
+v_data_2_hi .req v_data_2
+v_data_3_hi .req v_data_3
+
+v_d1_0 .req v20
+v_d1_1 .req v21
+v_d1_2 .req v22
+v_d1_3 .req v23
+v_d2_0 .req v24
+v_d2_1 .req v25
+v_d2_2 .req v26
+v_d2_3 .req v27
+v_d3_0 .req v28
+v_d3_1 .req v29
+v_d3_2 .req v30
+v_d3_3 .req v31
+q_d1_0 .req q20
+q_d1_1 .req q21
+q_d1_2 .req q22
+q_d1_3 .req q23
+q_d2_0 .req q24
+q_d2_1 .req q25
+q_d2_2 .req q26
+q_d2_3 .req q27
+q_d3_0 .req q28
+q_d3_1 .req q29
+q_d3_2 .req q30
+q_d3_3 .req q31
+
+v_d4_0 .req v_d1_0
+v_d4_1 .req v_d1_1
+v_d4_2 .req v_d1_2
+v_d4_3 .req v_d1_3
+q_d4_0 .req q_d1_0
+q_d4_1 .req q_d1_1
+q_d4_2 .req q_d1_2
+q_d4_3 .req q_d1_3
+
+v_data .req v21
+q_data .req q21
+v_data_lo .req v22
+v_data_hi .req v23
+
+cdecl(gf_4vect_mad_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ lsl x_vec_i, x_vec_i, #5
+ lsl x_vec, x_vec, #5
+ add x_tbl1, x_tbl, x_vec_i
+ add x_tbl2, x_tbl1, x_vec
+ add x_tbl3, x_tbl2, x_vec
+ add x_tbl4, x_tbl3, x_vec
+ add x_src_end, x_src, x_len
+ ldr x_dest1, [x_dest, #8*0]
+ ldr x_dest2, [x_dest, #8*1]
+ ldr x_dest3, [x_dest, #8*2]
+ ldr x_dest4, [x_dest, #8*3]
+ ldr q_gft1_lo, [x_tbl1]
+ ldr q_gft1_hi, [x_tbl1, #16]
+ ldr q_gft2_lo, [x_tbl2]
+ ldr q_gft2_hi, [x_tbl2, #16]
+ ldr q_gft3_lo, [x_tbl3]
+ ldr q_gft3_hi, [x_tbl3, #16]
+ ldr q_gft4_lo, [x_tbl4]
+ ldr q_gft4_hi, [x_tbl4, #16]
+
+.Lloop64_init:
+ /* less than 64 bytes, goto Lloop16_init */
+ cmp x_len, #64
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_src_end, x_src_end, #64
+
+.Lloop64:
+ ldr q_data_0, [x_src, #16*0]
+ ldr q_data_1, [x_src, #16*1]
+ ldr q_data_2, [x_src, #16*2]
+ ldr q_data_3, [x_src, #16*3]
+ add x_src, x_src, #64
+
+ ldr q_d1_0, [x_dest1, #16*0]
+ ldr q_d1_1, [x_dest1, #16*1]
+ ldr q_d1_2, [x_dest1, #16*2]
+ ldr q_d1_3, [x_dest1, #16*3]
+
+ ldr q_d2_0, [x_dest2, #16*0]
+ ldr q_d2_1, [x_dest2, #16*1]
+ ldr q_d2_2, [x_dest2, #16*2]
+ ldr q_d2_3, [x_dest2, #16*3]
+
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
+ and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
+
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ ushr v_data_2_hi.16b, v_data_2.16b, #4
+ ushr v_data_3_hi.16b, v_data_3.16b, #4
+
+ /* dest1 */
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
+ eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
+ eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
+ eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
+ eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
+ eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
+ eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
+ eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
+
+ /* dest2 */
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
+ eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
+ eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
+ eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
+ eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
+ eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
+ eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
+ eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1, #16*0]
+ str q_d1_1, [x_dest1, #16*1]
+ str q_d1_2, [x_dest1, #16*2]
+ str q_d1_3, [x_dest1, #16*3]
+ add x_dest1, x_dest1, #64
+
+ str q_d2_0, [x_dest2, #16*0]
+ str q_d2_1, [x_dest2, #16*1]
+ str q_d2_2, [x_dest2, #16*2]
+ str q_d2_3, [x_dest2, #16*3]
+ add x_dest2, x_dest2, #64
+
+ ldr q_d3_0, [x_dest3, #16*0]
+ ldr q_d3_1, [x_dest3, #16*1]
+ ldr q_d3_2, [x_dest3, #16*2]
+ ldr q_d3_3, [x_dest3, #16*3]
+
+ ldr q_d4_0, [x_dest4, #16*0]
+ ldr q_d4_1, [x_dest4, #16*1]
+ ldr q_d4_2, [x_dest4, #16*2]
+ ldr q_d4_3, [x_dest4, #16*3]
+
+ /* dest3 */
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
+ eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
+ eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
+ eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
+ eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
+ eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
+ eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
+ eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
+
+ /* dest4 */
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b
+ eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
+ eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b
+ eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b
+ eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b
+ eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b
+ eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b
+ eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b
+ eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b
+
+ str q_d3_0, [x_dest3, #16*0]
+ str q_d3_1, [x_dest3, #16*1]
+ str q_d3_2, [x_dest3, #16*2]
+ str q_d3_3, [x_dest3, #16*3]
+ add x_dest3, x_dest3, #64
+
+ str q_d4_0, [x_dest4, #16*0]
+ str q_d4_1, [x_dest4, #16*1]
+ str q_d4_2, [x_dest4, #16*2]
+ str q_d4_3, [x_dest4, #16*3]
+ add x_dest4, x_dest4, #64
+
+ cmp x_src, x_src_end
+ bls .Lloop64
+
+.Lloop64_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+ add x_src_end, x_src_end, #64
+
+.Lloop16_init:
+ sub x_src_end, x_src_end, #16
+ cmp x_src, x_src_end
+ bhi .lessthan16_init
+
+.Lloop16:
+ ldr q_data, [x_src]
+
+ ldr q_d1_0, [x_dest1]
+ ldr q_d2_0, [x_dest2]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1]
+ str q_d2_0, [x_dest2]
+ ldr q_d3_0, [x_dest3]
+ ldr q_d4_0, [x_dest4]
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
+ eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
+ eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
+
+ str q_d3_0, [x_dest3]
+ str q_d4_0, [x_dest4]
+
+ add x_src, x_src, #16
+ add x_dest1, x_dest1, #16
+ add x_dest2, x_dest2, #16
+ add x_dest3, x_dest3, #16
+ add x_dest4, x_dest4, #16
+ cmp x_src, x_src_end
+ bls .Lloop16
+
+.lessthan16_init:
+ sub x_tmp, x_src, x_src_end
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16:
+ mov x_src, x_src_end
+ sub x_dest1, x_dest1, x_tmp
+ sub x_dest2, x_dest2, x_tmp
+ sub x_dest3, x_dest3, x_tmp
+ sub x_dest4, x_dest4, x_tmp
+
+#ifndef __APPLE__
+ adrp x_const, const_tbl
+ add x_const, x_const, :lo12:const_tbl
+#else
+ adrp x_const, const_tbl@PAGE
+ add x_const, x_const, const_tbl@PAGEOFF
+#endif
+ sub x_const, x_const, x_tmp
+ ldr q_tmp, [x_const, #16]
+
+ ldr q_data, [x_src]
+ ldr q_d1_0, [x_dest1]
+ ldr q_d2_0, [x_dest2]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1]
+ str q_d2_0, [x_dest2]
+ ldr q_d3_0, [x_dest3]
+ ldr q_d4_0, [x_dest4]
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
+
+ str q_d3_0, [x_dest3]
+ str q_d4_0, [x_dest4]
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
+
+ASM_DEF_RODATA
+.balign 8
+const_tbl:
+ .dword 0x0000000000000000, 0x0000000000000000
+ .dword 0xffffffffffffffff, 0xffffffffffffffff
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S
new file mode 100644
index 0000000000..89ec89f5c6
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S
@@ -0,0 +1,194 @@
+/**************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_4vect_mad_sve)
+#ifndef __APPLE__
+.type gf_4vect_mad_sve, %function
+#endif
+
+/* gf_4vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+ */
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_pos .req x6
+x_dest2 .req x7
+x_dest3 .req x8
+x_dest4 .req x9
+x_dest1 .req x12
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_tmp_lo .req z4
+z_tmp_hi .req z5
+
+z_gft1_lo .req z6
+z_gft1_hi .req z7
+q_gft1_lo .req q6
+q_gft1_hi .req q7
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_gft4_lo .req z21
+z_gft4_hi .req z22
+q_gft4_lo .req q21
+q_gft4_hi .req q22
+
+z_dest2 .req z27
+z_dest3 .req z28
+z_dest4 .req z29
+
+cdecl(gf_4vect_mad_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ /* load table 1 */
+ add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */
+
+ /* Load table 1 with NEON instruction ldp */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
+ /* load table 2 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl]
+ /* load table 3 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl]
+ /* load table 4 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl]
+
+ ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */
+ ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */
+ ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */
+ ldr x_dest4, [x_dest, #8*3] /* pointer to dest4 */
+
+ mov x_pos, #0
+
+ /* vector length agnostic */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ prfb pldl2strm, p0, [x_dest1, x_pos]
+ prfb pldl2strm, p0, [x_dest2, x_pos]
+
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_src, x_pos]
+
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* load dest data, governed by p0 */
+ ld1b z_dest1.b, p0/z, [x_dest1, x_pos]
+ ld1b z_dest2.b, p0/z, [x_dest2, x_pos]
+
+ prfb pldl2strm, p0, [x_dest3, x_pos]
+ prfb pldl2strm, p0, [x_dest4, x_pos]
+
+ /* dest1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_tmp_lo.d, z_dest1.d
+ eor z_dest1.d, z_tmp_hi.d, z_dest1.d
+
+ ld1b z_dest3.b, p0/z, [x_dest3, x_pos]
+ ld1b z_dest4.b, p0/z, [x_dest4, x_pos]
+
+ /* dest2 */
+ tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_tmp_lo.d, z_dest2.d
+ eor z_dest2.d, z_tmp_hi.d, z_dest2.d
+
+ /* dest3 */
+ tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_tmp_lo.d, z_dest3.d
+ eor z_dest3.d, z_tmp_hi.d, z_dest3.d
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+
+ /* dest4 */
+ tbl z_tmp_lo.b, {z_gft4_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft4_hi.b}, z_src_hi.b
+ eor z_dest4.d, z_tmp_lo.d, z_dest4.d
+ eor z_dest4.d, z_tmp_hi.d, z_dest4.d
+
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
+ /* increment one vector length */
+ incb x_pos
+
+ b .Lloopsve_vl
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_neon.S
new file mode 100644
index 0000000000..13166665d6
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_neon.S
@@ -0,0 +1,484 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_5vect_dot_prod_neon)
+#ifndef __APPLE__
+.type gf_5vect_dot_prod_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+x_tmp .req x8
+x_dest1 .req x9
+x_dest2 .req x10
+x_dest3 .req x11
+x_dest4 .req x12
+x_dest5 .req x13
+
+/* vectors */
+v_tmp1 .req v0
+q_tmp1 .req q0
+v_tmp2 .req v1
+q_tmp2 .req q1
+
+v_mask0f .req v_tmp1
+q_mask0f .req q_tmp1
+v_tmp_lo .req v_tmp1
+v_tmp_hi .req v_tmp2
+
+v_gft_lo .req v2
+v_gft_hi .req v3
+q_gft_lo .req q2
+q_gft_hi .req q3
+
+v_p1_0 .req v4
+v_p2_0 .req v5
+v_p3_0 .req v6
+v_p4_0 .req v7
+
+q_p1_0 .req q4
+q_p2_0 .req q5
+q_p3_0 .req q6
+q_p4_0 .req q7
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+
+v_data_0_lo .req v12
+v_data_1_lo .req v13
+v_data_2_lo .req v14
+v_data_3_lo .req v15
+v_data_0_hi .req v_data_0
+v_data_1_hi .req v_data_1
+v_data_2_hi .req v_data_2
+v_data_3_hi .req v_data_3
+
+v_p5_0 .req v16
+v_p1_1 .req v17
+v_p2_1 .req v18
+v_p3_1 .req v19
+v_p4_1 .req v20
+v_p5_1 .req v21
+v_p1_2 .req v22
+v_p2_2 .req v23
+v_p3_2 .req v24
+v_p4_2 .req v25
+v_p5_2 .req v26
+v_p1_3 .req v27
+v_p2_3 .req v28
+v_p3_3 .req v29
+v_p4_3 .req v30
+v_p5_3 .req v31
+
+q_p5_0 .req q16
+q_p1_1 .req q17
+q_p2_1 .req q18
+q_p3_1 .req q19
+q_p4_1 .req q20
+q_p5_1 .req q21
+q_p1_2 .req q22
+q_p2_2 .req q23
+q_p3_2 .req q24
+q_p4_2 .req q25
+q_p5_2 .req q26
+q_p1_3 .req q27
+q_p2_3 .req q28
+q_p3_3 .req q29
+q_p4_3 .req q30
+q_p5_3 .req q31
+
+v_data .req v_p1_1
+q_data .req q_p1_1
+v_data_lo .req v_p2_1
+v_data_hi .req v_p3_1
+
+v_gft1_lo .req v_p4_1
+v_gft1_hi .req v_p5_1
+v_gft2_lo .req v_p1_2
+v_gft2_hi .req v_p2_2
+v_gft3_lo .req v_p3_2
+v_gft3_hi .req v_p4_2
+v_gft4_lo .req v_p5_2
+v_gft4_hi .req v_p1_3
+v_gft5_lo .req v_p2_3
+v_gft5_hi .req v_p3_3
+q_gft1_lo .req q_p4_1
+q_gft1_hi .req q_p5_1
+q_gft2_lo .req q_p1_2
+q_gft2_hi .req q_p2_2
+q_gft3_lo .req q_p3_2
+q_gft3_hi .req q_p4_2
+q_gft4_lo .req q_p5_2
+q_gft4_hi .req q_p1_3
+q_gft5_lo .req q_p2_3
+q_gft5_hi .req q_p3_3
+
+
+cdecl(gf_5vect_dot_prod_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldr x_dest1, [x_dest, #8*0]
+ ldr x_dest2, [x_dest, #8*1]
+ ldr x_dest3, [x_dest, #8*2]
+ ldr x_dest4, [x_dest, #8*3]
+ ldr x_dest5, [x_dest, #8*4]
+
+.Lloop64_init:
+ /* less than 64 bytes, goto Lloop16_init */
+ cmp x_len, #64
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_len, x_len, #64
+
+.Lloop64:
+ movi v_p1_0.16b, #0
+ movi v_p1_1.16b, #0
+ movi v_p1_2.16b, #0
+ movi v_p1_3.16b, #0
+ movi v_p2_0.16b, #0
+ movi v_p2_1.16b, #0
+ movi v_p2_2.16b, #0
+ movi v_p2_3.16b, #0
+ movi v_p3_0.16b, #0
+ movi v_p3_1.16b, #0
+ movi v_p3_2.16b, #0
+ movi v_p3_3.16b, #0
+ movi v_p4_0.16b, #0
+ movi v_p4_1.16b, #0
+ movi v_p4_2.16b, #0
+ movi v_p4_3.16b, #0
+ movi v_p5_0.16b, #0
+ movi v_p5_1.16b, #0
+ movi v_p5_2.16b, #0
+ movi v_p5_3.16b, #0
+ mov x_vec_i, #0
+
+.Lloop64_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ add x_ptr, x_ptr, x_pos
+
+ ldr q_data_0, [x_ptr], #16
+ ldr q_data_1, [x_ptr], #16
+ ldr q_data_2, [x_ptr], #16
+ ldr q_data_3, [x_ptr], #16
+ prfm pldl2keep, [x_ptr]
+
+ movi v_mask0f.16b, #0x0f
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
+ and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ ushr v_data_2_hi.16b, v_data_2.16b, #4
+ ushr v_data_3_hi.16b, v_data_3.16b, #4
+
+ /* v_p1_x */
+ add x_tmp, x_tbl, x_vec_i, lsl #2
+ add x_vec_i, x_vec_i, #8
+ ldp q_gft_lo, q_gft_hi, [x_tmp]
+ prfm pldl3keep, [x_tmp, #32]
+ add x_tmp, x_tmp, x_vec, lsl #2
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
+ eor v_p1_0.16b, v_tmp_lo.16b, v_p1_0.16b
+ eor v_p1_0.16b, v_p1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
+ eor v_p1_1.16b, v_tmp_lo.16b, v_p1_1.16b
+ eor v_p1_1.16b, v_p1_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
+ eor v_p1_2.16b, v_tmp_lo.16b, v_p1_2.16b
+ eor v_p1_2.16b, v_p1_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
+ eor v_p1_3.16b, v_tmp_lo.16b, v_p1_3.16b
+ eor v_p1_3.16b, v_p1_3.16b, v_tmp_hi.16b
+
+ /* v_p2_x */
+ ldp q_gft_lo, q_gft_hi, [x_tmp]
+ prfm pldl3keep, [x_tmp, #32]
+ add x_tmp, x_tmp, x_vec, lsl #2
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
+ eor v_p2_0.16b, v_tmp_lo.16b, v_p2_0.16b
+ eor v_p2_0.16b, v_p2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
+ eor v_p2_1.16b, v_tmp_lo.16b, v_p2_1.16b
+ eor v_p2_1.16b, v_p2_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
+ eor v_p2_2.16b, v_tmp_lo.16b, v_p2_2.16b
+ eor v_p2_2.16b, v_p2_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
+ eor v_p2_3.16b, v_tmp_lo.16b, v_p2_3.16b
+ eor v_p2_3.16b, v_p2_3.16b, v_tmp_hi.16b
+
+ /* v_p3_x */
+ ldp q_gft_lo, q_gft_hi, [x_tmp]
+ prfm pldl3keep, [x_tmp, #32]
+ add x_tmp, x_tmp, x_vec, lsl #2
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
+ eor v_p3_0.16b, v_tmp_lo.16b, v_p3_0.16b
+ eor v_p3_0.16b, v_p3_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
+ eor v_p3_1.16b, v_tmp_lo.16b, v_p3_1.16b
+ eor v_p3_1.16b, v_p3_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
+ eor v_p3_2.16b, v_tmp_lo.16b, v_p3_2.16b
+ eor v_p3_2.16b, v_p3_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
+ eor v_p3_3.16b, v_tmp_lo.16b, v_p3_3.16b
+ eor v_p3_3.16b, v_p3_3.16b, v_tmp_hi.16b
+
+ /* v_p4_x */
+ ldp q_gft_lo, q_gft_hi, [x_tmp]
+ prfm pldl3keep, [x_tmp, #32]
+ add x_tmp, x_tmp, x_vec, lsl #2
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
+ eor v_p4_0.16b, v_tmp_lo.16b, v_p4_0.16b
+ eor v_p4_0.16b, v_p4_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
+ eor v_p4_1.16b, v_tmp_lo.16b, v_p4_1.16b
+ eor v_p4_1.16b, v_p4_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
+ eor v_p4_2.16b, v_tmp_lo.16b, v_p4_2.16b
+ eor v_p4_2.16b, v_p4_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
+ eor v_p4_3.16b, v_tmp_lo.16b, v_p4_3.16b
+ eor v_p4_3.16b, v_p4_3.16b, v_tmp_hi.16b
+
+ /* v_p5_x */
+ ldp q_gft_lo, q_gft_hi, [x_tmp]
+ prfm pldl3keep, [x_tmp, #32]
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
+ eor v_p5_0.16b, v_tmp_lo.16b, v_p5_0.16b
+ eor v_p5_0.16b, v_p5_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
+ eor v_p5_1.16b, v_tmp_lo.16b, v_p5_1.16b
+ eor v_p5_1.16b, v_p5_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
+ eor v_p5_2.16b, v_tmp_lo.16b, v_p5_2.16b
+ eor v_p5_2.16b, v_p5_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
+ eor v_p5_3.16b, v_tmp_lo.16b, v_p5_3.16b
+ eor v_p5_3.16b, v_p5_3.16b, v_tmp_hi.16b
+
+ cmp x_vec_i, x_vec
+ blt .Lloop64_vects
+
+.Lloop64_vects_end:
+ add x_ptr, x_dest1, x_pos
+ stp q_p1_0, q_p1_1, [x_ptr], #32
+ stp q_p1_2, q_p1_3, [x_ptr]
+
+ add x_ptr, x_dest2, x_pos
+ stp q_p2_0, q_p2_1, [x_ptr], #32
+ stp q_p2_2, q_p2_3, [x_ptr]
+
+ add x_ptr, x_dest3, x_pos
+ stp q_p3_0, q_p3_1, [x_ptr], #32
+ stp q_p3_2, q_p3_3, [x_ptr]
+
+ add x_ptr, x_dest4, x_pos
+ stp q_p4_0, q_p4_1, [x_ptr], #32
+ stp q_p4_2, q_p4_3, [x_ptr]
+
+ add x_ptr, x_dest5, x_pos
+ stp q_p5_0, q_p5_1, [x_ptr], #32
+ stp q_p5_2, q_p5_3, [x_ptr]
+
+ add x_pos, x_pos, #64
+ cmp x_pos, x_len
+ ble .Lloop64
+
+.Lloop64_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+
+ add x_len, x_len, #64
+ cmp x_pos, x_len
+ beq .return_pass
+
+.Lloop16_init:
+ sub x_len, x_len, #16
+ cmp x_pos, x_len
+ bgt .lessthan16_init
+
+.Lloop16:
+ movi v_p1_0.16b, #0
+ movi v_p2_0.16b, #0
+ movi v_p3_0.16b, #0
+ movi v_p4_0.16b, #0
+ movi v_p5_0.16b, #0
+ mov x_vec_i, #0
+
+.Lloop16_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ ldr q_data, [x_ptr, x_pos]
+
+ movi v_mask0f.16b, #0x0f
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ add x_tmp, x_tbl, x_vec_i, lsl #2
+ add x_vec_i, x_vec_i, #8
+ ldp q_gft1_lo, q_gft1_hi, [x_tmp]
+ add x_tmp, x_tmp, x_vec, lsl #2
+ ldp q_gft2_lo, q_gft2_hi, [x_tmp]
+ add x_tmp, x_tmp, x_vec, lsl #2
+ ldp q_gft3_lo, q_gft3_hi, [x_tmp]
+ add x_tmp, x_tmp, x_vec, lsl #2
+ ldp q_gft4_lo, q_gft4_hi, [x_tmp]
+ add x_tmp, x_tmp, x_vec, lsl #2
+ ldp q_gft5_lo, q_gft5_hi, [x_tmp]
+
+ tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ tbl v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
+ tbl v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
+ tbl v_gft5_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
+ tbl v_gft5_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
+
+ eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b
+ eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b
+ eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b
+ eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b
+ eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b
+ eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b
+ eor v_p4_0.16b, v_gft4_hi.16b, v_p4_0.16b
+ eor v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b
+ eor v_p5_0.16b, v_gft5_hi.16b, v_p5_0.16b
+ eor v_p5_0.16b, v_p5_0.16b, v_gft5_lo.16b
+
+ cmp x_vec_i, x_vec
+ bne .Lloop16_vects
+
+.Lloop16_vects_end:
+ str q_p1_0, [x_dest1, x_pos]
+ str q_p2_0, [x_dest2, x_pos]
+ str q_p3_0, [x_dest3, x_pos]
+ str q_p4_0, [x_dest4, x_pos]
+ str q_p5_0, [x_dest5, x_pos]
+ add x_pos, x_pos, #16
+ cmp x_pos, x_len
+ ble .Lloop16
+
+.Lloop16_end:
+ sub x_tmp, x_pos, x_len
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16_init:
+ mov x_pos, x_len
+ b .Lloop16
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_sve.S
new file mode 100644
index 0000000000..bb7cd0184e
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_sve.S
@@ -0,0 +1,237 @@
+/*************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_5vect_dot_prod_sve)
+#ifndef __APPLE__
+.type gf_5vect_dot_prod_sve, %function
+#endif
+/* void gf_5vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+ */
+
+/* arguments */
+x_len .req x0 /* vector length */
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+
+x_tbl1 .req x8
+x_tbl2 .req x9
+x_tbl3 .req x10
+x_tbl4 .req x11
+x_tbl5 .req x12
+x_dest1 .req x13
+x_dest2 .req x14
+x_dest4 .req x15
+x_dest5 .req x_dest /* reused */
+
+/* r16,r17,r18,r29,r30: special role registers, avoided */
+/* r19..r29 and SP must be preserved */
+x_dest3 .req x19
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_gft1_lo .req z4
+z_gft1_hi .req z5
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_gft4_lo .req z21
+z_gft4_hi .req z22
+q_gft4_lo .req q21
+q_gft4_hi .req q22
+
+z_gft5_lo .req z23
+z_gft5_hi .req z24
+q_gft5_lo .req q23
+q_gft5_hi .req q24
+
+z_dest2 .req z27
+z_dest3 .req z28
+z_dest4 .req z29
+z_dest5 .req z30
+
+cdecl(gf_5vect_dot_prod_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ /* save r19..r29 */
+ sub sp, sp, #16 /* alignment */
+ str x19, [sp]
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
+ ldp x_dest3, x_dest4, [x_dest, #8*2]
+ ldr x_dest5, [x_dest, #8*4]
+
+/* Loop 1: x_len, vector length */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ mov x_vec_i, #0 /* clear x_vec_i */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ mov z_dest1.b, #0 /* clear z_dest1 */
+ mov z_dest2.b, #0 /* clear z_dest2 */
+ mov z_dest3.b, #0 /* clear z_dest3 */
+ mov z_dest4.b, #0 /* clear z_dest4 */
+ mov z_dest5.b, #0 /* clear z_dest5 */
+
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
+ add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */
+ add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */
+ add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */
+
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
+.Lloopsve_vl_vects:
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
+ /* load gf_table's */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+
+ /* prefetch */
+ prfb pldl2keep, p0, [x_tbl1]
+ prfb pldl2keep, p0, [x_tbl2]
+
+ /* calc for next */
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ /* dest 1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
+ eor z_dest1.d, z_dest1.d, z_gft1_hi.d
+
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
+ prfb pldl2keep, p0, [x_tbl3]
+ prfb pldl2keep, p0, [x_tbl4]
+
+ /* dest 2 */
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
+ eor z_dest2.d, z_dest2.d, z_gft2_hi.d
+
+ /* dest 3 */
+ tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_gft3_lo.d, z_dest3.d
+ eor z_dest3.d, z_dest3.d, z_gft3_hi.d
+
+ ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32
+ prfb pldl2keep, p0, [x_tbl5]
+
+ /* dest 4 */
+ tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b
+ tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b
+ eor z_dest4.d, z_gft4_lo.d, z_dest4.d
+ eor z_dest4.d, z_dest4.d, z_gft4_hi.d
+
+ /* dest 5 */
+ tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b
+ tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b
+ eor z_dest5.d, z_gft5_lo.d, z_dest5.d
+ eor z_dest5.d, z_dest5.d, z_gft5_hi.d
+
+ cmp x_vec_i, x_vec
+ blt .Lloopsve_vl_vects
+/* end of Loop 2 */
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
+ st1b z_dest5.b, p0, [x_dest5, x_pos]
+
+ /* increment one vector length */
+ incb x_pos
+ b .Lloopsve_vl
+/* end of Loop 1 */
+
+.return_pass:
+ /* restore r19..r29 */
+ ldr x19, [sp]
+ add sp, sp, #16
+
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_neon.S
new file mode 100644
index 0000000000..473e4c5774
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_neon.S
@@ -0,0 +1,544 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_5vect_mad_neon)
+#ifndef __APPLE__
+.type gf_5vect_mad_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_src_end .req x6
+x_dest1 .req x7
+x_dest2 .req x8
+x_dest3 .req x9
+x_dest4 .req x10
+x_dest5 .req x_dest
+x_tmp .req x11
+x_tbl1 .req x12
+x_tbl2 .req x13
+x_tbl3 .req x14
+x_tbl4 .req x15
+x_tbl5 .req x16
+x_const .req x17
+
+/* vectors */
+v_mask0f .req v0
+v_tmp_lo .req v1
+v_tmp_hi .req v2
+v_tmp .req v3
+q_tmp .req q3
+
+v_gft1_lo .req v4
+v_gft1_hi .req v5
+v_gft2_lo .req v6
+v_gft2_hi .req v7
+v_gft3_lo .req v16
+v_gft3_hi .req v17
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+q_gft2_lo .req q6
+q_gft2_hi .req q7
+q_gft3_lo .req q16
+q_gft3_hi .req q17
+
+v_gft4_lo .req v18
+v_gft4_hi .req v19
+q_gft4_lo .req q18
+q_gft4_hi .req q19
+v_gft5_lo .req v_gft2_lo
+v_gft5_hi .req v_gft2_hi
+q_gft5_lo .req q_gft2_lo
+q_gft5_hi .req q_gft2_hi
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+
+v_data_0_lo .req v12
+v_data_1_lo .req v13
+v_data_2_lo .req v14
+v_data_3_lo .req v15
+v_data_0_hi .req v_data_0
+v_data_1_hi .req v_data_1
+v_data_2_hi .req v_data_2
+v_data_3_hi .req v_data_3
+
+v_d1_0 .req v20
+v_d1_1 .req v21
+v_d1_2 .req v22
+v_d1_3 .req v23
+v_d2_0 .req v24
+v_d2_1 .req v25
+v_d2_2 .req v26
+v_d2_3 .req v27
+v_d3_0 .req v28
+v_d3_1 .req v29
+v_d3_2 .req v30
+v_d3_3 .req v31
+q_d1_0 .req q20
+q_d1_1 .req q21
+q_d1_2 .req q22
+q_d1_3 .req q23
+q_d2_0 .req q24
+q_d2_1 .req q25
+q_d2_2 .req q26
+q_d2_3 .req q27
+q_d3_0 .req q28
+q_d3_1 .req q29
+q_d3_2 .req q30
+q_d3_3 .req q31
+
+v_d4_0 .req v_d1_0
+v_d4_1 .req v_d1_1
+v_d4_2 .req v_d1_2
+v_d4_3 .req v_d1_3
+q_d4_0 .req q_d1_0
+q_d4_1 .req q_d1_1
+q_d4_2 .req q_d1_2
+q_d4_3 .req q_d1_3
+v_d5_0 .req v_d2_0
+v_d5_1 .req v_d2_1
+v_d5_2 .req v_d2_2
+v_d5_3 .req v_d2_3
+q_d5_0 .req q_d2_0
+q_d5_1 .req q_d2_1
+q_d5_2 .req q_d2_2
+q_d5_3 .req q_d2_3
+
+v_data .req v21
+q_data .req q21
+v_data_lo .req v22
+v_data_hi .req v23
+
+cdecl(gf_5vect_mad_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ lsl x_vec_i, x_vec_i, #5
+ lsl x_vec, x_vec, #5
+ add x_tbl1, x_tbl, x_vec_i
+ add x_tbl2, x_tbl1, x_vec
+ add x_tbl3, x_tbl2, x_vec
+ add x_tbl4, x_tbl3, x_vec
+ add x_tbl5, x_tbl4, x_vec
+ add x_src_end, x_src, x_len
+ ldr x_dest1, [x_dest, #8*0]
+ ldr x_dest2, [x_dest, #8*1]
+ ldr x_dest3, [x_dest, #8*2]
+ ldr x_dest4, [x_dest, #8*3]
+ ldr x_dest5, [x_dest, #8*4]
+ ldr q_gft1_lo, [x_tbl1]
+ ldr q_gft1_hi, [x_tbl1, #16]
+ ldr q_gft3_lo, [x_tbl3]
+ ldr q_gft3_hi, [x_tbl3, #16]
+ ldr q_gft4_lo, [x_tbl4]
+ ldr q_gft4_hi, [x_tbl4, #16]
+
+.Lloop64_init:
+ /* less than 64 bytes, goto Lloop16_init */
+ cmp x_len, #64
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_src_end, x_src_end, #64
+
+.Lloop64:
+ ldr q_data_0, [x_src, #16*0]
+ ldr q_data_1, [x_src, #16*1]
+ ldr q_data_2, [x_src, #16*2]
+ ldr q_data_3, [x_src, #16*3]
+ add x_src, x_src, #64
+
+ ldr q_d1_0, [x_dest1, #16*0]
+ ldr q_d1_1, [x_dest1, #16*1]
+ ldr q_d1_2, [x_dest1, #16*2]
+ ldr q_d1_3, [x_dest1, #16*3]
+
+ ldr q_d2_0, [x_dest2, #16*0]
+ ldr q_d2_1, [x_dest2, #16*1]
+ ldr q_d2_2, [x_dest2, #16*2]
+ ldr q_d2_3, [x_dest2, #16*3]
+
+ ldr q_d3_0, [x_dest3, #16*0]
+ ldr q_d3_1, [x_dest3, #16*1]
+ ldr q_d3_2, [x_dest3, #16*2]
+ ldr q_d3_3, [x_dest3, #16*3]
+
+ ldr q_gft2_lo, [x_tbl2]
+ ldr q_gft2_hi, [x_tbl2, #16]
+
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
+ and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
+
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ ushr v_data_2_hi.16b, v_data_2.16b, #4
+ ushr v_data_3_hi.16b, v_data_3.16b, #4
+
+ /* dest1 */
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
+ eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
+ eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
+ eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
+ eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
+ eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
+ eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
+ eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
+
+ /* dest2 */
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
+ eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
+ eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
+ eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
+ eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
+ eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
+ eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
+ eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
+
+ /* dest3 */
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
+ eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
+ eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
+ eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
+ eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
+ eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
+ eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
+ eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1, #16*0]
+ str q_d1_1, [x_dest1, #16*1]
+ str q_d1_2, [x_dest1, #16*2]
+ str q_d1_3, [x_dest1, #16*3]
+ add x_dest1, x_dest1, #64
+
+ str q_d2_0, [x_dest2, #16*0]
+ str q_d2_1, [x_dest2, #16*1]
+ str q_d2_2, [x_dest2, #16*2]
+ str q_d2_3, [x_dest2, #16*3]
+ add x_dest2, x_dest2, #64
+
+ str q_d3_0, [x_dest3, #16*0]
+ str q_d3_1, [x_dest3, #16*1]
+ str q_d3_2, [x_dest3, #16*2]
+ str q_d3_3, [x_dest3, #16*3]
+ add x_dest3, x_dest3, #64
+
+ ldr q_d4_0, [x_dest4, #16*0]
+ ldr q_d4_1, [x_dest4, #16*1]
+ ldr q_d4_2, [x_dest4, #16*2]
+ ldr q_d4_3, [x_dest4, #16*3]
+
+ ldr q_d5_0, [x_dest5, #16*0]
+ ldr q_d5_1, [x_dest5, #16*1]
+ ldr q_d5_2, [x_dest5, #16*2]
+ ldr q_d5_3, [x_dest5, #16*3]
+
+ ldr q_gft5_lo, [x_tbl5]
+ ldr q_gft5_hi, [x_tbl5, #16]
+
+ /* dest4 */
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b
+ eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
+ eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b
+ eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b
+ eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b
+ eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b
+ eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b
+ eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b
+ eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b
+
+ /* dest5 */
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_0_hi.16b
+ eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
+ eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_1_hi.16b
+ eor v_d5_1.16b, v_tmp_lo.16b, v_d5_1.16b
+ eor v_d5_1.16b, v_d5_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_2_hi.16b
+ eor v_d5_2.16b, v_tmp_lo.16b, v_d5_2.16b
+ eor v_d5_2.16b, v_d5_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_3_hi.16b
+ eor v_d5_3.16b, v_tmp_lo.16b, v_d5_3.16b
+ eor v_d5_3.16b, v_d5_3.16b, v_tmp_hi.16b
+
+ str q_d4_0, [x_dest4, #16*0]
+ str q_d4_1, [x_dest4, #16*1]
+ str q_d4_2, [x_dest4, #16*2]
+ str q_d4_3, [x_dest4, #16*3]
+ add x_dest4, x_dest4, #64
+
+ str q_d5_0, [x_dest5, #16*0]
+ str q_d5_1, [x_dest5, #16*1]
+ str q_d5_2, [x_dest5, #16*2]
+ str q_d5_3, [x_dest5, #16*3]
+ add x_dest5, x_dest5, #64
+
+ cmp x_src, x_src_end
+ bls .Lloop64
+
+.Lloop64_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+ add x_src_end, x_src_end, #64
+
+.Lloop16_init:
+ sub x_src_end, x_src_end, #16
+ cmp x_src, x_src_end
+ bhi .lessthan16_init
+
+.Lloop16:
+ ldr q_data, [x_src]
+
+ ldr q_d1_0, [x_dest1]
+ ldr q_d2_0, [x_dest2]
+ ldr q_d3_0, [x_dest3]
+ ldr q_gft2_lo, [x_tbl2]
+ ldr q_gft2_hi, [x_tbl2, #16]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1]
+ str q_d2_0, [x_dest2]
+ str q_d3_0, [x_dest3]
+
+ ldr q_d4_0, [x_dest4]
+ ldr q_d5_0, [x_dest5]
+ ldr q_gft5_lo, [x_tbl5]
+ ldr q_gft5_hi, [x_tbl5, #16]
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
+ eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
+ eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
+ eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
+ eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
+
+ str q_d4_0, [x_dest4]
+ str q_d5_0, [x_dest5]
+
+ add x_src, x_src, #16
+ add x_dest1, x_dest1, #16
+ add x_dest2, x_dest2, #16
+ add x_dest3, x_dest3, #16
+ add x_dest4, x_dest4, #16
+ add x_dest5, x_dest5, #16
+ cmp x_src, x_src_end
+ bls .Lloop16
+
+.lessthan16_init:
+ sub x_tmp, x_src, x_src_end
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16:
+ mov x_src, x_src_end
+ sub x_dest1, x_dest1, x_tmp
+ sub x_dest2, x_dest2, x_tmp
+ sub x_dest3, x_dest3, x_tmp
+ sub x_dest4, x_dest4, x_tmp
+ sub x_dest5, x_dest5, x_tmp
+
+#ifndef __APPLE__
+ adrp x_const, const_tbl
+ add x_const, x_const, :lo12:const_tbl
+#else
+ adrp x_const, const_tbl@PAGE
+ add x_const, x_const, const_tbl@PAGEOFF
+#endif
+ sub x_const, x_const, x_tmp
+ ldr q_tmp, [x_const, #16]
+
+ ldr q_data, [x_src]
+ ldr q_d1_0, [x_dest1]
+ ldr q_d2_0, [x_dest2]
+ ldr q_d3_0, [x_dest3]
+ ldr q_gft2_lo, [x_tbl2]
+ ldr q_gft2_hi, [x_tbl2, #16]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1]
+ str q_d2_0, [x_dest2]
+ str q_d3_0, [x_dest3]
+
+ ldr q_d4_0, [x_dest4]
+ ldr q_d5_0, [x_dest5]
+ ldr q_gft5_lo, [x_tbl5]
+ ldr q_gft5_hi, [x_tbl5, #16]
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
+
+ str q_d4_0, [x_dest4]
+ str q_d5_0, [x_dest5]
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
+
+ASM_DEF_RODATA
+.balign 8
+const_tbl:
+ .dword 0x0000000000000000, 0x0000000000000000
+ .dword 0xffffffffffffffff, 0xffffffffffffffff
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_sve.S
new file mode 100644
index 0000000000..ab374d365a
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_sve.S
@@ -0,0 +1,218 @@
+/**************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_5vect_mad_sve)
+#ifndef __APPLE__
+.type gf_5vect_mad_sve, %function
+#endif
+
+/* gf_5vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+ */
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_pos .req x6
+x_dest2 .req x7
+x_dest3 .req x8
+x_dest4 .req x9
+x_dest5 .req x10
+x_dest1 .req x12
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_tmp_lo .req z4
+z_tmp_hi .req z5
+
+z_gft1_lo .req z6
+z_gft1_hi .req z7
+q_gft1_lo .req q6
+q_gft1_hi .req q7
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_gft4_lo .req z21
+z_gft4_hi .req z22
+q_gft4_lo .req q21
+q_gft4_hi .req q22
+
+z_gft5_lo .req z23
+z_gft5_hi .req z24
+q_gft5_lo .req q23
+q_gft5_hi .req q24
+
+z_dest2 .req z27
+z_dest3 .req z28
+z_dest4 .req z29
+z_dest5 .req z30
+
+cdecl(gf_5vect_mad_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ /* load table 1 */
+ add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */
+
+ /* Load with NEON instruction ldp */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
+ /* load table 2 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl]
+ /* load table 3 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl]
+ /* load table 4 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl]
+ /* load table 5 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft5_lo, q_gft5_hi, [x_tbl]
+
+ ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */
+ ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */
+ ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */
+ ldr x_dest4, [x_dest, #8*3] /* pointer to dest4 */
+ ldr x_dest5, [x_dest, #8*4] /* pointer to dest5 */
+
+ mov x_pos, #0
+
+ /* vector length agnostic */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ prfb pldl2strm, p0, [x_dest1, x_pos]
+ prfb pldl2strm, p0, [x_dest2, x_pos]
+
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_src, x_pos]
+
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* load dest data, governed by p0 */
+ ld1b z_dest1.b, p0/z, [x_dest1, x_pos]
+ ld1b z_dest2.b, p0/z, [x_dest2, x_pos]
+
+ prfb pldl2strm, p0, [x_dest3, x_pos]
+ prfb pldl2strm, p0, [x_dest4, x_pos]
+
+ /* dest1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_tmp_lo.d, z_dest1.d
+ eor z_dest1.d, z_tmp_hi.d, z_dest1.d
+
+ ld1b z_dest3.b, p0/z, [x_dest3, x_pos]
+ ld1b z_dest4.b, p0/z, [x_dest4, x_pos]
+ prfb pldl2strm, p0, [x_dest5, x_pos]
+
+ /* dest2 */
+ tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_tmp_lo.d, z_dest2.d
+ eor z_dest2.d, z_tmp_hi.d, z_dest2.d
+
+ ld1b z_dest5.b, p0/z, [x_dest5, x_pos]
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+
+ /* dest3 */
+ tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_tmp_lo.d, z_dest3.d
+ eor z_dest3.d, z_tmp_hi.d, z_dest3.d
+
+ /* dest4 */
+ tbl z_tmp_lo.b, {z_gft4_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft4_hi.b}, z_src_hi.b
+ eor z_dest4.d, z_tmp_lo.d, z_dest4.d
+ eor z_dest4.d, z_tmp_hi.d, z_dest4.d
+
+ /* store dest data, governed by p0 */
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+
+ /* dest5 */
+ tbl z_tmp_lo.b, {z_gft5_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft5_hi.b}, z_src_hi.b
+ eor z_dest5.d, z_tmp_lo.d, z_dest5.d
+ eor z_dest5.d, z_tmp_hi.d, z_dest5.d
+
+ /* store dest data, governed by p0 */
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
+ st1b z_dest5.b, p0, [x_dest5, x_pos]
+ /* increment one vector length */
+ incb x_pos
+
+ b .Lloopsve_vl
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_dot_prod_sve.S
new file mode 100644
index 0000000000..acc98953b3
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_dot_prod_sve.S
@@ -0,0 +1,258 @@
+/*************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_6vect_dot_prod_sve)
+#ifndef __APPLE__
+.type gf_6vect_dot_prod_sve, %function
+#endif
+/* void gf_6vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+ */
+
+/* arguments */
+x_len .req x0 /* vector length */
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+
+x_tbl1 .req x8
+x_tbl2 .req x9
+x_tbl3 .req x10
+x_tbl4 .req x11
+x_tbl5 .req x12
+x_tbl6 .req x13
+x_dest1 .req x14
+x_dest2 .req x15
+x_dest6 .req x_dest /* reused */
+
+/* r16,r17,r18,r29,r30: special role registers, avoided */
+/* r19..r29 and SP must be preserved */
+x_dest3 .req x19
+x_dest4 .req x20
+x_dest5 .req x21
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_gft1_lo .req z4
+z_gft1_hi .req z5
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_gft4_lo .req z21
+z_gft4_hi .req z22
+q_gft4_lo .req q21
+q_gft4_hi .req q22
+
+z_gft5_lo .req z23
+z_gft5_hi .req z24
+q_gft5_lo .req q23
+q_gft5_hi .req q24
+
+z_gft6_lo .req z25
+z_gft6_hi .req z26
+q_gft6_lo .req q25
+q_gft6_hi .req q26
+
+z_dest2 .req z27
+z_dest3 .req z28
+z_dest4 .req z29
+z_dest5 .req z30
+z_dest6 .req z31
+
+cdecl(gf_6vect_dot_prod_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ /* save r19..r29 */
+ sub sp, sp, #32 /* alignment */
+ stp x19, x20, [sp]
+ str x21, [sp, #16]
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
+ ldp x_dest3, x_dest4, [x_dest, #8*2]
+ ldp x_dest5, x_dest6, [x_dest, #8*4] /* x_dest6 reuses x_dest */
+
+/* Loop 1: x_len, vector length */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ mov x_vec_i, #0 /* clear x_vec_i */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ mov z_dest1.b, #0 /* clear z_dest1 */
+ mov z_dest2.b, #0 /* clear z_dest2 */
+ mov z_dest3.b, #0 /* clear z_dest3 */
+ mov z_dest4.b, #0 /* clear z_dest4 */
+ mov z_dest5.b, #0 /* clear z_dest5 */
+ mov z_dest6.b, #0 /* clear z_dest6 */
+
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
+ add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */
+ add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */
+ add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */
+ add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */
+
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
+.Lloopsve_vl_vects:
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
+ /* load gf_table's */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+
+ /* prefetch */
+ prfb pldl2keep, p0, [x_tbl1]
+ prfb pldl2keep, p0, [x_tbl2]
+
+ /* calc for next and prefetch */
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ /* dest 1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
+ eor z_dest1.d, z_dest1.d, z_gft1_hi.d
+
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
+ prfb pldl2keep, p0, [x_tbl3]
+ prfb pldl2keep, p0, [x_tbl4]
+
+ /* dest 2 */
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
+ eor z_dest2.d, z_dest2.d, z_gft2_hi.d
+
+ /* dest 3 */
+ tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_gft3_lo.d, z_dest3.d
+ eor z_dest3.d, z_dest3.d, z_gft3_hi.d
+
+ ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32
+ ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32
+ prfb pldl2keep, p0, [x_tbl5]
+ prfb pldl2keep, p0, [x_tbl6]
+
+ /* dest 4 */
+ tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b
+ tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b
+ eor z_dest4.d, z_gft4_lo.d, z_dest4.d
+ eor z_dest4.d, z_dest4.d, z_gft4_hi.d
+
+ /* dest 5 */
+ tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b
+ tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b
+ eor z_dest5.d, z_gft5_lo.d, z_dest5.d
+ eor z_dest5.d, z_dest5.d, z_gft5_hi.d
+
+ /* dest 6 */
+ tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b
+ tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b
+ eor z_dest6.d, z_gft6_lo.d, z_dest6.d
+ eor z_dest6.d, z_dest6.d, z_gft6_hi.d
+
+ cmp x_vec_i, x_vec
+ blt .Lloopsve_vl_vects
+/* end of Loop 2 */
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
+ st1b z_dest5.b, p0, [x_dest5, x_pos]
+ st1b z_dest6.b, p0, [x_dest6, x_pos]
+
+ /* increment one vector length */
+ incb x_pos
+ b .Lloopsve_vl
+/* end of Loop 1 */
+
+.return_pass:
+ /* restore r19..r29 */
+ ldr x21, [sp, #16]
+ ldp x19, x20, [sp]
+ add sp, sp, #32
+
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_neon.S
new file mode 100644
index 0000000000..3b1b1b4b21
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_neon.S
@@ -0,0 +1,618 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+.global cdecl(gf_6vect_mad_neon)
+#ifndef __APPLE__
+.type gf_6vect_mad_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_src_end .req x6
+x_dest1 .req x7
+x_dest2 .req x8
+x_dest3 .req x9
+x_dest4 .req x10
+x_dest5 .req x11
+x_dest6 .req x_dest
+x_tmp .req x12
+x_tbl1 .req x13
+x_tbl2 .req x14
+x_tbl3 .req x15
+x_tbl4 .req x16
+x_tbl5 .req x17
+x_tbl6 .req x_tbl
+x_const .req x18
+
+/* vectors */
+v_mask0f .req v0
+v_tmp_lo .req v1
+v_tmp_hi .req v2
+v_tmp .req v3
+q_tmp .req q3
+
+v_gft1_lo .req v4
+v_gft1_hi .req v5
+v_gft2_lo .req v6
+v_gft2_hi .req v7
+v_gft3_lo .req v16
+v_gft3_hi .req v17
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+q_gft2_lo .req q6
+q_gft2_hi .req q7
+q_gft3_lo .req q16
+q_gft3_hi .req q17
+
+v_gft4_lo .req v18
+v_gft4_hi .req v19
+q_gft4_lo .req q18
+q_gft4_hi .req q19
+v_gft5_lo .req v_gft2_lo
+v_gft5_hi .req v_gft2_hi
+q_gft5_lo .req q_gft2_lo
+q_gft5_hi .req q_gft2_hi
+v_gft6_lo .req v_gft3_lo
+v_gft6_hi .req v_gft3_hi
+q_gft6_lo .req q_gft3_lo
+q_gft6_hi .req q_gft3_hi
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+
+v_data_0_lo .req v12
+v_data_1_lo .req v13
+v_data_2_lo .req v14
+v_data_3_lo .req v15
+v_data_0_hi .req v_data_0
+v_data_1_hi .req v_data_1
+v_data_2_hi .req v_data_2
+v_data_3_hi .req v_data_3
+
+v_d1_0 .req v20
+v_d1_1 .req v21
+v_d1_2 .req v22
+v_d1_3 .req v23
+v_d2_0 .req v24
+v_d2_1 .req v25
+v_d2_2 .req v26
+v_d2_3 .req v27
+v_d3_0 .req v28
+v_d3_1 .req v29
+v_d3_2 .req v30
+v_d3_3 .req v31
+q_d1_0 .req q20
+q_d1_1 .req q21
+q_d1_2 .req q22
+q_d1_3 .req q23
+q_d2_0 .req q24
+q_d2_1 .req q25
+q_d2_2 .req q26
+q_d2_3 .req q27
+q_d3_0 .req q28
+q_d3_1 .req q29
+q_d3_2 .req q30
+q_d3_3 .req q31
+
+v_d4_0 .req v_d1_0
+v_d4_1 .req v_d1_1
+v_d4_2 .req v_d1_2
+v_d4_3 .req v_d1_3
+q_d4_0 .req q_d1_0
+q_d4_1 .req q_d1_1
+q_d4_2 .req q_d1_2
+q_d4_3 .req q_d1_3
+v_d5_0 .req v_d2_0
+v_d5_1 .req v_d2_1
+v_d5_2 .req v_d2_2
+v_d5_3 .req v_d2_3
+q_d5_0 .req q_d2_0
+q_d5_1 .req q_d2_1
+q_d5_2 .req q_d2_2
+q_d5_3 .req q_d2_3
+v_d6_0 .req v_d3_0
+v_d6_1 .req v_d3_1
+v_d6_2 .req v_d3_2
+v_d6_3 .req v_d3_3
+q_d6_0 .req q_d3_0
+q_d6_1 .req q_d3_1
+q_d6_2 .req q_d3_2
+q_d6_3 .req q_d3_3
+
+v_data .req v21
+q_data .req q21
+v_data_lo .req v22
+v_data_hi .req v23
+
+cdecl(gf_6vect_mad_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ lsl x_vec_i, x_vec_i, #5
+ lsl x_vec, x_vec, #5
+ add x_tbl1, x_tbl, x_vec_i
+ add x_tbl2, x_tbl1, x_vec
+ add x_tbl3, x_tbl2, x_vec
+ add x_tbl4, x_tbl3, x_vec
+ add x_tbl5, x_tbl4, x_vec
+ add x_tbl6, x_tbl5, x_vec
+ add x_src_end, x_src, x_len
+ ldr x_dest1, [x_dest, #8*0]
+ ldr x_dest2, [x_dest, #8*1]
+ ldr x_dest3, [x_dest, #8*2]
+ ldr x_dest4, [x_dest, #8*3]
+ ldr x_dest5, [x_dest, #8*4]
+ ldr x_dest6, [x_dest, #8*5]
+ ldr q_gft1_lo, [x_tbl1]
+ ldr q_gft1_hi, [x_tbl1, #16]
+ ldr q_gft4_lo, [x_tbl4]
+ ldr q_gft4_hi, [x_tbl4, #16]
+
+.Lloop64_init:
+ /* less than 64 bytes, goto Lloop16_init */
+ cmp x_len, #64
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_src_end, x_src_end, #64
+
+.Lloop64:
+ ldr q_data_0, [x_src, #16*0]
+ ldr q_data_1, [x_src, #16*1]
+ ldr q_data_2, [x_src, #16*2]
+ ldr q_data_3, [x_src, #16*3]
+ add x_src, x_src, #64
+
+ ldr q_d1_0, [x_dest1, #16*0]
+ ldr q_d1_1, [x_dest1, #16*1]
+ ldr q_d1_2, [x_dest1, #16*2]
+ ldr q_d1_3, [x_dest1, #16*3]
+
+ ldr q_d2_0, [x_dest2, #16*0]
+ ldr q_d2_1, [x_dest2, #16*1]
+ ldr q_d2_2, [x_dest2, #16*2]
+ ldr q_d2_3, [x_dest2, #16*3]
+
+ ldr q_d3_0, [x_dest3, #16*0]
+ ldr q_d3_1, [x_dest3, #16*1]
+ ldr q_d3_2, [x_dest3, #16*2]
+ ldr q_d3_3, [x_dest3, #16*3]
+
+ ldr q_gft2_lo, [x_tbl2]
+ ldr q_gft2_hi, [x_tbl2, #16]
+ ldr q_gft3_lo, [x_tbl3]
+ ldr q_gft3_hi, [x_tbl3, #16]
+
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
+ and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
+
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ ushr v_data_2_hi.16b, v_data_2.16b, #4
+ ushr v_data_3_hi.16b, v_data_3.16b, #4
+
+ /* dest1 */
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
+ eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
+ eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
+ eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
+ eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
+ eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
+ eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
+ eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
+
+ /* dest2 */
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
+ eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
+ eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
+ eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
+ eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
+ eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
+ eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
+ eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
+
+ /* dest3 */
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
+ eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
+ eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
+ eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
+ eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
+ eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
+ eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
+ eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1, #16*0]
+ str q_d1_1, [x_dest1, #16*1]
+ str q_d1_2, [x_dest1, #16*2]
+ str q_d1_3, [x_dest1, #16*3]
+ add x_dest1, x_dest1, #64
+
+ str q_d2_0, [x_dest2, #16*0]
+ str q_d2_1, [x_dest2, #16*1]
+ str q_d2_2, [x_dest2, #16*2]
+ str q_d2_3, [x_dest2, #16*3]
+ add x_dest2, x_dest2, #64
+
+ str q_d3_0, [x_dest3, #16*0]
+ str q_d3_1, [x_dest3, #16*1]
+ str q_d3_2, [x_dest3, #16*2]
+ str q_d3_3, [x_dest3, #16*3]
+ add x_dest3, x_dest3, #64
+
+ ldr q_d4_0, [x_dest4, #16*0]
+ ldr q_d4_1, [x_dest4, #16*1]
+ ldr q_d4_2, [x_dest4, #16*2]
+ ldr q_d4_3, [x_dest4, #16*3]
+
+ ldr q_d5_0, [x_dest5, #16*0]
+ ldr q_d5_1, [x_dest5, #16*1]
+ ldr q_d5_2, [x_dest5, #16*2]
+ ldr q_d5_3, [x_dest5, #16*3]
+
+ ldr q_d6_0, [x_dest6, #16*0]
+ ldr q_d6_1, [x_dest6, #16*1]
+ ldr q_d6_2, [x_dest6, #16*2]
+ ldr q_d6_3, [x_dest6, #16*3]
+
+ ldr q_gft5_lo, [x_tbl5]
+ ldr q_gft5_hi, [x_tbl5, #16]
+ ldr q_gft6_lo, [x_tbl6]
+ ldr q_gft6_hi, [x_tbl6, #16]
+
+ /* dest4 */
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b
+ eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
+ eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b
+ eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b
+ eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b
+ eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b
+ eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b
+ eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b
+ eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b
+
+ /* dest5 */
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_0_hi.16b
+ eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
+ eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_1_hi.16b
+ eor v_d5_1.16b, v_tmp_lo.16b, v_d5_1.16b
+ eor v_d5_1.16b, v_d5_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_2_hi.16b
+ eor v_d5_2.16b, v_tmp_lo.16b, v_d5_2.16b
+ eor v_d5_2.16b, v_d5_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_3_hi.16b
+ eor v_d5_3.16b, v_tmp_lo.16b, v_d5_3.16b
+ eor v_d5_3.16b, v_d5_3.16b, v_tmp_hi.16b
+
+ /* dest6 */
+ tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_0_hi.16b
+ eor v_d6_0.16b, v_tmp_lo.16b, v_d6_0.16b
+ eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_1_hi.16b
+ eor v_d6_1.16b, v_tmp_lo.16b, v_d6_1.16b
+ eor v_d6_1.16b, v_d6_1.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_2_hi.16b
+ eor v_d6_2.16b, v_tmp_lo.16b, v_d6_2.16b
+ eor v_d6_2.16b, v_d6_2.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_3_hi.16b
+ eor v_d6_3.16b, v_tmp_lo.16b, v_d6_3.16b
+ eor v_d6_3.16b, v_d6_3.16b, v_tmp_hi.16b
+
+ str q_d4_0, [x_dest4, #16*0]
+ str q_d4_1, [x_dest4, #16*1]
+ str q_d4_2, [x_dest4, #16*2]
+ str q_d4_3, [x_dest4, #16*3]
+ add x_dest4, x_dest4, #64
+
+ str q_d5_0, [x_dest5, #16*0]
+ str q_d5_1, [x_dest5, #16*1]
+ str q_d5_2, [x_dest5, #16*2]
+ str q_d5_3, [x_dest5, #16*3]
+ add x_dest5, x_dest5, #64
+
+ str q_d6_0, [x_dest6, #16*0]
+ str q_d6_1, [x_dest6, #16*1]
+ str q_d6_2, [x_dest6, #16*2]
+ str q_d6_3, [x_dest6, #16*3]
+ add x_dest6, x_dest6, #64
+
+ cmp x_src, x_src_end
+ bls .Lloop64
+
+.Lloop64_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+ add x_src_end, x_src_end, #64
+
+.Lloop16_init:
+ sub x_src_end, x_src_end, #16
+ cmp x_src, x_src_end
+ bhi .lessthan16_init
+
+.Lloop16:
+ ldr q_data, [x_src]
+
+ ldr q_d1_0, [x_dest1]
+ ldr q_d2_0, [x_dest2]
+ ldr q_d3_0, [x_dest3]
+ ldr q_gft2_lo, [x_tbl2]
+ ldr q_gft2_hi, [x_tbl2, #16]
+ ldr q_gft3_lo, [x_tbl3]
+ ldr q_gft3_hi, [x_tbl3, #16]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1]
+ str q_d2_0, [x_dest2]
+ str q_d3_0, [x_dest3]
+
+ ldr q_d4_0, [x_dest4]
+ ldr q_d5_0, [x_dest5]
+ ldr q_d6_0, [x_dest6]
+ ldr q_gft5_lo, [x_tbl5]
+ ldr q_gft5_hi, [x_tbl5, #16]
+ ldr q_gft6_lo, [x_tbl6]
+ ldr q_gft6_hi, [x_tbl6, #16]
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
+ eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
+ eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
+ eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
+ eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_hi.16b
+ eor v_d6_0.16b, v_tmp_lo.16b, v_d6_0.16b
+ eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
+
+ str q_d4_0, [x_dest4]
+ str q_d5_0, [x_dest5]
+ str q_d6_0, [x_dest6]
+
+ add x_src, x_src, #16
+ add x_dest1, x_dest1, #16
+ add x_dest2, x_dest2, #16
+ add x_dest3, x_dest3, #16
+ add x_dest4, x_dest4, #16
+ add x_dest5, x_dest5, #16
+ add x_dest6, x_dest6, #16
+ cmp x_src, x_src_end
+ bls .Lloop16
+
+.lessthan16_init:
+ sub x_tmp, x_src, x_src_end
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16:
+ mov x_src, x_src_end
+ sub x_dest1, x_dest1, x_tmp
+ sub x_dest2, x_dest2, x_tmp
+ sub x_dest3, x_dest3, x_tmp
+ sub x_dest4, x_dest4, x_tmp
+ sub x_dest5, x_dest5, x_tmp
+ sub x_dest6, x_dest6, x_tmp
+
+#ifndef __APPLE__
+ adrp x_const, const_tbl
+ add x_const, x_const, :lo12:const_tbl
+#else
+ adrp x_const, const_tbl@PAGE
+ add x_const, x_const, const_tbl@PAGEOFF
+#endif
+ sub x_const, x_const, x_tmp
+ ldr q_tmp, [x_const, #16]
+
+ ldr q_data, [x_src]
+ ldr q_d1_0, [x_dest1]
+ ldr q_d2_0, [x_dest2]
+ ldr q_d3_0, [x_dest3]
+ ldr q_gft2_lo, [x_tbl2]
+ ldr q_gft2_hi, [x_tbl2, #16]
+ ldr q_gft3_lo, [x_tbl3]
+ ldr q_gft3_hi, [x_tbl3, #16]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
+
+ str q_d1_0, [x_dest1]
+ str q_d2_0, [x_dest2]
+ str q_d3_0, [x_dest3]
+
+ ldr q_d4_0, [x_dest4]
+ ldr q_d5_0, [x_dest5]
+ ldr q_d6_0, [x_dest6]
+ ldr q_gft5_lo, [x_tbl5]
+ ldr q_gft5_hi, [x_tbl5, #16]
+ ldr q_gft6_lo, [x_tbl6]
+ ldr q_gft6_hi, [x_tbl6, #16]
+
+ tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
+
+ tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_lo.16b
+ tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_hi.16b
+ eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
+ and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
+ eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
+
+ str q_d4_0, [x_dest4]
+ str q_d5_0, [x_dest5]
+ str q_d6_0, [x_dest6]
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
+
+ASM_DEF_RODATA
+.balign 8
+const_tbl:
+ .dword 0x0000000000000000, 0x0000000000000000
+ .dword 0xffffffffffffffff, 0xffffffffffffffff
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_sve.S
new file mode 100644
index 0000000000..c4f372cd73
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_sve.S
@@ -0,0 +1,237 @@
+/**************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_6vect_mad_sve)
+#ifndef __APPLE__
+.type gf_6vect_mad_sve, %function
+#endif
+
+/* gf_6vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest);
+ */
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_pos .req x6
+x_dest2 .req x7
+x_dest3 .req x8
+x_dest4 .req x9
+x_dest5 .req x10
+x_dest6 .req x11
+x_dest1 .req x12
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+
+z_tmp_lo .req z4
+z_tmp_hi .req z5
+
+z_gft1_lo .req z6
+z_gft1_hi .req z7
+q_gft1_lo .req q6
+q_gft1_hi .req q7
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_gft4_lo .req z21
+z_gft4_hi .req z22
+q_gft4_lo .req q21
+q_gft4_hi .req q22
+
+z_gft5_lo .req z23
+z_gft5_hi .req z24
+q_gft5_lo .req q23
+q_gft5_hi .req q24
+
+z_gft6_lo .req z25
+z_gft6_hi .req z26
+q_gft6_lo .req q25
+q_gft6_hi .req q26
+
+z_dest2 .req z27
+z_dest3 .req z28
+z_dest4 .req z29
+z_dest5 .req z30
+z_dest6 .req z31
+
+cdecl(gf_6vect_mad_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ /* load table 1 */
+ add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */
+
+ /* Load with NEON instruction ldp */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
+ /* load table 2 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl]
+ /* load table 3 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl]
+ /* load table 4 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl]
+ /* load table 5 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft5_lo, q_gft5_hi, [x_tbl]
+ /* load table 6 */
+ add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */
+ ldp q_gft6_lo, q_gft6_hi, [x_tbl]
+
+ ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */
+ ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */
+ ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */
+ ldr x_dest4, [x_dest, #8*3] /* pointer to dest4 */
+ ldr x_dest5, [x_dest, #8*4] /* pointer to dest5 */
+ ldr x_dest6, [x_dest, #8*5] /* pointer to dest6 */
+
+ mov x_pos, #0
+
+ /* vector length agnostic */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ prfb pldl2strm, p0, [x_dest1, x_pos]
+ prfb pldl2strm, p0, [x_dest2, x_pos]
+
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_src, x_pos]
+
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* load dest data, governed by p0 */
+ ld1b z_dest1.b, p0/z, [x_dest1, x_pos]
+ ld1b z_dest2.b, p0/z, [x_dest2, x_pos]
+
+ prfb pldl2strm, p0, [x_dest3, x_pos]
+ prfb pldl2strm, p0, [x_dest4, x_pos]
+
+ /* dest1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_tmp_lo.d, z_dest1.d
+ eor z_dest1.d, z_tmp_hi.d, z_dest1.d
+
+ ld1b z_dest3.b, p0/z, [x_dest3, x_pos]
+ ld1b z_dest4.b, p0/z, [x_dest4, x_pos]
+
+ prfb pldl2strm, p0, [x_dest5, x_pos]
+ prfb pldl2strm, p0, [x_dest6, x_pos]
+
+ /* dest2 */
+ tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_tmp_lo.d, z_dest2.d
+ eor z_dest2.d, z_tmp_hi.d, z_dest2.d
+
+ ld1b z_dest5.b, p0/z, [x_dest5, x_pos]
+ ld1b z_dest6.b, p0/z, [x_dest6, x_pos]
+
+ /* dest3 */
+ tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_tmp_lo.d, z_dest3.d
+ eor z_dest3.d, z_tmp_hi.d, z_dest3.d
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+
+ /* dest4 */
+ tbl z_tmp_lo.b, {z_gft4_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft4_hi.b}, z_src_hi.b
+ eor z_dest4.d, z_tmp_lo.d, z_dest4.d
+ eor z_dest4.d, z_tmp_hi.d, z_dest4.d
+
+ /* dest5 */
+ tbl z_tmp_lo.b, {z_gft5_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft5_hi.b}, z_src_hi.b
+ eor z_dest5.d, z_tmp_lo.d, z_dest5.d
+ eor z_dest5.d, z_tmp_hi.d, z_dest5.d
+
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
+
+ /* dest6 */
+ tbl z_tmp_lo.b, {z_gft6_lo.b}, z_src_lo.b
+ tbl z_tmp_hi.b, {z_gft6_hi.b}, z_src_hi.b
+ eor z_dest6.d, z_tmp_lo.d, z_dest6.d
+ eor z_dest6.d, z_tmp_hi.d, z_dest6.d
+
+ st1b z_dest5.b, p0, [x_dest5, x_pos]
+ st1b z_dest6.b, p0, [x_dest6, x_pos]
+ /* increment one vector length */
+ incb x_pos
+
+ b .Lloopsve_vl
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_7vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_7vect_dot_prod_sve.S
new file mode 100644
index 0000000000..0f74873de0
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_7vect_dot_prod_sve.S
@@ -0,0 +1,281 @@
+/*************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_7vect_dot_prod_sve)
+#ifndef __APPLE__
+.type gf_7vect_dot_prod_sve, %function
+#endif
+/* void gf_7vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+ */
+
+/* arguments */
+x_len .req x0 /* vector length */
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+
+x_tbl1 .req x8
+x_tbl2 .req x9
+x_tbl3 .req x10
+x_tbl4 .req x11
+x_tbl5 .req x12
+x_tbl6 .req x13
+x_tbl7 .req x14
+
+x_dest1 .req x15
+
+/* r16,r17,r18,r29,r30: special role registers, avoided */
+/* r19..r29 and SP must be preserved */
+x_dest2 .req x19
+x_dest3 .req x20
+x_dest4 .req x21
+x_dest5 .req x22
+x_dest6 .req x23
+x_dest7 .req x_dest /* reused */
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+z_gft1_lo .req z4
+z_gft1_hi .req z5
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+
+z_gft7_lo .req z6
+z_gft7_hi .req z7
+q_gft7_lo .req q6
+q_gft7_hi .req q7
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_dest7 .req z16
+
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_gft4_lo .req z21
+z_gft4_hi .req z22
+q_gft4_lo .req q21
+q_gft4_hi .req q22
+
+z_gft5_lo .req z23
+z_gft5_hi .req z24
+q_gft5_lo .req q23
+q_gft5_hi .req q24
+
+z_gft6_lo .req z25
+z_gft6_hi .req z26
+q_gft6_lo .req q25
+q_gft6_hi .req q26
+
+z_dest2 .req z27
+z_dest3 .req z28
+z_dest4 .req z29
+z_dest5 .req z30
+z_dest6 .req z31
+
+cdecl(gf_7vect_dot_prod_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ /* save r19..r29 */
+ sub sp, sp, #48 /* alignment */
+ stp x19, x20, [sp]
+ stp x21, x22, [sp, #16]
+ str x23, [sp, #32]
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
+ ldp x_dest3, x_dest4, [x_dest, #8*2]
+ ldp x_dest5, x_dest6, [x_dest, #8*4]
+ ldr x_dest7, [x_dest, #8*6] /* x_dest7 reuses x_dest */
+
+/* Loop 1: x_len, vector length */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ mov x_vec_i, #0 /* clear x_vec_i */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ mov z_dest1.b, #0 /* clear z_dest1 */
+ mov z_dest2.b, #0 /* clear z_dest2 */
+ mov z_dest3.b, #0 /* clear z_dest3 */
+ mov z_dest4.b, #0 /* clear z_dest4 */
+ mov z_dest5.b, #0 /* clear z_dest5 */
+ mov z_dest6.b, #0 /* clear z_dest6 */
+ mov z_dest7.b, #0 /* clear z_dest7 */
+
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
+ add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */
+ add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */
+ add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */
+ add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */
+ add x_tbl7, x_tbl6, x_vec, LSL #2 /* reset x_tbl7 */
+
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
+.Lloopsve_vl_vects:
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
+ /* load gf_table's */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+
+ /* prefetch */
+ prfb pldl2keep, p0, [x_tbl1]
+ prfb pldl2keep, p0, [x_tbl2]
+
+ /* calc for next and prefetch */
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ /* dest 1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
+ eor z_dest1.d, z_gft1_hi.d, z_dest1.d
+
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
+ prfb pldl2keep, p0, [x_tbl3]
+ prfb pldl2keep, p0, [x_tbl4]
+
+ /* dest 2 */
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
+ eor z_dest2.d, z_gft2_hi.d, z_dest2.d
+
+ /* dest 3 */
+ tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_gft3_lo.d, z_dest3.d
+ eor z_dest3.d, z_gft3_hi.d, z_dest3.d
+
+ ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32
+ ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32
+ prfb pldl2keep, p0, [x_tbl5]
+ prfb pldl2keep, p0, [x_tbl6]
+
+ /* dest 4 */
+ tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b
+ tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b
+ eor z_dest4.d, z_gft4_lo.d, z_dest4.d
+ eor z_dest4.d, z_gft4_hi.d, z_dest4.d
+
+ /* dest 5 */
+ tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b
+ tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b
+ eor z_dest5.d, z_gft5_lo.d, z_dest5.d
+ eor z_dest5.d, z_gft5_hi.d, z_dest5.d
+
+ ldp q_gft7_lo, q_gft7_hi, [x_tbl7], #32
+ prfb pldl2keep, p0, [x_tbl7]
+
+ /* dest 6 */
+ tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b
+ tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b
+ eor z_dest6.d, z_gft6_lo.d, z_dest6.d
+ eor z_dest6.d, z_gft6_hi.d, z_dest6.d
+
+ /* dest 7 */
+ tbl z_gft7_lo.b, {z_gft7_lo.b}, z_src_lo.b
+ tbl z_gft7_hi.b, {z_gft7_hi.b}, z_src_hi.b
+ eor z_dest7.d, z_gft7_lo.d, z_dest7.d
+ eor z_dest7.d, z_gft7_hi.d, z_dest7.d
+
+ cmp x_vec_i, x_vec
+ blt .Lloopsve_vl_vects
+/* end of Loop 2 */
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
+ st1b z_dest5.b, p0, [x_dest5, x_pos]
+ st1b z_dest6.b, p0, [x_dest6, x_pos]
+ st1b z_dest7.b, p0, [x_dest7, x_pos]
+
+ /* increment one vector length */
+ incb x_pos
+ b .Lloopsve_vl
+/* end of Loop 1 */
+
+.return_pass:
+ /* restore r19..r29 */
+ ldr x23, [sp, #32]
+ ldp x21, x22, [sp, #16]
+ ldp x19, x20, [sp]
+ add sp, sp, #48
+
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_8vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_8vect_dot_prod_sve.S
new file mode 100644
index 0000000000..20768f4889
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_8vect_dot_prod_sve.S
@@ -0,0 +1,307 @@
+/*************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_8vect_dot_prod_sve)
+#ifndef __APPLE__
+.type gf_8vect_dot_prod_sve, %function
+#endif
+/* void gf_8vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+ */
+
+/* arguments */
+x_len .req x0 /* vector length */
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
+x_tbl .req x2
+x_src .req x3
+x_dest .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+
+x_tbl1 .req x8
+x_tbl2 .req x9
+x_tbl3 .req x10
+x_tbl4 .req x11
+x_tbl5 .req x12
+x_tbl6 .req x13
+x_tbl7 .req x14
+
+x_dest1 .req x15
+
+/* r16,r17,r18,r29,r30: special role registers, avoided */
+/* r19..r29 and SP must be preserved */
+x_dest2 .req x19
+x_dest3 .req x20
+x_dest4 .req x21
+x_dest5 .req x22
+x_dest6 .req x23
+x_dest7 .req x24
+x_dest8 .req x_dest /* reused */
+x_tbl8 .req x25
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest1 .req z3
+z_gft1_lo .req z4
+z_gft1_hi .req z5
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+
+z_gft7_lo .req z6
+z_gft7_hi .req z7
+q_gft7_lo .req q6
+q_gft7_hi .req q7
+
+/* bottom 64-bit of v8..v15 must be preserved if used */
+z_dest7 .req z8
+
+z_gft8_lo .req z9
+z_gft8_hi .req z10
+q_gft8_lo .req q9
+q_gft8_hi .req q10
+
+z_dest8 .req z16
+
+z_gft2_lo .req z17
+z_gft2_hi .req z18
+q_gft2_lo .req q17
+q_gft2_hi .req q18
+
+z_gft3_lo .req z19
+z_gft3_hi .req z20
+q_gft3_lo .req q19
+q_gft3_hi .req q20
+
+z_gft4_lo .req z21
+z_gft4_hi .req z22
+q_gft4_lo .req q21
+q_gft4_hi .req q22
+
+z_gft5_lo .req z23
+z_gft5_hi .req z24
+q_gft5_lo .req q23
+q_gft5_hi .req q24
+
+z_gft6_lo .req z25
+z_gft6_hi .req z26
+q_gft6_lo .req q25
+q_gft6_hi .req q26
+
+z_dest2 .req z27
+z_dest3 .req z28
+z_dest4 .req z29
+z_dest5 .req z30
+z_dest6 .req z31
+
+cdecl(gf_8vect_dot_prod_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ /* save r19..r29 */
+ sub sp, sp, #80 /* alignment */
+ stp x19, x20, [sp]
+ stp x21, x22, [sp, #16]
+ stp x23, x24, [sp, #32]
+ stp d8, d9, [sp, #48]
+ str d10, [sp, #56]
+ str x25, [sp, #64]
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+ ldp x_dest1, x_dest2, [x_dest, #8*0]
+ ldp x_dest3, x_dest4, [x_dest, #8*2]
+ ldp x_dest5, x_dest6, [x_dest, #8*4]
+ ldp x_dest7, x_dest8, [x_dest, #8*6] /* x_dest8 reuses x_dest */
+
+/* Loop 1: x_len, vector length */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ mov x_vec_i, #0 /* clear x_vec_i */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ mov z_dest1.b, #0 /* clear z_dest1 */
+ mov z_dest2.b, #0 /* clear z_dest2 */
+ mov z_dest3.b, #0 /* clear z_dest3 */
+ mov z_dest4.b, #0 /* clear z_dest4 */
+ mov z_dest5.b, #0 /* clear z_dest5 */
+ mov z_dest6.b, #0 /* clear z_dest6 */
+ mov z_dest7.b, #0 /* clear z_dest7 */
+ mov z_dest8.b, #0 /* clear z_dest8 */
+
+ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
+ add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */
+ add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */
+ add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */
+ add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */
+ add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */
+ add x_tbl7, x_tbl6, x_vec, LSL #2 /* reset x_tbl7 */
+ add x_tbl8, x_tbl7, x_vec, LSL #2 /* reset x_tbl8 */
+
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
+.Lloopsve_vl_vects:
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */
+ /* load gf_table's */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */
+ ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
+
+ /* prefetch */
+ prfb pldl2keep, p0, [x_tbl1]
+ prfb pldl2keep, p0, [x_tbl2]
+
+ /* calc for next and prefetch */
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+
+ /* dest 1 */
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest1.d, z_gft1_lo.d, z_dest1.d
+ eor z_dest1.d, z_gft1_hi.d, z_dest1.d
+
+ ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
+ ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
+ prfb pldl2keep, p0, [x_tbl3]
+ prfb pldl2keep, p0, [x_tbl4]
+
+ /* dest 2 */
+ tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b
+ tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b
+ eor z_dest2.d, z_gft2_lo.d, z_dest2.d
+ eor z_dest2.d, z_gft2_hi.d, z_dest2.d
+
+ /* dest 3 */
+ tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b
+ tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b
+ eor z_dest3.d, z_gft3_lo.d, z_dest3.d
+ eor z_dest3.d, z_gft3_hi.d, z_dest3.d
+
+ ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32
+ ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32
+ prfb pldl2keep, p0, [x_tbl5]
+ prfb pldl2keep, p0, [x_tbl6]
+
+ /* dest 4 */
+ tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b
+ tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b
+ eor z_dest4.d, z_gft4_lo.d, z_dest4.d
+ eor z_dest4.d, z_gft4_hi.d, z_dest4.d
+
+ /* dest 5 */
+ tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b
+ tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b
+ eor z_dest5.d, z_gft5_lo.d, z_dest5.d
+ eor z_dest5.d, z_gft5_hi.d, z_dest5.d
+
+ ldp q_gft7_lo, q_gft7_hi, [x_tbl7], #32
+ ldp q_gft8_lo, q_gft8_hi, [x_tbl8], #32
+ prfb pldl2keep, p0, [x_tbl7]
+ prfb pldl2keep, p0, [x_tbl8]
+
+ /* dest 6 */
+ tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b
+ tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b
+ eor z_dest6.d, z_gft6_lo.d, z_dest6.d
+ eor z_dest6.d, z_gft6_hi.d, z_dest6.d
+
+ /* dest 7 */
+ tbl z_gft7_lo.b, {z_gft7_lo.b}, z_src_lo.b
+ tbl z_gft7_hi.b, {z_gft7_hi.b}, z_src_hi.b
+ eor z_dest7.d, z_gft7_lo.d, z_dest7.d
+ eor z_dest7.d, z_gft7_hi.d, z_dest7.d
+
+ /* dest 8 */
+ tbl z_gft8_lo.b, {z_gft8_lo.b}, z_src_lo.b
+ tbl z_gft8_hi.b, {z_gft8_hi.b}, z_src_hi.b
+ eor z_dest8.d, z_gft8_lo.d, z_dest8.d
+ eor z_dest8.d, z_gft8_hi.d, z_dest8.d
+
+ cmp x_vec_i, x_vec
+ blt .Lloopsve_vl_vects
+/* end of Loop 2 */
+
+ /* store dest data, governed by p0 */
+ st1b z_dest1.b, p0, [x_dest1, x_pos]
+ st1b z_dest2.b, p0, [x_dest2, x_pos]
+ st1b z_dest3.b, p0, [x_dest3, x_pos]
+ st1b z_dest4.b, p0, [x_dest4, x_pos]
+ st1b z_dest5.b, p0, [x_dest5, x_pos]
+ st1b z_dest6.b, p0, [x_dest6, x_pos]
+ st1b z_dest7.b, p0, [x_dest7, x_pos]
+ st1b z_dest8.b, p0, [x_dest8, x_pos]
+
+ /* increment one vector length */
+ incb x_pos
+ b .Lloopsve_vl
+/* end of Loop 1 */
+
+.return_pass:
+ /* restore r19..r29 */
+ ldr x25, [sp, #64]
+ ldr d10, [sp, #56]
+ ldp d8, d9, [sp, #48]
+ ldp x23, x24, [sp, #32]
+ ldp x21, x22, [sp, #16]
+ ldp x19, x20, [sp]
+ add sp, sp, #80
+
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_neon.S
new file mode 100644
index 0000000000..4d17362894
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_neon.S
@@ -0,0 +1,303 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_vect_dot_prod_neon)
+#ifndef __APPLE__
+.type gf_vect_dot_prod_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_tbl .req x2
+x_src .req x3
+x_dest1 .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+x_tmp .req x8
+x_tbl1 .req x9
+
+/* vectors */
+v_gft1_lo .req v0
+v_gft1_hi .req v1
+q_gft1_lo .req q0
+q_gft1_hi .req q1
+v_mask0f .req v2
+q_mask0f .req q2
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+v_data_4 .req v12
+v_data_5 .req v13
+v_data_6 .req v14
+v_data_7 .req v15
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+q_data_4 .req q12
+q_data_5 .req q13
+q_data_6 .req q14
+q_data_7 .req q15
+
+v_data_0_lo .req v16
+v_data_1_lo .req v17
+v_data_2_lo .req v18
+v_data_3_lo .req v19
+v_data_4_lo .req v20
+v_data_5_lo .req v21
+v_data_6_lo .req v22
+v_data_7_lo .req v23
+v_data_0_hi .req v_data_0
+v_data_1_hi .req v_data_1
+v_data_2_hi .req v_data_2
+v_data_3_hi .req v_data_3
+v_data_4_hi .req v_data_4
+v_data_5_hi .req v_data_5
+v_data_6_hi .req v_data_6
+v_data_7_hi .req v_data_7
+
+v_p0 .req v24
+v_p1 .req v25
+v_p2 .req v26
+v_p3 .req v27
+v_p4 .req v28
+v_p5 .req v29
+v_p6 .req v30
+v_p7 .req v31
+q_p0 .req q24
+q_p1 .req q25
+q_p2 .req q26
+q_p3 .req q27
+q_p4 .req q28
+q_p5 .req q29
+q_p6 .req q30
+q_p7 .req q31
+
+v_p .req v_p0
+q_p .req q_p0
+v_data .req v_p1
+q_data .req q_p1
+v_data_lo .req v_p2
+v_data_hi .req v_p3
+
+
+cdecl(gf_vect_dot_prod_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ mov x_pos, #0
+
+ lsl x_vec, x_vec, #3
+
+.Lloop128_init:
+ /* less than 128 bytes, goto Lloop16_init */
+ cmp x_len, #128
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_len, x_len, #128
+
+.Lloop128:
+ movi v_p0.16b, #0
+ movi v_p1.16b, #0
+ movi v_p2.16b, #0
+ movi v_p3.16b, #0
+ movi v_p4.16b, #0
+ movi v_p5.16b, #0
+ movi v_p6.16b, #0
+ movi v_p7.16b, #0
+
+ mov x_tbl1, x_tbl
+ mov x_vec_i, #0
+
+.Lloop128_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ add x_vec_i, x_vec_i, #8
+ add x_ptr, x_ptr, x_pos
+
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
+
+ ldp q_data_0, q_data_1, [x_ptr], #32
+ ldp q_data_2, q_data_3, [x_ptr], #32
+ ldp q_data_4, q_data_5, [x_ptr], #32
+ ldp q_data_6, q_data_7, [x_ptr]
+
+ prfm pldl1keep, [x_tbl1]
+ prfm pldl1strm, [x_ptr]
+
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
+ and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
+ and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
+ and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
+ and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
+ and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
+
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ ushr v_data_2_hi.16b, v_data_2.16b, #4
+ ushr v_data_3_hi.16b, v_data_3.16b, #4
+ ushr v_data_4_hi.16b, v_data_4.16b, #4
+ ushr v_data_5_hi.16b, v_data_5.16b, #4
+ ushr v_data_6_hi.16b, v_data_6.16b, #4
+ ushr v_data_7_hi.16b, v_data_7.16b, #4
+
+ tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
+ tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
+ tbl v_data_2_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
+ tbl v_data_3_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
+ tbl v_data_4_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
+ tbl v_data_5_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
+ tbl v_data_6_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
+ tbl v_data_7_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
+
+ tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
+ tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
+ tbl v_data_2_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
+ tbl v_data_3_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
+ tbl v_data_4_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
+ tbl v_data_5_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
+ tbl v_data_6_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
+ tbl v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
+
+ eor v_p0.16b, v_data_0_lo.16b, v_p0.16b
+ eor v_p0.16b, v_p0.16b, v_data_0_hi.16b
+ eor v_p1.16b, v_data_1_lo.16b, v_p1.16b
+ eor v_p1.16b, v_p1.16b, v_data_1_hi.16b
+ eor v_p2.16b, v_data_2_lo.16b, v_p2.16b
+ eor v_p2.16b, v_p2.16b, v_data_2_hi.16b
+ eor v_p3.16b, v_data_3_lo.16b, v_p3.16b
+ eor v_p3.16b, v_p3.16b, v_data_3_hi.16b
+ eor v_p4.16b, v_data_4_lo.16b, v_p4.16b
+ eor v_p4.16b, v_p4.16b, v_data_4_hi.16b
+ eor v_p5.16b, v_data_5_lo.16b, v_p5.16b
+ eor v_p5.16b, v_p5.16b, v_data_5_hi.16b
+ eor v_p6.16b, v_data_6_lo.16b, v_p6.16b
+ eor v_p6.16b, v_p6.16b, v_data_6_hi.16b
+ eor v_p7.16b, v_data_7_lo.16b, v_p7.16b
+ eor v_p7.16b, v_p7.16b, v_data_7_hi.16b
+
+ cmp x_vec_i, x_vec
+ blt .Lloop128_vects
+
+.Lloop128_vects_end:
+ add x_ptr, x_dest1, x_pos
+ stp q_p0, q_p1, [x_ptr], #32
+ stp q_p2, q_p3, [x_ptr], #32
+ stp q_p4, q_p5, [x_ptr], #32
+ stp q_p6, q_p7, [x_ptr]
+
+ add x_pos, x_pos, #128
+ cmp x_pos, x_len
+ ble .Lloop128
+
+.Lloop128_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+
+ add x_len, x_len, #128
+ cmp x_pos, x_len
+ beq .return_pass
+
+.Lloop16_init:
+ sub x_len, x_len, #16
+ cmp x_pos, x_len
+ bgt .lessthan16_init
+
+.Lloop16:
+ movi v_p.16b, #0
+ mov x_tbl1, x_tbl
+ mov x_vec_i, #0
+
+.Lloop16_vects:
+ ldr x_ptr, [x_src, x_vec_i]
+ ldr q_data, [x_ptr, x_pos]
+ add x_vec_i, x_vec_i, #8
+
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_data_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_data_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_p.16b, v_data_lo.16b, v_p.16b
+ eor v_p.16b, v_p.16b, v_data_hi.16b
+
+ cmp x_vec_i, x_vec
+ blt .Lloop16_vects
+
+.Lloop16_vects_end:
+ str q_p, [x_dest1, x_pos]
+ add x_pos, x_pos, #16
+ cmp x_pos, x_len
+ ble .Lloop16
+
+.Lloop16_end:
+ sub x_tmp, x_pos, x_len
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16_init:
+ mov x_pos, x_len
+ b .Lloop16
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_sve.S
new file mode 100644
index 0000000000..48ce151fde
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_sve.S
@@ -0,0 +1,132 @@
+/**************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_vect_dot_prod_sve)
+#ifndef __APPLE__
+.type gf_vect_dot_prod_sve, %function
+#endif
+/* void gf_vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char *dest);
+ */
+
+/* arguments */
+x_len .req x0 /* vector length */
+x_vec .req x1 /* number of source vectors (ie. data blocks) */
+x_tbl .req x2
+x_src .req x3
+x_dest1 .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_vec_i .req x5
+x_ptr .req x6
+x_pos .req x7
+x_tbl1 .req x8
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest .req z3
+
+z_gft1_lo .req z4
+z_gft1_hi .req z5
+q_gft1_lo .req q4
+q_gft1_hi .req q5
+
+cdecl(gf_vect_dot_prod_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ mov x_pos, #0
+ lsl x_vec, x_vec, #3
+
+/* Loop 1: x_len, vector length */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ mov z_dest.b, #0 /* clear z_dest */
+ mov x_vec_i, #0 /* clear x_vec_i */
+ mov x_tbl1, x_tbl /* reset x_tbl1 */
+
+/* Loop 2: x_vec, number of source vectors (ie. data blocks) */
+.Lloopsve_vl_vects:
+ ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */
+
+ add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */
+
+ /* load gf_table */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is added by #32
+ for each src vect */
+
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest.d, z_gft1_lo.d, z_dest.d
+ eor z_dest.d, z_gft1_hi.d, z_dest.d
+
+ cmp x_vec_i, x_vec
+ blt .Lloopsve_vl_vects
+
+ /* end of Loop 2 */
+ /* store dest data, governed by p0 */
+ st1b z_dest.b, p0, [x_dest1, x_pos]
+ /* increment one vector length */
+ incb x_pos
+
+ b .Lloopsve_vl
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_neon.S
new file mode 100644
index 0000000000..bc2b957820
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_neon.S
@@ -0,0 +1,324 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_vect_mad_neon)
+#ifndef __APPLE__
+.type gf_vect_mad_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_src_end .req x6
+x_dest1 .req x_dest
+x_tmp .req x7
+x_const .req x8
+
+/* vectors */
+v_mask0f .req v0
+v_tmp .req v1
+q_tmp .req q1
+
+v_tmp1_lo .req v2
+v_tmp1_hi .req v3
+v_tmp2_lo .req v4
+v_tmp2_hi .req v5
+
+v_gft1_lo .req v6
+v_gft1_hi .req v7
+q_gft1_lo .req q6
+q_gft1_hi .req q7
+
+v_data_0 .req v8
+v_data_1 .req v9
+v_data_2 .req v10
+v_data_3 .req v11
+v_data_4 .req v12
+v_data_5 .req v13
+v_data_6 .req v14
+v_data_7 .req v15
+q_data_0 .req q8
+q_data_1 .req q9
+q_data_2 .req q10
+q_data_3 .req q11
+q_data_4 .req q12
+q_data_5 .req q13
+q_data_6 .req q14
+q_data_7 .req q15
+
+v_data_0_lo .req v16
+v_data_1_lo .req v17
+v_data_2_lo .req v18
+v_data_3_lo .req v19
+v_data_4_lo .req v20
+v_data_5_lo .req v21
+v_data_6_lo .req v22
+v_data_7_lo .req v23
+v_data_0_hi .req v_data_0
+v_data_1_hi .req v_data_1
+v_data_2_hi .req v_data_2
+v_data_3_hi .req v_data_3
+v_data_4_hi .req v_data_4
+v_data_5_hi .req v_data_5
+v_data_6_hi .req v_data_6
+v_data_7_hi .req v_data_7
+
+v_d1_0 .req v24
+v_d1_1 .req v25
+v_d1_2 .req v26
+v_d1_3 .req v27
+v_d1_4 .req v28
+v_d1_5 .req v29
+v_d1_6 .req v30
+v_d1_7 .req v31
+q_d1_0 .req q24
+q_d1_1 .req q25
+q_d1_2 .req q26
+q_d1_3 .req q27
+q_d1_4 .req q28
+q_d1_5 .req q29
+q_d1_6 .req q30
+q_d1_7 .req q31
+
+v_data .req v_d1_1
+q_data .req q_d1_1
+v_data_lo .req v_d1_2
+v_data_hi .req v_d1_3
+
+
+cdecl(gf_vect_mad_neon):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ lsl x_vec_i, x_vec_i, #5
+ add x_tbl, x_tbl, x_vec_i
+ add x_src_end, x_src, x_len
+
+ ldr q_gft1_lo, [x_tbl]
+ ldr q_gft1_hi, [x_tbl, #16]
+
+.Lloop128_init:
+ /* less than 128 bytes, goto Lloop16_init */
+ cmp x_len, #128
+ blt .Lloop16_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_src_end, x_src_end, #128
+
+.Lloop128:
+ ldr q_data_0, [x_src, #16*0]
+ ldr q_data_1, [x_src, #16*1]
+ ldr q_data_2, [x_src, #16*2]
+ ldr q_data_3, [x_src, #16*3]
+ ldr q_data_4, [x_src, #16*4]
+ ldr q_data_5, [x_src, #16*5]
+ ldr q_data_6, [x_src, #16*6]
+ ldr q_data_7, [x_src, #16*7]
+
+ ldr q_d1_0, [x_dest1, #16*0]
+ ldr q_d1_1, [x_dest1, #16*1]
+ ldr q_d1_2, [x_dest1, #16*2]
+ ldr q_d1_3, [x_dest1, #16*3]
+ ldr q_d1_4, [x_dest1, #16*4]
+ ldr q_d1_5, [x_dest1, #16*5]
+ ldr q_d1_6, [x_dest1, #16*6]
+ ldr q_d1_7, [x_dest1, #16*7]
+
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
+ and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
+ and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
+ and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
+ and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
+ and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
+
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ ushr v_data_2_hi.16b, v_data_2.16b, #4
+ ushr v_data_3_hi.16b, v_data_3.16b, #4
+ ushr v_data_4_hi.16b, v_data_4.16b, #4
+ ushr v_data_5_hi.16b, v_data_5.16b, #4
+ ushr v_data_6_hi.16b, v_data_6.16b, #4
+ ushr v_data_7_hi.16b, v_data_7.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
+ tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
+ tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
+
+ eor v_d1_0.16b, v_tmp1_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
+ eor v_d1_1.16b, v_tmp2_lo.16b, v_d1_1.16b
+ eor v_d1_1.16b, v_d1_1.16b, v_tmp2_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
+ tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
+ tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
+
+ eor v_d1_2.16b, v_tmp1_lo.16b, v_d1_2.16b
+ eor v_d1_2.16b, v_d1_2.16b, v_tmp1_hi.16b
+ eor v_d1_3.16b, v_tmp2_lo.16b, v_d1_3.16b
+ eor v_d1_3.16b, v_d1_3.16b, v_tmp2_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
+ tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
+ tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
+
+ eor v_d1_4.16b, v_tmp1_lo.16b, v_d1_4.16b
+ eor v_d1_4.16b, v_d1_4.16b, v_tmp1_hi.16b
+ eor v_d1_5.16b, v_tmp2_lo.16b, v_d1_5.16b
+ eor v_d1_5.16b, v_d1_5.16b, v_tmp2_hi.16b
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
+ tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
+ tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
+
+ eor v_d1_6.16b, v_tmp1_lo.16b, v_d1_6.16b
+ eor v_d1_6.16b, v_d1_6.16b, v_tmp1_hi.16b
+ eor v_d1_7.16b, v_tmp2_lo.16b, v_d1_7.16b
+ eor v_d1_7.16b, v_d1_7.16b, v_tmp2_hi.16b
+
+ str q_d1_0, [x_dest1, #16*0]
+ str q_d1_1, [x_dest1, #16*1]
+ str q_d1_2, [x_dest1, #16*2]
+ str q_d1_3, [x_dest1, #16*3]
+ str q_d1_4, [x_dest1, #16*4]
+ str q_d1_5, [x_dest1, #16*5]
+ str q_d1_6, [x_dest1, #16*6]
+ str q_d1_7, [x_dest1, #16*7]
+
+ add x_src, x_src, #128
+ add x_dest1, x_dest1, #128
+ cmp x_src, x_src_end
+ bls .Lloop128
+
+.Lloop128_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+ add x_src_end, x_src_end, #128
+
+.Lloop16_init:
+ sub x_src_end, x_src_end, #16
+ cmp x_src, x_src_end
+ bhi .lessthan16_init
+
+.Lloop16:
+ ldr q_data, [x_src]
+ ldr q_d1_0, [x_dest1]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_d1_0.16b, v_tmp1_lo.16b, v_d1_0.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
+
+ str q_d1_0, [x_dest1]
+
+ add x_dest1, x_dest1, #16
+ add x_src, x_src, #16
+ cmp x_src, x_src_end
+ bls .Lloop16
+
+.lessthan16_init:
+ sub x_tmp, x_src, x_src_end
+ cmp x_tmp, #16
+ beq .return_pass
+
+.lessthan16:
+ mov x_src, x_src_end
+ sub x_dest1, x_dest1, x_tmp
+
+#ifndef __APPLE__
+ adrp x_const, const_tbl
+ add x_const, x_const, :lo12:const_tbl
+#else
+ adrp x_const, const_tbl@PAGE
+ add x_const, x_const, const_tbl@PAGEOFF
+#endif
+ sub x_const, x_const, x_tmp
+ ldr q_tmp, [x_const, #16]
+
+ ldr q_data, [x_src]
+ ldr q_d1_0, [x_dest1]
+
+ and v_data_lo.16b, v_data.16b, v_mask0f.16b
+ ushr v_data_hi.16b, v_data.16b, #4
+
+ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
+ tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
+ eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
+ and v_tmp1_hi.16b, v_tmp1_hi.16b, v_tmp.16b
+ eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b
+
+ str q_d1_0, [x_dest1]
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
+
+ASM_DEF_RODATA
+.balign 8
+const_tbl:
+ .dword 0x0000000000000000, 0x0000000000000000
+ .dword 0xffffffffffffffff, 0xffffffffffffffff
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_sve.S
new file mode 100644
index 0000000000..41d6da9d9a
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_sve.S
@@ -0,0 +1,126 @@
+/**************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_vect_mad_sve)
+#ifndef __APPLE__
+.type gf_vect_mad_sve, %function
+#endif
+
+/* gf_vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char *dest);
+ */
+/* arguments */
+x_len .req x0
+x_vec .req x1
+x_vec_i .req x2
+x_tbl .req x3
+x_src .req x4
+x_dest .req x5
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_pos .req x6
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src
+
+z_dest .req z3
+
+z_tmp1_lo .req z4
+z_tmp1_hi .req z5
+
+z_gft1_lo .req z6
+z_gft1_hi .req z7
+q_gft1_lo .req q6
+q_gft1_hi .req q7
+
+cdecl(gf_vect_mad_sve):
+ /* less than 16 bytes, return_fail */
+ cmp x_len, #16
+ blt .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */
+
+ /* Load with NEON instruction ldp */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
+
+ mov x_pos, #0
+
+ /* vector length agnostic */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ /* prefetch dest data */
+ prfb pldl2strm, p0, [x_dest, x_pos]
+
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_src, x_pos]
+
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* load dest data, governed by p0 */
+ ld1b z_dest.b, p0/z, [x_dest, x_pos]
+
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_tmp1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_tmp1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest.d, z_tmp1_lo.d, z_dest.d
+ eor z_dest.d, z_tmp1_hi.d, z_dest.d
+
+ /* store dest data, governed by p0 */
+ st1b z_dest.b, p0, [x_dest, x_pos]
+ /* increment one vector length */
+ incb x_pos
+
+ b .Lloopsve_vl
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_neon.S
new file mode 100644
index 0000000000..096b91dd29
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_neon.S
@@ -0,0 +1,240 @@
+/**************************************************************
+ Copyright (c) 2019 Huawei Technologies Co., Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "../include/aarch64_label.h"
+
+.text
+
+.global cdecl(gf_vect_mul_neon)
+#ifndef __APPLE__
+.type gf_vect_mul_neon, %function
+#endif
+
+/* arguments */
+x_len .req x0
+x_tbl .req x1
+x_src .req x2
+x_dest .req x3
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_dest1 .req x_dest
+x_src_end .req x4
+x_tmp .req x5
+
+/* vectors */
+v_mask0f .req v0
+
+v_gft1_lo .req v2
+v_gft1_hi .req v3
+q_gft1_lo .req q2
+q_gft1_hi .req q3
+
+v_data_0 .req v16
+v_data_1 .req v17
+v_data_2 .req v18
+v_data_3 .req v19
+v_data_4 .req v20
+v_data_5 .req v21
+v_data_6 .req v22
+v_data_7 .req v23
+q_data_0 .req q16
+q_data_1 .req q17
+q_data_2 .req q18
+q_data_3 .req q19
+q_data_4 .req q20
+q_data_5 .req q21
+q_data_6 .req q22
+q_data_7 .req q23
+
+v_data_0_lo .req v24
+v_data_1_lo .req v25
+v_data_2_lo .req v26
+v_data_3_lo .req v27
+v_data_4_lo .req v28
+v_data_5_lo .req v29
+v_data_6_lo .req v30
+v_data_7_lo .req v31
+v_data_0_hi .req v_data_0
+v_data_1_hi .req v_data_1
+v_data_2_hi .req v_data_2
+v_data_3_hi .req v_data_3
+v_data_4_hi .req v_data_4
+v_data_5_hi .req v_data_5
+v_data_6_hi .req v_data_6
+v_data_7_hi .req v_data_7
+
+
+cdecl(gf_vect_mul_neon):
+ /* less than 32 bytes, return_fail */
+ cmp x_len, #32
+ blt .return_fail
+
+ movi v_mask0f.16b, #0x0f
+ add x_src_end, x_src, x_len
+ ldr q_gft1_lo, [x_tbl]
+ ldr q_gft1_hi, [x_tbl, #16]
+
+
+.Lloop128_init:
+ /* less than 128 bytes, goto Lloop16_init */
+ cmp x_len, #128
+ blt .Lloop32_init
+
+ /* save d8 ~ d15 to stack */
+ sub sp, sp, #64
+ stp d8, d9, [sp]
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+
+ sub x_src_end, x_src_end, #128
+
+.Lloop128:
+ ldr q_data_0, [x_src, #16*0]
+ ldr q_data_1, [x_src, #16*1]
+ ldr q_data_2, [x_src, #16*2]
+ ldr q_data_3, [x_src, #16*3]
+ ldr q_data_4, [x_src, #16*4]
+ ldr q_data_5, [x_src, #16*5]
+ ldr q_data_6, [x_src, #16*6]
+ ldr q_data_7, [x_src, #16*7]
+
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
+ and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
+ and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
+ and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
+ and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
+ and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
+
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ ushr v_data_2_hi.16b, v_data_2.16b, #4
+ ushr v_data_3_hi.16b, v_data_3.16b, #4
+ ushr v_data_4_hi.16b, v_data_4.16b, #4
+ ushr v_data_5_hi.16b, v_data_5.16b, #4
+ ushr v_data_6_hi.16b, v_data_6.16b, #4
+ ushr v_data_7_hi.16b, v_data_7.16b, #4
+
+ tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
+ tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
+ tbl v_data_2_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
+ tbl v_data_3_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
+ tbl v_data_4_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
+ tbl v_data_5_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
+ tbl v_data_6_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
+ tbl v_data_7_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
+
+ tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
+ tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
+ tbl v_data_2_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
+ tbl v_data_3_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
+ tbl v_data_4_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
+ tbl v_data_5_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
+ tbl v_data_6_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
+ tbl v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
+
+ eor v_data_0.16b, v_data_0_hi.16b, v_data_0_lo.16b
+ eor v_data_1.16b, v_data_1_hi.16b, v_data_1_lo.16b
+ eor v_data_2.16b, v_data_2_hi.16b, v_data_2_lo.16b
+ eor v_data_3.16b, v_data_3_hi.16b, v_data_3_lo.16b
+ eor v_data_4.16b, v_data_4_hi.16b, v_data_4_lo.16b
+ eor v_data_5.16b, v_data_5_hi.16b, v_data_5_lo.16b
+ eor v_data_6.16b, v_data_6_hi.16b, v_data_6_lo.16b
+ eor v_data_7.16b, v_data_7_hi.16b, v_data_7_lo.16b
+
+ str q_data_0, [x_dest1, #16*0]
+ str q_data_1, [x_dest1, #16*1]
+ str q_data_2, [x_dest1, #16*2]
+ str q_data_3, [x_dest1, #16*3]
+ str q_data_4, [x_dest1, #16*4]
+ str q_data_5, [x_dest1, #16*5]
+ str q_data_6, [x_dest1, #16*6]
+ str q_data_7, [x_dest1, #16*7]
+
+ add x_src, x_src, #128
+ add x_dest1, x_dest1, #128
+ cmp x_src, x_src_end
+ bls .Lloop128
+
+.Lloop128_end:
+ /* restore d8 ~ d15 */
+ ldp d8, d9, [sp]
+ ldp d10, d11, [sp, #16]
+ ldp d12, d13, [sp, #32]
+ ldp d14, d15, [sp, #48]
+ add sp, sp, #64
+ add x_src_end, x_src_end, #128
+ cmp x_src, x_src_end
+ beq .return_pass
+
+.Lloop32_init:
+ sub x_src_end, x_src_end, #32
+ cmp x_src, x_src_end
+ bhi .return_fail
+
+.Lloop32:
+ ldr q_data_0, [x_src, #16*0]
+ ldr q_data_1, [x_src, #16*1]
+
+ and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
+ and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
+ ushr v_data_0_hi.16b, v_data_0.16b, #4
+ ushr v_data_1_hi.16b, v_data_1.16b, #4
+ tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
+ tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
+ tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
+ tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
+ eor v_data_0.16b, v_data_0_hi.16b, v_data_0_lo.16b
+ eor v_data_1.16b, v_data_1_hi.16b, v_data_1_lo.16b
+ str q_data_0, [x_dest1, #16*0]
+ str q_data_1, [x_dest1, #16*1]
+
+ add x_dest1, x_dest1, #32
+ add x_src, x_src, #32
+ cmp x_src, x_src_end
+ bls .Lloop32
+
+.Lloop32_end:
+ sub x_tmp, x_src, x_src_end
+ cmp x_tmp, #32
+ beq .return_pass
+ b .return_fail
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_sve.S
new file mode 100644
index 0000000000..d2219bf54c
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_sve.S
@@ -0,0 +1,123 @@
+/**************************************************************
+ Copyright (c) 2021 Linaro Ltd.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Huawei Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.text
+.align 6
+.arch armv8-a+sve
+
+#include "../include/aarch64_label.h"
+
+.global cdecl(gf_vect_mul_sve)
+#ifndef __APPLE__
+.type gf_vect_mul_sve, %function
+#endif
+
+/* Refer to include/gf_vect_mul.h
+ *
+ * @param len Length of vector in bytes. Must be aligned to 32B.
+ * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
+ * @param src Pointer to src data array. Must be aligned to 32B.
+ * @param dest Pointer to destination data array. Must be aligned to 32B.
+ * @returns 0 pass, other fail
+ *
+ * int gf_vect_mul(int len, unsigned char *gftbl, void *src, void *dest);
+ */
+
+/* arguments */
+x_len .req x0
+x_tbl .req x1
+x_src .req x2
+x_dest .req x3
+x_tmp .req x4
+
+/* returns */
+w_ret .req w0
+
+/* local variables */
+x_pos .req x5
+
+/* vectors */
+z_mask0f .req z0
+
+z_src .req z1
+z_src_lo .req z2
+z_src_hi .req z_src /* reuse */
+
+z_dest .req z3
+z_tmp1_lo .req z4
+z_tmp1_hi .req z_dest /* reuse */
+
+z_gft1_lo .req z6
+z_gft1_hi .req z7
+q_gft1_lo .req q6
+q_gft1_hi .req q7
+
+cdecl(gf_vect_mul_sve):
+ /* len not aligned to 32B, return_fail */
+ and x_tmp, x_len, #0x1f
+ cmp x_tmp, #0
+ bne .return_fail
+
+ mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */
+ mov x_pos, #0
+
+ /* Load with NEON instruction ldp */
+ ldp q_gft1_lo, q_gft1_hi, [x_tbl]
+
+ /* vector length agnostic */
+.Lloopsve_vl:
+ whilelo p0.b, x_pos, x_len
+ b.none .return_pass
+
+ /* load src data, governed by p0 */
+ ld1b z_src.b, p0/z, [x_src, x_pos]
+
+ /* split 4-bit lo; 4-bit hi */
+ and z_src_lo.d, z_src.d, z_mask0f.d
+ lsr z_src_hi.b, z_src.b, #4
+
+ /* table indexing, ie. gf(2^8) multiplication */
+ tbl z_tmp1_lo.b, {z_gft1_lo.b}, z_src_lo.b
+ tbl z_tmp1_hi.b, {z_gft1_hi.b}, z_src_hi.b
+ /* exclusive or, ie. gf(2^8) add */
+ eor z_dest.d, z_tmp1_hi.d, z_tmp1_lo.d
+
+ /* store dest data, governed by p0 */
+ st1b z_dest.b, p0, [x_dest, x_pos]
+ /* increment one vector length */
+ incb x_pos
+
+ b .Lloopsve_vl
+
+.return_pass:
+ mov w_ret, #0
+ ret
+
+.return_fail:
+ mov w_ret, #1
+ ret
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/ya.make b/contrib/libs/isa-l/erasure_code/aarch64/ya.make
new file mode 100644
index 0000000000..ba7f601cfa
--- /dev/null
+++ b/contrib/libs/isa-l/erasure_code/aarch64/ya.make
@@ -0,0 +1,51 @@
+LIBRARY()
+
+LICENSE(BSD-3-Clause)
+
+LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
+
+VERSION(2.31)
+
+NO_UTIL()
+
+NO_COMPILER_WARNINGS()
+
+ADDINCL(
+ contrib/libs/isa-l/include
+)
+
+IF(ARCH_AARCH64)
+CFLAGS(-D__ASSEMBLY__)
+SRCS(
+ ec_multibinary_arm.S
+ gf_2vect_dot_prod_neon.S
+ gf_2vect_dot_prod_sve.S
+ gf_2vect_mad_neon.S
+ gf_2vect_mad_sve.S
+ gf_3vect_dot_prod_neon.S
+ gf_3vect_dot_prod_sve.S
+ gf_3vect_mad_neon.S
+ gf_3vect_mad_sve.S
+ gf_4vect_dot_prod_neon.S
+ gf_4vect_dot_prod_sve.S
+ gf_4vect_mad_neon.S
+ gf_4vect_mad_sve.S
+ gf_5vect_dot_prod_neon.S
+ gf_5vect_dot_prod_sve.S
+ gf_5vect_mad_neon.S
+ gf_5vect_mad_sve.S
+ gf_6vect_dot_prod_sve.S
+ gf_6vect_mad_neon.S
+ gf_6vect_mad_sve.S
+ gf_7vect_dot_prod_sve.S
+ gf_8vect_dot_prod_sve.S
+ gf_vect_dot_prod_neon.S
+ gf_vect_dot_prod_sve.S
+ gf_vect_mad_neon.S
+ gf_vect_mad_sve.S
+ gf_vect_mul_neon.S
+ gf_vect_mul_sve.S
+)
+ENDIF()
+
+END()