diff options
author | Maxim Yurchuk <maxim-yurchuk@ydb.tech> | 2024-10-18 20:31:38 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-10-18 20:31:38 +0300 |
commit | 2a74bac2d2d3bccb4e10120f1ead805640ec9dd0 (patch) | |
tree | 047e4818ced5aaf73f58517629e5260b5291f9f0 /contrib/libs/isa-l/erasure_code/aarch64 | |
parent | 2d9656823e9521d8c29ea4c9a1d0eab78391abfc (diff) | |
parent | 3d834a1923bbf9403cd4a448e7f32b670aa4124f (diff) | |
download | ydb-2a74bac2d2d3bccb4e10120f1ead805640ec9dd0.tar.gz |
Merge pull request #10502 from ydb-platform/mergelibs-241016-1210
Library import 241016-1210
Diffstat (limited to 'contrib/libs/isa-l/erasure_code/aarch64')
33 files changed, 8672 insertions, 0 deletions
diff --git a/contrib/libs/isa-l/erasure_code/aarch64/.yandex_meta/licenses.list.txt b/contrib/libs/isa-l/erasure_code/aarch64/.yandex_meta/licenses.list.txt new file mode 100644 index 0000000000..8f218b47cb --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/.yandex_meta/licenses.list.txt @@ -0,0 +1,164 @@ +====================BSD-3-Clause==================== + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +====================BSD-3-Clause==================== +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Arm Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +====================BSD-3-Clause==================== +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +====================BSD-3-Clause==================== +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +====================BSD-3-Clause==================== +ISA-L is licensed using a BSD 3-clause [license]. All code submitted to + + +====================BSD-3-Clause AND BSD-3-Clause AND BSD-3-Clause==================== + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +====================COPYRIGHT==================== + Copyright(c) 2011-2013 Intel Corporation All rights reserved. + + +====================COPYRIGHT==================== + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + +====================COPYRIGHT==================== +# Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + +====================COPYRIGHT==================== +# Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + +====================COPYRIGHT==================== +# Copyright(c) 2011-2018 Intel Corporation All rights reserved. + + +====================COPYRIGHT==================== +# Copyright(c) 2019 Arm Corporation All rights reserved. + + +====================COPYRIGHT==================== +; Copyright(c) 2011-2019 Intel Corporation All rights reserved. diff --git a/contrib/libs/isa-l/erasure_code/aarch64/Makefile.am b/contrib/libs/isa-l/erasure_code/aarch64/Makefile.am new file mode 100644 index 0000000000..47bbf12d2b --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/Makefile.am @@ -0,0 +1,60 @@ +################################################################## +# Copyright (c) 2019 Huawei Technologies Co., Ltd. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Huawei Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +lsrc_aarch64 += \ + erasure_code/aarch64/ec_aarch64_highlevel_func.c \ + erasure_code/aarch64/ec_aarch64_dispatcher.c \ + erasure_code/aarch64/gf_vect_dot_prod_neon.S \ + erasure_code/aarch64/gf_2vect_dot_prod_neon.S \ + erasure_code/aarch64/gf_3vect_dot_prod_neon.S \ + erasure_code/aarch64/gf_4vect_dot_prod_neon.S \ + erasure_code/aarch64/gf_5vect_dot_prod_neon.S \ + erasure_code/aarch64/gf_vect_mad_neon.S \ + erasure_code/aarch64/gf_2vect_mad_neon.S \ + erasure_code/aarch64/gf_3vect_mad_neon.S \ + erasure_code/aarch64/gf_4vect_mad_neon.S \ + erasure_code/aarch64/gf_5vect_mad_neon.S \ + erasure_code/aarch64/gf_6vect_mad_neon.S \ + erasure_code/aarch64/gf_vect_mul_neon.S \ + erasure_code/aarch64/gf_vect_mad_sve.S \ + erasure_code/aarch64/gf_2vect_mad_sve.S \ + erasure_code/aarch64/gf_3vect_mad_sve.S \ + erasure_code/aarch64/gf_4vect_mad_sve.S \ + erasure_code/aarch64/gf_5vect_mad_sve.S \ + erasure_code/aarch64/gf_6vect_mad_sve.S \ + erasure_code/aarch64/gf_vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_2vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_3vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_4vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_5vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_6vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_7vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_8vect_dot_prod_sve.S \ + erasure_code/aarch64/gf_vect_mul_sve.S \ + erasure_code/aarch64/ec_multibinary_arm.S diff --git a/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_dispatcher.c b/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_dispatcher.c new file mode 100644 index 0000000000..0a11604076 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_dispatcher.c @@ -0,0 +1,124 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <aarch64_multibinary.h> + +DEFINE_INTERFACE_DISPATCHER(gf_vect_dot_prod) +{ +#if defined(__linux__) + unsigned long auxval = getauxval(AT_HWCAP); + + if (auxval & HWCAP_SVE) + return PROVIDER_INFO(gf_vect_dot_prod_sve); + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(gf_vect_dot_prod_neon); +#elif defined(__APPLE__) + if (sysctlEnabled(SYSCTL_SVE_KEY)) + return PROVIDER_INFO(gf_vect_dot_prod_sve); + return PROVIDER_INFO(gf_vect_dot_prod_neon); +#endif + return PROVIDER_BASIC(gf_vect_dot_prod); + +} + +DEFINE_INTERFACE_DISPATCHER(gf_vect_mad) +{ +#if defined(__linux__) + unsigned long auxval = getauxval(AT_HWCAP); + + if (auxval & HWCAP_SVE) + return PROVIDER_INFO(gf_vect_mad_sve); + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(gf_vect_mad_neon); +#elif defined(__APPLE__) + if (sysctlEnabled(SYSCTL_SVE_KEY)) + return PROVIDER_INFO(gf_vect_mad_sve); + return PROVIDER_INFO(gf_vect_mad_neon); +#endif + return PROVIDER_BASIC(gf_vect_mad); + +} + +DEFINE_INTERFACE_DISPATCHER(ec_encode_data) +{ +#if defined(__linux__) + unsigned long auxval = getauxval(AT_HWCAP); + + if (auxval & HWCAP_SVE) + return PROVIDER_INFO(ec_encode_data_sve); + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(ec_encode_data_neon); +#elif defined(__APPLE__) + if (sysctlEnabled(SYSCTL_SVE_KEY)) + return PROVIDER_INFO(ec_encode_data_sve); + return PROVIDER_INFO(ec_encode_data_neon); +#endif + return PROVIDER_BASIC(ec_encode_data); + +} + +DEFINE_INTERFACE_DISPATCHER(ec_encode_data_update) +{ +#if defined(__linux__) + unsigned long auxval = getauxval(AT_HWCAP); + + if (auxval & HWCAP_SVE) + return PROVIDER_INFO(ec_encode_data_update_sve); + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(ec_encode_data_update_neon); +#elif defined(__APPLE__) + if (sysctlEnabled(SYSCTL_SVE_KEY)) + return PROVIDER_INFO(ec_encode_data_update_sve); + return PROVIDER_INFO(ec_encode_data_update_neon); +#endif + return PROVIDER_BASIC(ec_encode_data_update); + +} + +DEFINE_INTERFACE_DISPATCHER(gf_vect_mul) +{ +#if defined(__linux__) + unsigned long auxval = getauxval(AT_HWCAP); + + if (auxval & HWCAP_SVE) + return PROVIDER_INFO(gf_vect_mul_sve); + if (auxval & HWCAP_ASIMD) + return PROVIDER_INFO(gf_vect_mul_neon); +#elif defined(__APPLE__) + if (sysctlEnabled(SYSCTL_SVE_KEY)) + return PROVIDER_INFO(gf_vect_mul_sve); + return PROVIDER_INFO(gf_vect_mul_neon); +#endif + return PROVIDER_BASIC(gf_vect_mul); + +} + +DEFINE_INTERFACE_DISPATCHER(ec_init_tables) +{ + return PROVIDER_BASIC(ec_init_tables); +} diff --git a/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_highlevel_func.c b/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_highlevel_func.c new file mode 100644 index 0000000000..e001fd72a0 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/ec_aarch64_highlevel_func.c @@ -0,0 +1,264 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "erasure_code.h" + +/*external function*/ +extern void gf_vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest); +extern void gf_2vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_3vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_4vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_5vect_dot_prod_neon(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char *dest); +extern void gf_2vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_3vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_4vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_5vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_6vect_mad_neon(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + +void ec_encode_data_neon(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data, + unsigned char **coding) +{ + if (len < 16) { + ec_encode_data_base(len, k, rows, g_tbls, data, coding); + return; + } + + while (rows > 5) { + gf_5vect_dot_prod_neon(len, k, g_tbls, data, coding); + g_tbls += 5 * k * 32; + coding += 5; + rows -= 5; + } + switch (rows) { + case 5: + gf_5vect_dot_prod_neon(len, k, g_tbls, data, coding); + break; + case 4: + gf_4vect_dot_prod_neon(len, k, g_tbls, data, coding); + break; + case 3: + gf_3vect_dot_prod_neon(len, k, g_tbls, data, coding); + break; + case 2: + gf_2vect_dot_prod_neon(len, k, g_tbls, data, coding); + break; + case 1: + gf_vect_dot_prod_neon(len, k, g_tbls, data, *coding); + break; + case 0: + break; + default: + break; + } +} + +void ec_encode_data_update_neon(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding) +{ + if (len < 16) { + ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding); + return; + } + while (rows > 6) { + gf_6vect_mad_neon(len, k, vec_i, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + rows -= 6; + } + switch (rows) { + case 6: + gf_6vect_mad_neon(len, k, vec_i, g_tbls, data, coding); + break; + case 5: + gf_5vect_mad_neon(len, k, vec_i, g_tbls, data, coding); + break; + case 4: + gf_4vect_mad_neon(len, k, vec_i, g_tbls, data, coding); + break; + case 3: + gf_3vect_mad_neon(len, k, vec_i, g_tbls, data, coding); + break; + case 2: + gf_2vect_mad_neon(len, k, vec_i, g_tbls, data, coding); + break; + case 1: + gf_vect_mad_neon(len, k, vec_i, g_tbls, data, *coding); + break; + case 0: + break; + } +} + +/* SVE */ +extern void gf_vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest); +extern void gf_2vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_3vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_4vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_5vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_6vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_7vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_8vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); +extern void gf_vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char *dest); +extern void gf_2vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_3vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_4vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_5vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); +extern void gf_6vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + +void ec_encode_data_sve(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data, + unsigned char **coding) +{ + if (len < 16) { + ec_encode_data_base(len, k, rows, g_tbls, data, coding); + return; + } + + while (rows > 11) { + gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + rows -= 6; + } + + switch (rows) { + case 11: + /* 7 + 4 */ + gf_7vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 7 * k * 32; + coding += 7; + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 10: + /* 6 + 4 */ + gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 9: + /* 5 + 4 */ + gf_5vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 5 * k * 32; + coding += 5; + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 8: + /* 4 + 4 */ + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + g_tbls += 4 * k * 32; + coding += 4; + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 7: + gf_7vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 6: + gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 5: + gf_5vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 4: + gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 3: + gf_3vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 2: + gf_2vect_dot_prod_sve(len, k, g_tbls, data, coding); + break; + case 1: + gf_vect_dot_prod_sve(len, k, g_tbls, data, *coding); + break; + default: + break; + } +} + +void ec_encode_data_update_sve(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding) +{ + if (len < 16) { + ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding); + return; + } + while (rows > 6) { + gf_6vect_mad_sve(len, k, vec_i, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + rows -= 6; + } + switch (rows) { + case 6: + gf_6vect_mad_sve(len, k, vec_i, g_tbls, data, coding); + break; + case 5: + gf_5vect_mad_sve(len, k, vec_i, g_tbls, data, coding); + break; + case 4: + gf_4vect_mad_sve(len, k, vec_i, g_tbls, data, coding); + break; + case 3: + gf_3vect_mad_sve(len, k, vec_i, g_tbls, data, coding); + break; + case 2: + gf_2vect_mad_sve(len, k, vec_i, g_tbls, data, coding); + break; + case 1: + gf_vect_mad_sve(len, k, vec_i, g_tbls, data, *coding); + break; + default: + break; + } +} diff --git a/contrib/libs/isa-l/erasure_code/aarch64/ec_multibinary_arm.S b/contrib/libs/isa-l/erasure_code/aarch64/ec_multibinary_arm.S new file mode 100644 index 0000000000..c276e63780 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/ec_multibinary_arm.S @@ -0,0 +1,37 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "aarch64_multibinary.h" + +mbin_interface ec_encode_data +mbin_interface gf_vect_mul +mbin_interface gf_vect_dot_prod +mbin_interface gf_vect_mad +mbin_interface ec_encode_data_update +mbin_interface ec_init_tables diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_neon.S new file mode 100644 index 0000000000..4ff7e7ce16 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_neon.S @@ -0,0 +1,402 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_2vect_dot_prod_neon) +#ifndef __APPLE__ +.type gf_2vect_dot_prod_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 +x_tmp .req x8 +x_tbl1 .req x9 +x_tbl2 .req x10 +x_dest1 .req x11 +x_dest2 .req x12 + +/* vectors */ +v_gft1_lo .req v0 +v_gft1_hi .req v1 +v_gft2_lo .req v2 +v_gft2_hi .req v3 +q_gft1_lo .req q0 +q_gft1_hi .req q1 +q_gft2_lo .req q2 +q_gft2_hi .req q3 + +v_mask0f .req v4 +q_mask0f .req q4 + +v_tmp1_lo .req v5 +v_tmp1_hi .req v6 +v_tmp1 .req v7 + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +v_data_4 .req v12 +v_data_5 .req v13 +v_data_6 .req v14 +v_data_7 .req v15 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 +q_data_4 .req q12 +q_data_5 .req q13 +q_data_6 .req q14 +q_data_7 .req q15 + +v_p1_0 .req v16 +v_p1_1 .req v17 +v_p1_2 .req v18 +v_p1_3 .req v19 +v_p1_4 .req v20 +v_p1_5 .req v21 +v_p1_6 .req v22 +v_p1_7 .req v23 +v_p2_0 .req v24 +v_p2_1 .req v25 +v_p2_2 .req v26 +v_p2_3 .req v27 +v_p2_4 .req v28 +v_p2_5 .req v29 +v_p2_6 .req v30 +v_p2_7 .req v31 + +q_p1_0 .req q16 +q_p1_1 .req q17 +q_p1_2 .req q18 +q_p1_3 .req q19 +q_p1_4 .req q20 +q_p1_5 .req q21 +q_p1_6 .req q22 +q_p1_7 .req q23 +q_p2_0 .req q24 +q_p2_1 .req q25 +q_p2_2 .req q26 +q_p2_3 .req q27 +q_p2_4 .req q28 +q_p2_5 .req q29 +q_p2_6 .req q30 +q_p2_7 .req q31 + +v_p1 .req v_p1_0 +q_p1 .req q_p1_0 +v_p2 .req v_p2_0 +q_p2 .req q_p2_0 +v_data .req v_p1_1 +q_data .req q_p1_1 +v_data_lo .req v_p1_2 +v_data_hi .req v_p1_3 + +cdecl(gf_2vect_dot_prod_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldr x_dest1, [x_dest, #8*0] + ldr x_dest2, [x_dest, #8*1] + +.Lloop128_init: + /* less than 128 bytes, goto Lloop16_init */ + cmp x_len, #128 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_len, x_len, #128 + +.Lloop128: + movi v_p1_0.16b, #0 + movi v_p1_1.16b, #0 + movi v_p1_2.16b, #0 + movi v_p1_3.16b, #0 + movi v_p1_4.16b, #0 + movi v_p1_5.16b, #0 + movi v_p1_6.16b, #0 + movi v_p1_7.16b, #0 + + movi v_p2_0.16b, #0 + movi v_p2_1.16b, #0 + movi v_p2_2.16b, #0 + movi v_p2_3.16b, #0 + movi v_p2_4.16b, #0 + movi v_p2_5.16b, #0 + movi v_p2_6.16b, #0 + movi v_p2_7.16b, #0 + + mov x_tbl1, x_tbl + add x_tbl2, x_tbl, x_vec, lsl #2 + mov x_vec_i, #0 + +.Lloop128_vects: + ldr x_ptr, [x_src, x_vec_i] + add x_vec_i, x_vec_i, #8 + add x_ptr, x_ptr, x_pos + + ldp q_data_0, q_data_1, [x_ptr], #32 + ldp q_data_2, q_data_3, [x_ptr], #32 + + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + ldp q_data_4, q_data_5, [x_ptr], #32 + ldp q_data_6, q_data_7, [x_ptr], #32 + prfm pldl1strm, [x_ptr] + prfm pldl1keep, [x_tbl1] + prfm pldl1keep, [x_tbl2] + + /* data_0 */ + and v_tmp1.16b, v_data_0.16b, v_mask0f.16b + ushr v_data_0.16b, v_data_0.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b + eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b + eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b + eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b + eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b + + /* data_1 */ + and v_tmp1.16b, v_data_1.16b, v_mask0f.16b + ushr v_data_1.16b, v_data_1.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b + eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b + eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b + eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b + eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b + + /* data_2 */ + and v_tmp1.16b, v_data_2.16b, v_mask0f.16b + ushr v_data_2.16b, v_data_2.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b + eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b + eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b + eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b + eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b + + /* data_3 */ + and v_tmp1.16b, v_data_3.16b, v_mask0f.16b + ushr v_data_3.16b, v_data_3.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b + eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b + eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b + eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b + eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b + + /* data_4 */ + and v_tmp1.16b, v_data_4.16b, v_mask0f.16b + ushr v_data_4.16b, v_data_4.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_4.16b + eor v_p1_4.16b, v_tmp1_lo.16b, v_p1_4.16b + eor v_p1_4.16b, v_p1_4.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_4.16b + eor v_p2_4.16b, v_tmp1_lo.16b, v_p2_4.16b + eor v_p2_4.16b, v_p2_4.16b, v_tmp1_hi.16b + + /* data_5 */ + and v_tmp1.16b, v_data_5.16b, v_mask0f.16b + ushr v_data_5.16b, v_data_5.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_5.16b + eor v_p1_5.16b, v_tmp1_lo.16b, v_p1_5.16b + eor v_p1_5.16b, v_p1_5.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_5.16b + eor v_p2_5.16b, v_tmp1_lo.16b, v_p2_5.16b + eor v_p2_5.16b, v_p2_5.16b, v_tmp1_hi.16b + + /* data_6 */ + and v_tmp1.16b, v_data_6.16b, v_mask0f.16b + ushr v_data_6.16b, v_data_6.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_6.16b + eor v_p1_6.16b, v_tmp1_lo.16b, v_p1_6.16b + eor v_p1_6.16b, v_p1_6.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_6.16b + eor v_p2_6.16b, v_tmp1_lo.16b, v_p2_6.16b + eor v_p2_6.16b, v_p2_6.16b, v_tmp1_hi.16b + + /* data_7 */ + and v_tmp1.16b, v_data_7.16b, v_mask0f.16b + ushr v_data_7.16b, v_data_7.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_7.16b + eor v_p1_7.16b, v_tmp1_lo.16b, v_p1_7.16b + eor v_p1_7.16b, v_p1_7.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_7.16b + eor v_p2_7.16b, v_tmp1_lo.16b, v_p2_7.16b + eor v_p2_7.16b, v_p2_7.16b, v_tmp1_hi.16b + + cmp x_vec_i, x_vec + blt .Lloop128_vects + +.Lloop128_vects_end: + add x_ptr, x_dest1, x_pos + stp q_p1_0, q_p1_1, [x_ptr], #32 + stp q_p1_2, q_p1_3, [x_ptr], #32 + stp q_p1_4, q_p1_5, [x_ptr], #32 + stp q_p1_6, q_p1_7, [x_ptr] + + add x_ptr, x_dest2, x_pos + stp q_p2_0, q_p2_1, [x_ptr], #32 + stp q_p2_2, q_p2_3, [x_ptr], #32 + stp q_p2_4, q_p2_5, [x_ptr], #32 + stp q_p2_6, q_p2_7, [x_ptr] + + add x_pos, x_pos, #128 + cmp x_pos, x_len + ble .Lloop128 + +.Lloop128_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + + add x_len, x_len, #128 + cmp x_pos, x_len + beq .return_pass + +.Lloop16_init: + sub x_len, x_len, #16 + cmp x_pos, x_len + bgt .lessthan16_init + +.Lloop16: + movi v_p1.16b, #0 + movi v_p2.16b, #0 + mov x_tbl1, x_tbl + add x_tbl2, x_tbl, x_vec, lsl #2 + mov x_vec_i, #0 + +.Lloop16_vects: + ldr x_ptr, [x_src, x_vec_i] + ldr q_data, [x_ptr, x_pos] + add x_vec_i, x_vec_i, #8 + + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + eor v_p1.16b, v_tmp1_lo.16b, v_p1.16b + eor v_p1.16b, v_p1.16b, v_tmp1_hi.16b + + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + eor v_p2.16b, v_tmp1_lo.16b, v_p2.16b + eor v_p2.16b, v_p2.16b, v_tmp1_hi.16b + + cmp x_vec_i, x_vec + bne .Lloop16_vects + +.Lloop16_vects_end: + str q_p1, [x_dest1, x_pos] + str q_p2, [x_dest2, x_pos] + add x_pos, x_pos, #16 + cmp x_pos, x_len + ble .Lloop16 + +.Lloop16_end: + sub x_tmp, x_pos, x_len + cmp x_tmp, #16 + beq .return_pass + +.lessthan16_init: + mov x_pos, x_len + b .Lloop16 + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_sve.S new file mode 100644 index 0000000000..99b5f15cfb --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_dot_prod_sve.S @@ -0,0 +1,168 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_2vect_dot_prod_sve) +#ifndef __APPLE__ +.type gf_2vect_dot_prod_sve, %function +#endif +/* void gf_2vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_dest1 .req x10 +x_dest2 .req x_dest /* reused */ + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_dest2 .req z27 + +cdecl(gf_2vect_dot_prod_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_dest1.d, z_gft1_hi.d + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_dest2.d, z_gft2_hi.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_neon.S new file mode 100644 index 0000000000..453524a221 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_neon.S @@ -0,0 +1,411 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_2vect_mad_neon) +#ifndef __APPLE__ +.type gf_2vect_mad_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_src_end .req x6 +x_dest1 .req x7 +x_dest2 .req x8 +x_tmp .req x9 +x_tbl1 .req x10 +x_tbl2 .req x11 +x_const .req x12 + +/* vectors */ +v_mask0f .req v0 +v_tmp_lo .req v1 +v_tmp_hi .req v2 +v_tmp .req v3 +q_tmp .req q3 + +v_gft1_lo .req v4 +v_gft1_hi .req v5 +v_gft2_lo .req v6 +v_gft2_hi .req v7 +q_gft1_lo .req q4 +q_gft1_hi .req q5 +q_gft2_lo .req q6 +q_gft2_hi .req q7 + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +v_data_4 .req v12 +v_data_5 .req v13 +v_data_6 .req v14 +v_data_7 .req v15 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 +q_data_4 .req q12 +q_data_5 .req q13 +q_data_6 .req q14 +q_data_7 .req q15 + +v_data_0_lo .req v16 +v_data_1_lo .req v17 +v_data_2_lo .req v18 +v_data_3_lo .req v19 +v_data_4_lo .req v20 +v_data_5_lo .req v21 +v_data_6_lo .req v22 +v_data_7_lo .req v23 +v_data_0_hi .req v_data_0 +v_data_1_hi .req v_data_1 +v_data_2_hi .req v_data_2 +v_data_3_hi .req v_data_3 +v_data_4_hi .req v_data_4 +v_data_5_hi .req v_data_5 +v_data_6_hi .req v_data_6 +v_data_7_hi .req v_data_7 + +v_d0 .req v24 +v_d1 .req v25 +v_d2 .req v26 +v_d3 .req v27 +v_d4 .req v28 +v_d5 .req v29 +v_d6 .req v30 +v_d7 .req v31 +q_d0 .req q24 +q_d1 .req q25 +q_d2 .req q26 +q_d3 .req q27 +q_d4 .req q28 +q_d5 .req q29 +q_d6 .req q30 +q_d7 .req q31 + +v_data .req v16 +q_data .req q16 +v_data_lo .req v17 +v_data_hi .req v18 + + +cdecl(gf_2vect_mad_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + lsl x_vec_i, x_vec_i, #5 + lsl x_vec, x_vec, #5 + add x_tbl1, x_tbl, x_vec_i + add x_tbl2, x_tbl1, x_vec + add x_src_end, x_src, x_len + + ldr x_dest1, [x_dest] + ldr x_dest2, [x_dest, #8] + ldr q_gft1_lo, [x_tbl1] + ldr q_gft1_hi, [x_tbl1, #16] + ldr q_gft2_lo, [x_tbl2] + ldr q_gft2_hi, [x_tbl2, #16] + +.Lloop128_init: + /* less than 128 bytes, goto Lloop16_init */ + cmp x_len, #128 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_src_end, x_src_end, #128 + +.Lloop128: + ldr q_data_0, [x_src, #16*0] + ldr q_data_1, [x_src, #16*1] + ldr q_data_2, [x_src, #16*2] + ldr q_data_3, [x_src, #16*3] + ldr q_data_4, [x_src, #16*4] + ldr q_data_5, [x_src, #16*5] + ldr q_data_6, [x_src, #16*6] + ldr q_data_7, [x_src, #16*7] + + ldr q_d0, [x_dest1, #16*0] + ldr q_d1, [x_dest1, #16*1] + ldr q_d2, [x_dest1, #16*2] + ldr q_d3, [x_dest1, #16*3] + ldr q_d4, [x_dest1, #16*4] + ldr q_d5, [x_dest1, #16*5] + ldr q_d6, [x_dest1, #16*6] + ldr q_d7, [x_dest1, #16*7] + + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b + and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b + and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b + and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b + and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b + and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b + + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + ushr v_data_2_hi.16b, v_data_2.16b, #4 + ushr v_data_3_hi.16b, v_data_3.16b, #4 + ushr v_data_4_hi.16b, v_data_4.16b, #4 + ushr v_data_5_hi.16b, v_data_5.16b, #4 + ushr v_data_6_hi.16b, v_data_6.16b, #4 + ushr v_data_7_hi.16b, v_data_7.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b + eor v_d0.16b, v_tmp_lo.16b, v_d0.16b + eor v_d0.16b, v_d0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b + eor v_d1.16b, v_tmp_lo.16b, v_d1.16b + eor v_d1.16b, v_d1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b + eor v_d2.16b, v_tmp_lo.16b, v_d2.16b + eor v_d2.16b, v_d2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b + eor v_d3.16b, v_tmp_lo.16b, v_d3.16b + eor v_d3.16b, v_d3.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b + eor v_d4.16b, v_tmp_lo.16b, v_d4.16b + eor v_d4.16b, v_d4.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b + eor v_d5.16b, v_tmp_lo.16b, v_d5.16b + eor v_d5.16b, v_d5.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b + eor v_d6.16b, v_tmp_lo.16b, v_d6.16b + eor v_d6.16b, v_d6.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b + eor v_d7.16b, v_tmp_lo.16b, v_d7.16b + eor v_d7.16b, v_d7.16b, v_tmp_hi.16b + + str q_d0, [x_dest1, #16*0] + str q_d1, [x_dest1, #16*1] + str q_d2, [x_dest1, #16*2] + str q_d3, [x_dest1, #16*3] + str q_d4, [x_dest1, #16*4] + str q_d5, [x_dest1, #16*5] + str q_d6, [x_dest1, #16*6] + str q_d7, [x_dest1, #16*7] + + ldr q_d0, [x_dest2, #16*0] + ldr q_d1, [x_dest2, #16*1] + ldr q_d2, [x_dest2, #16*2] + ldr q_d3, [x_dest2, #16*3] + ldr q_d4, [x_dest2, #16*4] + ldr q_d5, [x_dest2, #16*5] + ldr q_d6, [x_dest2, #16*6] + ldr q_d7, [x_dest2, #16*7] + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b + eor v_d0.16b, v_tmp_lo.16b, v_d0.16b + eor v_d0.16b, v_d0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b + eor v_d1.16b, v_tmp_lo.16b, v_d1.16b + eor v_d1.16b, v_d1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b + eor v_d2.16b, v_tmp_lo.16b, v_d2.16b + eor v_d2.16b, v_d2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b + eor v_d3.16b, v_tmp_lo.16b, v_d3.16b + eor v_d3.16b, v_d3.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_4_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_4_hi.16b + eor v_d4.16b, v_tmp_lo.16b, v_d4.16b + eor v_d4.16b, v_d4.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_5_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_5_hi.16b + eor v_d5.16b, v_tmp_lo.16b, v_d5.16b + eor v_d5.16b, v_d5.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_6_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_6_hi.16b + eor v_d6.16b, v_tmp_lo.16b, v_d6.16b + eor v_d6.16b, v_d6.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_7_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_7_hi.16b + eor v_d7.16b, v_tmp_lo.16b, v_d7.16b + eor v_d7.16b, v_d7.16b, v_tmp_hi.16b + + str q_d0, [x_dest2, #16*0] + str q_d1, [x_dest2, #16*1] + str q_d2, [x_dest2, #16*2] + str q_d3, [x_dest2, #16*3] + str q_d4, [x_dest2, #16*4] + str q_d5, [x_dest2, #16*5] + str q_d6, [x_dest2, #16*6] + str q_d7, [x_dest2, #16*7] + + add x_src, x_src, #128 + add x_dest1, x_dest1, #128 + add x_dest2, x_dest2, #128 + cmp x_src, x_src_end + bls .Lloop128 + +.Lloop128_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + add x_src_end, x_src_end, #128 + +.Lloop16_init: + sub x_src_end, x_src_end, #16 + cmp x_src, x_src_end + bhi .lessthan16_init + +.Lloop16: + ldr q_data, [x_src] + + ldr q_d0, [x_dest1] + ldr q_d1, [x_dest2] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_d0.16b, v_tmp_lo.16b, v_d0.16b + eor v_d0.16b, v_d0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_d1.16b, v_tmp_lo.16b, v_d1.16b + eor v_d1.16b, v_d1.16b, v_tmp_hi.16b + + str q_d0, [x_dest1] + str q_d1, [x_dest2] + + add x_dest1, x_dest1, #16 + add x_dest2, x_dest2, #16 + add x_src, x_src, #16 + cmp x_src, x_src_end + bls .Lloop16 + +.lessthan16_init: + sub x_tmp, x_src, x_src_end + cmp x_tmp, #16 + beq .return_pass + +.lessthan16: + mov x_src, x_src_end + sub x_dest1, x_dest1, x_tmp + sub x_dest2, x_dest2, x_tmp + +#ifndef __APPLE__ + adrp x_const, const_tbl + add x_const, x_const, :lo12:const_tbl +#else + adrp x_const, const_tbl@PAGE + add x_const, x_const, const_tbl@PAGEOFF +#endif + sub x_const, x_const, x_tmp + ldr q_tmp, [x_const, #16] + + ldr q_data, [x_src] + ldr q_d0, [x_dest1] + ldr q_d1, [x_dest2] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d0.16b, v_d0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d1.16b, v_d1.16b, v_tmp_hi.16b + + str q_d0, [x_dest1] + str q_d1, [x_dest2] + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret + +ASM_DEF_RODATA +.balign 8 +const_tbl: + .dword 0x0000000000000000, 0x0000000000000000 + .dword 0xffffffffffffffff, 0xffffffffffffffff diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_sve.S new file mode 100644 index 0000000000..f0ddf01187 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_2vect_mad_sve.S @@ -0,0 +1,152 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_2vect_mad_sve) +#ifndef __APPLE__ +.type gf_2vect_mad_sve, %function +#endif + +/* gf_2vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + */ +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x6 +x_dest2 .req x7 +x_dest1 .req x12 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_tmp_lo .req z4 +z_tmp_hi .req z5 + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_dest2 .req z27 + +cdecl(gf_2vect_mad_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + /* load table 1 */ + add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */ + + /* Load table 1 with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + /* load table 2 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl] + + ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */ + ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */ + + mov x_pos, #0 + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + /* prefetch dest data */ + prfb pldl2strm, p0, [x_dest1, x_pos] + prfb pldl2strm, p0, [x_dest2, x_pos] + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* load dest data, governed by p0 */ + ld1b z_dest1.b, p0/z, [x_dest1, x_pos] + ld1b z_dest2.b, p0/z, [x_dest2, x_pos] + + /* dest1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_tmp_lo.d, z_dest1.d + eor z_dest1.d, z_tmp_hi.d, z_dest1.d + + /* dest2 */ + tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_tmp_lo.d, z_dest2.d + eor z_dest2.d, z_tmp_hi.d, z_dest2.d + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_neon.S new file mode 100644 index 0000000000..cff34fc3dd --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_neon.S @@ -0,0 +1,361 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_3vect_dot_prod_neon) +#ifndef __APPLE__ +.type gf_3vect_dot_prod_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 +x_tmp .req x8 +x_dest1 .req x9 +x_tbl1 .req x10 +x_dest2 .req x11 +x_tbl2 .req x12 +x_dest3 .req x13 +x_tbl3 .req x14 + +/* vectors */ +v_gft1_lo .req v0 +v_gft1_hi .req v1 +v_gft2_lo .req v2 +v_gft2_hi .req v3 +v_gft3_lo .req v4 +v_gft3_hi .req v5 +q_gft1_lo .req q0 +q_gft1_hi .req q1 +q_gft2_lo .req q2 +q_gft2_hi .req q3 +q_gft3_lo .req q4 +q_gft3_hi .req q5 + +v_mask0f .req v6 +q_mask0f .req q6 +v_tmp1 .req v7 + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 + +v_tmp1_lo .req v12 +v_tmp1_hi .req v13 + +v_p1_0 .req v20 +v_p1_1 .req v21 +v_p1_2 .req v22 +v_p1_3 .req v23 +v_p2_0 .req v24 +v_p2_1 .req v25 +v_p2_2 .req v26 +v_p2_3 .req v27 +v_p3_0 .req v28 +v_p3_1 .req v29 +v_p3_2 .req v30 +v_p3_3 .req v31 + +q_p1_0 .req q20 +q_p1_1 .req q21 +q_p1_2 .req q22 +q_p1_3 .req q23 +q_p2_0 .req q24 +q_p2_1 .req q25 +q_p2_2 .req q26 +q_p2_3 .req q27 +q_p3_0 .req q28 +q_p3_1 .req q29 +q_p3_2 .req q30 +q_p3_3 .req q31 + +v_data .req v_p1_1 +q_data .req q_p1_1 +v_data_lo .req v_p1_2 +v_data_hi .req v_p1_3 + + +cdecl(gf_3vect_dot_prod_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldr x_dest1, [x_dest, #8*0] + ldr x_dest2, [x_dest, #8*1] + ldr x_dest3, [x_dest, #8*2] + +.Lloop64_init: + /* less than 64 bytes, goto Lloop16_init */ + cmp x_len, #64 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_len, x_len, #64 + +.Lloop64: + movi v_p1_0.16b, #0 + movi v_p1_1.16b, #0 + movi v_p1_2.16b, #0 + movi v_p1_3.16b, #0 + movi v_p2_0.16b, #0 + movi v_p2_1.16b, #0 + movi v_p2_2.16b, #0 + movi v_p2_3.16b, #0 + movi v_p3_0.16b, #0 + movi v_p3_1.16b, #0 + movi v_p3_2.16b, #0 + movi v_p3_3.16b, #0 + + mov x_tbl1, x_tbl + add x_tbl2, x_tbl1, x_vec, lsl #2 + add x_tbl3, x_tbl2, x_vec, lsl #2 + mov x_vec_i, #0 + +.Lloop64_vects: + ldr x_ptr, [x_src, x_vec_i] + add x_vec_i, x_vec_i, #8 + add x_ptr, x_ptr, x_pos + + ldr q_data_0, [x_ptr], #16 + ldr q_data_1, [x_ptr], #16 + + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + + ldr q_data_2, [x_ptr], #16 + ldr q_data_3, [x_ptr], #16 + prfm pldl1strm, [x_ptr] + prfm pldl1keep, [x_tbl1] + prfm pldl1keep, [x_tbl2] + prfm pldl1keep, [x_tbl3] + + /* data_0 */ + and v_tmp1.16b, v_data_0.16b, v_mask0f.16b + ushr v_data_0.16b, v_data_0.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b + eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b + eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b + eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b + eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b + eor v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b + eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b + + /* data_1 */ + and v_tmp1.16b, v_data_1.16b, v_mask0f.16b + ushr v_data_1.16b, v_data_1.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b + eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b + eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b + eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b + eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b + eor v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b + eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b + + /* data_2 */ + and v_tmp1.16b, v_data_2.16b, v_mask0f.16b + ushr v_data_2.16b, v_data_2.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b + eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b + eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b + eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b + eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b + eor v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b + eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b + + /* data_3 */ + and v_tmp1.16b, v_data_3.16b, v_mask0f.16b + ushr v_data_3.16b, v_data_3.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b + eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b + eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b + eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b + eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b + eor v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b + eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b + + cmp x_vec_i, x_vec + blt .Lloop64_vects + +.Lloop64_vects_end: + add x_ptr, x_dest1, x_pos + stp q_p1_0, q_p1_1, [x_ptr], #32 + stp q_p1_2, q_p1_3, [x_ptr] + + add x_ptr, x_dest2, x_pos + stp q_p2_0, q_p2_1, [x_ptr], #32 + stp q_p2_2, q_p2_3, [x_ptr] + + add x_ptr, x_dest3, x_pos + stp q_p3_0, q_p3_1, [x_ptr], #32 + stp q_p3_2, q_p3_3, [x_ptr] + + add x_pos, x_pos, #64 + cmp x_pos, x_len + ble .Lloop64 + +.Lloop64_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + + add x_len, x_len, #64 + cmp x_pos, x_len + beq .return_pass + +.Lloop16_init: + sub x_len, x_len, #16 + cmp x_pos, x_len + bgt .lessthan16_init + +.Lloop16: + movi v_p1_0.16b, #0 + movi v_p2_0.16b, #0 + movi v_p3_0.16b, #0 + mov x_tbl1, x_tbl + add x_tbl2, x_tbl1, x_vec, lsl #2 + add x_tbl3, x_tbl2, x_vec, lsl #2 + mov x_vec_i, #0 + +.Lloop16_vects: + ldr x_ptr, [x_src, x_vec_i] + add x_vec_i, x_vec_i, #8 + ldr q_data, [x_ptr, x_pos] + + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + + eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b + eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b + eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b + eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b + eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b + eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b + + cmp x_vec_i, x_vec + bne .Lloop16_vects + +.Lloop16_vects_end: + str q_p1_0, [x_dest1, x_pos] + str q_p2_0, [x_dest2, x_pos] + str q_p3_0, [x_dest3, x_pos] + add x_pos, x_pos, #16 + cmp x_pos, x_len + ble .Lloop16 + +.Lloop16_end: + sub x_tmp, x_pos, x_len + cmp x_tmp, #16 + beq .return_pass + +.lessthan16_init: + mov x_pos, x_len + b .Lloop16 + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_sve.S new file mode 100644 index 0000000000..8f6414ee52 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_dot_prod_sve.S @@ -0,0 +1,189 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_3vect_dot_prod_sve) +#ifndef __APPLE__ +.type gf_3vect_dot_prod_sve, %function +#endif +/* void gf_3vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_tbl3 .req x10 +x_dest1 .req x11 +x_dest2 .req x12 +x_dest3 .req x_dest /* reused */ + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_dest2 .req z27 +z_dest3 .req z28 + +cdecl(gf_3vect_dot_prod_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + ldr x_dest3, [x_dest, #8*2] + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + mov z_dest3.b, #0 /* clear z_dest3 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_dest1.d, z_gft1_hi.d + + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + prfb pldl2keep, p0, [x_tbl3] + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_dest2.d, z_gft2_hi.d + + /* dest 3 */ + tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_gft3_lo.d, z_dest3.d + eor z_dest3.d, z_dest3.d, z_gft3_hi.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_neon.S new file mode 100644 index 0000000000..fcfeec1e23 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_neon.S @@ -0,0 +1,391 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_3vect_mad_neon) +#ifndef __APPLE__ +.type gf_3vect_mad_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_src_end .req x6 +x_dest1 .req x7 +x_dest2 .req x8 +x_dest3 .req x_dest +x_tmp .req x10 +x_tbl1 .req x11 +x_tbl2 .req x12 +x_tbl3 .req x13 +x_const .req x14 + +/* vectors */ +v_mask0f .req v0 +v_tmp_lo .req v1 +v_tmp_hi .req v2 +v_tmp .req v3 +q_tmp .req q3 + +v_gft1_lo .req v4 +v_gft1_hi .req v5 +v_gft2_lo .req v6 +v_gft2_hi .req v7 +v_gft3_lo .req v16 +v_gft3_hi .req v17 +q_gft1_lo .req q4 +q_gft1_hi .req q5 +q_gft2_lo .req q6 +q_gft2_hi .req q7 +q_gft3_lo .req q16 +q_gft3_hi .req q17 + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 + +v_data_0_lo .req v12 +v_data_1_lo .req v13 +v_data_2_lo .req v14 +v_data_3_lo .req v15 +v_data_0_hi .req v_data_0 +v_data_1_hi .req v_data_1 +v_data_2_hi .req v_data_2 +v_data_3_hi .req v_data_3 + +v_d1_0 .req v20 +v_d1_1 .req v21 +v_d1_2 .req v22 +v_d1_3 .req v23 +v_d2_0 .req v24 +v_d2_1 .req v25 +v_d2_2 .req v26 +v_d2_3 .req v27 +v_d3_0 .req v28 +v_d3_1 .req v29 +v_d3_2 .req v30 +v_d3_3 .req v31 +q_d1_0 .req q20 +q_d1_1 .req q21 +q_d1_2 .req q22 +q_d1_3 .req q23 +q_d2_0 .req q24 +q_d2_1 .req q25 +q_d2_2 .req q26 +q_d2_3 .req q27 +q_d3_0 .req q28 +q_d3_1 .req q29 +q_d3_2 .req q30 +q_d3_3 .req q31 + +v_data .req v21 +q_data .req q21 +v_data_lo .req v22 +v_data_hi .req v23 + +cdecl(gf_3vect_mad_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + lsl x_vec_i, x_vec_i, #5 + lsl x_vec, x_vec, #5 + add x_tbl1, x_tbl, x_vec_i + add x_tbl2, x_tbl1, x_vec + add x_tbl3, x_tbl2, x_vec + add x_src_end, x_src, x_len + ldr x_dest1, [x_dest] + ldr x_dest2, [x_dest, #8] + ldr x_dest3, [x_dest, #16] + ldr q_gft1_lo, [x_tbl1] + ldr q_gft1_hi, [x_tbl1, #16] + ldr q_gft2_lo, [x_tbl2] + ldr q_gft2_hi, [x_tbl2, #16] + ldr q_gft3_lo, [x_tbl3] + ldr q_gft3_hi, [x_tbl3, #16] + +.Lloop64_init: + /* less than 64 bytes, goto Lloop16_init */ + cmp x_len, #64 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_src_end, x_src_end, #64 + +.Lloop64: + ldr q_data_0, [x_src, #16*0] + ldr q_data_1, [x_src, #16*1] + ldr q_data_2, [x_src, #16*2] + ldr q_data_3, [x_src, #16*3] + add x_src, x_src, #64 + + ldr q_d1_0, [x_dest1, #16*0] + ldr q_d1_1, [x_dest1, #16*1] + ldr q_d1_2, [x_dest1, #16*2] + ldr q_d1_3, [x_dest1, #16*3] + + ldr q_d2_0, [x_dest2, #16*0] + ldr q_d2_1, [x_dest2, #16*1] + ldr q_d2_2, [x_dest2, #16*2] + ldr q_d2_3, [x_dest2, #16*3] + + ldr q_d3_0, [x_dest3, #16*0] + ldr q_d3_1, [x_dest3, #16*1] + ldr q_d3_2, [x_dest3, #16*2] + ldr q_d3_3, [x_dest3, #16*3] + + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b + and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b + + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + ushr v_data_2_hi.16b, v_data_2.16b, #4 + ushr v_data_3_hi.16b, v_data_3.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b + eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b + eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b + eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b + eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b + eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b + eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b + eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b + eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b + eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b + eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b + eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b + eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b + eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b + eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b + eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b + eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b + eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b + eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b + eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b + eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b + eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1, #16*0] + str q_d1_1, [x_dest1, #16*1] + str q_d1_2, [x_dest1, #16*2] + str q_d1_3, [x_dest1, #16*3] + add x_dest1, x_dest1, #64 + + str q_d2_0, [x_dest2, #16*0] + str q_d2_1, [x_dest2, #16*1] + str q_d2_2, [x_dest2, #16*2] + str q_d2_3, [x_dest2, #16*3] + add x_dest2, x_dest2, #64 + + str q_d3_0, [x_dest3, #16*0] + str q_d3_1, [x_dest3, #16*1] + str q_d3_2, [x_dest3, #16*2] + str q_d3_3, [x_dest3, #16*3] + add x_dest3, x_dest3, #64 + + cmp x_src, x_src_end + bls .Lloop64 + +.Lloop64_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + add x_src_end, x_src_end, #64 + +.Lloop16_init: + sub x_src_end, x_src_end, #16 + cmp x_src, x_src_end + bhi .lessthan16_init + +.Lloop16: + ldr q_data, [x_src] + + ldr q_d1_0, [x_dest1] + ldr q_d2_0, [x_dest2] + ldr q_d3_0, [x_dest3] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1] + str q_d2_0, [x_dest2] + str q_d3_0, [x_dest3] + + add x_src, x_src, #16 + add x_dest1, x_dest1, #16 + add x_dest2, x_dest2, #16 + add x_dest3, x_dest3, #16 + cmp x_src, x_src_end + bls .Lloop16 + +.lessthan16_init: + sub x_tmp, x_src, x_src_end + cmp x_tmp, #16 + beq .return_pass + +.lessthan16: + mov x_src, x_src_end + sub x_dest1, x_dest1, x_tmp + sub x_dest2, x_dest2, x_tmp + sub x_dest3, x_dest3, x_tmp + +#ifndef __APPLE__ + adrp x_const, const_tbl + add x_const, x_const, :lo12:const_tbl +#else + adrp x_const, const_tbl@PAGE + add x_const, x_const, const_tbl@PAGEOFF +#endif + sub x_const, x_const, x_tmp + ldr q_tmp, [x_const, #16] + + ldr q_data, [x_src] + ldr q_d1_0, [x_dest1] + ldr q_d2_0, [x_dest2] + ldr q_d3_0, [x_dest3] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1] + str q_d2_0, [x_dest2] + str q_d3_0, [x_dest3] + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret + +ASM_DEF_RODATA +.balign 8 +const_tbl: + .dword 0x0000000000000000, 0x0000000000000000 + .dword 0xffffffffffffffff, 0xffffffffffffffff diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_sve.S new file mode 100644 index 0000000000..9e0ca5c4b3 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_3vect_mad_sve.S @@ -0,0 +1,175 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_3vect_mad_sve) +#ifndef __APPLE__ +.type gf_3vect_mad_sve, %function +#endif + +/* gf_3vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + */ +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x6 +x_dest2 .req x7 +x_dest3 .req x8 +x_dest1 .req x12 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_tmp_lo .req z4 +z_tmp_hi .req z5 + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_dest2 .req z27 +z_dest3 .req z28 + +cdecl(gf_3vect_mad_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + /* load table 1 */ + add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */ + + /* Load table 1 with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + /* load table 2 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl] + /* load table 3 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft3_lo, q_gft3_hi, [x_tbl] + + ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */ + ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */ + ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */ + + mov x_pos, #0 + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + /* dest data prefetch */ + prfb pldl2strm, p0, [x_dest1, x_pos] + prfb pldl2strm, p0, [x_dest2, x_pos] + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* load dest data, governed by p0 */ + ld1b z_dest1.b, p0/z, [x_dest1, x_pos] + ld1b z_dest2.b, p0/z, [x_dest2, x_pos] + prfb pldl2strm, p0, [x_dest3, x_pos] + + /* dest1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_tmp_lo.d, z_dest1.d + eor z_dest1.d, z_tmp_hi.d, z_dest1.d + + /* dest2 */ + tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b + + ld1b z_dest3.b, p0/z, [x_dest3, x_pos] + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + + eor z_dest2.d, z_tmp_lo.d, z_dest2.d + eor z_dest2.d, z_tmp_hi.d, z_dest2.d + + /* dest3 */ + tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_tmp_lo.d, z_dest3.d + eor z_dest3.d, z_tmp_hi.d, z_dest3.d + + /* store dest data, governed by p0 */ + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_neon.S new file mode 100644 index 0000000000..6204102f68 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_neon.S @@ -0,0 +1,425 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_4vect_dot_prod_neon) +#ifndef __APPLE__ +.type gf_4vect_dot_prod_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 +x_tmp .req x8 +x_dest1 .req x9 +x_tbl1 .req x10 +x_dest2 .req x11 +x_tbl2 .req x12 +x_dest3 .req x13 +x_tbl3 .req x14 +x_dest4 .req x_dest +x_tbl4 .req x15 + +/* vectors */ +v_mask0f .req v0 +q_mask0f .req q0 +v_tmp1_lo .req v1 +v_tmp1_hi .req v2 +v_tmp1 .req v3 +q_tmp1 .req q3 + +v_p1_0 .req v4 +v_p2_0 .req v5 +v_p3_0 .req v6 +v_p4_0 .req v7 + +q_p1_0 .req q4 +q_p2_0 .req q5 +q_p3_0 .req q6 +q_p4_0 .req q7 + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 + +v_p1_3 .req v12 +v_p2_3 .req v13 +v_p3_3 .req v14 +v_p4_3 .req v15 +q_p1_3 .req q12 +q_p2_3 .req q13 +q_p3_3 .req q14 +q_p4_3 .req q15 + +v_gft1_lo .req v16 +v_gft1_hi .req v17 +v_gft2_lo .req v18 +v_gft2_hi .req v19 +v_gft3_lo .req v20 +v_gft3_hi .req v21 +v_gft4_lo .req v22 +v_gft4_hi .req v23 +q_gft1_lo .req q16 +q_gft1_hi .req q17 +q_gft2_lo .req q18 +q_gft2_hi .req q19 +q_gft3_lo .req q20 +q_gft3_hi .req q21 +q_gft4_lo .req q22 +q_gft4_hi .req q23 + +v_p1_1 .req v24 +v_p1_2 .req v25 +v_p2_1 .req v26 +v_p2_2 .req v27 +v_p3_1 .req v28 +v_p3_2 .req v29 +v_p4_1 .req v30 +v_p4_2 .req v31 + +q_p1_1 .req q24 +q_p1_2 .req q25 +q_p2_1 .req q26 +q_p2_2 .req q27 +q_p3_1 .req q28 +q_p3_2 .req q29 +q_p4_1 .req q30 +q_p4_2 .req q31 + +v_data .req v_tmp1 +q_data .req q_tmp1 +v_data_lo .req v_tmp1_lo +v_data_hi .req v_tmp1_hi + +cdecl(gf_4vect_dot_prod_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldr x_dest1, [x_dest, #8*0] + ldr x_dest2, [x_dest, #8*1] + ldr x_dest3, [x_dest, #8*2] + ldr x_dest4, [x_dest, #8*3] + +.Lloop64_init: + /* less than 64 bytes, goto Lloop16_init */ + cmp x_len, #64 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_len, x_len, #64 + +.Lloop64: + movi v_p1_0.16b, #0 + movi v_p1_1.16b, #0 + movi v_p1_2.16b, #0 + movi v_p1_3.16b, #0 + movi v_p2_0.16b, #0 + movi v_p2_1.16b, #0 + movi v_p2_2.16b, #0 + movi v_p2_3.16b, #0 + movi v_p3_0.16b, #0 + movi v_p3_1.16b, #0 + movi v_p3_2.16b, #0 + movi v_p3_3.16b, #0 + movi v_p4_0.16b, #0 + movi v_p4_1.16b, #0 + movi v_p4_2.16b, #0 + movi v_p4_3.16b, #0 + + mov x_tbl1, x_tbl + add x_tbl2, x_tbl1, x_vec, lsl #2 + add x_tbl3, x_tbl2, x_vec, lsl #2 + add x_tbl4, x_tbl3, x_vec, lsl #2 + mov x_vec_i, #0 + prfm pldl1keep, [x_tbl1] + prfm pldl1keep, [x_tbl2] + prfm pldl1keep, [x_tbl3] + prfm pldl1keep, [x_tbl4] + +.Lloop64_vects: + ldr x_ptr, [x_src, x_vec_i] + add x_vec_i, x_vec_i, #8 + add x_ptr, x_ptr, x_pos + + ldr q_data_0, [x_ptr], #16 + ldr q_data_1, [x_ptr], #16 + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 + ldr q_data_2, [x_ptr], #16 + ldr q_data_3, [x_ptr], #16 + + prfm pldl1strm, [x_ptr] + prfm pldl1keep, [x_tbl1] + prfm pldl1keep, [x_tbl2] + prfm pldl1keep, [x_tbl3] + prfm pldl1keep, [x_tbl4] + + /* data_0 */ + and v_tmp1.16b, v_data_0.16b, v_mask0f.16b + ushr v_data_0.16b, v_data_0.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b + eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b + eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b + eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b + eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b + eor v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b + eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_0.16b + eor v_p4_0.16b, v_tmp1_lo.16b, v_p4_0.16b + eor v_p4_0.16b, v_p4_0.16b, v_tmp1_hi.16b + + /* data_1 */ + and v_tmp1.16b, v_data_1.16b, v_mask0f.16b + ushr v_data_1.16b, v_data_1.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b + eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b + eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b + eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b + eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b + eor v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b + eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_1.16b + eor v_p4_1.16b, v_tmp1_lo.16b, v_p4_1.16b + eor v_p4_1.16b, v_p4_1.16b, v_tmp1_hi.16b + + /* data_2 */ + and v_tmp1.16b, v_data_2.16b, v_mask0f.16b + ushr v_data_2.16b, v_data_2.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b + eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b + eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b + eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b + eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b + eor v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b + eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_2.16b + eor v_p4_2.16b, v_tmp1_lo.16b, v_p4_2.16b + eor v_p4_2.16b, v_p4_2.16b, v_tmp1_hi.16b + + /* data_3 */ + and v_tmp1.16b, v_data_3.16b, v_mask0f.16b + ushr v_data_3.16b, v_data_3.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b + eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b + eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b + eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b + eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b + eor v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b + eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b + + tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b + tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_3.16b + eor v_p4_3.16b, v_tmp1_lo.16b, v_p4_3.16b + eor v_p4_3.16b, v_p4_3.16b, v_tmp1_hi.16b + + cmp x_vec_i, x_vec + blt .Lloop64_vects + +.Lloop64_vects_end: + add x_ptr, x_dest1, x_pos + stp q_p1_0, q_p1_1, [x_ptr], #32 + stp q_p1_2, q_p1_3, [x_ptr] + + add x_ptr, x_dest2, x_pos + stp q_p2_0, q_p2_1, [x_ptr], #32 + stp q_p2_2, q_p2_3, [x_ptr] + + add x_ptr, x_dest3, x_pos + stp q_p3_0, q_p3_1, [x_ptr], #32 + stp q_p3_2, q_p3_3, [x_ptr] + + add x_ptr, x_dest4, x_pos + stp q_p4_0, q_p4_1, [x_ptr], #32 + stp q_p4_2, q_p4_3, [x_ptr] + + add x_pos, x_pos, #64 + cmp x_pos, x_len + ble .Lloop64 + +.Lloop64_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + + add x_len, x_len, #64 + cmp x_pos, x_len + beq .return_pass + +.Lloop16_init: + sub x_len, x_len, #16 + cmp x_pos, x_len + bgt .lessthan16_init + +.Lloop16: + movi v_p1_0.16b, #0 + movi v_p2_0.16b, #0 + movi v_p3_0.16b, #0 + movi v_p4_0.16b, #0 + mov x_tbl1, x_tbl + add x_tbl2, x_tbl1, x_vec, lsl #2 + add x_tbl3, x_tbl2, x_vec, lsl #2 + add x_tbl4, x_tbl3, x_vec, lsl #2 + mov x_vec_i, #0 + +.Lloop16_vects: + ldr x_ptr, [x_src, x_vec_i] + add x_vec_i, x_vec_i, #8 + ldr q_data, [x_ptr, x_pos] + + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 + + prfm pldl1keep, [x_tbl1] + prfm pldl1keep, [x_tbl2] + prfm pldl1keep, [x_tbl3] + prfm pldl1keep, [x_tbl4] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + tbl v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b + tbl v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b + + eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b + eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b + eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b + eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b + eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b + eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b + eor v_p4_0.16b, v_gft4_hi.16b, v_p4_0.16b + eor v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b + + cmp x_vec_i, x_vec + bne .Lloop16_vects + +.Lloop16_vects_end: + str q_p1_0, [x_dest1, x_pos] + str q_p2_0, [x_dest2, x_pos] + str q_p3_0, [x_dest3, x_pos] + str q_p4_0, [x_dest4, x_pos] + add x_pos, x_pos, #16 + cmp x_pos, x_len + ble .Lloop16 + +.Lloop16_end: + sub x_tmp, x_pos, x_len + cmp x_tmp, #16 + beq .return_pass + +.lessthan16_init: + mov x_pos, x_len + b .Lloop16 + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_sve.S new file mode 100644 index 0000000000..eb354279f8 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_dot_prod_sve.S @@ -0,0 +1,208 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_4vect_dot_prod_sve) +#ifndef __APPLE__ +.type gf_4vect_dot_prod_sve, %function +#endif +/* void gf_4vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_tbl3 .req x10 +x_tbl4 .req x11 +x_dest1 .req x12 +x_dest2 .req x13 +x_dest3 .req x14 +x_dest4 .req x_dest /* reused */ + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 + +cdecl(gf_4vect_dot_prod_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + ldp x_dest3, x_dest4, [x_dest, #8*2] + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + mov z_dest3.b, #0 /* clear z_dest3 */ + mov z_dest4.b, #0 /* clear z_dest4 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ + add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_dest1.d, z_gft1_hi.d + + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 + prfb pldl2keep, p0, [x_tbl3] + prfb pldl2keep, p0, [x_tbl4] + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_dest2.d, z_gft2_hi.d + + /* dest 3 */ + tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_gft3_lo.d, z_dest3.d + eor z_dest3.d, z_dest3.d, z_gft3_hi.d + + /* dest 4 */ + tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_gft4_lo.d, z_dest4.d + eor z_dest4.d, z_dest4.d, z_gft4_hi.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_neon.S new file mode 100644 index 0000000000..ebf82e7ffe --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_neon.S @@ -0,0 +1,464 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_4vect_mad_neon) +#ifndef __APPLE__ +.type gf_4vect_mad_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_src_end .req x6 +x_dest1 .req x7 +x_dest2 .req x8 +x_dest3 .req x9 +x_dest4 .req x_dest +x_tmp .req x10 +x_tbl1 .req x11 +x_tbl2 .req x12 +x_tbl3 .req x13 +x_tbl4 .req x14 +x_const .req x15 + +/* vectors */ +v_mask0f .req v0 +v_tmp_lo .req v1 +v_tmp_hi .req v2 +v_tmp .req v3 +q_tmp .req q3 + +v_gft1_lo .req v4 +v_gft1_hi .req v5 +v_gft2_lo .req v6 +v_gft2_hi .req v7 +v_gft3_lo .req v16 +v_gft3_hi .req v17 +v_gft4_lo .req v18 +v_gft4_hi .req v19 +q_gft1_lo .req q4 +q_gft1_hi .req q5 +q_gft2_lo .req q6 +q_gft2_hi .req q7 +q_gft3_lo .req q16 +q_gft3_hi .req q17 +q_gft4_lo .req q18 +q_gft4_hi .req q19 + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 + +v_data_0_lo .req v12 +v_data_1_lo .req v13 +v_data_2_lo .req v14 +v_data_3_lo .req v15 +v_data_0_hi .req v_data_0 +v_data_1_hi .req v_data_1 +v_data_2_hi .req v_data_2 +v_data_3_hi .req v_data_3 + +v_d1_0 .req v20 +v_d1_1 .req v21 +v_d1_2 .req v22 +v_d1_3 .req v23 +v_d2_0 .req v24 +v_d2_1 .req v25 +v_d2_2 .req v26 +v_d2_3 .req v27 +v_d3_0 .req v28 +v_d3_1 .req v29 +v_d3_2 .req v30 +v_d3_3 .req v31 +q_d1_0 .req q20 +q_d1_1 .req q21 +q_d1_2 .req q22 +q_d1_3 .req q23 +q_d2_0 .req q24 +q_d2_1 .req q25 +q_d2_2 .req q26 +q_d2_3 .req q27 +q_d3_0 .req q28 +q_d3_1 .req q29 +q_d3_2 .req q30 +q_d3_3 .req q31 + +v_d4_0 .req v_d1_0 +v_d4_1 .req v_d1_1 +v_d4_2 .req v_d1_2 +v_d4_3 .req v_d1_3 +q_d4_0 .req q_d1_0 +q_d4_1 .req q_d1_1 +q_d4_2 .req q_d1_2 +q_d4_3 .req q_d1_3 + +v_data .req v21 +q_data .req q21 +v_data_lo .req v22 +v_data_hi .req v23 + +cdecl(gf_4vect_mad_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + lsl x_vec_i, x_vec_i, #5 + lsl x_vec, x_vec, #5 + add x_tbl1, x_tbl, x_vec_i + add x_tbl2, x_tbl1, x_vec + add x_tbl3, x_tbl2, x_vec + add x_tbl4, x_tbl3, x_vec + add x_src_end, x_src, x_len + ldr x_dest1, [x_dest, #8*0] + ldr x_dest2, [x_dest, #8*1] + ldr x_dest3, [x_dest, #8*2] + ldr x_dest4, [x_dest, #8*3] + ldr q_gft1_lo, [x_tbl1] + ldr q_gft1_hi, [x_tbl1, #16] + ldr q_gft2_lo, [x_tbl2] + ldr q_gft2_hi, [x_tbl2, #16] + ldr q_gft3_lo, [x_tbl3] + ldr q_gft3_hi, [x_tbl3, #16] + ldr q_gft4_lo, [x_tbl4] + ldr q_gft4_hi, [x_tbl4, #16] + +.Lloop64_init: + /* less than 64 bytes, goto Lloop16_init */ + cmp x_len, #64 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_src_end, x_src_end, #64 + +.Lloop64: + ldr q_data_0, [x_src, #16*0] + ldr q_data_1, [x_src, #16*1] + ldr q_data_2, [x_src, #16*2] + ldr q_data_3, [x_src, #16*3] + add x_src, x_src, #64 + + ldr q_d1_0, [x_dest1, #16*0] + ldr q_d1_1, [x_dest1, #16*1] + ldr q_d1_2, [x_dest1, #16*2] + ldr q_d1_3, [x_dest1, #16*3] + + ldr q_d2_0, [x_dest2, #16*0] + ldr q_d2_1, [x_dest2, #16*1] + ldr q_d2_2, [x_dest2, #16*2] + ldr q_d2_3, [x_dest2, #16*3] + + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b + and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b + + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + ushr v_data_2_hi.16b, v_data_2.16b, #4 + ushr v_data_3_hi.16b, v_data_3.16b, #4 + + /* dest1 */ + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b + eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b + eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b + eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b + eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b + eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b + eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b + eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b + + /* dest2 */ + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b + eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b + eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b + eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b + eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b + eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b + eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b + eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1, #16*0] + str q_d1_1, [x_dest1, #16*1] + str q_d1_2, [x_dest1, #16*2] + str q_d1_3, [x_dest1, #16*3] + add x_dest1, x_dest1, #64 + + str q_d2_0, [x_dest2, #16*0] + str q_d2_1, [x_dest2, #16*1] + str q_d2_2, [x_dest2, #16*2] + str q_d2_3, [x_dest2, #16*3] + add x_dest2, x_dest2, #64 + + ldr q_d3_0, [x_dest3, #16*0] + ldr q_d3_1, [x_dest3, #16*1] + ldr q_d3_2, [x_dest3, #16*2] + ldr q_d3_3, [x_dest3, #16*3] + + ldr q_d4_0, [x_dest4, #16*0] + ldr q_d4_1, [x_dest4, #16*1] + ldr q_d4_2, [x_dest4, #16*2] + ldr q_d4_3, [x_dest4, #16*3] + + /* dest3 */ + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b + eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b + eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b + eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b + eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b + eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b + eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b + eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b + + /* dest4 */ + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b + eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b + eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b + eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b + eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b + eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b + eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b + eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b + eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b + + str q_d3_0, [x_dest3, #16*0] + str q_d3_1, [x_dest3, #16*1] + str q_d3_2, [x_dest3, #16*2] + str q_d3_3, [x_dest3, #16*3] + add x_dest3, x_dest3, #64 + + str q_d4_0, [x_dest4, #16*0] + str q_d4_1, [x_dest4, #16*1] + str q_d4_2, [x_dest4, #16*2] + str q_d4_3, [x_dest4, #16*3] + add x_dest4, x_dest4, #64 + + cmp x_src, x_src_end + bls .Lloop64 + +.Lloop64_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + add x_src_end, x_src_end, #64 + +.Lloop16_init: + sub x_src_end, x_src_end, #16 + cmp x_src, x_src_end + bhi .lessthan16_init + +.Lloop16: + ldr q_data, [x_src] + + ldr q_d1_0, [x_dest1] + ldr q_d2_0, [x_dest2] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1] + str q_d2_0, [x_dest2] + ldr q_d3_0, [x_dest3] + ldr q_d4_0, [x_dest4] + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b + eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b + eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b + + str q_d3_0, [x_dest3] + str q_d4_0, [x_dest4] + + add x_src, x_src, #16 + add x_dest1, x_dest1, #16 + add x_dest2, x_dest2, #16 + add x_dest3, x_dest3, #16 + add x_dest4, x_dest4, #16 + cmp x_src, x_src_end + bls .Lloop16 + +.lessthan16_init: + sub x_tmp, x_src, x_src_end + cmp x_tmp, #16 + beq .return_pass + +.lessthan16: + mov x_src, x_src_end + sub x_dest1, x_dest1, x_tmp + sub x_dest2, x_dest2, x_tmp + sub x_dest3, x_dest3, x_tmp + sub x_dest4, x_dest4, x_tmp + +#ifndef __APPLE__ + adrp x_const, const_tbl + add x_const, x_const, :lo12:const_tbl +#else + adrp x_const, const_tbl@PAGE + add x_const, x_const, const_tbl@PAGEOFF +#endif + sub x_const, x_const, x_tmp + ldr q_tmp, [x_const, #16] + + ldr q_data, [x_src] + ldr q_d1_0, [x_dest1] + ldr q_d2_0, [x_dest2] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1] + str q_d2_0, [x_dest2] + ldr q_d3_0, [x_dest3] + ldr q_d4_0, [x_dest4] + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b + + str q_d3_0, [x_dest3] + str q_d4_0, [x_dest4] + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret + +ASM_DEF_RODATA +.balign 8 +const_tbl: + .dword 0x0000000000000000, 0x0000000000000000 + .dword 0xffffffffffffffff, 0xffffffffffffffff diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S new file mode 100644 index 0000000000..89ec89f5c6 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_4vect_mad_sve.S @@ -0,0 +1,194 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_4vect_mad_sve) +#ifndef __APPLE__ +.type gf_4vect_mad_sve, %function +#endif + +/* gf_4vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + */ +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x6 +x_dest2 .req x7 +x_dest3 .req x8 +x_dest4 .req x9 +x_dest1 .req x12 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_tmp_lo .req z4 +z_tmp_hi .req z5 + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 + +cdecl(gf_4vect_mad_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + /* load table 1 */ + add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */ + + /* Load table 1 with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + /* load table 2 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl] + /* load table 3 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft3_lo, q_gft3_hi, [x_tbl] + /* load table 4 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft4_lo, q_gft4_hi, [x_tbl] + + ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */ + ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */ + ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */ + ldr x_dest4, [x_dest, #8*3] /* pointer to dest4 */ + + mov x_pos, #0 + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + prfb pldl2strm, p0, [x_dest1, x_pos] + prfb pldl2strm, p0, [x_dest2, x_pos] + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* load dest data, governed by p0 */ + ld1b z_dest1.b, p0/z, [x_dest1, x_pos] + ld1b z_dest2.b, p0/z, [x_dest2, x_pos] + + prfb pldl2strm, p0, [x_dest3, x_pos] + prfb pldl2strm, p0, [x_dest4, x_pos] + + /* dest1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_tmp_lo.d, z_dest1.d + eor z_dest1.d, z_tmp_hi.d, z_dest1.d + + ld1b z_dest3.b, p0/z, [x_dest3, x_pos] + ld1b z_dest4.b, p0/z, [x_dest4, x_pos] + + /* dest2 */ + tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_tmp_lo.d, z_dest2.d + eor z_dest2.d, z_tmp_hi.d, z_dest2.d + + /* dest3 */ + tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_tmp_lo.d, z_dest3.d + eor z_dest3.d, z_tmp_hi.d, z_dest3.d + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + + /* dest4 */ + tbl z_tmp_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_tmp_lo.d, z_dest4.d + eor z_dest4.d, z_tmp_hi.d, z_dest4.d + + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_neon.S new file mode 100644 index 0000000000..13166665d6 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_neon.S @@ -0,0 +1,484 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_5vect_dot_prod_neon) +#ifndef __APPLE__ +.type gf_5vect_dot_prod_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 +x_tmp .req x8 +x_dest1 .req x9 +x_dest2 .req x10 +x_dest3 .req x11 +x_dest4 .req x12 +x_dest5 .req x13 + +/* vectors */ +v_tmp1 .req v0 +q_tmp1 .req q0 +v_tmp2 .req v1 +q_tmp2 .req q1 + +v_mask0f .req v_tmp1 +q_mask0f .req q_tmp1 +v_tmp_lo .req v_tmp1 +v_tmp_hi .req v_tmp2 + +v_gft_lo .req v2 +v_gft_hi .req v3 +q_gft_lo .req q2 +q_gft_hi .req q3 + +v_p1_0 .req v4 +v_p2_0 .req v5 +v_p3_0 .req v6 +v_p4_0 .req v7 + +q_p1_0 .req q4 +q_p2_0 .req q5 +q_p3_0 .req q6 +q_p4_0 .req q7 + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 + +v_data_0_lo .req v12 +v_data_1_lo .req v13 +v_data_2_lo .req v14 +v_data_3_lo .req v15 +v_data_0_hi .req v_data_0 +v_data_1_hi .req v_data_1 +v_data_2_hi .req v_data_2 +v_data_3_hi .req v_data_3 + +v_p5_0 .req v16 +v_p1_1 .req v17 +v_p2_1 .req v18 +v_p3_1 .req v19 +v_p4_1 .req v20 +v_p5_1 .req v21 +v_p1_2 .req v22 +v_p2_2 .req v23 +v_p3_2 .req v24 +v_p4_2 .req v25 +v_p5_2 .req v26 +v_p1_3 .req v27 +v_p2_3 .req v28 +v_p3_3 .req v29 +v_p4_3 .req v30 +v_p5_3 .req v31 + +q_p5_0 .req q16 +q_p1_1 .req q17 +q_p2_1 .req q18 +q_p3_1 .req q19 +q_p4_1 .req q20 +q_p5_1 .req q21 +q_p1_2 .req q22 +q_p2_2 .req q23 +q_p3_2 .req q24 +q_p4_2 .req q25 +q_p5_2 .req q26 +q_p1_3 .req q27 +q_p2_3 .req q28 +q_p3_3 .req q29 +q_p4_3 .req q30 +q_p5_3 .req q31 + +v_data .req v_p1_1 +q_data .req q_p1_1 +v_data_lo .req v_p2_1 +v_data_hi .req v_p3_1 + +v_gft1_lo .req v_p4_1 +v_gft1_hi .req v_p5_1 +v_gft2_lo .req v_p1_2 +v_gft2_hi .req v_p2_2 +v_gft3_lo .req v_p3_2 +v_gft3_hi .req v_p4_2 +v_gft4_lo .req v_p5_2 +v_gft4_hi .req v_p1_3 +v_gft5_lo .req v_p2_3 +v_gft5_hi .req v_p3_3 +q_gft1_lo .req q_p4_1 +q_gft1_hi .req q_p5_1 +q_gft2_lo .req q_p1_2 +q_gft2_hi .req q_p2_2 +q_gft3_lo .req q_p3_2 +q_gft3_hi .req q_p4_2 +q_gft4_lo .req q_p5_2 +q_gft4_hi .req q_p1_3 +q_gft5_lo .req q_p2_3 +q_gft5_hi .req q_p3_3 + + +cdecl(gf_5vect_dot_prod_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldr x_dest1, [x_dest, #8*0] + ldr x_dest2, [x_dest, #8*1] + ldr x_dest3, [x_dest, #8*2] + ldr x_dest4, [x_dest, #8*3] + ldr x_dest5, [x_dest, #8*4] + +.Lloop64_init: + /* less than 64 bytes, goto Lloop16_init */ + cmp x_len, #64 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_len, x_len, #64 + +.Lloop64: + movi v_p1_0.16b, #0 + movi v_p1_1.16b, #0 + movi v_p1_2.16b, #0 + movi v_p1_3.16b, #0 + movi v_p2_0.16b, #0 + movi v_p2_1.16b, #0 + movi v_p2_2.16b, #0 + movi v_p2_3.16b, #0 + movi v_p3_0.16b, #0 + movi v_p3_1.16b, #0 + movi v_p3_2.16b, #0 + movi v_p3_3.16b, #0 + movi v_p4_0.16b, #0 + movi v_p4_1.16b, #0 + movi v_p4_2.16b, #0 + movi v_p4_3.16b, #0 + movi v_p5_0.16b, #0 + movi v_p5_1.16b, #0 + movi v_p5_2.16b, #0 + movi v_p5_3.16b, #0 + mov x_vec_i, #0 + +.Lloop64_vects: + ldr x_ptr, [x_src, x_vec_i] + add x_ptr, x_ptr, x_pos + + ldr q_data_0, [x_ptr], #16 + ldr q_data_1, [x_ptr], #16 + ldr q_data_2, [x_ptr], #16 + ldr q_data_3, [x_ptr], #16 + prfm pldl2keep, [x_ptr] + + movi v_mask0f.16b, #0x0f + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b + and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + ushr v_data_2_hi.16b, v_data_2.16b, #4 + ushr v_data_3_hi.16b, v_data_3.16b, #4 + + /* v_p1_x */ + add x_tmp, x_tbl, x_vec_i, lsl #2 + add x_vec_i, x_vec_i, #8 + ldp q_gft_lo, q_gft_hi, [x_tmp] + prfm pldl3keep, [x_tmp, #32] + add x_tmp, x_tmp, x_vec, lsl #2 + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + eor v_p1_0.16b, v_tmp_lo.16b, v_p1_0.16b + eor v_p1_0.16b, v_p1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p1_1.16b, v_tmp_lo.16b, v_p1_1.16b + eor v_p1_1.16b, v_p1_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + eor v_p1_2.16b, v_tmp_lo.16b, v_p1_2.16b + eor v_p1_2.16b, v_p1_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p1_3.16b, v_tmp_lo.16b, v_p1_3.16b + eor v_p1_3.16b, v_p1_3.16b, v_tmp_hi.16b + + /* v_p2_x */ + ldp q_gft_lo, q_gft_hi, [x_tmp] + prfm pldl3keep, [x_tmp, #32] + add x_tmp, x_tmp, x_vec, lsl #2 + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + eor v_p2_0.16b, v_tmp_lo.16b, v_p2_0.16b + eor v_p2_0.16b, v_p2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p2_1.16b, v_tmp_lo.16b, v_p2_1.16b + eor v_p2_1.16b, v_p2_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + eor v_p2_2.16b, v_tmp_lo.16b, v_p2_2.16b + eor v_p2_2.16b, v_p2_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p2_3.16b, v_tmp_lo.16b, v_p2_3.16b + eor v_p2_3.16b, v_p2_3.16b, v_tmp_hi.16b + + /* v_p3_x */ + ldp q_gft_lo, q_gft_hi, [x_tmp] + prfm pldl3keep, [x_tmp, #32] + add x_tmp, x_tmp, x_vec, lsl #2 + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + eor v_p3_0.16b, v_tmp_lo.16b, v_p3_0.16b + eor v_p3_0.16b, v_p3_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p3_1.16b, v_tmp_lo.16b, v_p3_1.16b + eor v_p3_1.16b, v_p3_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + eor v_p3_2.16b, v_tmp_lo.16b, v_p3_2.16b + eor v_p3_2.16b, v_p3_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p3_3.16b, v_tmp_lo.16b, v_p3_3.16b + eor v_p3_3.16b, v_p3_3.16b, v_tmp_hi.16b + + /* v_p4_x */ + ldp q_gft_lo, q_gft_hi, [x_tmp] + prfm pldl3keep, [x_tmp, #32] + add x_tmp, x_tmp, x_vec, lsl #2 + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + eor v_p4_0.16b, v_tmp_lo.16b, v_p4_0.16b + eor v_p4_0.16b, v_p4_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p4_1.16b, v_tmp_lo.16b, v_p4_1.16b + eor v_p4_1.16b, v_p4_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + eor v_p4_2.16b, v_tmp_lo.16b, v_p4_2.16b + eor v_p4_2.16b, v_p4_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p4_3.16b, v_tmp_lo.16b, v_p4_3.16b + eor v_p4_3.16b, v_p4_3.16b, v_tmp_hi.16b + + /* v_p5_x */ + ldp q_gft_lo, q_gft_hi, [x_tmp] + prfm pldl3keep, [x_tmp, #32] + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + eor v_p5_0.16b, v_tmp_lo.16b, v_p5_0.16b + eor v_p5_0.16b, v_p5_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p5_1.16b, v_tmp_lo.16b, v_p5_1.16b + eor v_p5_1.16b, v_p5_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + eor v_p5_2.16b, v_tmp_lo.16b, v_p5_2.16b + eor v_p5_2.16b, v_p5_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p5_3.16b, v_tmp_lo.16b, v_p5_3.16b + eor v_p5_3.16b, v_p5_3.16b, v_tmp_hi.16b + + cmp x_vec_i, x_vec + blt .Lloop64_vects + +.Lloop64_vects_end: + add x_ptr, x_dest1, x_pos + stp q_p1_0, q_p1_1, [x_ptr], #32 + stp q_p1_2, q_p1_3, [x_ptr] + + add x_ptr, x_dest2, x_pos + stp q_p2_0, q_p2_1, [x_ptr], #32 + stp q_p2_2, q_p2_3, [x_ptr] + + add x_ptr, x_dest3, x_pos + stp q_p3_0, q_p3_1, [x_ptr], #32 + stp q_p3_2, q_p3_3, [x_ptr] + + add x_ptr, x_dest4, x_pos + stp q_p4_0, q_p4_1, [x_ptr], #32 + stp q_p4_2, q_p4_3, [x_ptr] + + add x_ptr, x_dest5, x_pos + stp q_p5_0, q_p5_1, [x_ptr], #32 + stp q_p5_2, q_p5_3, [x_ptr] + + add x_pos, x_pos, #64 + cmp x_pos, x_len + ble .Lloop64 + +.Lloop64_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + + add x_len, x_len, #64 + cmp x_pos, x_len + beq .return_pass + +.Lloop16_init: + sub x_len, x_len, #16 + cmp x_pos, x_len + bgt .lessthan16_init + +.Lloop16: + movi v_p1_0.16b, #0 + movi v_p2_0.16b, #0 + movi v_p3_0.16b, #0 + movi v_p4_0.16b, #0 + movi v_p5_0.16b, #0 + mov x_vec_i, #0 + +.Lloop16_vects: + ldr x_ptr, [x_src, x_vec_i] + ldr q_data, [x_ptr, x_pos] + + movi v_mask0f.16b, #0x0f + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + add x_tmp, x_tbl, x_vec_i, lsl #2 + add x_vec_i, x_vec_i, #8 + ldp q_gft1_lo, q_gft1_hi, [x_tmp] + add x_tmp, x_tmp, x_vec, lsl #2 + ldp q_gft2_lo, q_gft2_hi, [x_tmp] + add x_tmp, x_tmp, x_vec, lsl #2 + ldp q_gft3_lo, q_gft3_hi, [x_tmp] + add x_tmp, x_tmp, x_vec, lsl #2 + ldp q_gft4_lo, q_gft4_hi, [x_tmp] + add x_tmp, x_tmp, x_vec, lsl #2 + ldp q_gft5_lo, q_gft5_hi, [x_tmp] + + tbl v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + tbl v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + tbl v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + tbl v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b + tbl v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b + tbl v_gft5_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b + tbl v_gft5_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b + + eor v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b + eor v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b + eor v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b + eor v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b + eor v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b + eor v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b + eor v_p4_0.16b, v_gft4_hi.16b, v_p4_0.16b + eor v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b + eor v_p5_0.16b, v_gft5_hi.16b, v_p5_0.16b + eor v_p5_0.16b, v_p5_0.16b, v_gft5_lo.16b + + cmp x_vec_i, x_vec + bne .Lloop16_vects + +.Lloop16_vects_end: + str q_p1_0, [x_dest1, x_pos] + str q_p2_0, [x_dest2, x_pos] + str q_p3_0, [x_dest3, x_pos] + str q_p4_0, [x_dest4, x_pos] + str q_p5_0, [x_dest5, x_pos] + add x_pos, x_pos, #16 + cmp x_pos, x_len + ble .Lloop16 + +.Lloop16_end: + sub x_tmp, x_pos, x_len + cmp x_tmp, #16 + beq .return_pass + +.lessthan16_init: + mov x_pos, x_len + b .Lloop16 + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_sve.S new file mode 100644 index 0000000000..bb7cd0184e --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_dot_prod_sve.S @@ -0,0 +1,237 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_5vect_dot_prod_sve) +#ifndef __APPLE__ +.type gf_5vect_dot_prod_sve, %function +#endif +/* void gf_5vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_tbl3 .req x10 +x_tbl4 .req x11 +x_tbl5 .req x12 +x_dest1 .req x13 +x_dest2 .req x14 +x_dest4 .req x15 +x_dest5 .req x_dest /* reused */ + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ +x_dest3 .req x19 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_gft5_lo .req z23 +z_gft5_hi .req z24 +q_gft5_lo .req q23 +q_gft5_hi .req q24 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 +z_dest5 .req z30 + +cdecl(gf_5vect_dot_prod_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + /* save r19..r29 */ + sub sp, sp, #16 /* alignment */ + str x19, [sp] + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + ldp x_dest3, x_dest4, [x_dest, #8*2] + ldr x_dest5, [x_dest, #8*4] + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + mov z_dest3.b, #0 /* clear z_dest3 */ + mov z_dest4.b, #0 /* clear z_dest4 */ + mov z_dest5.b, #0 /* clear z_dest5 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ + add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ + add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_dest1.d, z_gft1_hi.d + + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 + prfb pldl2keep, p0, [x_tbl3] + prfb pldl2keep, p0, [x_tbl4] + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_dest2.d, z_gft2_hi.d + + /* dest 3 */ + tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_gft3_lo.d, z_dest3.d + eor z_dest3.d, z_dest3.d, z_gft3_hi.d + + ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 + prfb pldl2keep, p0, [x_tbl5] + + /* dest 4 */ + tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_gft4_lo.d, z_dest4.d + eor z_dest4.d, z_dest4.d, z_gft4_hi.d + + /* dest 5 */ + tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b + tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b + eor z_dest5.d, z_gft5_lo.d, z_dest5.d + eor z_dest5.d, z_dest5.d, z_gft5_hi.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + st1b z_dest5.b, p0, [x_dest5, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + /* restore r19..r29 */ + ldr x19, [sp] + add sp, sp, #16 + + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_neon.S new file mode 100644 index 0000000000..473e4c5774 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_neon.S @@ -0,0 +1,544 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_5vect_mad_neon) +#ifndef __APPLE__ +.type gf_5vect_mad_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_src_end .req x6 +x_dest1 .req x7 +x_dest2 .req x8 +x_dest3 .req x9 +x_dest4 .req x10 +x_dest5 .req x_dest +x_tmp .req x11 +x_tbl1 .req x12 +x_tbl2 .req x13 +x_tbl3 .req x14 +x_tbl4 .req x15 +x_tbl5 .req x16 +x_const .req x17 + +/* vectors */ +v_mask0f .req v0 +v_tmp_lo .req v1 +v_tmp_hi .req v2 +v_tmp .req v3 +q_tmp .req q3 + +v_gft1_lo .req v4 +v_gft1_hi .req v5 +v_gft2_lo .req v6 +v_gft2_hi .req v7 +v_gft3_lo .req v16 +v_gft3_hi .req v17 +q_gft1_lo .req q4 +q_gft1_hi .req q5 +q_gft2_lo .req q6 +q_gft2_hi .req q7 +q_gft3_lo .req q16 +q_gft3_hi .req q17 + +v_gft4_lo .req v18 +v_gft4_hi .req v19 +q_gft4_lo .req q18 +q_gft4_hi .req q19 +v_gft5_lo .req v_gft2_lo +v_gft5_hi .req v_gft2_hi +q_gft5_lo .req q_gft2_lo +q_gft5_hi .req q_gft2_hi + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 + +v_data_0_lo .req v12 +v_data_1_lo .req v13 +v_data_2_lo .req v14 +v_data_3_lo .req v15 +v_data_0_hi .req v_data_0 +v_data_1_hi .req v_data_1 +v_data_2_hi .req v_data_2 +v_data_3_hi .req v_data_3 + +v_d1_0 .req v20 +v_d1_1 .req v21 +v_d1_2 .req v22 +v_d1_3 .req v23 +v_d2_0 .req v24 +v_d2_1 .req v25 +v_d2_2 .req v26 +v_d2_3 .req v27 +v_d3_0 .req v28 +v_d3_1 .req v29 +v_d3_2 .req v30 +v_d3_3 .req v31 +q_d1_0 .req q20 +q_d1_1 .req q21 +q_d1_2 .req q22 +q_d1_3 .req q23 +q_d2_0 .req q24 +q_d2_1 .req q25 +q_d2_2 .req q26 +q_d2_3 .req q27 +q_d3_0 .req q28 +q_d3_1 .req q29 +q_d3_2 .req q30 +q_d3_3 .req q31 + +v_d4_0 .req v_d1_0 +v_d4_1 .req v_d1_1 +v_d4_2 .req v_d1_2 +v_d4_3 .req v_d1_3 +q_d4_0 .req q_d1_0 +q_d4_1 .req q_d1_1 +q_d4_2 .req q_d1_2 +q_d4_3 .req q_d1_3 +v_d5_0 .req v_d2_0 +v_d5_1 .req v_d2_1 +v_d5_2 .req v_d2_2 +v_d5_3 .req v_d2_3 +q_d5_0 .req q_d2_0 +q_d5_1 .req q_d2_1 +q_d5_2 .req q_d2_2 +q_d5_3 .req q_d2_3 + +v_data .req v21 +q_data .req q21 +v_data_lo .req v22 +v_data_hi .req v23 + +cdecl(gf_5vect_mad_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + lsl x_vec_i, x_vec_i, #5 + lsl x_vec, x_vec, #5 + add x_tbl1, x_tbl, x_vec_i + add x_tbl2, x_tbl1, x_vec + add x_tbl3, x_tbl2, x_vec + add x_tbl4, x_tbl3, x_vec + add x_tbl5, x_tbl4, x_vec + add x_src_end, x_src, x_len + ldr x_dest1, [x_dest, #8*0] + ldr x_dest2, [x_dest, #8*1] + ldr x_dest3, [x_dest, #8*2] + ldr x_dest4, [x_dest, #8*3] + ldr x_dest5, [x_dest, #8*4] + ldr q_gft1_lo, [x_tbl1] + ldr q_gft1_hi, [x_tbl1, #16] + ldr q_gft3_lo, [x_tbl3] + ldr q_gft3_hi, [x_tbl3, #16] + ldr q_gft4_lo, [x_tbl4] + ldr q_gft4_hi, [x_tbl4, #16] + +.Lloop64_init: + /* less than 64 bytes, goto Lloop16_init */ + cmp x_len, #64 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_src_end, x_src_end, #64 + +.Lloop64: + ldr q_data_0, [x_src, #16*0] + ldr q_data_1, [x_src, #16*1] + ldr q_data_2, [x_src, #16*2] + ldr q_data_3, [x_src, #16*3] + add x_src, x_src, #64 + + ldr q_d1_0, [x_dest1, #16*0] + ldr q_d1_1, [x_dest1, #16*1] + ldr q_d1_2, [x_dest1, #16*2] + ldr q_d1_3, [x_dest1, #16*3] + + ldr q_d2_0, [x_dest2, #16*0] + ldr q_d2_1, [x_dest2, #16*1] + ldr q_d2_2, [x_dest2, #16*2] + ldr q_d2_3, [x_dest2, #16*3] + + ldr q_d3_0, [x_dest3, #16*0] + ldr q_d3_1, [x_dest3, #16*1] + ldr q_d3_2, [x_dest3, #16*2] + ldr q_d3_3, [x_dest3, #16*3] + + ldr q_gft2_lo, [x_tbl2] + ldr q_gft2_hi, [x_tbl2, #16] + + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b + and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b + + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + ushr v_data_2_hi.16b, v_data_2.16b, #4 + ushr v_data_3_hi.16b, v_data_3.16b, #4 + + /* dest1 */ + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b + eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b + eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b + eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b + eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b + eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b + eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b + eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b + + /* dest2 */ + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b + eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b + eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b + eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b + eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b + eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b + eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b + eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b + + /* dest3 */ + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b + eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b + eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b + eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b + eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b + eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b + eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b + eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1, #16*0] + str q_d1_1, [x_dest1, #16*1] + str q_d1_2, [x_dest1, #16*2] + str q_d1_3, [x_dest1, #16*3] + add x_dest1, x_dest1, #64 + + str q_d2_0, [x_dest2, #16*0] + str q_d2_1, [x_dest2, #16*1] + str q_d2_2, [x_dest2, #16*2] + str q_d2_3, [x_dest2, #16*3] + add x_dest2, x_dest2, #64 + + str q_d3_0, [x_dest3, #16*0] + str q_d3_1, [x_dest3, #16*1] + str q_d3_2, [x_dest3, #16*2] + str q_d3_3, [x_dest3, #16*3] + add x_dest3, x_dest3, #64 + + ldr q_d4_0, [x_dest4, #16*0] + ldr q_d4_1, [x_dest4, #16*1] + ldr q_d4_2, [x_dest4, #16*2] + ldr q_d4_3, [x_dest4, #16*3] + + ldr q_d5_0, [x_dest5, #16*0] + ldr q_d5_1, [x_dest5, #16*1] + ldr q_d5_2, [x_dest5, #16*2] + ldr q_d5_3, [x_dest5, #16*3] + + ldr q_gft5_lo, [x_tbl5] + ldr q_gft5_hi, [x_tbl5, #16] + + /* dest4 */ + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b + eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b + eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b + eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b + eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b + eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b + eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b + eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b + eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b + + /* dest5 */ + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_0_hi.16b + eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b + eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_1_hi.16b + eor v_d5_1.16b, v_tmp_lo.16b, v_d5_1.16b + eor v_d5_1.16b, v_d5_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_2_hi.16b + eor v_d5_2.16b, v_tmp_lo.16b, v_d5_2.16b + eor v_d5_2.16b, v_d5_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_3_hi.16b + eor v_d5_3.16b, v_tmp_lo.16b, v_d5_3.16b + eor v_d5_3.16b, v_d5_3.16b, v_tmp_hi.16b + + str q_d4_0, [x_dest4, #16*0] + str q_d4_1, [x_dest4, #16*1] + str q_d4_2, [x_dest4, #16*2] + str q_d4_3, [x_dest4, #16*3] + add x_dest4, x_dest4, #64 + + str q_d5_0, [x_dest5, #16*0] + str q_d5_1, [x_dest5, #16*1] + str q_d5_2, [x_dest5, #16*2] + str q_d5_3, [x_dest5, #16*3] + add x_dest5, x_dest5, #64 + + cmp x_src, x_src_end + bls .Lloop64 + +.Lloop64_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + add x_src_end, x_src_end, #64 + +.Lloop16_init: + sub x_src_end, x_src_end, #16 + cmp x_src, x_src_end + bhi .lessthan16_init + +.Lloop16: + ldr q_data, [x_src] + + ldr q_d1_0, [x_dest1] + ldr q_d2_0, [x_dest2] + ldr q_d3_0, [x_dest3] + ldr q_gft2_lo, [x_tbl2] + ldr q_gft2_hi, [x_tbl2, #16] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1] + str q_d2_0, [x_dest2] + str q_d3_0, [x_dest3] + + ldr q_d4_0, [x_dest4] + ldr q_d5_0, [x_dest5] + ldr q_gft5_lo, [x_tbl5] + ldr q_gft5_hi, [x_tbl5, #16] + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b + eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b + eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b + eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b + eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b + + str q_d4_0, [x_dest4] + str q_d5_0, [x_dest5] + + add x_src, x_src, #16 + add x_dest1, x_dest1, #16 + add x_dest2, x_dest2, #16 + add x_dest3, x_dest3, #16 + add x_dest4, x_dest4, #16 + add x_dest5, x_dest5, #16 + cmp x_src, x_src_end + bls .Lloop16 + +.lessthan16_init: + sub x_tmp, x_src, x_src_end + cmp x_tmp, #16 + beq .return_pass + +.lessthan16: + mov x_src, x_src_end + sub x_dest1, x_dest1, x_tmp + sub x_dest2, x_dest2, x_tmp + sub x_dest3, x_dest3, x_tmp + sub x_dest4, x_dest4, x_tmp + sub x_dest5, x_dest5, x_tmp + +#ifndef __APPLE__ + adrp x_const, const_tbl + add x_const, x_const, :lo12:const_tbl +#else + adrp x_const, const_tbl@PAGE + add x_const, x_const, const_tbl@PAGEOFF +#endif + sub x_const, x_const, x_tmp + ldr q_tmp, [x_const, #16] + + ldr q_data, [x_src] + ldr q_d1_0, [x_dest1] + ldr q_d2_0, [x_dest2] + ldr q_d3_0, [x_dest3] + ldr q_gft2_lo, [x_tbl2] + ldr q_gft2_hi, [x_tbl2, #16] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1] + str q_d2_0, [x_dest2] + str q_d3_0, [x_dest3] + + ldr q_d4_0, [x_dest4] + ldr q_d5_0, [x_dest5] + ldr q_gft5_lo, [x_tbl5] + ldr q_gft5_hi, [x_tbl5, #16] + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b + + str q_d4_0, [x_dest4] + str q_d5_0, [x_dest5] + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret + +ASM_DEF_RODATA +.balign 8 +const_tbl: + .dword 0x0000000000000000, 0x0000000000000000 + .dword 0xffffffffffffffff, 0xffffffffffffffff diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_sve.S new file mode 100644 index 0000000000..ab374d365a --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_5vect_mad_sve.S @@ -0,0 +1,218 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_5vect_mad_sve) +#ifndef __APPLE__ +.type gf_5vect_mad_sve, %function +#endif + +/* gf_5vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + */ +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x6 +x_dest2 .req x7 +x_dest3 .req x8 +x_dest4 .req x9 +x_dest5 .req x10 +x_dest1 .req x12 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_tmp_lo .req z4 +z_tmp_hi .req z5 + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_gft5_lo .req z23 +z_gft5_hi .req z24 +q_gft5_lo .req q23 +q_gft5_hi .req q24 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 +z_dest5 .req z30 + +cdecl(gf_5vect_mad_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + /* load table 1 */ + add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */ + + /* Load with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + /* load table 2 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl] + /* load table 3 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft3_lo, q_gft3_hi, [x_tbl] + /* load table 4 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft4_lo, q_gft4_hi, [x_tbl] + /* load table 5 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft5_lo, q_gft5_hi, [x_tbl] + + ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */ + ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */ + ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */ + ldr x_dest4, [x_dest, #8*3] /* pointer to dest4 */ + ldr x_dest5, [x_dest, #8*4] /* pointer to dest5 */ + + mov x_pos, #0 + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + prfb pldl2strm, p0, [x_dest1, x_pos] + prfb pldl2strm, p0, [x_dest2, x_pos] + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* load dest data, governed by p0 */ + ld1b z_dest1.b, p0/z, [x_dest1, x_pos] + ld1b z_dest2.b, p0/z, [x_dest2, x_pos] + + prfb pldl2strm, p0, [x_dest3, x_pos] + prfb pldl2strm, p0, [x_dest4, x_pos] + + /* dest1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_tmp_lo.d, z_dest1.d + eor z_dest1.d, z_tmp_hi.d, z_dest1.d + + ld1b z_dest3.b, p0/z, [x_dest3, x_pos] + ld1b z_dest4.b, p0/z, [x_dest4, x_pos] + prfb pldl2strm, p0, [x_dest5, x_pos] + + /* dest2 */ + tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_tmp_lo.d, z_dest2.d + eor z_dest2.d, z_tmp_hi.d, z_dest2.d + + ld1b z_dest5.b, p0/z, [x_dest5, x_pos] + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + + /* dest3 */ + tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_tmp_lo.d, z_dest3.d + eor z_dest3.d, z_tmp_hi.d, z_dest3.d + + /* dest4 */ + tbl z_tmp_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_tmp_lo.d, z_dest4.d + eor z_dest4.d, z_tmp_hi.d, z_dest4.d + + /* store dest data, governed by p0 */ + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + + /* dest5 */ + tbl z_tmp_lo.b, {z_gft5_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft5_hi.b}, z_src_hi.b + eor z_dest5.d, z_tmp_lo.d, z_dest5.d + eor z_dest5.d, z_tmp_hi.d, z_dest5.d + + /* store dest data, governed by p0 */ + st1b z_dest4.b, p0, [x_dest4, x_pos] + st1b z_dest5.b, p0, [x_dest5, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_dot_prod_sve.S new file mode 100644 index 0000000000..acc98953b3 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_dot_prod_sve.S @@ -0,0 +1,258 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_6vect_dot_prod_sve) +#ifndef __APPLE__ +.type gf_6vect_dot_prod_sve, %function +#endif +/* void gf_6vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_tbl3 .req x10 +x_tbl4 .req x11 +x_tbl5 .req x12 +x_tbl6 .req x13 +x_dest1 .req x14 +x_dest2 .req x15 +x_dest6 .req x_dest /* reused */ + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ +x_dest3 .req x19 +x_dest4 .req x20 +x_dest5 .req x21 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_gft5_lo .req z23 +z_gft5_hi .req z24 +q_gft5_lo .req q23 +q_gft5_hi .req q24 + +z_gft6_lo .req z25 +z_gft6_hi .req z26 +q_gft6_lo .req q25 +q_gft6_hi .req q26 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 +z_dest5 .req z30 +z_dest6 .req z31 + +cdecl(gf_6vect_dot_prod_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + /* save r19..r29 */ + sub sp, sp, #32 /* alignment */ + stp x19, x20, [sp] + str x21, [sp, #16] + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + ldp x_dest3, x_dest4, [x_dest, #8*2] + ldp x_dest5, x_dest6, [x_dest, #8*4] /* x_dest6 reuses x_dest */ + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + mov z_dest3.b, #0 /* clear z_dest3 */ + mov z_dest4.b, #0 /* clear z_dest4 */ + mov z_dest5.b, #0 /* clear z_dest5 */ + mov z_dest6.b, #0 /* clear z_dest6 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ + add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ + add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ + add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next and prefetch */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_dest1.d, z_gft1_hi.d + + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 + prfb pldl2keep, p0, [x_tbl3] + prfb pldl2keep, p0, [x_tbl4] + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_dest2.d, z_gft2_hi.d + + /* dest 3 */ + tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_gft3_lo.d, z_dest3.d + eor z_dest3.d, z_dest3.d, z_gft3_hi.d + + ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 + ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32 + prfb pldl2keep, p0, [x_tbl5] + prfb pldl2keep, p0, [x_tbl6] + + /* dest 4 */ + tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_gft4_lo.d, z_dest4.d + eor z_dest4.d, z_dest4.d, z_gft4_hi.d + + /* dest 5 */ + tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b + tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b + eor z_dest5.d, z_gft5_lo.d, z_dest5.d + eor z_dest5.d, z_dest5.d, z_gft5_hi.d + + /* dest 6 */ + tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b + tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b + eor z_dest6.d, z_gft6_lo.d, z_dest6.d + eor z_dest6.d, z_dest6.d, z_gft6_hi.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + st1b z_dest5.b, p0, [x_dest5, x_pos] + st1b z_dest6.b, p0, [x_dest6, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + /* restore r19..r29 */ + ldr x21, [sp, #16] + ldp x19, x20, [sp] + add sp, sp, #32 + + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_neon.S new file mode 100644 index 0000000000..3b1b1b4b21 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_neon.S @@ -0,0 +1,618 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text +.global cdecl(gf_6vect_mad_neon) +#ifndef __APPLE__ +.type gf_6vect_mad_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_src_end .req x6 +x_dest1 .req x7 +x_dest2 .req x8 +x_dest3 .req x9 +x_dest4 .req x10 +x_dest5 .req x11 +x_dest6 .req x_dest +x_tmp .req x12 +x_tbl1 .req x13 +x_tbl2 .req x14 +x_tbl3 .req x15 +x_tbl4 .req x16 +x_tbl5 .req x17 +x_tbl6 .req x_tbl +x_const .req x18 + +/* vectors */ +v_mask0f .req v0 +v_tmp_lo .req v1 +v_tmp_hi .req v2 +v_tmp .req v3 +q_tmp .req q3 + +v_gft1_lo .req v4 +v_gft1_hi .req v5 +v_gft2_lo .req v6 +v_gft2_hi .req v7 +v_gft3_lo .req v16 +v_gft3_hi .req v17 +q_gft1_lo .req q4 +q_gft1_hi .req q5 +q_gft2_lo .req q6 +q_gft2_hi .req q7 +q_gft3_lo .req q16 +q_gft3_hi .req q17 + +v_gft4_lo .req v18 +v_gft4_hi .req v19 +q_gft4_lo .req q18 +q_gft4_hi .req q19 +v_gft5_lo .req v_gft2_lo +v_gft5_hi .req v_gft2_hi +q_gft5_lo .req q_gft2_lo +q_gft5_hi .req q_gft2_hi +v_gft6_lo .req v_gft3_lo +v_gft6_hi .req v_gft3_hi +q_gft6_lo .req q_gft3_lo +q_gft6_hi .req q_gft3_hi + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 + +v_data_0_lo .req v12 +v_data_1_lo .req v13 +v_data_2_lo .req v14 +v_data_3_lo .req v15 +v_data_0_hi .req v_data_0 +v_data_1_hi .req v_data_1 +v_data_2_hi .req v_data_2 +v_data_3_hi .req v_data_3 + +v_d1_0 .req v20 +v_d1_1 .req v21 +v_d1_2 .req v22 +v_d1_3 .req v23 +v_d2_0 .req v24 +v_d2_1 .req v25 +v_d2_2 .req v26 +v_d2_3 .req v27 +v_d3_0 .req v28 +v_d3_1 .req v29 +v_d3_2 .req v30 +v_d3_3 .req v31 +q_d1_0 .req q20 +q_d1_1 .req q21 +q_d1_2 .req q22 +q_d1_3 .req q23 +q_d2_0 .req q24 +q_d2_1 .req q25 +q_d2_2 .req q26 +q_d2_3 .req q27 +q_d3_0 .req q28 +q_d3_1 .req q29 +q_d3_2 .req q30 +q_d3_3 .req q31 + +v_d4_0 .req v_d1_0 +v_d4_1 .req v_d1_1 +v_d4_2 .req v_d1_2 +v_d4_3 .req v_d1_3 +q_d4_0 .req q_d1_0 +q_d4_1 .req q_d1_1 +q_d4_2 .req q_d1_2 +q_d4_3 .req q_d1_3 +v_d5_0 .req v_d2_0 +v_d5_1 .req v_d2_1 +v_d5_2 .req v_d2_2 +v_d5_3 .req v_d2_3 +q_d5_0 .req q_d2_0 +q_d5_1 .req q_d2_1 +q_d5_2 .req q_d2_2 +q_d5_3 .req q_d2_3 +v_d6_0 .req v_d3_0 +v_d6_1 .req v_d3_1 +v_d6_2 .req v_d3_2 +v_d6_3 .req v_d3_3 +q_d6_0 .req q_d3_0 +q_d6_1 .req q_d3_1 +q_d6_2 .req q_d3_2 +q_d6_3 .req q_d3_3 + +v_data .req v21 +q_data .req q21 +v_data_lo .req v22 +v_data_hi .req v23 + +cdecl(gf_6vect_mad_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + lsl x_vec_i, x_vec_i, #5 + lsl x_vec, x_vec, #5 + add x_tbl1, x_tbl, x_vec_i + add x_tbl2, x_tbl1, x_vec + add x_tbl3, x_tbl2, x_vec + add x_tbl4, x_tbl3, x_vec + add x_tbl5, x_tbl4, x_vec + add x_tbl6, x_tbl5, x_vec + add x_src_end, x_src, x_len + ldr x_dest1, [x_dest, #8*0] + ldr x_dest2, [x_dest, #8*1] + ldr x_dest3, [x_dest, #8*2] + ldr x_dest4, [x_dest, #8*3] + ldr x_dest5, [x_dest, #8*4] + ldr x_dest6, [x_dest, #8*5] + ldr q_gft1_lo, [x_tbl1] + ldr q_gft1_hi, [x_tbl1, #16] + ldr q_gft4_lo, [x_tbl4] + ldr q_gft4_hi, [x_tbl4, #16] + +.Lloop64_init: + /* less than 64 bytes, goto Lloop16_init */ + cmp x_len, #64 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_src_end, x_src_end, #64 + +.Lloop64: + ldr q_data_0, [x_src, #16*0] + ldr q_data_1, [x_src, #16*1] + ldr q_data_2, [x_src, #16*2] + ldr q_data_3, [x_src, #16*3] + add x_src, x_src, #64 + + ldr q_d1_0, [x_dest1, #16*0] + ldr q_d1_1, [x_dest1, #16*1] + ldr q_d1_2, [x_dest1, #16*2] + ldr q_d1_3, [x_dest1, #16*3] + + ldr q_d2_0, [x_dest2, #16*0] + ldr q_d2_1, [x_dest2, #16*1] + ldr q_d2_2, [x_dest2, #16*2] + ldr q_d2_3, [x_dest2, #16*3] + + ldr q_d3_0, [x_dest3, #16*0] + ldr q_d3_1, [x_dest3, #16*1] + ldr q_d3_2, [x_dest3, #16*2] + ldr q_d3_3, [x_dest3, #16*3] + + ldr q_gft2_lo, [x_tbl2] + ldr q_gft2_hi, [x_tbl2, #16] + ldr q_gft3_lo, [x_tbl3] + ldr q_gft3_hi, [x_tbl3, #16] + + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b + and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b + + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + ushr v_data_2_hi.16b, v_data_2.16b, #4 + ushr v_data_3_hi.16b, v_data_3.16b, #4 + + /* dest1 */ + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b + eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b + eor v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b + eor v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b + eor v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b + eor v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b + eor v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b + eor v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b + + /* dest2 */ + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b + eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b + eor v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b + eor v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b + eor v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b + eor v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b + eor v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b + eor v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b + + /* dest3 */ + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b + eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b + eor v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b + eor v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b + eor v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b + eor v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b + eor v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b + eor v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1, #16*0] + str q_d1_1, [x_dest1, #16*1] + str q_d1_2, [x_dest1, #16*2] + str q_d1_3, [x_dest1, #16*3] + add x_dest1, x_dest1, #64 + + str q_d2_0, [x_dest2, #16*0] + str q_d2_1, [x_dest2, #16*1] + str q_d2_2, [x_dest2, #16*2] + str q_d2_3, [x_dest2, #16*3] + add x_dest2, x_dest2, #64 + + str q_d3_0, [x_dest3, #16*0] + str q_d3_1, [x_dest3, #16*1] + str q_d3_2, [x_dest3, #16*2] + str q_d3_3, [x_dest3, #16*3] + add x_dest3, x_dest3, #64 + + ldr q_d4_0, [x_dest4, #16*0] + ldr q_d4_1, [x_dest4, #16*1] + ldr q_d4_2, [x_dest4, #16*2] + ldr q_d4_3, [x_dest4, #16*3] + + ldr q_d5_0, [x_dest5, #16*0] + ldr q_d5_1, [x_dest5, #16*1] + ldr q_d5_2, [x_dest5, #16*2] + ldr q_d5_3, [x_dest5, #16*3] + + ldr q_d6_0, [x_dest6, #16*0] + ldr q_d6_1, [x_dest6, #16*1] + ldr q_d6_2, [x_dest6, #16*2] + ldr q_d6_3, [x_dest6, #16*3] + + ldr q_gft5_lo, [x_tbl5] + ldr q_gft5_hi, [x_tbl5, #16] + ldr q_gft6_lo, [x_tbl6] + ldr q_gft6_hi, [x_tbl6, #16] + + /* dest4 */ + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b + eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b + eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b + eor v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b + eor v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b + eor v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b + eor v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b + eor v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b + eor v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b + + /* dest5 */ + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_0_hi.16b + eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b + eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_1_hi.16b + eor v_d5_1.16b, v_tmp_lo.16b, v_d5_1.16b + eor v_d5_1.16b, v_d5_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_2_hi.16b + eor v_d5_2.16b, v_tmp_lo.16b, v_d5_2.16b + eor v_d5_2.16b, v_d5_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_3_hi.16b + eor v_d5_3.16b, v_tmp_lo.16b, v_d5_3.16b + eor v_d5_3.16b, v_d5_3.16b, v_tmp_hi.16b + + /* dest6 */ + tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_0_lo.16b + tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_0_hi.16b + eor v_d6_0.16b, v_tmp_lo.16b, v_d6_0.16b + eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_1_lo.16b + tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_1_hi.16b + eor v_d6_1.16b, v_tmp_lo.16b, v_d6_1.16b + eor v_d6_1.16b, v_d6_1.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_2_lo.16b + tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_2_hi.16b + eor v_d6_2.16b, v_tmp_lo.16b, v_d6_2.16b + eor v_d6_2.16b, v_d6_2.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_3_lo.16b + tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_3_hi.16b + eor v_d6_3.16b, v_tmp_lo.16b, v_d6_3.16b + eor v_d6_3.16b, v_d6_3.16b, v_tmp_hi.16b + + str q_d4_0, [x_dest4, #16*0] + str q_d4_1, [x_dest4, #16*1] + str q_d4_2, [x_dest4, #16*2] + str q_d4_3, [x_dest4, #16*3] + add x_dest4, x_dest4, #64 + + str q_d5_0, [x_dest5, #16*0] + str q_d5_1, [x_dest5, #16*1] + str q_d5_2, [x_dest5, #16*2] + str q_d5_3, [x_dest5, #16*3] + add x_dest5, x_dest5, #64 + + str q_d6_0, [x_dest6, #16*0] + str q_d6_1, [x_dest6, #16*1] + str q_d6_2, [x_dest6, #16*2] + str q_d6_3, [x_dest6, #16*3] + add x_dest6, x_dest6, #64 + + cmp x_src, x_src_end + bls .Lloop64 + +.Lloop64_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + add x_src_end, x_src_end, #64 + +.Lloop16_init: + sub x_src_end, x_src_end, #16 + cmp x_src, x_src_end + bhi .lessthan16_init + +.Lloop16: + ldr q_data, [x_src] + + ldr q_d1_0, [x_dest1] + ldr q_d2_0, [x_dest2] + ldr q_d3_0, [x_dest3] + ldr q_gft2_lo, [x_tbl2] + ldr q_gft2_hi, [x_tbl2, #16] + ldr q_gft3_lo, [x_tbl3] + ldr q_gft3_hi, [x_tbl3, #16] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + eor v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1] + str q_d2_0, [x_dest2] + str q_d3_0, [x_dest3] + + ldr q_d4_0, [x_dest4] + ldr q_d5_0, [x_dest5] + ldr q_d6_0, [x_dest6] + ldr q_gft5_lo, [x_tbl5] + ldr q_gft5_hi, [x_tbl5, #16] + ldr q_gft6_lo, [x_tbl6] + ldr q_gft6_hi, [x_tbl6, #16] + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b + eor v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b + eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b + eor v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b + eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_hi.16b + eor v_d6_0.16b, v_tmp_lo.16b, v_d6_0.16b + eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b + + str q_d4_0, [x_dest4] + str q_d5_0, [x_dest5] + str q_d6_0, [x_dest6] + + add x_src, x_src, #16 + add x_dest1, x_dest1, #16 + add x_dest2, x_dest2, #16 + add x_dest3, x_dest3, #16 + add x_dest4, x_dest4, #16 + add x_dest5, x_dest5, #16 + add x_dest6, x_dest6, #16 + cmp x_src, x_src_end + bls .Lloop16 + +.lessthan16_init: + sub x_tmp, x_src, x_src_end + cmp x_tmp, #16 + beq .return_pass + +.lessthan16: + mov x_src, x_src_end + sub x_dest1, x_dest1, x_tmp + sub x_dest2, x_dest2, x_tmp + sub x_dest3, x_dest3, x_tmp + sub x_dest4, x_dest4, x_tmp + sub x_dest5, x_dest5, x_tmp + sub x_dest6, x_dest6, x_tmp + +#ifndef __APPLE__ + adrp x_const, const_tbl + add x_const, x_const, :lo12:const_tbl +#else + adrp x_const, const_tbl@PAGE + add x_const, x_const, const_tbl@PAGEOFF +#endif + sub x_const, x_const, x_tmp + ldr q_tmp, [x_const, #16] + + ldr q_data, [x_src] + ldr q_d1_0, [x_dest1] + ldr q_d2_0, [x_dest2] + ldr q_d3_0, [x_dest3] + ldr q_gft2_lo, [x_tbl2] + ldr q_gft2_hi, [x_tbl2, #16] + ldr q_gft3_lo, [x_tbl3] + ldr q_gft3_hi, [x_tbl3, #16] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b + + str q_d1_0, [x_dest1] + str q_d2_0, [x_dest2] + str q_d3_0, [x_dest3] + + ldr q_d4_0, [x_dest4] + ldr q_d5_0, [x_dest5] + ldr q_d6_0, [x_dest6] + ldr q_gft5_lo, [x_tbl5] + ldr q_gft5_hi, [x_tbl5, #16] + ldr q_gft6_lo, [x_tbl6] + ldr q_gft6_hi, [x_tbl6, #16] + + tbl v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b + + tbl v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_lo.16b + tbl v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_hi.16b + eor v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b + and v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b + eor v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b + + str q_d4_0, [x_dest4] + str q_d5_0, [x_dest5] + str q_d6_0, [x_dest6] + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret + +ASM_DEF_RODATA +.balign 8 +const_tbl: + .dword 0x0000000000000000, 0x0000000000000000 + .dword 0xffffffffffffffff, 0xffffffffffffffff diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_sve.S new file mode 100644 index 0000000000..c4f372cd73 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_6vect_mad_sve.S @@ -0,0 +1,237 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_6vect_mad_sve) +#ifndef __APPLE__ +.type gf_6vect_mad_sve, %function +#endif + +/* gf_6vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char **dest); + */ +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x6 +x_dest2 .req x7 +x_dest3 .req x8 +x_dest4 .req x9 +x_dest5 .req x10 +x_dest6 .req x11 +x_dest1 .req x12 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 + +z_tmp_lo .req z4 +z_tmp_hi .req z5 + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_gft5_lo .req z23 +z_gft5_hi .req z24 +q_gft5_lo .req q23 +q_gft5_hi .req q24 + +z_gft6_lo .req z25 +z_gft6_hi .req z26 +q_gft6_lo .req q25 +q_gft6_hi .req q26 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 +z_dest5 .req z30 +z_dest6 .req z31 + +cdecl(gf_6vect_mad_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + /* load table 1 */ + add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */ + + /* Load with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + /* load table 2 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl] + /* load table 3 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft3_lo, q_gft3_hi, [x_tbl] + /* load table 4 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft4_lo, q_gft4_hi, [x_tbl] + /* load table 5 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft5_lo, q_gft5_hi, [x_tbl] + /* load table 6 */ + add x_tbl, x_tbl, x_vec, LSL #5 /* x_tbl += x_vec * 2^5 */ + ldp q_gft6_lo, q_gft6_hi, [x_tbl] + + ldr x_dest1, [x_dest, #8*0] /* pointer to dest1 */ + ldr x_dest2, [x_dest, #8*1] /* pointer to dest2 */ + ldr x_dest3, [x_dest, #8*2] /* pointer to dest3 */ + ldr x_dest4, [x_dest, #8*3] /* pointer to dest4 */ + ldr x_dest5, [x_dest, #8*4] /* pointer to dest5 */ + ldr x_dest6, [x_dest, #8*5] /* pointer to dest6 */ + + mov x_pos, #0 + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + prfb pldl2strm, p0, [x_dest1, x_pos] + prfb pldl2strm, p0, [x_dest2, x_pos] + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* load dest data, governed by p0 */ + ld1b z_dest1.b, p0/z, [x_dest1, x_pos] + ld1b z_dest2.b, p0/z, [x_dest2, x_pos] + + prfb pldl2strm, p0, [x_dest3, x_pos] + prfb pldl2strm, p0, [x_dest4, x_pos] + + /* dest1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_tmp_lo.d, z_dest1.d + eor z_dest1.d, z_tmp_hi.d, z_dest1.d + + ld1b z_dest3.b, p0/z, [x_dest3, x_pos] + ld1b z_dest4.b, p0/z, [x_dest4, x_pos] + + prfb pldl2strm, p0, [x_dest5, x_pos] + prfb pldl2strm, p0, [x_dest6, x_pos] + + /* dest2 */ + tbl z_tmp_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_tmp_lo.d, z_dest2.d + eor z_dest2.d, z_tmp_hi.d, z_dest2.d + + ld1b z_dest5.b, p0/z, [x_dest5, x_pos] + ld1b z_dest6.b, p0/z, [x_dest6, x_pos] + + /* dest3 */ + tbl z_tmp_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_tmp_lo.d, z_dest3.d + eor z_dest3.d, z_tmp_hi.d, z_dest3.d + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + + /* dest4 */ + tbl z_tmp_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_tmp_lo.d, z_dest4.d + eor z_dest4.d, z_tmp_hi.d, z_dest4.d + + /* dest5 */ + tbl z_tmp_lo.b, {z_gft5_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft5_hi.b}, z_src_hi.b + eor z_dest5.d, z_tmp_lo.d, z_dest5.d + eor z_dest5.d, z_tmp_hi.d, z_dest5.d + + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + + /* dest6 */ + tbl z_tmp_lo.b, {z_gft6_lo.b}, z_src_lo.b + tbl z_tmp_hi.b, {z_gft6_hi.b}, z_src_hi.b + eor z_dest6.d, z_tmp_lo.d, z_dest6.d + eor z_dest6.d, z_tmp_hi.d, z_dest6.d + + st1b z_dest5.b, p0, [x_dest5, x_pos] + st1b z_dest6.b, p0, [x_dest6, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_7vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_7vect_dot_prod_sve.S new file mode 100644 index 0000000000..0f74873de0 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_7vect_dot_prod_sve.S @@ -0,0 +1,281 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_7vect_dot_prod_sve) +#ifndef __APPLE__ +.type gf_7vect_dot_prod_sve, %function +#endif +/* void gf_7vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_tbl3 .req x10 +x_tbl4 .req x11 +x_tbl5 .req x12 +x_tbl6 .req x13 +x_tbl7 .req x14 + +x_dest1 .req x15 + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ +x_dest2 .req x19 +x_dest3 .req x20 +x_dest4 .req x21 +x_dest5 .req x22 +x_dest6 .req x23 +x_dest7 .req x_dest /* reused */ + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +z_gft7_lo .req z6 +z_gft7_hi .req z7 +q_gft7_lo .req q6 +q_gft7_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_dest7 .req z16 + +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_gft5_lo .req z23 +z_gft5_hi .req z24 +q_gft5_lo .req q23 +q_gft5_hi .req q24 + +z_gft6_lo .req z25 +z_gft6_hi .req z26 +q_gft6_lo .req q25 +q_gft6_hi .req q26 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 +z_dest5 .req z30 +z_dest6 .req z31 + +cdecl(gf_7vect_dot_prod_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + /* save r19..r29 */ + sub sp, sp, #48 /* alignment */ + stp x19, x20, [sp] + stp x21, x22, [sp, #16] + str x23, [sp, #32] + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + ldp x_dest3, x_dest4, [x_dest, #8*2] + ldp x_dest5, x_dest6, [x_dest, #8*4] + ldr x_dest7, [x_dest, #8*6] /* x_dest7 reuses x_dest */ + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + mov z_dest3.b, #0 /* clear z_dest3 */ + mov z_dest4.b, #0 /* clear z_dest4 */ + mov z_dest5.b, #0 /* clear z_dest5 */ + mov z_dest6.b, #0 /* clear z_dest6 */ + mov z_dest7.b, #0 /* clear z_dest7 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ + add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ + add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ + add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */ + add x_tbl7, x_tbl6, x_vec, LSL #2 /* reset x_tbl7 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next and prefetch */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_gft1_hi.d, z_dest1.d + + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 + prfb pldl2keep, p0, [x_tbl3] + prfb pldl2keep, p0, [x_tbl4] + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_gft2_hi.d, z_dest2.d + + /* dest 3 */ + tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_gft3_lo.d, z_dest3.d + eor z_dest3.d, z_gft3_hi.d, z_dest3.d + + ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 + ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32 + prfb pldl2keep, p0, [x_tbl5] + prfb pldl2keep, p0, [x_tbl6] + + /* dest 4 */ + tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_gft4_lo.d, z_dest4.d + eor z_dest4.d, z_gft4_hi.d, z_dest4.d + + /* dest 5 */ + tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b + tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b + eor z_dest5.d, z_gft5_lo.d, z_dest5.d + eor z_dest5.d, z_gft5_hi.d, z_dest5.d + + ldp q_gft7_lo, q_gft7_hi, [x_tbl7], #32 + prfb pldl2keep, p0, [x_tbl7] + + /* dest 6 */ + tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b + tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b + eor z_dest6.d, z_gft6_lo.d, z_dest6.d + eor z_dest6.d, z_gft6_hi.d, z_dest6.d + + /* dest 7 */ + tbl z_gft7_lo.b, {z_gft7_lo.b}, z_src_lo.b + tbl z_gft7_hi.b, {z_gft7_hi.b}, z_src_hi.b + eor z_dest7.d, z_gft7_lo.d, z_dest7.d + eor z_dest7.d, z_gft7_hi.d, z_dest7.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + st1b z_dest5.b, p0, [x_dest5, x_pos] + st1b z_dest6.b, p0, [x_dest6, x_pos] + st1b z_dest7.b, p0, [x_dest7, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + /* restore r19..r29 */ + ldr x23, [sp, #32] + ldp x21, x22, [sp, #16] + ldp x19, x20, [sp] + add sp, sp, #48 + + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_8vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_8vect_dot_prod_sve.S new file mode 100644 index 0000000000..20768f4889 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_8vect_dot_prod_sve.S @@ -0,0 +1,307 @@ +/************************************************************* + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_8vect_dot_prod_sve) +#ifndef __APPLE__ +.type gf_8vect_dot_prod_sve, %function +#endif +/* void gf_8vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char **dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 + +x_tbl1 .req x8 +x_tbl2 .req x9 +x_tbl3 .req x10 +x_tbl4 .req x11 +x_tbl5 .req x12 +x_tbl6 .req x13 +x_tbl7 .req x14 + +x_dest1 .req x15 + +/* r16,r17,r18,r29,r30: special role registers, avoided */ +/* r19..r29 and SP must be preserved */ +x_dest2 .req x19 +x_dest3 .req x20 +x_dest4 .req x21 +x_dest5 .req x22 +x_dest6 .req x23 +x_dest7 .req x24 +x_dest8 .req x_dest /* reused */ +x_tbl8 .req x25 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest1 .req z3 +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +z_gft7_lo .req z6 +z_gft7_hi .req z7 +q_gft7_lo .req q6 +q_gft7_hi .req q7 + +/* bottom 64-bit of v8..v15 must be preserved if used */ +z_dest7 .req z8 + +z_gft8_lo .req z9 +z_gft8_hi .req z10 +q_gft8_lo .req q9 +q_gft8_hi .req q10 + +z_dest8 .req z16 + +z_gft2_lo .req z17 +z_gft2_hi .req z18 +q_gft2_lo .req q17 +q_gft2_hi .req q18 + +z_gft3_lo .req z19 +z_gft3_hi .req z20 +q_gft3_lo .req q19 +q_gft3_hi .req q20 + +z_gft4_lo .req z21 +z_gft4_hi .req z22 +q_gft4_lo .req q21 +q_gft4_hi .req q22 + +z_gft5_lo .req z23 +z_gft5_hi .req z24 +q_gft5_lo .req q23 +q_gft5_hi .req q24 + +z_gft6_lo .req z25 +z_gft6_hi .req z26 +q_gft6_lo .req q25 +q_gft6_hi .req q26 + +z_dest2 .req z27 +z_dest3 .req z28 +z_dest4 .req z29 +z_dest5 .req z30 +z_dest6 .req z31 + +cdecl(gf_8vect_dot_prod_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + /* save r19..r29 */ + sub sp, sp, #80 /* alignment */ + stp x19, x20, [sp] + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + str d10, [sp, #56] + str x25, [sp, #64] + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + ldp x_dest1, x_dest2, [x_dest, #8*0] + ldp x_dest3, x_dest4, [x_dest, #8*2] + ldp x_dest5, x_dest6, [x_dest, #8*4] + ldp x_dest7, x_dest8, [x_dest, #8*6] /* x_dest8 reuses x_dest */ + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov x_vec_i, #0 /* clear x_vec_i */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + mov z_dest1.b, #0 /* clear z_dest1 */ + mov z_dest2.b, #0 /* clear z_dest2 */ + mov z_dest3.b, #0 /* clear z_dest3 */ + mov z_dest4.b, #0 /* clear z_dest4 */ + mov z_dest5.b, #0 /* clear z_dest5 */ + mov z_dest6.b, #0 /* clear z_dest6 */ + mov z_dest7.b, #0 /* clear z_dest7 */ + mov z_dest8.b, #0 /* clear z_dest8 */ + + /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + add x_tbl2, x_tbl1, x_vec, LSL #2 /* reset x_tbl2 */ + add x_tbl3, x_tbl2, x_vec, LSL #2 /* reset x_tbl3 */ + add x_tbl4, x_tbl3, x_vec, LSL #2 /* reset x_tbl4 */ + add x_tbl5, x_tbl4, x_vec, LSL #2 /* reset x_tbl5 */ + add x_tbl6, x_tbl5, x_vec, LSL #2 /* reset x_tbl6 */ + add x_tbl7, x_tbl6, x_vec, LSL #2 /* reset x_tbl7 */ + add x_tbl8, x_tbl7, x_vec, LSL #2 /* reset x_tbl8 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* gf_tbl addr: (x_tbl + dest_idx * x_vec * 32) + src_vec_idx * 32 */ + /* load gf_table's */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is post-added by #32 for each src vect */ + ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 + + /* prefetch */ + prfb pldl2keep, p0, [x_tbl1] + prfb pldl2keep, p0, [x_tbl2] + + /* calc for next and prefetch */ + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + + /* dest 1 */ + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest1.d, z_gft1_lo.d, z_dest1.d + eor z_dest1.d, z_gft1_hi.d, z_dest1.d + + ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 + ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 + prfb pldl2keep, p0, [x_tbl3] + prfb pldl2keep, p0, [x_tbl4] + + /* dest 2 */ + tbl z_gft2_lo.b, {z_gft2_lo.b}, z_src_lo.b + tbl z_gft2_hi.b, {z_gft2_hi.b}, z_src_hi.b + eor z_dest2.d, z_gft2_lo.d, z_dest2.d + eor z_dest2.d, z_gft2_hi.d, z_dest2.d + + /* dest 3 */ + tbl z_gft3_lo.b, {z_gft3_lo.b}, z_src_lo.b + tbl z_gft3_hi.b, {z_gft3_hi.b}, z_src_hi.b + eor z_dest3.d, z_gft3_lo.d, z_dest3.d + eor z_dest3.d, z_gft3_hi.d, z_dest3.d + + ldp q_gft5_lo, q_gft5_hi, [x_tbl5], #32 + ldp q_gft6_lo, q_gft6_hi, [x_tbl6], #32 + prfb pldl2keep, p0, [x_tbl5] + prfb pldl2keep, p0, [x_tbl6] + + /* dest 4 */ + tbl z_gft4_lo.b, {z_gft4_lo.b}, z_src_lo.b + tbl z_gft4_hi.b, {z_gft4_hi.b}, z_src_hi.b + eor z_dest4.d, z_gft4_lo.d, z_dest4.d + eor z_dest4.d, z_gft4_hi.d, z_dest4.d + + /* dest 5 */ + tbl z_gft5_lo.b, {z_gft5_lo.b}, z_src_lo.b + tbl z_gft5_hi.b, {z_gft5_hi.b}, z_src_hi.b + eor z_dest5.d, z_gft5_lo.d, z_dest5.d + eor z_dest5.d, z_gft5_hi.d, z_dest5.d + + ldp q_gft7_lo, q_gft7_hi, [x_tbl7], #32 + ldp q_gft8_lo, q_gft8_hi, [x_tbl8], #32 + prfb pldl2keep, p0, [x_tbl7] + prfb pldl2keep, p0, [x_tbl8] + + /* dest 6 */ + tbl z_gft6_lo.b, {z_gft6_lo.b}, z_src_lo.b + tbl z_gft6_hi.b, {z_gft6_hi.b}, z_src_hi.b + eor z_dest6.d, z_gft6_lo.d, z_dest6.d + eor z_dest6.d, z_gft6_hi.d, z_dest6.d + + /* dest 7 */ + tbl z_gft7_lo.b, {z_gft7_lo.b}, z_src_lo.b + tbl z_gft7_hi.b, {z_gft7_hi.b}, z_src_hi.b + eor z_dest7.d, z_gft7_lo.d, z_dest7.d + eor z_dest7.d, z_gft7_hi.d, z_dest7.d + + /* dest 8 */ + tbl z_gft8_lo.b, {z_gft8_lo.b}, z_src_lo.b + tbl z_gft8_hi.b, {z_gft8_hi.b}, z_src_hi.b + eor z_dest8.d, z_gft8_lo.d, z_dest8.d + eor z_dest8.d, z_gft8_hi.d, z_dest8.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects +/* end of Loop 2 */ + + /* store dest data, governed by p0 */ + st1b z_dest1.b, p0, [x_dest1, x_pos] + st1b z_dest2.b, p0, [x_dest2, x_pos] + st1b z_dest3.b, p0, [x_dest3, x_pos] + st1b z_dest4.b, p0, [x_dest4, x_pos] + st1b z_dest5.b, p0, [x_dest5, x_pos] + st1b z_dest6.b, p0, [x_dest6, x_pos] + st1b z_dest7.b, p0, [x_dest7, x_pos] + st1b z_dest8.b, p0, [x_dest8, x_pos] + + /* increment one vector length */ + incb x_pos + b .Lloopsve_vl +/* end of Loop 1 */ + +.return_pass: + /* restore r19..r29 */ + ldr x25, [sp, #64] + ldr d10, [sp, #56] + ldp d8, d9, [sp, #48] + ldp x23, x24, [sp, #32] + ldp x21, x22, [sp, #16] + ldp x19, x20, [sp] + add sp, sp, #80 + + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_neon.S new file mode 100644 index 0000000000..4d17362894 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_neon.S @@ -0,0 +1,303 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_vect_dot_prod_neon) +#ifndef __APPLE__ +.type gf_vect_dot_prod_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_tbl .req x2 +x_src .req x3 +x_dest1 .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 +x_tmp .req x8 +x_tbl1 .req x9 + +/* vectors */ +v_gft1_lo .req v0 +v_gft1_hi .req v1 +q_gft1_lo .req q0 +q_gft1_hi .req q1 +v_mask0f .req v2 +q_mask0f .req q2 + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +v_data_4 .req v12 +v_data_5 .req v13 +v_data_6 .req v14 +v_data_7 .req v15 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 +q_data_4 .req q12 +q_data_5 .req q13 +q_data_6 .req q14 +q_data_7 .req q15 + +v_data_0_lo .req v16 +v_data_1_lo .req v17 +v_data_2_lo .req v18 +v_data_3_lo .req v19 +v_data_4_lo .req v20 +v_data_5_lo .req v21 +v_data_6_lo .req v22 +v_data_7_lo .req v23 +v_data_0_hi .req v_data_0 +v_data_1_hi .req v_data_1 +v_data_2_hi .req v_data_2 +v_data_3_hi .req v_data_3 +v_data_4_hi .req v_data_4 +v_data_5_hi .req v_data_5 +v_data_6_hi .req v_data_6 +v_data_7_hi .req v_data_7 + +v_p0 .req v24 +v_p1 .req v25 +v_p2 .req v26 +v_p3 .req v27 +v_p4 .req v28 +v_p5 .req v29 +v_p6 .req v30 +v_p7 .req v31 +q_p0 .req q24 +q_p1 .req q25 +q_p2 .req q26 +q_p3 .req q27 +q_p4 .req q28 +q_p5 .req q29 +q_p6 .req q30 +q_p7 .req q31 + +v_p .req v_p0 +q_p .req q_p0 +v_data .req v_p1 +q_data .req q_p1 +v_data_lo .req v_p2 +v_data_hi .req v_p3 + + +cdecl(gf_vect_dot_prod_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + mov x_pos, #0 + + lsl x_vec, x_vec, #3 + +.Lloop128_init: + /* less than 128 bytes, goto Lloop16_init */ + cmp x_len, #128 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_len, x_len, #128 + +.Lloop128: + movi v_p0.16b, #0 + movi v_p1.16b, #0 + movi v_p2.16b, #0 + movi v_p3.16b, #0 + movi v_p4.16b, #0 + movi v_p5.16b, #0 + movi v_p6.16b, #0 + movi v_p7.16b, #0 + + mov x_tbl1, x_tbl + mov x_vec_i, #0 + +.Lloop128_vects: + ldr x_ptr, [x_src, x_vec_i] + add x_vec_i, x_vec_i, #8 + add x_ptr, x_ptr, x_pos + + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 + + ldp q_data_0, q_data_1, [x_ptr], #32 + ldp q_data_2, q_data_3, [x_ptr], #32 + ldp q_data_4, q_data_5, [x_ptr], #32 + ldp q_data_6, q_data_7, [x_ptr] + + prfm pldl1keep, [x_tbl1] + prfm pldl1strm, [x_ptr] + + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b + and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b + and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b + and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b + and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b + and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b + + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + ushr v_data_2_hi.16b, v_data_2.16b, #4 + ushr v_data_3_hi.16b, v_data_3.16b, #4 + ushr v_data_4_hi.16b, v_data_4.16b, #4 + ushr v_data_5_hi.16b, v_data_5.16b, #4 + ushr v_data_6_hi.16b, v_data_6.16b, #4 + ushr v_data_7_hi.16b, v_data_7.16b, #4 + + tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b + tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b + tbl v_data_2_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b + tbl v_data_3_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b + tbl v_data_4_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b + tbl v_data_5_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b + tbl v_data_6_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b + tbl v_data_7_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b + + tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b + tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b + tbl v_data_2_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b + tbl v_data_3_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b + tbl v_data_4_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b + tbl v_data_5_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b + tbl v_data_6_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b + tbl v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b + + eor v_p0.16b, v_data_0_lo.16b, v_p0.16b + eor v_p0.16b, v_p0.16b, v_data_0_hi.16b + eor v_p1.16b, v_data_1_lo.16b, v_p1.16b + eor v_p1.16b, v_p1.16b, v_data_1_hi.16b + eor v_p2.16b, v_data_2_lo.16b, v_p2.16b + eor v_p2.16b, v_p2.16b, v_data_2_hi.16b + eor v_p3.16b, v_data_3_lo.16b, v_p3.16b + eor v_p3.16b, v_p3.16b, v_data_3_hi.16b + eor v_p4.16b, v_data_4_lo.16b, v_p4.16b + eor v_p4.16b, v_p4.16b, v_data_4_hi.16b + eor v_p5.16b, v_data_5_lo.16b, v_p5.16b + eor v_p5.16b, v_p5.16b, v_data_5_hi.16b + eor v_p6.16b, v_data_6_lo.16b, v_p6.16b + eor v_p6.16b, v_p6.16b, v_data_6_hi.16b + eor v_p7.16b, v_data_7_lo.16b, v_p7.16b + eor v_p7.16b, v_p7.16b, v_data_7_hi.16b + + cmp x_vec_i, x_vec + blt .Lloop128_vects + +.Lloop128_vects_end: + add x_ptr, x_dest1, x_pos + stp q_p0, q_p1, [x_ptr], #32 + stp q_p2, q_p3, [x_ptr], #32 + stp q_p4, q_p5, [x_ptr], #32 + stp q_p6, q_p7, [x_ptr] + + add x_pos, x_pos, #128 + cmp x_pos, x_len + ble .Lloop128 + +.Lloop128_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + + add x_len, x_len, #128 + cmp x_pos, x_len + beq .return_pass + +.Lloop16_init: + sub x_len, x_len, #16 + cmp x_pos, x_len + bgt .lessthan16_init + +.Lloop16: + movi v_p.16b, #0 + mov x_tbl1, x_tbl + mov x_vec_i, #0 + +.Lloop16_vects: + ldr x_ptr, [x_src, x_vec_i] + ldr q_data, [x_ptr, x_pos] + add x_vec_i, x_vec_i, #8 + + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_data_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_data_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_p.16b, v_data_lo.16b, v_p.16b + eor v_p.16b, v_p.16b, v_data_hi.16b + + cmp x_vec_i, x_vec + blt .Lloop16_vects + +.Lloop16_vects_end: + str q_p, [x_dest1, x_pos] + add x_pos, x_pos, #16 + cmp x_pos, x_len + ble .Lloop16 + +.Lloop16_end: + sub x_tmp, x_pos, x_len + cmp x_tmp, #16 + beq .return_pass + +.lessthan16_init: + mov x_pos, x_len + b .Lloop16 + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_sve.S new file mode 100644 index 0000000000..48ce151fde --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_dot_prod_sve.S @@ -0,0 +1,132 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_vect_dot_prod_sve) +#ifndef __APPLE__ +.type gf_vect_dot_prod_sve, %function +#endif +/* void gf_vect_dot_prod_sve(int len, int vlen, unsigned char *gftbls, + unsigned char **src, unsigned char *dest); + */ + +/* arguments */ +x_len .req x0 /* vector length */ +x_vec .req x1 /* number of source vectors (ie. data blocks) */ +x_tbl .req x2 +x_src .req x3 +x_dest1 .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_vec_i .req x5 +x_ptr .req x6 +x_pos .req x7 +x_tbl1 .req x8 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest .req z3 + +z_gft1_lo .req z4 +z_gft1_hi .req z5 +q_gft1_lo .req q4 +q_gft1_hi .req q5 + +cdecl(gf_vect_dot_prod_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + lsl x_vec, x_vec, #3 + +/* Loop 1: x_len, vector length */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + mov z_dest.b, #0 /* clear z_dest */ + mov x_vec_i, #0 /* clear x_vec_i */ + mov x_tbl1, x_tbl /* reset x_tbl1 */ + +/* Loop 2: x_vec, number of source vectors (ie. data blocks) */ +.Lloopsve_vl_vects: + ldr x_ptr, [x_src, x_vec_i] /* x_ptr: src base addr. */ + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_ptr, x_pos] /* load from: src base + pos offset */ + + add x_vec_i, x_vec_i, #8 /* move x_vec_i to next */ + + /* load gf_table */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 /* x_tbl1 is added by #32 + for each src vect */ + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_gft1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_gft1_hi.b, {z_gft1_hi.b}, z_src_hi.b + + /* exclusive or, ie. gf(2^8) add */ + eor z_dest.d, z_gft1_lo.d, z_dest.d + eor z_dest.d, z_gft1_hi.d, z_dest.d + + cmp x_vec_i, x_vec + blt .Lloopsve_vl_vects + + /* end of Loop 2 */ + /* store dest data, governed by p0 */ + st1b z_dest.b, p0, [x_dest1, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_neon.S new file mode 100644 index 0000000000..bc2b957820 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_neon.S @@ -0,0 +1,324 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_vect_mad_neon) +#ifndef __APPLE__ +.type gf_vect_mad_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_src_end .req x6 +x_dest1 .req x_dest +x_tmp .req x7 +x_const .req x8 + +/* vectors */ +v_mask0f .req v0 +v_tmp .req v1 +q_tmp .req q1 + +v_tmp1_lo .req v2 +v_tmp1_hi .req v3 +v_tmp2_lo .req v4 +v_tmp2_hi .req v5 + +v_gft1_lo .req v6 +v_gft1_hi .req v7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +v_data_0 .req v8 +v_data_1 .req v9 +v_data_2 .req v10 +v_data_3 .req v11 +v_data_4 .req v12 +v_data_5 .req v13 +v_data_6 .req v14 +v_data_7 .req v15 +q_data_0 .req q8 +q_data_1 .req q9 +q_data_2 .req q10 +q_data_3 .req q11 +q_data_4 .req q12 +q_data_5 .req q13 +q_data_6 .req q14 +q_data_7 .req q15 + +v_data_0_lo .req v16 +v_data_1_lo .req v17 +v_data_2_lo .req v18 +v_data_3_lo .req v19 +v_data_4_lo .req v20 +v_data_5_lo .req v21 +v_data_6_lo .req v22 +v_data_7_lo .req v23 +v_data_0_hi .req v_data_0 +v_data_1_hi .req v_data_1 +v_data_2_hi .req v_data_2 +v_data_3_hi .req v_data_3 +v_data_4_hi .req v_data_4 +v_data_5_hi .req v_data_5 +v_data_6_hi .req v_data_6 +v_data_7_hi .req v_data_7 + +v_d1_0 .req v24 +v_d1_1 .req v25 +v_d1_2 .req v26 +v_d1_3 .req v27 +v_d1_4 .req v28 +v_d1_5 .req v29 +v_d1_6 .req v30 +v_d1_7 .req v31 +q_d1_0 .req q24 +q_d1_1 .req q25 +q_d1_2 .req q26 +q_d1_3 .req q27 +q_d1_4 .req q28 +q_d1_5 .req q29 +q_d1_6 .req q30 +q_d1_7 .req q31 + +v_data .req v_d1_1 +q_data .req q_d1_1 +v_data_lo .req v_d1_2 +v_data_hi .req v_d1_3 + + +cdecl(gf_vect_mad_neon): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + movi v_mask0f.16b, #0x0f + lsl x_vec_i, x_vec_i, #5 + add x_tbl, x_tbl, x_vec_i + add x_src_end, x_src, x_len + + ldr q_gft1_lo, [x_tbl] + ldr q_gft1_hi, [x_tbl, #16] + +.Lloop128_init: + /* less than 128 bytes, goto Lloop16_init */ + cmp x_len, #128 + blt .Lloop16_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_src_end, x_src_end, #128 + +.Lloop128: + ldr q_data_0, [x_src, #16*0] + ldr q_data_1, [x_src, #16*1] + ldr q_data_2, [x_src, #16*2] + ldr q_data_3, [x_src, #16*3] + ldr q_data_4, [x_src, #16*4] + ldr q_data_5, [x_src, #16*5] + ldr q_data_6, [x_src, #16*6] + ldr q_data_7, [x_src, #16*7] + + ldr q_d1_0, [x_dest1, #16*0] + ldr q_d1_1, [x_dest1, #16*1] + ldr q_d1_2, [x_dest1, #16*2] + ldr q_d1_3, [x_dest1, #16*3] + ldr q_d1_4, [x_dest1, #16*4] + ldr q_d1_5, [x_dest1, #16*5] + ldr q_d1_6, [x_dest1, #16*6] + ldr q_d1_7, [x_dest1, #16*7] + + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b + and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b + and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b + and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b + and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b + and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b + + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + ushr v_data_2_hi.16b, v_data_2.16b, #4 + ushr v_data_3_hi.16b, v_data_3.16b, #4 + ushr v_data_4_hi.16b, v_data_4.16b, #4 + ushr v_data_5_hi.16b, v_data_5.16b, #4 + ushr v_data_6_hi.16b, v_data_6.16b, #4 + ushr v_data_7_hi.16b, v_data_7.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b + tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b + tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b + + eor v_d1_0.16b, v_tmp1_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b + eor v_d1_1.16b, v_tmp2_lo.16b, v_d1_1.16b + eor v_d1_1.16b, v_d1_1.16b, v_tmp2_hi.16b + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b + tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b + tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b + + eor v_d1_2.16b, v_tmp1_lo.16b, v_d1_2.16b + eor v_d1_2.16b, v_d1_2.16b, v_tmp1_hi.16b + eor v_d1_3.16b, v_tmp2_lo.16b, v_d1_3.16b + eor v_d1_3.16b, v_d1_3.16b, v_tmp2_hi.16b + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b + tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b + tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b + + eor v_d1_4.16b, v_tmp1_lo.16b, v_d1_4.16b + eor v_d1_4.16b, v_d1_4.16b, v_tmp1_hi.16b + eor v_d1_5.16b, v_tmp2_lo.16b, v_d1_5.16b + eor v_d1_5.16b, v_d1_5.16b, v_tmp2_hi.16b + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b + tbl v_tmp2_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b + tbl v_tmp2_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b + + eor v_d1_6.16b, v_tmp1_lo.16b, v_d1_6.16b + eor v_d1_6.16b, v_d1_6.16b, v_tmp1_hi.16b + eor v_d1_7.16b, v_tmp2_lo.16b, v_d1_7.16b + eor v_d1_7.16b, v_d1_7.16b, v_tmp2_hi.16b + + str q_d1_0, [x_dest1, #16*0] + str q_d1_1, [x_dest1, #16*1] + str q_d1_2, [x_dest1, #16*2] + str q_d1_3, [x_dest1, #16*3] + str q_d1_4, [x_dest1, #16*4] + str q_d1_5, [x_dest1, #16*5] + str q_d1_6, [x_dest1, #16*6] + str q_d1_7, [x_dest1, #16*7] + + add x_src, x_src, #128 + add x_dest1, x_dest1, #128 + cmp x_src, x_src_end + bls .Lloop128 + +.Lloop128_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + add x_src_end, x_src_end, #128 + +.Lloop16_init: + sub x_src_end, x_src_end, #16 + cmp x_src, x_src_end + bhi .lessthan16_init + +.Lloop16: + ldr q_data, [x_src] + ldr q_d1_0, [x_dest1] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_d1_0.16b, v_tmp1_lo.16b, v_d1_0.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b + + str q_d1_0, [x_dest1] + + add x_dest1, x_dest1, #16 + add x_src, x_src, #16 + cmp x_src, x_src_end + bls .Lloop16 + +.lessthan16_init: + sub x_tmp, x_src, x_src_end + cmp x_tmp, #16 + beq .return_pass + +.lessthan16: + mov x_src, x_src_end + sub x_dest1, x_dest1, x_tmp + +#ifndef __APPLE__ + adrp x_const, const_tbl + add x_const, x_const, :lo12:const_tbl +#else + adrp x_const, const_tbl@PAGE + add x_const, x_const, const_tbl@PAGEOFF +#endif + sub x_const, x_const, x_tmp + ldr q_tmp, [x_const, #16] + + ldr q_data, [x_src] + ldr q_d1_0, [x_dest1] + + and v_data_lo.16b, v_data.16b, v_mask0f.16b + ushr v_data_hi.16b, v_data.16b, #4 + + tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b + tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b + eor v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b + and v_tmp1_hi.16b, v_tmp1_hi.16b, v_tmp.16b + eor v_d1_0.16b, v_d1_0.16b, v_tmp1_hi.16b + + str q_d1_0, [x_dest1] + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret + +ASM_DEF_RODATA +.balign 8 +const_tbl: + .dword 0x0000000000000000, 0x0000000000000000 + .dword 0xffffffffffffffff, 0xffffffffffffffff diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_sve.S new file mode 100644 index 0000000000..41d6da9d9a --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mad_sve.S @@ -0,0 +1,126 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_vect_mad_sve) +#ifndef __APPLE__ +.type gf_vect_mad_sve, %function +#endif + +/* gf_vect_mad_sve(int len, int vec, int vec_i, unsigned char *gftbls, + unsigned char *src, unsigned char *dest); + */ +/* arguments */ +x_len .req x0 +x_vec .req x1 +x_vec_i .req x2 +x_tbl .req x3 +x_src .req x4 +x_dest .req x5 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x6 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src + +z_dest .req z3 + +z_tmp1_lo .req z4 +z_tmp1_hi .req z5 + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +cdecl(gf_vect_mad_sve): + /* less than 16 bytes, return_fail */ + cmp x_len, #16 + blt .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + add x_tbl, x_tbl, x_vec_i, LSL #5 /* x_tbl += x_vec_i * 2^5 */ + + /* Load with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + + mov x_pos, #0 + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + /* prefetch dest data */ + prfb pldl2strm, p0, [x_dest, x_pos] + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* load dest data, governed by p0 */ + ld1b z_dest.b, p0/z, [x_dest, x_pos] + + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp1_hi.b, {z_gft1_hi.b}, z_src_hi.b + + /* exclusive or, ie. gf(2^8) add */ + eor z_dest.d, z_tmp1_lo.d, z_dest.d + eor z_dest.d, z_tmp1_hi.d, z_dest.d + + /* store dest data, governed by p0 */ + st1b z_dest.b, p0, [x_dest, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_neon.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_neon.S new file mode 100644 index 0000000000..096b91dd29 --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_neon.S @@ -0,0 +1,240 @@ +/************************************************************** + Copyright (c) 2019 Huawei Technologies Co., Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../include/aarch64_label.h" + +.text + +.global cdecl(gf_vect_mul_neon) +#ifndef __APPLE__ +.type gf_vect_mul_neon, %function +#endif + +/* arguments */ +x_len .req x0 +x_tbl .req x1 +x_src .req x2 +x_dest .req x3 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_dest1 .req x_dest +x_src_end .req x4 +x_tmp .req x5 + +/* vectors */ +v_mask0f .req v0 + +v_gft1_lo .req v2 +v_gft1_hi .req v3 +q_gft1_lo .req q2 +q_gft1_hi .req q3 + +v_data_0 .req v16 +v_data_1 .req v17 +v_data_2 .req v18 +v_data_3 .req v19 +v_data_4 .req v20 +v_data_5 .req v21 +v_data_6 .req v22 +v_data_7 .req v23 +q_data_0 .req q16 +q_data_1 .req q17 +q_data_2 .req q18 +q_data_3 .req q19 +q_data_4 .req q20 +q_data_5 .req q21 +q_data_6 .req q22 +q_data_7 .req q23 + +v_data_0_lo .req v24 +v_data_1_lo .req v25 +v_data_2_lo .req v26 +v_data_3_lo .req v27 +v_data_4_lo .req v28 +v_data_5_lo .req v29 +v_data_6_lo .req v30 +v_data_7_lo .req v31 +v_data_0_hi .req v_data_0 +v_data_1_hi .req v_data_1 +v_data_2_hi .req v_data_2 +v_data_3_hi .req v_data_3 +v_data_4_hi .req v_data_4 +v_data_5_hi .req v_data_5 +v_data_6_hi .req v_data_6 +v_data_7_hi .req v_data_7 + + +cdecl(gf_vect_mul_neon): + /* less than 32 bytes, return_fail */ + cmp x_len, #32 + blt .return_fail + + movi v_mask0f.16b, #0x0f + add x_src_end, x_src, x_len + ldr q_gft1_lo, [x_tbl] + ldr q_gft1_hi, [x_tbl, #16] + + +.Lloop128_init: + /* less than 128 bytes, goto Lloop16_init */ + cmp x_len, #128 + blt .Lloop32_init + + /* save d8 ~ d15 to stack */ + sub sp, sp, #64 + stp d8, d9, [sp] + stp d10, d11, [sp, #16] + stp d12, d13, [sp, #32] + stp d14, d15, [sp, #48] + + sub x_src_end, x_src_end, #128 + +.Lloop128: + ldr q_data_0, [x_src, #16*0] + ldr q_data_1, [x_src, #16*1] + ldr q_data_2, [x_src, #16*2] + ldr q_data_3, [x_src, #16*3] + ldr q_data_4, [x_src, #16*4] + ldr q_data_5, [x_src, #16*5] + ldr q_data_6, [x_src, #16*6] + ldr q_data_7, [x_src, #16*7] + + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b + and v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b + and v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b + and v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b + and v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b + and v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b + + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + ushr v_data_2_hi.16b, v_data_2.16b, #4 + ushr v_data_3_hi.16b, v_data_3.16b, #4 + ushr v_data_4_hi.16b, v_data_4.16b, #4 + ushr v_data_5_hi.16b, v_data_5.16b, #4 + ushr v_data_6_hi.16b, v_data_6.16b, #4 + ushr v_data_7_hi.16b, v_data_7.16b, #4 + + tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b + tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b + tbl v_data_2_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b + tbl v_data_3_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b + tbl v_data_4_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b + tbl v_data_5_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b + tbl v_data_6_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b + tbl v_data_7_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b + + tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b + tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b + tbl v_data_2_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b + tbl v_data_3_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b + tbl v_data_4_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b + tbl v_data_5_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b + tbl v_data_6_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b + tbl v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b + + eor v_data_0.16b, v_data_0_hi.16b, v_data_0_lo.16b + eor v_data_1.16b, v_data_1_hi.16b, v_data_1_lo.16b + eor v_data_2.16b, v_data_2_hi.16b, v_data_2_lo.16b + eor v_data_3.16b, v_data_3_hi.16b, v_data_3_lo.16b + eor v_data_4.16b, v_data_4_hi.16b, v_data_4_lo.16b + eor v_data_5.16b, v_data_5_hi.16b, v_data_5_lo.16b + eor v_data_6.16b, v_data_6_hi.16b, v_data_6_lo.16b + eor v_data_7.16b, v_data_7_hi.16b, v_data_7_lo.16b + + str q_data_0, [x_dest1, #16*0] + str q_data_1, [x_dest1, #16*1] + str q_data_2, [x_dest1, #16*2] + str q_data_3, [x_dest1, #16*3] + str q_data_4, [x_dest1, #16*4] + str q_data_5, [x_dest1, #16*5] + str q_data_6, [x_dest1, #16*6] + str q_data_7, [x_dest1, #16*7] + + add x_src, x_src, #128 + add x_dest1, x_dest1, #128 + cmp x_src, x_src_end + bls .Lloop128 + +.Lloop128_end: + /* restore d8 ~ d15 */ + ldp d8, d9, [sp] + ldp d10, d11, [sp, #16] + ldp d12, d13, [sp, #32] + ldp d14, d15, [sp, #48] + add sp, sp, #64 + add x_src_end, x_src_end, #128 + cmp x_src, x_src_end + beq .return_pass + +.Lloop32_init: + sub x_src_end, x_src_end, #32 + cmp x_src, x_src_end + bhi .return_fail + +.Lloop32: + ldr q_data_0, [x_src, #16*0] + ldr q_data_1, [x_src, #16*1] + + and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b + and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b + ushr v_data_0_hi.16b, v_data_0.16b, #4 + ushr v_data_1_hi.16b, v_data_1.16b, #4 + tbl v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b + tbl v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b + tbl v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b + tbl v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b + eor v_data_0.16b, v_data_0_hi.16b, v_data_0_lo.16b + eor v_data_1.16b, v_data_1_hi.16b, v_data_1_lo.16b + str q_data_0, [x_dest1, #16*0] + str q_data_1, [x_dest1, #16*1] + + add x_dest1, x_dest1, #32 + add x_src, x_src, #32 + cmp x_src, x_src_end + bls .Lloop32 + +.Lloop32_end: + sub x_tmp, x_src, x_src_end + cmp x_tmp, #32 + beq .return_pass + b .return_fail + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_sve.S b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_sve.S new file mode 100644 index 0000000000..d2219bf54c --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/gf_vect_mul_sve.S @@ -0,0 +1,123 @@ +/************************************************************** + Copyright (c) 2021 Linaro Ltd. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Huawei Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.text +.align 6 +.arch armv8-a+sve + +#include "../include/aarch64_label.h" + +.global cdecl(gf_vect_mul_sve) +#ifndef __APPLE__ +.type gf_vect_mul_sve, %function +#endif + +/* Refer to include/gf_vect_mul.h + * + * @param len Length of vector in bytes. Must be aligned to 32B. + * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C. + * @param src Pointer to src data array. Must be aligned to 32B. + * @param dest Pointer to destination data array. Must be aligned to 32B. + * @returns 0 pass, other fail + * + * int gf_vect_mul(int len, unsigned char *gftbl, void *src, void *dest); + */ + +/* arguments */ +x_len .req x0 +x_tbl .req x1 +x_src .req x2 +x_dest .req x3 +x_tmp .req x4 + +/* returns */ +w_ret .req w0 + +/* local variables */ +x_pos .req x5 + +/* vectors */ +z_mask0f .req z0 + +z_src .req z1 +z_src_lo .req z2 +z_src_hi .req z_src /* reuse */ + +z_dest .req z3 +z_tmp1_lo .req z4 +z_tmp1_hi .req z_dest /* reuse */ + +z_gft1_lo .req z6 +z_gft1_hi .req z7 +q_gft1_lo .req q6 +q_gft1_hi .req q7 + +cdecl(gf_vect_mul_sve): + /* len not aligned to 32B, return_fail */ + and x_tmp, x_len, #0x1f + cmp x_tmp, #0 + bne .return_fail + + mov z_mask0f.b, #0x0f /* z_mask0f = 0x0F0F...0F */ + mov x_pos, #0 + + /* Load with NEON instruction ldp */ + ldp q_gft1_lo, q_gft1_hi, [x_tbl] + + /* vector length agnostic */ +.Lloopsve_vl: + whilelo p0.b, x_pos, x_len + b.none .return_pass + + /* load src data, governed by p0 */ + ld1b z_src.b, p0/z, [x_src, x_pos] + + /* split 4-bit lo; 4-bit hi */ + and z_src_lo.d, z_src.d, z_mask0f.d + lsr z_src_hi.b, z_src.b, #4 + + /* table indexing, ie. gf(2^8) multiplication */ + tbl z_tmp1_lo.b, {z_gft1_lo.b}, z_src_lo.b + tbl z_tmp1_hi.b, {z_gft1_hi.b}, z_src_hi.b + /* exclusive or, ie. gf(2^8) add */ + eor z_dest.d, z_tmp1_hi.d, z_tmp1_lo.d + + /* store dest data, governed by p0 */ + st1b z_dest.b, p0, [x_dest, x_pos] + /* increment one vector length */ + incb x_pos + + b .Lloopsve_vl + +.return_pass: + mov w_ret, #0 + ret + +.return_fail: + mov w_ret, #1 + ret diff --git a/contrib/libs/isa-l/erasure_code/aarch64/ya.make b/contrib/libs/isa-l/erasure_code/aarch64/ya.make new file mode 100644 index 0000000000..ba7f601cfa --- /dev/null +++ b/contrib/libs/isa-l/erasure_code/aarch64/ya.make @@ -0,0 +1,51 @@ +LIBRARY() + +LICENSE(BSD-3-Clause) + +LICENSE_TEXTS(.yandex_meta/licenses.list.txt) + +VERSION(2.31) + +NO_UTIL() + +NO_COMPILER_WARNINGS() + +ADDINCL( + contrib/libs/isa-l/include +) + +IF(ARCH_AARCH64) +CFLAGS(-D__ASSEMBLY__) +SRCS( + ec_multibinary_arm.S + gf_2vect_dot_prod_neon.S + gf_2vect_dot_prod_sve.S + gf_2vect_mad_neon.S + gf_2vect_mad_sve.S + gf_3vect_dot_prod_neon.S + gf_3vect_dot_prod_sve.S + gf_3vect_mad_neon.S + gf_3vect_mad_sve.S + gf_4vect_dot_prod_neon.S + gf_4vect_dot_prod_sve.S + gf_4vect_mad_neon.S + gf_4vect_mad_sve.S + gf_5vect_dot_prod_neon.S + gf_5vect_dot_prod_sve.S + gf_5vect_mad_neon.S + gf_5vect_mad_sve.S + gf_6vect_dot_prod_sve.S + gf_6vect_mad_neon.S + gf_6vect_mad_sve.S + gf_7vect_dot_prod_sve.S + gf_8vect_dot_prod_sve.S + gf_vect_dot_prod_neon.S + gf_vect_dot_prod_sve.S + gf_vect_mad_neon.S + gf_vect_mad_sve.S + gf_vect_mul_neon.S + gf_vect_mul_sve.S +) +ENDIF() + +END() |